├── LICENSE ├── RDPG.py ├── README.md ├── bleu.py ├── data_iterator.py ├── insepection.py ├── layers.py ├── mteval.sh ├── nmt_uni.py ├── noisy_translator.py ├── noisytrans_training.py ├── optimizer.py ├── policy.py ├── pretrain_uni.py ├── reward.py ├── run_eval.sh ├── run_train.sh ├── translate.sh ├── translate_uni.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Jiatao Gu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /RDPG.py: -------------------------------------------------------------------------------- 1 | """ 2 | -- Recurrent Deterministic Policy Gradient 3 | """ 4 | 5 | from nmt_uni import * 6 | 7 | import os 8 | import time, datetime 9 | import cPickle as pkl 10 | 11 | 12 | class RDPG(object): 13 | 14 | def __init__(self, 15 | trng, options, policy, config, 16 | n_in=None, n_out=None, 17 | recurrent=False, id=None): 18 | 19 | self.trng = trng 20 | self.options = options 21 | self.policy = policy 22 | self.recurrent = recurrent 23 | 24 | self.n_hidden = 512 25 | self.n_in = n_in 26 | self.n_out = n_out 27 | 28 | self.rec = 'lngru' 29 | if not n_in: 30 | self.n_in = options['readout_dim'] 31 | 32 | # ------------------------------------------------------------------------------ 33 | print 'policy network initialization' 34 | 35 | params = OrderedDict() 36 | if not self.recurrent: 37 | print 'building a feed-forward controller' 38 | params = get_layer('ff')[0](options, params, prefix='policy_net_in', 39 | nin=self.n_in, nout=self.n_hidden, scale=0.001) 40 | else: 41 | print 'building a recurrent controller' 42 | params = get_layer(self.rec)[0](options, params, prefix='policy_net_in', 43 | nin=self.n_in, dim=self.n_hidden, scale=0.001) 44 | 45 | params = get_layer('ff')[0](options, params, prefix='policy_net_out', 46 | nin=self.n_hidden, 47 | nout=self.n_out, 48 | scale=0.001) 49 | 50 | # -------------------------------------------------------------------------------- 51 | print 'critic network initialization (RNN)' 52 | params_b = OrderedDict() 53 | params_b = get_layer(self.rec)[0](options, params_b, prefix='critic_net_in', 54 | nin=self.n_in + self.n_out, 55 | dim=self.n_hidden, scale=0.001) 56 | params_b = get_layer('ff')[0](options, params_b, prefix='critic_net_out', 57 | nin=self.n_hidden, 58 | nout=1, 59 | scale=0.001) 60 | if id is not None: 61 | print 'reload the saved model: {}'.format(id) 62 | params = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params) 63 | params_b = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params_b) 64 | else: 65 | id = datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d-%H%M%S') 66 | print 'start from a new model: {}'.format(id) 67 | 68 | with open('.config/conf.{}.txt'.format(id), 'w') as f: 69 | f.write('[config]\n') 70 | 71 | for c in config: 72 | f.write('{}: {}\n'.format(c, config[c])) 73 | f.write('\n') 74 | 75 | f.write('[policy]\n') 76 | 77 | for c in policy: 78 | f.write('{}: {}\n'.format(c, policy[c])) 79 | 80 | # pkl.dump([policy, config], open('.config/{}.conf'.format(id), 'w')) 81 | print 'save the config file' 82 | 83 | self.id = id 84 | self.model = '.policy/{}-{}'.format(id, self.policy['base']) 85 | 86 | # theano shared params 87 | self.tparams = init_tparams(params) 88 | self.tparams_b = init_tparams(params_b) 89 | 90 | # build the policy network 91 | self.build_actor(options=options) 92 | self.build_discriminator(options=options) 93 | 94 | def build_actor(self, options): 95 | # ============================================================================= # 96 | # Actor from Policy Network 97 | # ============================================================================= # 98 | observation = tensor.matrix('observation', dtype='float32') # batch_size x readout_dim (seq_steps=1) 99 | prev_hidden = tensor.matrix('p_hidden', dtype='float32') 100 | 101 | if not self.recurrent: 102 | hiddens = get_layer('ff')[1](self.tparams, observation, 103 | options, prefix='policy_net_in', 104 | activ='tanh') 105 | else: 106 | hiddens = get_layer(self.rec)[1](self.tparams, observation, 107 | options, prefix='policy_net_in', mask=None, 108 | one_step=True, _init_state=prev_hidden)[0] 109 | 110 | act_inps = [observation, prev_hidden] 111 | act_outs = get_layer('ff')[1](self.tparams, hiddens, options, 112 | prefix='policy_net_out', 113 | activ='tanh' 114 | ) 115 | print 'build action function [Deterministic]' 116 | self.f_action = theano.function(act_inps, act_outs, 117 | on_unused_input='ignore') # action/dist/hiddens 118 | print 'done.' 119 | 120 | 121 | def build_discriminator(self, options): 122 | # ============================================================================= # 123 | # Build for End-t-End learning 124 | # ============================================================================= # 125 | observations = tensor.tensor3('observations', dtype='float32') 126 | mask = tensor.matrix('mask', dtype='float32') 127 | targets = tensor.vector('targets', dtype='float32') 128 | 129 | print 'build actor' 130 | if not self.recurrent: 131 | hiddens = get_layer('ff')[1](self.tparams, observations, 132 | options, prefix='policy_net_in', 133 | activ='tanh') 134 | else: 135 | hiddens = get_layer(self.rec)[1](self.tparams, observations, 136 | options, prefix='policy_net_in', mask=mask)[0] 137 | actions = get_layer('ff')[1](self.tparams, hiddens, options, prefix='policy_net_out', 138 | activ='tanh') # seq_steps x batch_size x n_out 139 | 140 | print 'build critic' 141 | state_action = concatenate([observations, actions], axis=-1) 142 | hiddens_b = get_layer(self.rec)[1](self.tparams_b, state_action, 143 | options, prefix='critic_net_in', mask=mask)[0] 144 | values = get_layer('ff')[1](self.tparams_b, hiddens_b, options, 145 | prefix='critic_net_out', 146 | activ='tanh')[-1, :, 0] # (batch_size, ) 147 | 148 | # =============================================================================== # 149 | # Build Deterministic Policy Gradient [Actor Parts] 150 | # =============================================================================== # 151 | inps_A = [observations, mask] 152 | loss_A = -tensor.mean(values) 153 | grad_A = tensor.grad(loss_A, wrt=itemlist(self.tparams)) 154 | grad_A = grad_clip(grad_A) 155 | outs_A = [loss_A, actions] 156 | 157 | # optimizer: Adam 158 | lr = tensor.scalar(name='lr') 159 | f_A, f_Aup = adam(lr, self.tparams, grad_A, inps_A, outs_A) 160 | 161 | # =============================================================================== # 162 | # Build Deterministic Policy Gradient [Critic Parts] 163 | # =============================================================================== # 164 | inps_B = [observations, mask, actions, targets] 165 | loss_B = tensor.mean((values - targets) ** 2) 166 | grad_B = tensor.grad(loss_B, wrt=itemlist(self.tparams_b)) 167 | grad_B = grad_clip(grad_B) 168 | outs_B = [loss_B] 169 | 170 | # optimizer: Adam 171 | lr = tensor.scalar(name='lr') 172 | f_B, f_Bup = adam(lr, self.tparams_b, grad_B, inps_B, outs_B) 173 | 174 | self.f_learner = [f_A, f_Aup, f_B, f_Bup] 175 | print 'done.' 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NMT-RDPG 2 | Neural machine translation with Recurrent Deterministic Policy Gradient 3 | -------------------------------------------------------------------------------- /bleu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Natural Language Toolkit: BLEU Score 3 | # 4 | # Copyright (C) 2001-2016 NLTK Project 5 | # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim 6 | # Contributors: Dmitrijs Milajevs, Liling Tan 7 | # URL: 8 | # For license information, see LICENSE.TXT 9 | 10 | """BLEU score implementation.""" 11 | from __future__ import division 12 | 13 | import math 14 | import fractions 15 | from collections import Counter 16 | 17 | from nltk.util import ngrams 18 | 19 | try: 20 | fractions.Fraction(0, 1000, _normalize=False) 21 | from fractions import Fraction 22 | except TypeError: 23 | from nltk.compat import Fraction 24 | 25 | 26 | def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), 27 | smoothing_function=None): 28 | """ 29 | Calculate BLEU score (Bilingual Evaluation Understudy) from 30 | Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. 31 | "BLEU: a method for automatic evaluation of machine translation." 32 | In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf 33 | 34 | >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 35 | ... 'ensures', 'that', 'the', 'military', 'always', 36 | ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] 37 | 38 | >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 39 | ... 'forever', 'hearing', 'the', 'activity', 'guidebook', 40 | ... 'that', 'party', 'direct'] 41 | 42 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 43 | ... 'ensures', 'that', 'the', 'military', 'will', 'forever', 44 | ... 'heed', 'Party', 'commands'] 45 | 46 | >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 47 | ... 'guarantees', 'the', 'military', 'forces', 'always', 48 | ... 'being', 'under', 'the', 'command', 'of', 'the', 49 | ... 'Party'] 50 | 51 | >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 52 | ... 'army', 'always', 'to', 'heed', 'the', 'directions', 53 | ... 'of', 'the', 'party'] 54 | 55 | >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS 56 | 0.5045... 57 | 58 | >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS 59 | 0.3969... 60 | 61 | The default BLEU calculates a score for up to 4grams using uniform 62 | weights. To evaluate your translations with higher/lower order ngrams, 63 | use customized weights. E.g. when accounting for up to 6grams with uniform 64 | weights: 65 | 66 | >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666) 67 | >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) 68 | 0.45838627164939455 69 | 70 | :param references: reference sentences 71 | :type references: list(list(str)) 72 | :param hypothesis: a hypothesis sentence 73 | :type hypothesis: list(str) 74 | :param weights: weights for unigrams, bigrams, trigrams and so on 75 | :type weights: list(float) 76 | :return: The sentence-level BLEU score. 77 | :rtype: float 78 | """ 79 | return corpus_bleu([references], [hypothesis], weights, smoothing_function) 80 | 81 | 82 | def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), 83 | smoothing_function=None): 84 | """ 85 | Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all 86 | the hypotheses and their respective references. 87 | 88 | Instead of averaging the sentence level BLEU scores (i.e. marco-average 89 | precision), the original BLEU metric (Papineni et al. 2002) accounts for 90 | the micro-average precision (i.e. summing the numerators and denominators 91 | for each hypothesis-reference(s) pairs before the division). 92 | 93 | >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 94 | ... 'ensures', 'that', 'the', 'military', 'always', 95 | ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] 96 | >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 97 | ... 'ensures', 'that', 'the', 'military', 'will', 'forever', 98 | ... 'heed', 'Party', 'commands'] 99 | >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', 100 | ... 'guarantees', 'the', 'military', 'forces', 'always', 101 | ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] 102 | >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 103 | ... 'army', 'always', 'to', 'heed', 'the', 'directions', 104 | ... 'of', 'the', 'party'] 105 | 106 | >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', 107 | ... 'interested', 'in', 'world', 'history'] 108 | >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', 109 | ... 'because', 'he', 'read', 'the', 'book'] 110 | 111 | >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] 112 | >>> hypotheses = [hyp1, hyp2] 113 | >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 114 | 0.5920... 115 | 116 | The example below show that corpus_bleu() is different from averaging 117 | sentence_bleu() for hypotheses 118 | 119 | >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) 120 | >>> score2 = sentence_bleu([ref2a], hyp2) 121 | >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 122 | 0.6223... 123 | 124 | :param references: a corpus of lists of reference sentences, w.r.t. hypotheses 125 | :type references: list(list(list(str))) 126 | :param hypotheses: a list of hypothesis sentences 127 | :type hypotheses: list(list(str)) 128 | :param weights: weights for unigrams, bigrams, trigrams and so on 129 | :type weights: list(float) 130 | :return: The corpus-level BLEU score. 131 | :rtype: float 132 | """ 133 | # Before proceeding to compute BLEU, perform sanity checks. 134 | 135 | p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. 136 | p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. 137 | hyp_lengths, ref_lengths = 0, 0 138 | 139 | assert len(list_of_references) == len( 140 | hypotheses), "The number of hypotheses and their reference(s) should be the same" 141 | 142 | # Iterate through each hypothesis and their corresponding references. 143 | for references, hypothesis in zip(list_of_references, hypotheses): 144 | # For each order of ngram, calculate the numerator and 145 | # denominator for the corpus-level modified precision. 146 | for i, _ in enumerate(weights, start=1): 147 | p_i = modified_precision(references, hypothesis, i) 148 | p_numerators[i] += p_i.numerator 149 | p_denominators[i] += p_i.denominator 150 | 151 | # Calculate the hypothesis length and the closest reference length. 152 | # Adds them to the corpus-level hypothesis and reference counts. 153 | hyp_len = len(hypothesis) 154 | hyp_lengths += hyp_len 155 | ref_lengths += closest_ref_length(references, hyp_len) 156 | 157 | # Calculate corpus-level brevity penalty. 158 | bp = brevity_penalty(ref_lengths, hyp_lengths) 159 | 160 | # Collects the various precision values for the different ngram orders. 161 | p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) 162 | for i, _ in enumerate(weights, start=1)] 163 | 164 | # Returns 0 if there's no matching n-grams 165 | # We only need to check for p_numerators[1] == 0, since if there's 166 | # no unigrams, there won't be any higher order ngrams. 167 | if p_numerators[1] == 0: 168 | return 0, 0 169 | 170 | # Smoothen the modified precision. 171 | # Note: smooth_precision() converts values into float. 172 | if not smoothing_function: 173 | smoothing_function = SmoothingFunction().method0 174 | p_n = smoothing_function(p_n, references=references, 175 | hypothesis=hypothesis, hyp_len=hyp_len) 176 | 177 | # Calculates the overall modified precision for all ngrams. 178 | # By sum of the product of the weights and the respective *p_n* 179 | s = (w * math.log(p_i) for w, p_i in zip(weights, p_n) 180 | if p_i.numerator != 0) 181 | 182 | # return bp * math.exp(math.fsum(s)) 183 | return math.exp(math.fsum(s)), bp * math.exp(math.fsum(s)) 184 | 185 | 186 | def modified_precision(references, hypothesis, n): 187 | """ 188 | Calculate modified ngram precision. 189 | 190 | The normal precision method may lead to some wrong translations with 191 | high-precision, e.g., the translation, in which a word of reference 192 | repeats several times, has very high precision. 193 | 194 | This function only returns the Fraction object that contains the numerator 195 | and denominator necessary to calculate the corpus-level precision. 196 | To calculate the modified precision for a single pair of hypothesis and 197 | references, cast the Fraction object into a float. 198 | 199 | The famous "the the the ... " example shows that you can get BLEU precision 200 | by duplicating high frequency words. 201 | 202 | >>> reference1 = 'the cat is on the mat'.split() 203 | >>> reference2 = 'there is a cat on the mat'.split() 204 | >>> hypothesis1 = 'the the the the the the the'.split() 205 | >>> references = [reference1, reference2] 206 | >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 207 | 0.2857... 208 | 209 | In the modified n-gram precision, a reference word will be considered 210 | exhausted after a matching hypothesis word is identified, e.g. 211 | 212 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 213 | ... 'ensures', 'that', 'the', 'military', 'will', 214 | ... 'forever', 'heed', 'Party', 'commands'] 215 | >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 216 | ... 'guarantees', 'the', 'military', 'forces', 'always', 217 | ... 'being', 'under', 'the', 'command', 'of', 'the', 218 | ... 'Party'] 219 | >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 220 | ... 'army', 'always', 'to', 'heed', 'the', 'directions', 221 | ... 'of', 'the', 'party'] 222 | >>> hypothesis = 'of the'.split() 223 | >>> references = [reference1, reference2, reference3] 224 | >>> float(modified_precision(references, hypothesis, n=1)) 225 | 1.0 226 | >>> float(modified_precision(references, hypothesis, n=2)) 227 | 1.0 228 | 229 | An example of a normal machine translation hypothesis: 230 | 231 | >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 232 | ... 'ensures', 'that', 'the', 'military', 'always', 233 | ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] 234 | 235 | >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 236 | ... 'forever', 'hearing', 'the', 'activity', 'guidebook', 237 | ... 'that', 'party', 'direct'] 238 | 239 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 240 | ... 'ensures', 'that', 'the', 'military', 'will', 241 | ... 'forever', 'heed', 'Party', 'commands'] 242 | 243 | >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 244 | ... 'guarantees', 'the', 'military', 'forces', 'always', 245 | ... 'being', 'under', 'the', 'command', 'of', 'the', 246 | ... 'Party'] 247 | 248 | >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 249 | ... 'army', 'always', 'to', 'heed', 'the', 'directions', 250 | ... 'of', 'the', 'party'] 251 | >>> references = [reference1, reference2, reference3] 252 | >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS 253 | 0.9444... 254 | >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS 255 | 0.5714... 256 | >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS 257 | 0.5882352941176471 258 | >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS 259 | 0.07692... 260 | 261 | 262 | :param references: A list of reference translations. 263 | :type references: list(list(str)) 264 | :param hypothesis: A hypothesis translation. 265 | :type hypothesis: list(str) 266 | :param n: The ngram order. 267 | :type n: int 268 | :return: BLEU's modified precision for the nth order ngram. 269 | :rtype: Fraction 270 | """ 271 | # Extracts all ngrams in hypothesis. 272 | counts = Counter(ngrams(hypothesis, n)) 273 | 274 | # Extract a union of references' counts. 275 | ## max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) 276 | max_counts = {} 277 | for reference in references: 278 | reference_counts = Counter(ngrams(reference, n)) 279 | for ngram in counts: 280 | max_counts[ngram] = max(max_counts.get(ngram, 0), 281 | reference_counts[ngram]) 282 | 283 | # Assigns the intersection between hypothesis and references' counts. 284 | clipped_counts = {ngram: min(count, max_counts[ngram]) 285 | for ngram, count in counts.items()} 286 | 287 | numerator = sum(clipped_counts.values()) 288 | # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. 289 | # Usually this happens when the ngram order is > len(reference). 290 | denominator = max(1, sum(counts.values())) 291 | 292 | return Fraction(numerator, denominator, _normalize=False) 293 | 294 | 295 | def closest_ref_length(references, hyp_len): 296 | """ 297 | This function finds the reference that is the closest length to the 298 | hypothesis. The closest reference length is referred to as *r* variable 299 | from the brevity penalty formula in Papineni et. al. (2002) 300 | 301 | :param references: A list of reference translations. 302 | :type references: list(list(str)) 303 | :param hypothesis: The length of the hypothesis. 304 | :type hypothesis: int 305 | :return: The length of the reference that's closest to the hypothesis. 306 | :rtype: int 307 | """ 308 | ref_lens = (len(reference) for reference in references) 309 | closest_ref_len = min(ref_lens, key=lambda ref_len: 310 | (abs(ref_len - hyp_len), ref_len)) 311 | return closest_ref_len 312 | 313 | 314 | def brevity_penalty(closest_ref_len, hyp_len): 315 | """ 316 | Calculate brevity penalty. 317 | 318 | As the modified n-gram precision still has the problem from the short 319 | length sentence, brevity penalty is used to modify the overall BLEU 320 | score according to length. 321 | 322 | An example from the paper. There are three references with length 12, 15 323 | and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. 324 | 325 | >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 326 | >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 327 | >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 328 | >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 329 | >>> references = [reference1, reference2, reference3] 330 | >>> hyp_len = len(hypothesis) 331 | >>> closest_ref_len = closest_ref_length(references, hyp_len) 332 | >>> brevity_penalty(closest_ref_len, hyp_len) 333 | 1.0 334 | 335 | In case a hypothesis translation is shorter than the references, penalty is 336 | applied. 337 | 338 | >>> references = [['a'] * 28, ['a'] * 28] 339 | >>> hypothesis = ['a'] * 12 340 | >>> hyp_len = len(hypothesis) 341 | >>> closest_ref_len = closest_ref_length(references, hyp_len) 342 | >>> brevity_penalty(closest_ref_len, hyp_len) 343 | 0.2635971381157267 344 | 345 | The length of the closest reference is used to compute the penalty. If the 346 | length of a hypothesis is 12, and the reference lengths are 13 and 2, the 347 | penalty is applied because the hypothesis length (12) is less then the 348 | closest reference length (13). 349 | 350 | >>> references = [['a'] * 13, ['a'] * 2] 351 | >>> hypothesis = ['a'] * 12 352 | >>> hyp_len = len(hypothesis) 353 | >>> closest_ref_len = closest_ref_length(references, hyp_len) 354 | >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS 355 | 0.9200... 356 | 357 | The brevity penalty doesn't depend on reference order. More importantly, 358 | when two reference sentences are at the same distance, the shortest 359 | reference sentence length is used. 360 | 361 | >>> references = [['a'] * 13, ['a'] * 11] 362 | >>> hypothesis = ['a'] * 12 363 | >>> hyp_len = len(hypothesis) 364 | >>> closest_ref_len = closest_ref_length(references, hyp_len) 365 | >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) 366 | >>> hyp_len = len(hypothesis) 367 | >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) 368 | >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) 369 | >>> bp1 == bp2 == 1 370 | True 371 | 372 | A test example from mteval-v13a.pl (starting from the line 705): 373 | 374 | >>> references = [['a'] * 11, ['a'] * 8] 375 | >>> hypothesis = ['a'] * 7 376 | >>> hyp_len = len(hypothesis) 377 | >>> closest_ref_len = closest_ref_length(references, hyp_len) 378 | >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS 379 | 0.8668... 380 | 381 | >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] 382 | >>> hypothesis = ['a'] * 7 383 | >>> hyp_len = len(hypothesis) 384 | >>> closest_ref_len = closest_ref_length(references, hyp_len) 385 | >>> brevity_penalty(closest_ref_len, hyp_len) 386 | 1.0 387 | 388 | :param hyp_len: The length of the hypothesis for a single sentence OR the 389 | sum of all the hypotheses' lengths for a corpus 390 | :type hyp_len: int 391 | :param closest_ref_len: The length of the closest reference for a single 392 | hypothesis OR the sum of all the closest references for every hypotheses. 393 | :type closest_reference_len: int 394 | :return: BLEU's brevity penalty. 395 | :rtype: float 396 | """ 397 | if hyp_len > closest_ref_len: 398 | return 1 399 | # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 400 | elif hyp_len == 0: 401 | return 0 402 | else: 403 | return math.exp(1 - closest_ref_len / hyp_len) 404 | 405 | 406 | class SmoothingFunction: 407 | """ 408 | This is an implementation of the smoothing techniques 409 | for segment-level BLEU scores that was presented in 410 | Boxing Chen and Collin Cherry (2014) A Systematic Comparison of 411 | Smoothing Techniques for Sentence-Level BLEU. In WMT14. 412 | http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf 413 | """ 414 | 415 | def __init__(self, epsilon=0.1, alpha=5, k=5): 416 | """ 417 | This will initialize the parameters required for the various smoothing 418 | techniques, the default values are set to the numbers used in the 419 | experiments from Chen and Cherry (2014). 420 | 421 | >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 422 | ... 'that', 'the', 'military', 'always', 'obeys', 'the', 423 | ... 'commands', 'of', 'the', 'party'] 424 | >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 425 | ... 'that', 'the', 'military', 'will', 'forever', 'heed', 426 | ... 'Party', 'commands'] 427 | 428 | >>> chencherry = SmoothingFunction() 429 | >>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS 430 | 0.4118... 431 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS 432 | 0.4118... 433 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS 434 | 0.4118... 435 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS 436 | 0.4489... 437 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS 438 | 0.4118... 439 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS 440 | 0.4118... 441 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS 442 | 0.4905... 443 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS 444 | 0.1801... 445 | >>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS 446 | 0.4905... 447 | 448 | :param epsilon: the epsilon value use in method 1 449 | :type epsilon: float 450 | :param alpha: the alpha value use in method 6 451 | :type alpha: int 452 | :param k: the k value use in method 4 453 | :type k: int 454 | """ 455 | self.epsilon = epsilon 456 | self.alpha = alpha 457 | self.k = k 458 | 459 | def method0(self, p_n, *args, **kwargs): 460 | """ No smoothing. """ 461 | return p_n 462 | 463 | def method1(self, p_n, *args, **kwargs): 464 | """ 465 | Smoothing method 1: Add *epsilon* counts to precision with 0 counts. 466 | """ 467 | return [(p_i.numerator + self.epsilon) / p_i.denominator 468 | if p_i.numerator == 0 else p_i for p_i in p_n] 469 | 470 | def method2(self, p_n, *args, **kwargs): 471 | """ 472 | Smoothing method 2: Add 1 to both numerator and denominator from 473 | Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of 474 | machine translation quality using longest common subsequence and 475 | skip-bigram statistics. In ACL04. 476 | """ 477 | return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n] 478 | 479 | def method3(self, p_n, *args, **kwargs): 480 | """ 481 | Smoothing method 3: NIST geometric sequence smoothing 482 | The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each 483 | precision score whose matching n-gram count is null. 484 | k is 1 for the first 'n' value for which the n-gram match count is null/ 485 | For example, if the text contains: 486 | - one 2-gram match 487 | - and (consequently) two 1-gram matches 488 | the n-gram count for each individual precision score would be: 489 | - n=1 => prec_count = 2 (two unigrams) 490 | - n=2 => prec_count = 1 (one bigram) 491 | - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) 492 | - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) 493 | """ 494 | incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. 495 | for i, p_i in enumerate(p_n): 496 | if p_i.numerator == 0: 497 | p_n[i] = 1 / (2 ** incvnt * p_i.denominator) 498 | incvnt += 1 499 | return p_n 500 | 501 | def method4(self, p_n, references, hypothesis, hyp_len): 502 | """ 503 | Smoothing method 4: 504 | Shorter translations may have inflated precision values due to having 505 | smaller denominators; therefore, we give them proportionally 506 | smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry 507 | suggests dividing by 1/ln(len(T)), where T is the length of the translation. 508 | """ 509 | incvnt = 1 510 | for i, p_i in enumerate(p_n): 511 | if p_i.numerator == 0 and hyp_len != 0: 512 | p_n[i] = incvnt * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST. 513 | incvnt += 1 514 | return p_n 515 | 516 | def method5(self, p_n, references, hypothesis, hyp_len): 517 | """ 518 | Smoothing method 5: 519 | The matched counts for similar values of n should be similar. To a 520 | calculate the n-gram matched count, it averages the n−1, n and n+1 gram 521 | matched counts. 522 | """ 523 | m = {} 524 | # Requires an precision value for an addition ngram order. 525 | p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] 526 | m[-1] = p_n[0] + 1 527 | for i, p_i in enumerate(p_n): 528 | p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 529 | m[i] = p_n[i] 530 | return p_n 531 | 532 | def method6(self, p_n, references, hypothesis, hyp_len): 533 | """ 534 | Smoothing method 6: 535 | Interpolates the maximum likelihood estimate of the precision *p_n* with 536 | a prior estimate *pi0*. The prior is estimated by assuming that the ratio 537 | between pn and pn−1 will be the same as that between pn−1 and pn−2. 538 | """ 539 | for i, p_i in enumerate(p_n): 540 | if i in [1, 2]: # Skips the first 2 orders of ngrams. 541 | continue 542 | else: 543 | pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] 544 | # No. of ngrams in translation. 545 | l = sum(1 for _ in ngrams(hypothesis, i + 1)) 546 | p_n[i] = (p_i + self.alpha * pi0) / (l + self.alpha) 547 | return p_n 548 | 549 | def method7(self, p_n, references, hypothesis, hyp_len): 550 | """ 551 | Smoothing method 6: 552 | Interpolates the maximum likelihood estimate of the precision *p_n* with 553 | a prior estimate *pi0*. The prior is estimated by assuming that the ratio 554 | between pn and pn−1 will be the same as that between pn−1 and pn−2. 555 | """ 556 | p_n = self.method4(p_n, references, hypothesis, hyp_len) 557 | p_n = self.method5(p_n, references, hypothesis, hyp_len) 558 | return p_n -------------------------------------------------------------------------------- /data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import cPickle as pkl 4 | import gzip 5 | 6 | 7 | def fopen(filename, mode='r'): 8 | if filename.endswith('.gz'): 9 | return gzip.open(filename, mode) 10 | return open(filename, mode) 11 | 12 | 13 | class TextIterator: 14 | """Simple Bitext iterator.""" 15 | def __init__(self, source, target, 16 | source_dict, target_dict, 17 | batch_size=128, 18 | maxlen=100, 19 | n_words_source=-1, 20 | n_words_target=-1, 21 | cache=5): 22 | 23 | self.source = fopen(source, 'r') 24 | self.target = fopen(target, 'r') 25 | 26 | print 'scan the dataset.' 27 | for si, _ in enumerate(self.source): 28 | pass 29 | for ti, _ in enumerate(self.target): 30 | pass 31 | 32 | self.source.close() 33 | self.target.close() 34 | 35 | assert si == ti, 'the number of the source and target document must the same' 36 | print 'scanned {} lines'.format(si) 37 | 38 | self.source = fopen(source, 'r') 39 | self.target = fopen(target, 'r') 40 | 41 | with open(source_dict, 'rb') as f: 42 | self.source_dict = pkl.load(f) 43 | with open(target_dict, 'rb') as f: 44 | self.target_dict = pkl.load(f) 45 | 46 | self.num = si 47 | self.batch_size = batch_size 48 | self.maxlen = maxlen 49 | 50 | self.n_words_source = n_words_source 51 | self.n_words_target = n_words_target 52 | 53 | self.source_buffer = [] 54 | self.target_buffer = [] 55 | self.k = batch_size * cache 56 | 57 | self.end_of_data = False 58 | 59 | 60 | 61 | 62 | def __iter__(self): 63 | return self 64 | 65 | def reset(self): 66 | self.source.seek(0) 67 | self.target.seek(0) 68 | 69 | def next(self): 70 | if self.end_of_data: 71 | self.end_of_data = False 72 | self.reset() 73 | raise StopIteration 74 | 75 | source = [] 76 | target = [] 77 | 78 | # fill buffer, if it's empty 79 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 80 | 81 | if len(self.source_buffer) == 0: 82 | for k_ in xrange(self.k): 83 | ss = self.source.readline() 84 | if ss == "": 85 | break 86 | tt = self.target.readline() 87 | if tt == "": 88 | break 89 | 90 | self.source_buffer.append(ss.strip().split()) 91 | self.target_buffer.append(tt.strip().split()) 92 | 93 | # sort by target buffer 94 | tlen = numpy.array([len(t) for t in self.target_buffer]) 95 | tidx = tlen.argsort() 96 | 97 | _sbuf = [self.source_buffer[i] for i in tidx] 98 | _tbuf = [self.target_buffer[i] for i in tidx] 99 | 100 | self.source_buffer = _sbuf 101 | self.target_buffer = _tbuf 102 | 103 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 104 | self.end_of_data = False 105 | self.reset() 106 | raise StopIteration 107 | 108 | try: 109 | 110 | # actual work here 111 | while True: 112 | 113 | # read from source file and map to word index 114 | try: 115 | ss = self.source_buffer.pop() 116 | except IndexError: 117 | break 118 | ss = [self.source_dict[w] if w in self.source_dict else 1 119 | for w in ss] 120 | if self.n_words_source > 0: 121 | ss = [w if w < self.n_words_source else 1 for w in ss] 122 | 123 | # read from source file and map to word index 124 | tt = self.target_buffer.pop() 125 | tt = [self.target_dict[w] if w in self.target_dict else 1 126 | for w in tt] 127 | if self.n_words_target > 0: 128 | tt = [w if w < self.n_words_target else 1 for w in tt] 129 | 130 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 131 | continue 132 | 133 | source.append(ss) 134 | target.append(tt) 135 | 136 | if len(source) >= self.batch_size or \ 137 | len(target) >= self.batch_size: 138 | break 139 | except IOError: 140 | self.end_of_data = True 141 | 142 | if len(source) <= 0 or len(target) <= 0: 143 | self.end_of_data = False 144 | self.reset() 145 | raise StopIteration 146 | 147 | return source, target 148 | -------------------------------------------------------------------------------- /insepection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import matplotlib 3 | # matplotlib.use('agg') 4 | import copy 5 | import numpy 6 | import os 7 | import seaborn as sns 8 | import pandas as pd 9 | sns.set(context="paper", font="monospace", style='whitegrid') 10 | from matplotlib import pyplot as plot 11 | from matplotlib import rc 12 | 13 | rc('font',**{'family':'Verdana', 'weight': 'normal'}) 14 | rc('font', size=8) 15 | rc('text', usetex=True) 16 | rc('text.latex',unicode=True) 17 | rc('text.latex',preamble='\usepackage[utf8]{inputenc}') 18 | rc('text.latex',preamble='\usepackage[russian]{babel}') 19 | rc('text.latex',preamble='\usepackage[german]{babel}') 20 | rc('text.latex',preamble='\usepackage[ngerman]{babel}') 21 | 22 | matplotlib.rcParams['ytick.labelsize'] = 11 23 | matplotlib.rcParams['xtick.labelsize'] = 11 24 | 25 | def heatmap(sources, refs, trans, actions, idx, atten=None, savefig=True, name='test', info=None, show=False): 26 | source = [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||'] 27 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] 28 | action = actions[idx] 29 | 30 | 31 | if atten: 32 | attention = numpy.array(atten[idx]) 33 | 34 | def track(acts, data, annote): 35 | x, y = 0, 0 36 | for a in acts: 37 | x += a 38 | y += 1 - a 39 | # print a, x, y, target[x].encode('utf8') 40 | data[y, x] = 1 41 | annote[y, x] = 'W' if a == 0 else 'C' 42 | 43 | return data, annote 44 | # print target 45 | 46 | data = numpy.zeros((len(source), len(target))) 47 | annote = numpy.chararray(data.shape, itemsize=8) 48 | annote[:] = '' 49 | data, annote = track(action, data, annote) 50 | data[0, 0] = 1 51 | annote[0, 0] = 'S' 52 | if atten: 53 | data[:-1, 1:] += attention.T 54 | 55 | d = pd.DataFrame(data=data, columns=target, index=source) 56 | # p = sns.diverging_palette(220, 10, as_cmap=True) 57 | f, ax = plot.subplots(figsize=(11, 11)) 58 | f.set_canvas(plot.gcf().canvas) 59 | g = sns.heatmap(d, ax=ax, annot=annote, fmt='s') 60 | g.xaxis.tick_top() 61 | 62 | plot.xticks(rotation=90) 63 | plot.yticks(rotation=0) 64 | # plot.show() 65 | if savefig: 66 | if not os.path.exists('.images/C_{}'.format(name)): 67 | os.mkdir('.images/C_{}'.format(name)) 68 | 69 | filename = 'Idx={}||'.format(info['index']) 70 | for w in info: 71 | if w is not 'index': 72 | filename += '.{}={:.2f}'.format(w, float(info[w])) 73 | 74 | print 'saving...' 75 | f.savefig('.images/C_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100) 76 | if show: 77 | plot.show() 78 | 79 | print 'plotting done.' 80 | plot.close() 81 | 82 | def heatmap2(sources, refs, trans, actions, idx, atten=None, full_atten=None, savefig=True, name='test', info=None, show=False): 83 | source = ['*'] + [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||'] 84 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] + ['*'] 85 | action = actions[idx] 86 | 87 | flag = 0 88 | if atten: 89 | attention = numpy.array(atten[idx]) 90 | else: 91 | attention = None 92 | 93 | if full_atten: 94 | fullatten = numpy.array(full_atten[idx]) 95 | else: 96 | fullatten = None 97 | 98 | def track(acts, data, annote): 99 | x, y, z = 0, 0, 0 100 | for a in acts: 101 | x += (a == 1) 102 | y += (a == 0) 103 | z += (a == 2) 104 | 105 | # data[y + 1, x] = 1 106 | # data[z, x + 1] = 1 107 | # annote[y, x] = 'W' if a == 0 else 'C' 108 | 109 | return data, annote 110 | # print target 111 | 112 | data = numpy.zeros((len(source), len(target))) 113 | annote = numpy.chararray(data.shape, itemsize=8) 114 | annote[:] = '' 115 | data, annote = track(action, data, annote) 116 | data[1, 0] = 1 117 | 118 | def draw(data_t, ax, attention=None): 119 | 120 | data = copy.copy(data_t) 121 | data[1:-1, 1:-1] += attention.T 122 | d = pd.DataFrame(data=data, columns=target, index=source) 123 | # p = sns.diverging_palette(220, 10, as_cmap=True) 124 | g = sns.heatmap(d, mask=(data==0), square=True, cbar=False, linewidths=0.1, ax=ax, annot=annote, fmt='s') 125 | g.xaxis.tick_top() 126 | 127 | for tick in ax.get_xticklabels(): 128 | tick.set_rotation(90) 129 | for tick in ax.get_yticklabels(): 130 | tick.set_rotation(0) 131 | 132 | ax.grid(True) 133 | f, [ax1, ax2] = plot.subplots(1, 2, figsize=(22, 11)) 134 | f.set_canvas(plot.gcf().canvas) 135 | 136 | draw(data, ax1, attention) 137 | # plot.xticks(rotation=90) 138 | # plot.yticks(rotation=0) 139 | # plot.grid() 140 | 141 | draw(data, ax2, fullatten) 142 | # plot.xticks(rotation=90) 143 | # plot.yticks(rotation=0) 144 | # plot.grid() 145 | 146 | 147 | if savefig: 148 | if not os.path.exists('.images/M_{}'.format(name)): 149 | os.mkdir('.images/M_{}'.format(name)) 150 | 151 | filename = 'Idx={}||'.format(info['index']) 152 | for w in info: 153 | if w is not 'index': 154 | filename += '.{}={:.2f}'.format(w, float(info[w])) 155 | 156 | print 'saving...' 157 | plot.savefig('.images/M_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100) 158 | 159 | if show: 160 | plot.show() 161 | 162 | print 'plotting done.' 163 | plot.close() 164 | 165 | 166 | 167 | 168 | 169 | 170 | def visualize(sources, refs, trans, aligns, idx, savefig=True, name='test', info=None): 171 | 172 | colors = ['b', 'g'] 173 | 174 | fig = plot.figure(figsize=(20, 2)) 175 | ax = plot.gca() 176 | 177 | # plot.hold('on') 178 | 179 | plot.xlim([0., 10.]) 180 | 181 | scolors = [] 182 | caidx = 0 183 | coloridx = 0 184 | for sidx in xrange(len([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + [''])): 185 | if caidx >= len(numpy.unique(aligns[idx])) or sidx >= numpy.unique(aligns[idx])[caidx]: 186 | caidx = caidx + 1 187 | coloridx = 1 - coloridx 188 | scolors.append(colors[coloridx]) 189 | 190 | tcolors = [] 191 | lastidx = -1 192 | coloridx = 1 193 | for tt in aligns[idx]: 194 | if tt != lastidx: 195 | lastidx = tt 196 | coloridx = 1 - coloridx 197 | tcolors.append(colors[coloridx]) 198 | 199 | x, y = 0., 1. 200 | s_pos = [(x, y)] 201 | for ii, ss in enumerate([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + ['']): 202 | 203 | ss.replace('%', '\%') 204 | xx = plot.text(x, y, ss) 205 | xx.set_bbox(dict(color=scolors[ii], alpha=0.1, edgecolor=scolors[ii])) 206 | xx._renderer = fig.canvas.get_renderer() 207 | wext = xx.get_window_extent() 208 | bbox = ax.transData.inverted().transform(wext) 209 | x = bbox[1, 0] + 0. 210 | s_pos.append((x, y)) 211 | s_pos.append((bbox[1, 0], y)) 212 | 213 | x, y = 0., .95 214 | t_pos = [] 215 | for ii, ss in enumerate([s_.decode('utf8').replace('@@', '--') for s_ in trans[idx].split()]): 216 | 217 | ss.replace('%', '\%') 218 | xx = plot.text(x, y, ss) 219 | xx._renderer = fig.canvas.get_renderer() 220 | wext = xx.get_window_extent() 221 | bbox = ax.transData.inverted().transform(wext) 222 | t_pos.append((bbox[0, 0], bbox[0, 1] + 0.03)) 223 | x = bbox[1, 0] + 0. 224 | t_pos.append((bbox[1, 0], bbox[0, 1] + 0.03)) 225 | 226 | lasttidx = 0 227 | lastidx = -1 228 | for tidx, sidx in enumerate(aligns[idx]): 229 | if lastidx != sidx: 230 | lastidx = sidx 231 | lasttidx = tidx 232 | sidx = numpy.minimum(sidx, len(s_pos) - 1) 233 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1], 234 | t_pos[tidx][0] - s_pos[sidx][0], 235 | t_pos[tidx][1] - s_pos[sidx][1], 236 | head_width=0., head_length=0., 237 | fc=tcolors[tidx], ec=tcolors[tidx], 238 | linestyle='dotted', width=0.0001) 239 | for tt in xrange(tidx, len(aligns[idx])): 240 | if aligns[idx][tt] != sidx: 241 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1], 242 | t_pos[tt][0] - s_pos[sidx][0], 243 | t_pos[tt][1] - s_pos[sidx][1], 244 | head_width=0., head_length=0., 245 | fc=tcolors[tidx], ec=tcolors[tidx], 246 | linestyle='dotted', width=0.0001) 247 | plot.fill_between([t_pos[tidx][0], s_pos[sidx][0], t_pos[tt][0]], 248 | [t_pos[tidx][1], s_pos[sidx][1], t_pos[tt][1]], 249 | facecolor=tcolors[tidx], alpha=0.1) 250 | break 251 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1], 252 | t_pos[-1][0] - s_pos[sidx][0], 253 | t_pos[-1][1] - s_pos[sidx][1], 254 | head_width=0., head_length=0., 255 | fc=tcolors[-1], ec=tcolors[-1], 256 | linestyle='dotted', width=0.0001) 257 | plot.fill_between([t_pos[lasttidx][0], s_pos[sidx][0], t_pos[-1][0]], 258 | [t_pos[lasttidx][1], s_pos[sidx][1], t_pos[-1][1]], 259 | facecolor=tcolors[tidx], alpha=0.1) 260 | 261 | # plot.hold('off') 262 | 263 | plot.axis('off') 264 | plot.ylim([0.95, 1.01]) 265 | plot.tight_layout() 266 | 267 | if savefig: 268 | if not os.path.exists('.images/{}'.format(name)): 269 | os.mkdir('.images/{}'.format(name)) 270 | 271 | filename = 'Idx={}||'.format(info['index']) 272 | for w in info: 273 | if w is not 'index': 274 | filename += '.{}={:.2f}'.format(w, float(info[w])) 275 | 276 | plot.savefig('.images/{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=300) 277 | 278 | print 'plotting done.' 279 | plot.close() 280 | # plot.show() 281 | 282 | 283 | if __name__ == "__main__": 284 | 285 | sources = ['I cannot understand .'] 286 | targets = ['Ich verstehe nicht .'] 287 | actions = [[0, 0, 1, 1, 2, 0, 1, 2, 2, 0, 1]] 288 | heatmap2(sources, targets, targets, actions, 0, savefig=False, show=True) 289 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Build the basic layers for neural machine translation 3 | """ 4 | import warnings 5 | import os 6 | import theano 7 | import theano.tensor as tensor 8 | import numpy 9 | 10 | from collections import OrderedDict 11 | 12 | profile = False 13 | TINY = 1e-7 14 | 15 | # -------------------------------------------------------------------------# 16 | # Basic utils: 17 | # push parameters to Theano shared variables 18 | def zipp(params, tparams): 19 | for kk, vv in params.iteritems(): 20 | tparams[kk].set_value(vv) 21 | 22 | 23 | # pull parameters from Theano shared variables 24 | def unzip(zipped, new_params=None): 25 | if new_params is None: 26 | new_params = OrderedDict() 27 | 28 | for kk, vv in zipped.iteritems(): 29 | new_params[kk] = vv.get_value() 30 | return new_params 31 | 32 | 33 | # flatten-grad 34 | def flatcat(arrays): 35 | ''' 36 | Flattens arrays and concatenates them in order. 37 | ''' 38 | return tensor.concatenate([a.flatten() for a in arrays]) 39 | 40 | def flatgrad(loss, vars_): 41 | return flatcat(tensor.grad(loss, wrt=itemlist(vars_))) 42 | 43 | def zipsame(*seqs): 44 | L = len(seqs[0]) 45 | assert all(len(seq) == L for seq in seqs[1:]) 46 | return zip(*seqs) 47 | 48 | 49 | 50 | # ------------------------------------------------------------------------# 51 | # get the list of parameters: Note that tparams must be OrderedDict 52 | def itemlist(tparams, exception=None): 53 | if not exception: 54 | return [vv for kk, vv in tparams.iteritems()] 55 | 56 | return [vv for kk, vv in tparams.iteritems() if kk not in exception] 57 | 58 | # make prefix-appended name 59 | def _p(pp, name): 60 | return '%s_%s' % (pp, name) 61 | 62 | # initialize Theano shared variables according to the initial parameters 63 | def init_tparams(params): 64 | tparams = OrderedDict() 65 | for kk, pp in params.iteritems(): 66 | tparams[kk] = theano.shared(params[kk], name=kk) 67 | return tparams 68 | 69 | 70 | # load parameters 71 | def load_params(path, params): 72 | pp = numpy.load(path) 73 | for kk, vv in params.iteritems(): 74 | if kk not in pp: 75 | warnings.warn('%s is not in the archive' % kk) 76 | continue 77 | print 'loading {}: {}'.format(kk, pp[kk].shape) 78 | params[kk] = pp[kk] 79 | 80 | return params 81 | 82 | # lateral normalization 83 | def ln(x, b, s): 84 | _eps = 1e-5 85 | output = (x - x.mean(1)[:,None]) / tensor.sqrt((x.var(1)[:,None] + _eps)) 86 | output = s[None, :] * output + b[None,:] 87 | return output 88 | 89 | 90 | # -------------------------------------------------------------------------# 91 | # Layers: 92 | # 'layer-name': ('parameter initializer', 'computational graph') -- registeration 93 | layers = dict() 94 | layers['ff'] = ('param_init_fflayer', 'fflayer') 95 | layers['gru'] = ('param_init_gru', 'gru_layer') 96 | layers['gru_cond'] = ('param_init_gru_cond', 'gru_cond_layer', 'gru_cond_context', 'gru_cond_update') 97 | layers['lngru'] = ('param_init_lngru', 'lngru_layer') 98 | 99 | def get_layer(name): 100 | fns = layers[name] 101 | return (eval(fns[0]), eval(fns[1])) 102 | 103 | # some utilities 104 | def ortho_weight(ndim): 105 | W = numpy.random.randn(ndim, ndim) 106 | u, s, v = numpy.linalg.svd(W) 107 | return u.astype('float32') 108 | 109 | # norm initialization 110 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 111 | if nout is None: 112 | nout = nin 113 | if nout == nin and ortho: 114 | W = ortho_weight(nin) 115 | else: 116 | W = scale * numpy.random.randn(nin, nout) 117 | 118 | return W.astype('float32') 119 | 120 | 121 | def tanh(x): 122 | return tensor.tanh(x) 123 | 124 | def linear(x): 125 | return x 126 | 127 | def sigmoid(x): 128 | return tensor.nnet.sigmoid(x) 129 | 130 | def relu(x): 131 | return tensor.nnet.relu(x) 132 | 133 | def softmax(x): 134 | return tensor.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) 135 | 136 | def concatenate(tensor_list, axis=0): 137 | """ 138 | Alternative implementation of `theano.tensor.concatenate`. 139 | This function does exactly the same thing, but contrary to Theano's own 140 | implementation, the gradient is implemented on the GPU. 141 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 142 | because the inverse operation (splitting) needs to be done on the CPU. 143 | This implementation does not have that problem. 144 | :usage: 145 | >>> x, y = theano.tensor.matrices('x', 'y') 146 | >>> c = concatenate([x, y], axis=1) 147 | :parameters: 148 | - tensor_list : list 149 | list of Theano tensor expressions that should be concatenated. 150 | - axis : int 151 | the tensors will be joined along this axis. 152 | :returns: 153 | - out : tensor 154 | the concatenated tensor expression. 155 | """ 156 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 157 | 158 | output_shape = () 159 | for k in range(axis): 160 | output_shape += (tensor_list[0].shape[k],) 161 | output_shape += (concat_size,) 162 | for k in range(axis + 1, tensor_list[0].ndim): 163 | output_shape += (tensor_list[0].shape[k],) 164 | 165 | out = tensor.zeros(output_shape) 166 | offset = 0 167 | for tt in tensor_list: 168 | indices = () 169 | for k in range(axis): 170 | indices += (slice(None),) 171 | indices += (slice(offset, offset + tt.shape[axis]),) 172 | for k in range(axis + 1, tensor_list[0].ndim): 173 | indices += (slice(None),) 174 | 175 | out = tensor.set_subtensor(out[indices], tt) 176 | offset += tt.shape[axis] 177 | 178 | return out 179 | 180 | #-------------------------------------------------------------------------# 181 | # Dropout: 182 | 183 | def dropout_layer(state_before, use_noise, trng): 184 | proj = tensor.switch( 185 | use_noise, 186 | state_before * trng.binomial(state_before.shape, p=0.5, n=1, 187 | dtype=state_before.dtype), 188 | state_before * 0.5) 189 | return proj 190 | 191 | 192 | # -------------------------------------------------------------------------# 193 | # Feedforward: 194 | # affine transformation + point-wise nonlinearity 195 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, 196 | ortho=True, negative=0, scale=0.01): 197 | if nin is None: 198 | nin = options['dim_proj'] 199 | if nout is None: 200 | nout = options['dim_proj'] 201 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=scale, ortho=ortho) 202 | if negative == 0: 203 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') 204 | else: 205 | params[_p(prefix, 'b')] = numpy.ones((nout,)).astype('float32') * negative 206 | 207 | return params 208 | 209 | 210 | def fflayer(tparams, state_below, options, prefix='rconv', 211 | activ='lambda x: tensor.tanh(x)', **kwargs): 212 | return eval(activ)( 213 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 214 | tparams[_p(prefix, 'b')]) 215 | 216 | 217 | # -------------------------------------------------------------------------# 218 | # Gated Recurrent Unit: 219 | # 220 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None, scale=0.01): 221 | if nin is None: 222 | nin = options['dim_proj'] 223 | if dim is None: 224 | dim = options['dim_proj'] 225 | 226 | # embedding to gates transformation weights, biases 227 | W = numpy.concatenate([norm_weight(nin, dim, scale=scale), 228 | norm_weight(nin, dim, scale=scale)], axis=1) 229 | params[_p(prefix, 'W')] = W 230 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 231 | 232 | # recurrent transformation weights for gates 233 | U = numpy.concatenate([ortho_weight(dim), 234 | ortho_weight(dim)], axis=1) 235 | params[_p(prefix, 'U')] = U 236 | 237 | # embedding to hidden state proposal weights, biases 238 | Wx = norm_weight(nin, dim, scale=scale) 239 | params[_p(prefix, 'Wx')] = Wx 240 | params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 241 | 242 | # recurrent transformation weights for hidden state proposal 243 | Ux = ortho_weight(dim) 244 | params[_p(prefix, 'Ux')] = Ux 245 | 246 | return params 247 | 248 | 249 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, 250 | one_step=False, _init_state=None, **kwargs): 251 | if one_step: 252 | assert _init_state, 'previous state must be provided' 253 | 254 | nsteps = state_below.shape[0] 255 | if state_below.ndim == 3: 256 | n_samples = state_below.shape[1] 257 | else: 258 | n_samples = 1 259 | 260 | dim = tparams[_p(prefix, 'Ux')].shape[1] 261 | 262 | if mask is None: 263 | mask = tensor.alloc(1., state_below.shape[0], 1) 264 | 265 | # utility function to slice a tensor 266 | def _slice(_x, n, dim): 267 | if _x.ndim == 3: 268 | return _x[:, :, n*dim:(n+1)*dim] 269 | return _x[:, n*dim:(n+1)*dim] 270 | 271 | # state_below is the input word embeddings 272 | # input to the gates, concatenated 273 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 274 | tparams[_p(prefix, 'b')] 275 | # input to compute the hidden state proposal 276 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 277 | tparams[_p(prefix, 'bx')] 278 | 279 | # step function to be used by scan 280 | # arguments | sequences |outputs-info| non-seqs 281 | def _step_slice(m_, x_, xx_, h_, U, Ux): 282 | preact = tensor.dot(h_, U) 283 | preact += x_ 284 | 285 | # reset and update gates 286 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 287 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 288 | 289 | # compute the hidden state proposal 290 | preactx = tensor.dot(h_, Ux) 291 | preactx = preactx * r 292 | preactx = preactx + xx_ 293 | 294 | # hidden state proposal 295 | h = tensor.tanh(preactx) 296 | 297 | # leaky integrate and obtain next hidden state 298 | h = u * h_ + (1. - u) * h 299 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 300 | 301 | return h 302 | 303 | # prepare scan arguments 304 | seqs = [mask, state_below_, state_belowx] 305 | init_states = [tensor.alloc(0., n_samples, dim)] 306 | _step = _step_slice 307 | shared_vars = [tparams[_p(prefix, 'U')], 308 | tparams[_p(prefix, 'Ux')]] 309 | 310 | if one_step: 311 | rval = _step(*(seqs + [_init_state] + shared_vars)) 312 | else: 313 | rval, updates = theano.scan(_step, 314 | sequences=seqs, 315 | outputs_info=init_states, 316 | non_sequences=shared_vars, 317 | name=_p(prefix, '_layers'), 318 | n_steps=nsteps, 319 | profile=profile, 320 | strict=True) 321 | rval = [rval] 322 | return rval 323 | 324 | # -------------------------------------------------------------------------# 325 | # Conditional Gated Recurrent Unit with Attention (GRU_cond) 326 | # 327 | def param_init_gru_cond(options, params, prefix='gru_cond', 328 | nin=None, dim=None, dimctx=None, 329 | nin_nonlin=None, dim_nonlin=None, scale=0.01): 330 | if nin is None: 331 | nin = options['dim'] 332 | if dim is None: 333 | dim = options['dim'] 334 | if dimctx is None: 335 | dimctx = options['dim'] 336 | if nin_nonlin is None: 337 | nin_nonlin = nin 338 | if dim_nonlin is None: 339 | dim_nonlin = dim 340 | 341 | W = numpy.concatenate([norm_weight(nin, dim, scale=scale), 342 | norm_weight(nin, dim, scale=scale)], axis=1) 343 | params[_p(prefix, 'W')] = W 344 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 345 | U = numpy.concatenate([ortho_weight(dim_nonlin), 346 | ortho_weight(dim_nonlin)], axis=1) 347 | params[_p(prefix, 'U')] = U 348 | 349 | Wx = norm_weight(nin_nonlin, dim_nonlin, scale=scale) 350 | params[_p(prefix, 'Wx')] = Wx 351 | Ux = ortho_weight(dim_nonlin) 352 | params[_p(prefix, 'Ux')] = Ux 353 | params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32') 354 | 355 | U_nl = numpy.concatenate([ortho_weight(dim_nonlin), 356 | ortho_weight(dim_nonlin)], axis=1) 357 | params[_p(prefix, 'U_nl')] = U_nl 358 | params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32') 359 | 360 | Ux_nl = ortho_weight(dim_nonlin) 361 | params[_p(prefix, 'Ux_nl')] = Ux_nl 362 | params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32') 363 | 364 | # context to LSTM 365 | Wc = norm_weight(dimctx, dim*2, scale=scale) 366 | params[_p(prefix, 'Wc')] = Wc 367 | 368 | Wcx = norm_weight(dimctx, dim, scale=scale) 369 | params[_p(prefix, 'Wcx')] = Wcx 370 | 371 | # attention: combined -> hidden 372 | W_comb_att = norm_weight(dim, dimctx, scale=scale) 373 | params[_p(prefix, 'W_comb_att')] = W_comb_att 374 | 375 | # attention: context -> hidden 376 | Wc_att = norm_weight(dimctx, scale=scale) 377 | params[_p(prefix, 'Wc_att')] = Wc_att 378 | 379 | # attention: hidden bias 380 | b_att = numpy.zeros((dimctx,)).astype('float32') 381 | params[_p(prefix, 'b_att')] = b_att 382 | 383 | # attention: 384 | U_att = norm_weight(dimctx, 1, scale=scale) 385 | params[_p(prefix, 'U_att')] = U_att 386 | c_att = numpy.zeros((1,)).astype('float32') 387 | params[_p(prefix, 'c_tt')] = c_att 388 | 389 | return params 390 | 391 | 392 | def gru_cond_layer(tparams, state_below, options, prefix='gru', 393 | mask=None, context=None, one_step=False, 394 | init_memory=None, init_state=None, 395 | context_mask=None, 396 | **kwargs): 397 | 398 | assert context, 'Context must be provided' 399 | 400 | if one_step: 401 | assert init_state, 'previous state must be provided' 402 | 403 | nsteps = state_below.shape[0] 404 | if state_below.ndim == 3: 405 | n_samples = state_below.shape[1] 406 | else: 407 | n_samples = 1 408 | 409 | # mask 410 | if mask is None: 411 | mask = tensor.alloc(1., state_below.shape[0], 1) 412 | 413 | dim = tparams[_p(prefix, 'Wcx')].shape[1] 414 | 415 | # initial/previous state 416 | if init_state is None: 417 | init_state = tensor.alloc(0., n_samples, dim) 418 | 419 | # projected context 420 | assert context.ndim == 3, \ 421 | 'Context must be 3-d: #annotation x #sample x dim' 422 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\ 423 | tparams[_p(prefix, 'b_att')] 424 | 425 | def _slice(_x, n, dim): 426 | if _x.ndim == 3: 427 | return _x[:, :, n*dim:(n+1)*dim] 428 | return _x[:, n*dim:(n+1)*dim] 429 | 430 | # projected x 431 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\ 432 | tparams[_p(prefix, 'bx')] 433 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\ 434 | tparams[_p(prefix, 'b')] 435 | 436 | def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, 437 | U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, 438 | U_nl, Ux_nl, b_nl, bx_nl): 439 | preact1 = tensor.dot(h_, U) 440 | preact1 += x_ 441 | preact1 = tensor.nnet.sigmoid(preact1) 442 | 443 | r1 = _slice(preact1, 0, dim) 444 | u1 = _slice(preact1, 1, dim) 445 | 446 | preactx1 = tensor.dot(h_, Ux) 447 | preactx1 *= r1 448 | preactx1 += xx_ 449 | 450 | h1 = tensor.tanh(preactx1) 451 | 452 | h1 = u1 * h_ + (1. - u1) * h1 453 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ 454 | 455 | # attention 456 | pstate_ = tensor.dot(h1, W_comb_att) 457 | pctx__ = pctx_ + pstate_[None, :, :] 458 | #pctx__ += xc_ 459 | pctx__ = tensor.tanh(pctx__) 460 | alpha = tensor.dot(pctx__, U_att)+c_tt 461 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) 462 | alpha = tensor.exp(alpha) 463 | 464 | if context_mask: 465 | alpha = alpha * context_mask 466 | alpha = alpha / (alpha.sum(0, keepdims=True) + TINY) 467 | 468 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context 469 | 470 | preact2 = tensor.dot(h1, U_nl)+b_nl 471 | preact2 += tensor.dot(ctx_, Wc) 472 | preact2 = tensor.nnet.sigmoid(preact2) 473 | 474 | r2 = _slice(preact2, 0, dim) 475 | u2 = _slice(preact2, 1, dim) 476 | 477 | preactx2 = tensor.dot(h1, Ux_nl)+bx_nl 478 | preactx2 *= r2 479 | preactx2 += tensor.dot(ctx_, Wcx) 480 | 481 | h2 = tensor.tanh(preactx2) 482 | 483 | h2 = u2 * h1 + (1. - u2) * h2 484 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 485 | 486 | return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u 487 | 488 | seqs = [mask, state_below_, state_belowx] 489 | #seqs = [mask, state_below_, state_belowx, state_belowc] 490 | _step = _step_slice 491 | 492 | shared_vars = [tparams[_p(prefix, 'U')], 493 | tparams[_p(prefix, 'Wc')], 494 | tparams[_p(prefix, 'W_comb_att')], 495 | tparams[_p(prefix, 'U_att')], 496 | tparams[_p(prefix, 'c_tt')], 497 | tparams[_p(prefix, 'Ux')], 498 | tparams[_p(prefix, 'Wcx')], 499 | tparams[_p(prefix, 'U_nl')], 500 | tparams[_p(prefix, 'Ux_nl')], 501 | tparams[_p(prefix, 'b_nl')], 502 | tparams[_p(prefix, 'bx_nl')]] 503 | 504 | if one_step: 505 | rval = _step(*(seqs + [init_state, None, None, pctx_, context] + 506 | shared_vars)) 507 | else: 508 | rval, updates = theano.scan(_step, 509 | sequences=seqs, 510 | outputs_info=[init_state, 511 | tensor.alloc(0., n_samples, 512 | context.shape[2]), 513 | tensor.alloc(0., n_samples, 514 | context.shape[0])], 515 | non_sequences=[pctx_, context]+shared_vars, 516 | name=_p(prefix, '_layers'), 517 | n_steps=nsteps, 518 | profile=profile, 519 | strict=True) 520 | return rval 521 | 522 | # ================================================================================== # 523 | # Conditional GRU: depart the network 524 | 525 | def gru_cond_context(tparams, state_below, options, prefix='gru', 526 | mask=None, context=None, 527 | init_memory=None, init_state=None, 528 | context_mask=None, 529 | **kwargs): 530 | 531 | assert context, 'Context must be provided' 532 | assert init_state, 'previous state must be provided' 533 | 534 | if state_below.ndim == 3: 535 | n_samples = state_below.shape[1] 536 | else: 537 | n_samples = 1 538 | 539 | # mask 540 | if mask is None: 541 | mask = tensor.alloc(1., state_below.shape[0], 1) 542 | 543 | dim = tparams[_p(prefix, 'Wcx')].shape[1] 544 | 545 | # initial/previous state 546 | if init_state is None: 547 | init_state = tensor.alloc(0., n_samples, dim) 548 | 549 | # projected context 550 | assert context.ndim == 3, \ 551 | 'Context must be 3-d: #annotation x #sample x dim' 552 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\ 553 | tparams[_p(prefix, 'b_att')] 554 | 555 | def _slice(_x, n, dim): 556 | if _x.ndim == 3: 557 | return _x[:, :, n*dim:(n+1)*dim] 558 | return _x[:, n*dim:(n+1)*dim] 559 | 560 | # projected x 561 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\ 562 | tparams[_p(prefix, 'bx')] 563 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\ 564 | tparams[_p(prefix, 'b')] 565 | 566 | def _step_slice(m_, x_, xx_, h_, pctx_, cc_, 567 | U, W_comb_att, U_att, c_tt, Ux): 568 | preact1 = tensor.dot(h_, U) 569 | preact1 += x_ 570 | preact1 = tensor.nnet.sigmoid(preact1) 571 | 572 | r1 = _slice(preact1, 0, dim) 573 | u1 = _slice(preact1, 1, dim) 574 | 575 | preactx1 = tensor.dot(h_, Ux) 576 | preactx1 *= r1 577 | preactx1 += xx_ 578 | 579 | h1 = tensor.tanh(preactx1) 580 | 581 | h1 = u1 * h_ + (1. - u1) * h1 582 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ 583 | 584 | # attention 585 | pstate_ = tensor.dot(h1, W_comb_att) 586 | 587 | pctx__ = pctx_ + pstate_[None, :, :] 588 | pctx__ = tensor.tanh(pctx__) 589 | 590 | alpha = tensor.dot(pctx__, U_att)+c_tt 591 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) 592 | alpha = tensor.exp(alpha) 593 | 594 | if context_mask: 595 | alpha = alpha * context_mask 596 | alpha = alpha / (alpha.sum(0, keepdims=True) + TINY) 597 | 598 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context 599 | return h1, ctx_, alpha.T # pstate_, preact, preactx, r, u 600 | 601 | seqs = [mask, state_below_, state_belowx] 602 | _step = _step_slice 603 | 604 | shared_vars = [tparams[_p(prefix, 'U')], 605 | tparams[_p(prefix, 'W_comb_att')], 606 | tparams[_p(prefix, 'U_att')], 607 | tparams[_p(prefix, 'c_tt')], 608 | tparams[_p(prefix, 'Ux')]] 609 | 610 | rval = _step(*(seqs + [init_state, pctx_, context] + shared_vars)) 611 | return rval 612 | 613 | 614 | def gru_cond_update(tparams, options, prefix='gru', 615 | mask=None, cxt=None, h1=None, 616 | **kwargs): 617 | 618 | assert cxt, 'Context vector must be provided' 619 | assert h1, 'Temperal state vector must be provided' 620 | 621 | # mask 622 | if mask is None: 623 | mask = tensor.alloc(1., h1.shape[0], 1) 624 | 625 | dim = tparams[_p(prefix, 'Wcx')].shape[1] 626 | 627 | 628 | def _slice(_x, n, dim): 629 | if _x.ndim == 3: 630 | return _x[:, :, n*dim:(n+1)*dim] 631 | return _x[:, n*dim:(n+1)*dim] 632 | 633 | 634 | def _step_slice(m_, ctx_, h1, 635 | Wc, Wcx, 636 | U_nl, Ux_nl, 637 | b_nl, bx_nl): 638 | 639 | preact2 = tensor.dot(h1, U_nl)+b_nl 640 | preact2 += tensor.dot(ctx_, Wc) 641 | preact2 = tensor.nnet.sigmoid(preact2) 642 | 643 | r2 = _slice(preact2, 0, dim) 644 | u2 = _slice(preact2, 1, dim) 645 | 646 | preactx2 = tensor.dot(h1, Ux_nl)+bx_nl 647 | preactx2 *= r2 648 | preactx2 += tensor.dot(ctx_, Wcx) 649 | 650 | h2 = tensor.tanh(preactx2) 651 | h2 = u2 * h1 + (1. - u2) * h2 652 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 653 | 654 | return h2 655 | 656 | seqs = [mask, cxt, h1] 657 | _step = _step_slice 658 | 659 | shared_vars = [tparams[_p(prefix, 'Wc')], 660 | tparams[_p(prefix, 'Wcx')], 661 | tparams[_p(prefix, 'U_nl')], 662 | tparams[_p(prefix, 'Ux_nl')], 663 | tparams[_p(prefix, 'b_nl')], 664 | tparams[_p(prefix, 'bx_nl')]] 665 | 666 | rval = _step(*(seqs + shared_vars)) 667 | return rval 668 | 669 | # ================================================================================== # 670 | 671 | 672 | 673 | 674 | # LN-GRU layer 675 | def param_init_lngru(options, params, prefix='lngru', nin=None, dim=None, scale=0.01): 676 | """ 677 | Gated Recurrent Unit (GRU) with LN 678 | """ 679 | if nin == None: 680 | nin = options['dim_proj'] 681 | if dim == None: 682 | dim = options['dim_proj'] 683 | W = numpy.concatenate([norm_weight(nin,dim, scale=scale), 684 | norm_weight(nin,dim, scale=scale)], axis=1) 685 | params[_p(prefix,'W')] = W 686 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') 687 | U = numpy.concatenate([ortho_weight(dim), 688 | ortho_weight(dim)], axis=1) 689 | params[_p(prefix,'U')] = U 690 | 691 | Wx = norm_weight(nin, dim, scale=scale) 692 | params[_p(prefix,'Wx')] = Wx 693 | Ux = ortho_weight(dim) 694 | params[_p(prefix,'Ux')] = Ux 695 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') 696 | 697 | # LN parameters 698 | scale_add = 0.0 699 | scale_mul = 1.0 700 | params[_p(prefix,'b1')] = scale_add * numpy.ones((2*dim)).astype('float32') 701 | params[_p(prefix,'b2')] = scale_add * numpy.ones((1*dim)).astype('float32') 702 | params[_p(prefix,'b3')] = scale_add * numpy.ones((2*dim)).astype('float32') 703 | params[_p(prefix,'b4')] = scale_add * numpy.ones((1*dim)).astype('float32') 704 | params[_p(prefix,'s1')] = scale_mul * numpy.ones((2*dim)).astype('float32') 705 | params[_p(prefix,'s2')] = scale_mul * numpy.ones((1*dim)).astype('float32') 706 | params[_p(prefix,'s3')] = scale_mul * numpy.ones((2*dim)).astype('float32') 707 | params[_p(prefix,'s4')] = scale_mul * numpy.ones((1*dim)).astype('float32') 708 | 709 | return params 710 | 711 | def lngru_layer(tparams, state_below, options, prefix='lngru', mask=None, one_step=False, _init_state=None, **kwargs): 712 | """ 713 | Feedforward pass through GRU with LN 714 | """ 715 | nsteps = state_below.shape[0] 716 | if state_below.ndim == 3: 717 | n_samples = state_below.shape[1] 718 | else: 719 | n_samples = 1 720 | 721 | dim = tparams[_p(prefix,'Ux')].shape[1] 722 | 723 | if _init_state == None: 724 | _init_state = tensor.alloc(0., n_samples, dim) 725 | 726 | if mask == None: 727 | mask = tensor.alloc(1., state_below.shape[0], 1) 728 | 729 | def _slice(_x, n, dim): 730 | if _x.ndim == 3: 731 | return _x[:, :, n*dim:(n+1)*dim] 732 | return _x[:, n*dim:(n+1)*dim] 733 | 734 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 735 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] 736 | U = tparams[_p(prefix, 'U')] 737 | Ux = tparams[_p(prefix, 'Ux')] 738 | 739 | def _step_slice(m_, x_, xx_, h_, U, Ux, b1, b2, b3, b4, s1, s2, s3, s4): 740 | 741 | x_ = ln(x_, b1, s1) 742 | xx_ = ln(xx_, b2, s2) 743 | 744 | preact = tensor.dot(h_, U) 745 | preact = ln(preact, b3, s3) 746 | preact += x_ 747 | 748 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 749 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 750 | 751 | preactx = tensor.dot(h_, Ux) 752 | preactx = ln(preactx, b4, s4) 753 | preactx = preactx * r 754 | preactx = preactx + xx_ 755 | 756 | h = tensor.tanh(preactx) 757 | 758 | h = u * h_ + (1. - u) * h 759 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 760 | 761 | return h 762 | 763 | seqs = [mask, state_below_, state_belowx] 764 | _step = _step_slice 765 | 766 | non_seqs = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]] 767 | non_seqs += [tparams[_p(prefix, 'b1')], tparams[_p(prefix, 'b2')], tparams[_p(prefix, 'b3')], tparams[_p(prefix, 'b4')]] 768 | non_seqs += [tparams[_p(prefix, 's1')], tparams[_p(prefix, 's2')], tparams[_p(prefix, 's3')], tparams[_p(prefix, 's4')]] 769 | 770 | if one_step: 771 | rval = _step(*(seqs+[_init_state]+non_seqs)) 772 | else: 773 | rval, updates = theano.scan(_step, 774 | sequences=seqs, 775 | outputs_info = [_init_state], 776 | non_sequences = non_seqs, 777 | name=_p(prefix, '_layers'), 778 | n_steps=nsteps, 779 | profile=False, 780 | strict=True) 781 | rval = [rval] 782 | return rval 783 | 784 | 785 | -------------------------------------------------------------------------------- /mteval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ref=" /misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok" 4 | sed -i 's/@@ //g' $1 5 | ./data/multi-bleu.perl ref < $1 6 | -------------------------------------------------------------------------------- /nmt_uni.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a neural machine translation model with soft attention 3 | ''' 4 | import theano 5 | import theano.tensor as tensor 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | 8 | import cPickle as pkl 9 | #import ipdb 10 | import numpy 11 | import copy 12 | 13 | import os 14 | 15 | import sys 16 | import time 17 | 18 | from collections import OrderedDict 19 | from data_iterator import TextIterator 20 | from layers import * 21 | from optimizer import * 22 | 23 | profile = False 24 | TINY = 1e-7 25 | 26 | # -----------------------------------------------------------------------------# 27 | # Build the Attention-based Neural Machine Translation 28 | 29 | # initialize all parameters 30 | def init_params(options): 31 | params = OrderedDict() 32 | 33 | # embedding 34 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 35 | params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) 36 | 37 | # encoder: uni-directional RNN 38 | params = get_layer(options['encoder'])[0](options, params, 39 | prefix='encoder', 40 | nin=options['dim_word'], 41 | dim=options['dim']) 42 | 43 | if options.get('birnn', False): 44 | params = get_layer(options['encoder'])[0](options, params, 45 | prefix='encoder_r', 46 | nin=options['dim_word'], 47 | dim=options['dim']) 48 | 49 | 50 | ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim'] 51 | 52 | # init_state, init_cell 53 | params = get_layer('ff')[0](options, params, prefix='ff_state', 54 | nin=ctxdim, nout=options['dim']) 55 | # decoder 56 | params = get_layer(options['decoder'])[0](options, params, 57 | prefix='decoder', 58 | nin=options['dim_word'], 59 | dim=options['dim'], 60 | dimctx=ctxdim) 61 | # readout 62 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 63 | nin=options['dim'], nout=options['dim_word'], 64 | ortho=False) 65 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 66 | nin=options['dim_word'], 67 | nout=options['dim_word'], ortho=False) 68 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 69 | nin=ctxdim, nout=options['dim_word'], 70 | ortho=False) 71 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 72 | nin=options['dim_word'], 73 | nout=options['n_words']) 74 | return params 75 | 76 | 77 | def build_model(tparams, options): 78 | opt_ret = dict() 79 | 80 | trng = RandomStreams(1234) 81 | use_noise = theano.shared(numpy.float32(0.)) 82 | 83 | # description string: #words x #samples 84 | x = tensor.matrix('x', dtype='int64') 85 | x_mask = tensor.matrix('x_mask', dtype='float32') 86 | y = tensor.matrix('y', dtype='int64') 87 | y_mask = tensor.matrix('y_mask', dtype='float32') 88 | 89 | # time_steps 90 | n_timesteps = x_mask.shape[0] 91 | n_timesteps_trg = y_mask.shape[0] 92 | n_samples = x_mask.shape[1] 93 | 94 | # word embedding for forward rnn (source) 95 | emb = tparams['Wemb'][x.flatten()] 96 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 97 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 98 | prefix='encoder', 99 | mask=x_mask) 100 | 101 | # for reverse RNN: bi-directional RNN encoder 102 | if options.get('birnn', False): 103 | xr = x[::-1] 104 | xr_mask = x_mask[::-1] 105 | 106 | embr = tparams['Wemb'][xr.flatten()] 107 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 108 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 109 | prefix='encoder_r', 110 | mask=xr_mask) 111 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 112 | 113 | else: 114 | ctx = proj[0] # context vectors 115 | 116 | # mean of the context (across time) will be used to initialize decoder rnn 117 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 118 | 119 | 120 | # initial decoder state 121 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 122 | prefix='ff_state', activ='tanh') 123 | 124 | # word embedding (target), we will shift the target sequence one time step 125 | # to the right. This is done because of the bi-gram connections in the 126 | # readout and decoder rnn. The first target will be all zeros and we will 127 | # not condition on the last output. 128 | emb = tparams['Wemb_dec'][y.flatten()] 129 | emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) 130 | emb_shifted = tensor.zeros_like(emb) 131 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 132 | emb = emb_shifted 133 | 134 | # decoder - pass through the decoder conditional gru with attention 135 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 136 | prefix='decoder', 137 | mask=y_mask, context=ctx, 138 | context_mask=x_mask, 139 | one_step=False, 140 | init_state=init_state) 141 | # hidden states of the decoder gru 142 | proj_h = proj[0] 143 | 144 | # weighted averages of context, generated by attention module 145 | ctxs = proj[1] 146 | 147 | # weights (alignment matrix) 148 | opt_ret['dec_alphas'] = proj[2] # --> to show the attenion weights 149 | 150 | # compute word probabilities 151 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options, 152 | prefix='ff_logit_lstm', activ='linear') 153 | logit_prev = get_layer('ff')[1](tparams, emb, options, 154 | prefix='ff_logit_prev', activ='linear') 155 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 156 | prefix='ff_logit_ctx', activ='linear') 157 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx) 158 | 159 | # dropout (noise) 160 | if options['use_dropout']: 161 | logit = dropout_layer(logit, use_noise, trng) 162 | logit = get_layer('ff')[1](tparams, logit, options, 163 | prefix='ff_logit', activ='linear') 164 | logit_shp = logit.shape 165 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 166 | 167 | # compute the cost (negative loglikelihood) 168 | y_flat = y.flatten() 169 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 170 | 171 | cost = -tensor.log(probs.flatten()[y_flat_idx]) 172 | cost = cost.reshape([y.shape[0], y.shape[1]]) 173 | cost = (cost * y_mask).sum(0) 174 | 175 | # we will build an additional function for computing costs 176 | f_cost = theano.function([ctx, x_mask, y, y_mask], cost) 177 | return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, f_cost 178 | 179 | 180 | # build a fine-tuner 181 | def build_fine(tparams, options, fullmodel=True): 182 | 183 | # ------------------- ENCODER ------------------------------------------ # 184 | 185 | opt_ret = dict() 186 | 187 | trng = RandomStreams(1234) 188 | use_noise = theano.shared(numpy.float32(0.)) 189 | 190 | # description string: #words x #samples 191 | x = tensor.matrix('x', dtype='int64') 192 | x_mask = tensor.matrix('x_mask', dtype='float32') 193 | y = tensor.matrix('y', dtype='int64') 194 | y_mask = tensor.matrix('y_mask', dtype='float32') 195 | 196 | # time_steps 197 | n_timesteps = x_mask.shape[0] 198 | n_timesteps_trg = y_mask.shape[0] 199 | n_samples = x_mask.shape[1] 200 | 201 | # word embedding for forward rnn (source) 202 | emb = tparams['Wemb'][x.flatten()] 203 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 204 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 205 | prefix='encoder', 206 | mask=x_mask) 207 | 208 | # for reverse RNN: bi-directional RNN encoder 209 | if options.get('birnn', False): 210 | xr = x[::-1] 211 | xr_mask = x_mask[::-1] 212 | 213 | embr = tparams['Wemb'][xr.flatten()] 214 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 215 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 216 | prefix='encoder_r', 217 | mask=xr_mask) 218 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 219 | 220 | else: 221 | ctx = proj[0] # context vectors 222 | 223 | # mean of the context (across time) will be used to initialize decoder rnn 224 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 225 | 226 | # or you can use the last state of forward + backward encoder rnns 227 | # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) 228 | 229 | # initial decoder state 230 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 231 | prefix='ff_state', activ='tanh') 232 | 233 | print 'compile the initializer' 234 | f_init = theano.function([x, x_mask], [ctx, init_state]) 235 | print 'encoder done.' 236 | # ------------------- ENCODER ------------------------------------------ # 237 | 238 | 239 | c_mask = tensor.tensor3('c_mask', dtype='float32') # seq_t x seq_s x batches 240 | 241 | emb = tparams['Wemb_dec'][y.flatten()] 242 | emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) 243 | emb_shifted = tensor.zeros_like(emb) 244 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 245 | emb = emb_shifted 246 | 247 | # decoder - pass through the decoder conditional gru with attention 248 | def _step(_emb, _y_mask, _c_mask, _init_state, _ctx): 249 | return get_layer(options['decoder'])[1](tparams, _emb, options, 250 | prefix='decoder', 251 | mask=_y_mask, context=_ctx, 252 | context_mask=_c_mask, 253 | one_step=True, 254 | init_state=_init_state) 255 | 256 | proj, _ = theano.scan(_step, 257 | sequences=[emb, y_mask, c_mask], 258 | outputs_info=[init_state, None, None], 259 | non_sequences=[ctx]) 260 | 261 | 262 | # hidden states of the decoder gru 263 | proj_h = proj[0] 264 | 265 | # weighted averages of context, generated by attention module 266 | ctxs = proj[1] 267 | 268 | # weights (alignment matrix) 269 | opt_ret['dec_alphas'] = proj[2] # --> to show the attenion weights 270 | 271 | # compute word probabilities 272 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options, 273 | prefix='ff_logit_lstm', activ='linear') 274 | logit_prev = get_layer('ff')[1](tparams, emb, options, 275 | prefix='ff_logit_prev', activ='linear') 276 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 277 | prefix='ff_logit_ctx', activ='linear') 278 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx) 279 | 280 | # dropout (noise) 281 | if options['use_dropout']: 282 | logit = dropout_layer(logit, use_noise, trng) 283 | logit = get_layer('ff')[1](tparams, logit, options, 284 | prefix='ff_logit', activ='linear') 285 | logit_shp = logit.shape 286 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 287 | 288 | # compute the cost (negative loglikelihood) 289 | y_flat = y.flatten() 290 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 291 | 292 | cost = -tensor.log(probs.flatten()[y_flat_idx] + TINY) 293 | cost = cost.reshape([y.shape[0], y.shape[1]]) 294 | a_cost = tensor.mean((cost * y_mask).sum(0)) 295 | 296 | # gradient clipping 297 | def _clip(grad): 298 | clip_c = 1. 299 | if clip_c > 0.: 300 | g2 = 0. 301 | for g in grad: 302 | g2 += (g ** 2).sum() 303 | new_grads = [] 304 | for g in grad: 305 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g)) 306 | grad = new_grads 307 | return grad 308 | 309 | 310 | lr = tensor.scalar(name='lr') 311 | if fullmodel: 312 | print 'build MLE optimizer for the whole NMT model:' 313 | a_grad = _clip(theano.grad(a_cost, wrt=itemlist(tparams))) 314 | inps = [x, x_mask, y, y_mask, c_mask] 315 | outps = [a_cost, cost] 316 | f_cost, f_update = adam(lr, tparams, a_grad, inps, outps) 317 | else: 318 | print 'build MLE only for decoder' 319 | tparams_d = OrderedDict() 320 | for w in tparams: 321 | if ('ff_state' not in w) and ('encoder' not in w) and (w != 'Wemb'): 322 | print w, 'updated.' 323 | tparams_d[w] = tparams[w] 324 | 325 | a_grad = _clip(theano.grad(a_cost, wrt=itemlist(tparams_d))) 326 | inps = [x, x_mask, y, y_mask, c_mask] 327 | outps = [a_cost, cost] 328 | f_cost, f_update = adam(lr, tparams_d, a_grad, inps, outps) 329 | 330 | print 'done.' 331 | return f_init, f_cost, f_update 332 | 333 | 334 | # build a sampler for NMT 335 | def build_sampler(tparams, options, trng): 336 | 337 | x = tensor.matrix('x', dtype='int64') 338 | 339 | n_timesteps = x.shape[0] 340 | n_samples = x.shape[1] 341 | 342 | # word embedding (source), forward and backward 343 | emb = tparams['Wemb'][x.flatten()] 344 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 345 | 346 | # encoder 347 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 348 | prefix='encoder') 349 | 350 | # bi-rnn 351 | if options.get('birnn', False): 352 | xr = x[::-1] 353 | 354 | embr = tparams['Wemb'][xr.flatten()] 355 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 356 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 357 | prefix='encoder_r') 358 | 359 | ## concatenate forward and backward rnn hidden states 360 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 361 | 362 | else: 363 | ctx = proj[0] 364 | 365 | # get the input for decoder rnn initializer mlp 366 | ctx_mean = ctx.mean(0) 367 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) 368 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 369 | prefix='ff_state', activ='tanh') 370 | 371 | print 'Building f_init...', 372 | outs = [init_state, ctx] 373 | f_init = theano.function([x], outs, name='f_init', profile=profile) 374 | print 'Done.' 375 | 376 | # .......................................................................... 377 | # x: 1 x 1 378 | y = tensor.vector('y_sampler', dtype='int64') 379 | init_state = tensor.matrix('init_state', dtype='float32') 380 | use_noise = theano.shared(numpy.float32(0.)) 381 | 382 | 383 | # if it's the first word, emb should be all zero and it is indicated by -1 384 | emb = tensor.switch(y[:, None] < 0, 385 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 386 | tparams['Wemb_dec'][y]) 387 | 388 | # apply one step of conditional gru with attention 389 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 390 | prefix='decoder', 391 | mask=None, context=ctx, 392 | one_step=True, 393 | init_state=init_state) 394 | # get the next hidden state 395 | next_state = proj[0] 396 | 397 | # get the weighted averages of context for this target word y 398 | ctxs = proj[1] 399 | 400 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 401 | prefix='ff_logit_lstm', activ='linear') 402 | logit_prev = get_layer('ff')[1](tparams, emb, options, 403 | prefix='ff_logit_prev', activ='linear') 404 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 405 | prefix='ff_logit_ctx', activ='linear') 406 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx) 407 | 408 | if options['use_dropout']: 409 | logit = dropout_layer(logit, use_noise, trng) 410 | logit = get_layer('ff')[1](tparams, logit, options, 411 | prefix='ff_logit', activ='linear') 412 | 413 | # compute the softmax probability 414 | next_probs = tensor.nnet.softmax(logit) 415 | 416 | # sample from softmax distribution to get the sample 417 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 418 | 419 | # compile a function to do the whole thing above, next word probability, 420 | # sampled word for the next target, next hidden state to be used 421 | print 'Building f_next..', 422 | inps = [y, ctx, init_state] 423 | outs = [next_probs, next_sample, next_state] 424 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 425 | print 'Done.' 426 | 427 | return f_init, f_next 428 | 429 | def build_partial(tparams, options, trng): 430 | 431 | assert options.get('birnn', False), 'must used in uni-directional mode' 432 | 433 | x = tensor.matrix('x', dtype='int64') 434 | prev_state = tensor.matrix('prev_state', dtype='float32') 435 | n_timesteps = x.shape[0] 436 | n_samples = x.shape[1] 437 | 438 | # word embedding (source), forward and backward 439 | emb = tparams['Wemb'][x.flatten()] 440 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 441 | 442 | # encoder 443 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 444 | one_step=True, 445 | _init_state=prev_state, 446 | prefix='encoder') 447 | next_state = proj[0] 448 | 449 | 450 | print 'Building f_partial...', 451 | outs = [next_state] 452 | f_partial = theano.function([x, prev_state], outs, name='f_partial', profile=profile) 453 | print 'Done' 454 | 455 | return f_partial 456 | 457 | 458 | def build_simultaneous_sampler(tparams, options, trng): 459 | x = tensor.matrix('x', dtype='int64') 460 | 461 | n_timesteps = x.shape[0] 462 | n_samples = x.shape[1] 463 | 464 | # word embedding (source), forward and backward 465 | emb = tparams['Wemb'][x.flatten()] 466 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 467 | 468 | # encoder 469 | proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') 470 | 471 | # bi-rnn 472 | if options.get('birnn', False): 473 | xr = x[::-1] 474 | 475 | embr = tparams['Wemb'][xr.flatten()] 476 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 477 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 478 | prefix='encoder_r') 479 | 480 | ## concatenate forward and backward rnn hidden states 481 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 482 | 483 | else: 484 | ctx = proj[0] 485 | 486 | # get the input for decoder rnn initializer mlp 487 | ctx_mean = ctx.mean(0) 488 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) 489 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 490 | prefix='ff_state', activ='tanh') 491 | 492 | print 'Building f_ctx/init...', 493 | 494 | f_sim_ctx = theano.function([x], ctx, name = 'f_sim_ctx') 495 | f_sim_init = theano.function([ctx], init_state, name='f_sim_init', profile=profile) 496 | 497 | print 'Done.' 498 | 499 | # -------------------------------------------------------------------------------- # 500 | y = tensor.vector('y_sampler', dtype='int64') 501 | ctx = tensor.tensor3('context_vectors', dtype='float32') 502 | mask = tensor.matrix('context_mask', dtype='float32') 503 | init_state = tensor.matrix('init_state', dtype='float32') 504 | use_noise = theano.shared(numpy.float32(0.)) 505 | 506 | # if it's the first word, emb should be all zero and it is indicated by -1 507 | emb = tensor.switch(y[:, None] < 0, 508 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 509 | tparams['Wemb_dec'][y]) 510 | 511 | # apply one step of conditional gru with attention 512 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 513 | prefix='decoder', 514 | mask=None, context=ctx, 515 | one_step=True, 516 | init_state=init_state, 517 | context_mask=mask) 518 | 519 | # get the next hidden state 520 | next_state = proj[0] 521 | 522 | # get the weighted averages of context for this target word y 523 | ctxs = proj[1] 524 | attention = proj[2] 525 | 526 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 527 | prefix='ff_logit_lstm', activ='linear') 528 | logit_prev = get_layer('ff')[1](tparams, emb, options, 529 | prefix='ff_logit_prev', activ='linear') 530 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 531 | prefix='ff_logit_ctx', activ='linear') 532 | logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) 533 | 534 | if options['use_dropout']: 535 | logit = dropout_layer(logit, use_noise, trng) 536 | 537 | logit = get_layer('ff')[1](tparams, logit, options, 538 | prefix='ff_logit', activ='linear') 539 | 540 | # compute the softmax probability 541 | next_probs = tensor.nnet.softmax(logit) 542 | 543 | # sample from softmax distribution to get the sample 544 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 545 | 546 | # ***== special care: use additional inforamtion ====*** # 547 | # compile a function to do the whole thing above, next word probability, 548 | # sampled word for the next target, next hidden state to be used 549 | print 'Building f_sim_next..', 550 | inps = [y, ctx, mask, init_state] 551 | ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim'] 552 | 553 | if 'pre' in options and options['pre']: 554 | assert not options.get('birnn', False), 'should not use birnn for SimulTrans' 555 | 556 | read_head = tensor.ivector('read_head') 557 | forget_head = tensor.ivector('forget_head') 558 | inps += [read_head, forget_head] 559 | 560 | def _grab(contexts, index): 561 | assert contexts.ndim == 3 562 | 563 | batch_size = contexts.shape[1] 564 | return contexts[index, tensor.arange(batch_size), :] 565 | 566 | last_ctx = _grab(ctx, read_head) 567 | first_ctx = _grab(ctx, forget_head) 568 | next_max_w = tparams['Wemb_dec'][next_probs.argmax(1)] 569 | 570 | readout = tensor.concatenate([next_state, ctxs, last_ctx, first_ctx, next_max_w], axis=-1) 571 | options['readout_dim'] = options['dim_word'] + ctxdim * 3 + options['dim'] 572 | 573 | else: 574 | print 'with normal input' 575 | readout = tensor.concatenate([next_state, ctxs, emb], axis=-1) # the obersavtion for each step. 576 | options['readout_dim'] = options['dim_word'] + options['dim'] + ctxdim 577 | 578 | outs = [next_probs, next_sample, next_state, readout, attention] 579 | f_sim_next = theano.function(inps, outs, name='f_sim_next', profile=profile) 580 | print 'Done.' 581 | 582 | return f_sim_ctx, f_sim_init, f_sim_next 583 | 584 | # ---------------------------------------------------------------------------- # 585 | # What we need are this part = v = # 586 | # # 587 | # ---------> for reinforcement noisy decoding # 588 | # ---------------------------------------------------------------------------- # 589 | 590 | def build_noisy_sampler(tparams, options, trng): 591 | x = tensor.matrix('x', dtype='int64') 592 | 593 | n_timesteps = x.shape[0] 594 | n_samples = x.shape[1] 595 | 596 | # word embedding (source), forward and backward 597 | emb = tparams['Wemb'][x.flatten()] 598 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 599 | 600 | # encoder 601 | proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') 602 | if options.get('birnn', False): 603 | xr = x[::-1] 604 | embr = tparams['Wemb'][xr.flatten()] 605 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 606 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 607 | prefix='encoder_r') 608 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 609 | 610 | else: 611 | ctx = proj[0] 612 | 613 | # get the input for decoder rnn initializer mlp 614 | ctx_mean = ctx.mean(0) 615 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 616 | prefix='ff_state', activ='tanh') 617 | 618 | print 'Building Encoder: f_ctx/init...', 619 | 620 | f_sim_ctx = theano.function([x], ctx, name = 'f_sim_ctx') 621 | f_sim_init = theano.function([ctx], init_state, name='f_sim_init', profile=profile) 622 | 623 | print 'Done.' 624 | 625 | # -------------------------------------------------------------------------------- # 626 | y = tensor.vector('y_sampler', dtype='int64') 627 | ctx = tensor.tensor3('context_vectors', dtype='float32') 628 | mask = tensor.matrix('context_mask', dtype='float32') 629 | prev_state = tensor.matrix('prev_state', dtype='float32') 630 | use_noise = theano.shared(numpy.float32(0.)) 631 | 632 | injd_noise = tensor.matrix('injected_noise', dtype='float32') 633 | 634 | # if it's the first word, emb should be all zero and it is indicated by -1 635 | emb = tensor.switch(y[:, None] < 0, 636 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 637 | tparams['Wemb_dec'][y]) 638 | 639 | # inject noise 640 | init_state = prev_state + injd_noise # apply the injected noise 641 | 642 | # apply one step of conditional gru with attention 643 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 644 | prefix='decoder', 645 | mask=None, context=ctx, 646 | one_step=True, 647 | init_state=init_state, 648 | context_mask=mask) 649 | 650 | # get the next hidden state 651 | next_state = proj[0] 652 | 653 | # get the weighted averages of context for this target word y 654 | ctxs = proj[1] 655 | attention = proj[2] 656 | 657 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 658 | prefix='ff_logit_lstm', activ='linear') 659 | logit_prev = get_layer('ff')[1](tparams, emb, options, 660 | prefix='ff_logit_prev', activ='linear') 661 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 662 | prefix='ff_logit_ctx', activ='linear') 663 | logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) 664 | 665 | if options['use_dropout']: 666 | logit = dropout_layer(logit, use_noise, trng) 667 | 668 | logit = get_layer('ff')[1](tparams, logit, options, 669 | prefix='ff_logit', activ='linear') 670 | 671 | # compute the softmax probability 672 | next_probs = tensor.nnet.softmax(logit) 673 | 674 | # sample from softmax distribution to get the sample 675 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 676 | 677 | # compile the function read-out and samples 678 | print 'Building f_sim_next..', 679 | 680 | inps = [y, ctx, mask, prev_state, injd_noise] 681 | ctxdim = options['dim'] if not options.get('birnn', False) else 2 * options['dim'] 682 | readout = tensor.concatenate([next_state, ctxs, emb], axis=-1) # the obersavtion for each step. 683 | options['readout_dim'] = options['dim_word'] + options['dim'] + ctxdim 684 | 685 | outs = [next_probs, next_sample, next_state, readout, attention] 686 | f_sim_next = theano.function(inps, outs, name='f_sim_next', profile=profile) 687 | 688 | print 'Done.' 689 | return f_sim_ctx, f_sim_init, f_sim_next 690 | 691 | 692 | # generate sample, either with stochastic sampling or beam search. Note that, 693 | # this function iteratively calls f_init and f_next functions. 694 | def gen_sample(tparams, f_init, f_next, x, options, trng=None, k=1, maxlen=30, 695 | stochastic=True, argmax=False, sigma=-1.): 696 | 697 | # k is the beam size we have 698 | if k > 1: 699 | assert not stochastic, \ 700 | 'Beam search does not support stochastic sampling' 701 | 702 | sample = [] 703 | sample_score = [] 704 | if stochastic: 705 | sample_score = 0 706 | 707 | live_k = 1 708 | dead_k = 0 709 | 710 | hyp_samples = [[]] * live_k 711 | hyp_scores = numpy.zeros(live_k).astype('float32') 712 | hyp_states = [] 713 | 714 | # get initial state of decoder rnn and encoder context 715 | ret = f_init(x) 716 | next_state, ctx0 = ret[0], ret[1] 717 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 718 | 719 | for ii in xrange(maxlen): 720 | ctx = numpy.tile(ctx0, [live_k, 1]) 721 | 722 | if sigma > 0.: 723 | next_state_inp = next_state + numpy.float32((sigma/(ii+1)) * numpy.random.randn(*next_state.shape)) 724 | else: 725 | next_state_inp = next_state 726 | 727 | inps = [next_w, ctx, next_state_inp] 728 | ret = f_next(*inps) 729 | next_p, next_w, next_state = ret[0], ret[1], ret[2] 730 | 731 | if stochastic: 732 | if argmax: 733 | nw = next_p[0].argmax() 734 | else: 735 | nw = next_w[0] 736 | sample.append(nw) 737 | sample_score += next_p[0, nw] 738 | if nw == 0: 739 | break 740 | else: 741 | cand_scores = hyp_scores[:, None] - numpy.log(next_p) 742 | cand_flat = cand_scores.flatten() 743 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 744 | 745 | voc_size = next_p.shape[1] 746 | trans_indices = ranks_flat / voc_size 747 | word_indices = ranks_flat % voc_size 748 | costs = cand_flat[ranks_flat] 749 | 750 | new_hyp_samples = [] 751 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 752 | new_hyp_states = [] 753 | 754 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 755 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 756 | new_hyp_scores[idx] = copy.copy(costs[idx]) 757 | new_hyp_states.append(copy.copy(next_state[ti])) 758 | 759 | # check the finished samples 760 | new_live_k = 0 761 | hyp_samples = [] 762 | hyp_scores = [] 763 | hyp_states = [] 764 | 765 | for idx in xrange(len(new_hyp_samples)): 766 | if new_hyp_samples[idx][-1] == 0: 767 | sample.append(new_hyp_samples[idx]) 768 | sample_score.append(new_hyp_scores[idx]) 769 | dead_k += 1 770 | else: 771 | new_live_k += 1 772 | hyp_samples.append(new_hyp_samples[idx]) 773 | hyp_scores.append(new_hyp_scores[idx]) 774 | hyp_states.append(new_hyp_states[idx]) 775 | hyp_scores = numpy.array(hyp_scores) 776 | live_k = new_live_k 777 | 778 | if new_live_k < 1: 779 | break 780 | if dead_k >= k: 781 | break 782 | 783 | next_w = numpy.array([w[-1] for w in hyp_samples]) 784 | next_state = numpy.array(hyp_states) 785 | 786 | if not stochastic: 787 | # dump every remaining one 788 | if live_k > 0: 789 | for idx in xrange(live_k): 790 | sample.append(hyp_samples[idx]) 791 | sample_score.append(hyp_scores[idx]) 792 | 793 | return sample, sample_score 794 | 795 | 796 | # calculate the log probablities on a given corpus using translation model 797 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True): 798 | probs = [] 799 | 800 | n_done = 0 801 | 802 | for x, y in iterator: 803 | n_done += len(x) 804 | 805 | x, x_mask, y, y_mask = prepare_data(x, y, 806 | n_words_src=options['n_words_src'], 807 | n_words=options['n_words']) 808 | 809 | pprobs = f_log_probs(x, x_mask, y, y_mask) 810 | for pp in pprobs: 811 | probs.append(pp) 812 | 813 | #if numpy.isnan(numpy.mean(probs)): 814 | # ipdb.set_trace() 815 | 816 | if verbose: 817 | print >>sys.stderr, '%d samples computed' % (n_done) 818 | 819 | return numpy.array(probs) 820 | 821 | #-----------------------------------------------------------------------------# 822 | # Batch preparation 823 | 824 | def prepare_data(seqs_x, 825 | seqs_y, 826 | maxlen=None, 827 | n_words_src=30000, 828 | n_words=30000): 829 | 830 | # x: a list of sentences 831 | lengths_x = [len(s) for s in seqs_x] 832 | lengths_y = [len(s) for s in seqs_y] 833 | 834 | if maxlen is not None: 835 | new_seqs_x = [] 836 | new_seqs_y = [] 837 | new_lengths_x = [] 838 | new_lengths_y = [] 839 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y): 840 | if l_x < maxlen and l_y < maxlen: 841 | new_seqs_x.append(s_x) 842 | new_lengths_x.append(l_x) 843 | new_seqs_y.append(s_y) 844 | new_lengths_y.append(l_y) 845 | lengths_x = new_lengths_x 846 | seqs_x = new_seqs_x 847 | lengths_y = new_lengths_y 848 | seqs_y = new_seqs_y 849 | 850 | if len(lengths_x) < 1 or len(lengths_y) < 1: 851 | return None, None, None, None 852 | 853 | n_samples = len(seqs_x) 854 | maxlen_x = numpy.max(lengths_x) + 1 855 | maxlen_y = numpy.max(lengths_y) + 1 856 | 857 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 858 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 859 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 860 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 861 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)): 862 | x[:lengths_x[idx], idx] = s_x 863 | x_mask[:lengths_x[idx]+1, idx] = 1. 864 | y[:lengths_y[idx], idx] = s_y 865 | y_mask[:lengths_y[idx]+1, idx] = 1. 866 | 867 | return x, x_mask, y, y_mask 868 | 869 | 870 | #-----------------------------------------------------------------------------# 871 | # Training Function: 872 | 873 | def train(dim_word = 100, # word vector dimensionality 874 | dim = 1000, # the number of RNN units 875 | encoder = 'gru', 876 | decoder = 'gru_cond', 877 | patience = 10, # early stopping patience 878 | max_epochs = 5000, 879 | finish_after = 10000000, # finish after this many updates 880 | dispFreq = 100, 881 | decay_c = 0., # L2 regularization penalty 882 | alpha_c = 0., # alignment regularization 883 | clip_c = -1., # gradient clipping threshold 884 | lrate = 0.01, # learning rate 885 | n_words_src = 100000, # source vocabulary size 886 | n_words = 100000, # target vocabulary size 887 | maxlen = 100, # maximum length of the description 888 | optimizer = 'rmsprop', 889 | batch_size = 16, 890 | valid_batch_size = 16, 891 | saveto = 'model.npz', 892 | validFreq = 1000, 893 | saveFreq = 1000, # save the parameters after every saveFreq updates 894 | sampleFreq = 100, # generate some samples after every sampleFreq 895 | datasets =[ 896 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', 897 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], 898 | 899 | valid_datasets=['../data/dev/newstest2011.en.tok', 900 | '../data/dev/newstest2011.fr.tok'], 901 | 902 | dictionaries=[ 903 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 904 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], 905 | 906 | use_dropout = False, 907 | reload_ = False, 908 | overwrite = False): 909 | 910 | # Model options 911 | model_options = locals().copy() 912 | 913 | # load dictionaries and invert them 914 | worddicts = [None] * len(dictionaries) 915 | worddicts_r = [None] * len(dictionaries) 916 | for ii, dd in enumerate(dictionaries): 917 | with open(dd, 'rb') as f: 918 | worddicts[ii] = pkl.load(f) 919 | worddicts_r[ii] = dict() 920 | for kk, vv in worddicts[ii].iteritems(): 921 | worddicts_r[ii][vv] = kk 922 | 923 | # reload options 924 | if reload_ and os.path.exists(saveto): 925 | print 'Reloading model options' 926 | with open('%s.pkl' % saveto, 'rb') as f: 927 | model_options = pkl.load(f) 928 | 929 | print 'Loading data' 930 | train = TextIterator(datasets[0], datasets[1], 931 | dictionaries[0], dictionaries[1], 932 | n_words_source=n_words_src, n_words_target=n_words, 933 | batch_size=batch_size, 934 | maxlen=maxlen) 935 | valid = TextIterator(valid_datasets[0], valid_datasets[1], 936 | dictionaries[0], dictionaries[1], 937 | n_words_source=n_words_src, n_words_target=n_words, 938 | batch_size=valid_batch_size, 939 | maxlen=maxlen) 940 | 941 | print 'Building model' 942 | params = init_params(model_options) 943 | # reload parameters 944 | if reload_ and os.path.exists(saveto): 945 | print 'Reloading model parameters' 946 | params = load_params(saveto, params) 947 | 948 | tparams = init_tparams(params) 949 | 950 | trng, use_noise, \ 951 | x, x_mask, y, y_mask, \ 952 | opt_ret, \ 953 | cost, f_cost = \ 954 | build_model(tparams, model_options) 955 | inps = [x, x_mask, y, y_mask] 956 | 957 | print 'Building sampler' 958 | f_init, f_next = build_sampler(tparams, model_options, trng) 959 | 960 | # before any regularizer 961 | print 'Building f_log_probs...', 962 | f_log_probs = theano.function(inps, cost, profile=profile) 963 | print 'Done' 964 | 965 | cost = cost.mean() 966 | 967 | # apply L2 regularization on weights 968 | if decay_c > 0.: 969 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 970 | weight_decay = 0. 971 | for kk, vv in tparams.iteritems(): 972 | weight_decay += (vv ** 2).sum() 973 | weight_decay *= decay_c 974 | cost += weight_decay 975 | 976 | # regularize the alpha weights 977 | if alpha_c > 0. and not model_options['decoder'].endswith('simple'): 978 | alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') 979 | alpha_reg = alpha_c * ( 980 | (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - 981 | opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() 982 | cost += alpha_reg 983 | 984 | # after all regularizers - compile the computational graph for cost 985 | print 'Building f_cost...', 986 | f_cost = theano.function(inps, cost, profile=profile) 987 | print 'Done' 988 | 989 | print 'Computing gradient...', 990 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 991 | print 'Done' 992 | 993 | # apply gradient clipping here 994 | if clip_c > 0.: 995 | g2 = 0. 996 | for g in grads: 997 | g2 += (g**2).sum() 998 | new_grads = [] 999 | for g in grads: 1000 | new_grads.append(tensor.switch(g2 > (clip_c**2), 1001 | g / tensor.sqrt(g2) * clip_c, 1002 | g)) 1003 | grads = new_grads 1004 | 1005 | # compile the optimizer, the actual computational graph is compiled here 1006 | lr = tensor.scalar(name='lr') 1007 | print 'Building optimizers...', 1008 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 1009 | print 'Done' 1010 | 1011 | print 'Optimization' 1012 | 1013 | best_p = None 1014 | bad_counter = 0 1015 | uidx = 0 1016 | estop = False 1017 | history_errs = [] 1018 | # reload history 1019 | if reload_ and os.path.exists(saveto): 1020 | rmodel = numpy.load(saveto) 1021 | history_errs = list(rmodel['history_errs']) 1022 | if 'uidx' in rmodel: 1023 | uidx = rmodel['uidx'] 1024 | 1025 | if validFreq == -1: 1026 | validFreq = len(train[0])/batch_size 1027 | if saveFreq == -1: 1028 | saveFreq = len(train[0])/batch_size 1029 | if sampleFreq == -1: 1030 | sampleFreq = len(train[0])/batch_size 1031 | 1032 | for eidx in xrange(max_epochs): 1033 | n_samples = 0 1034 | 1035 | for x, y in train: 1036 | n_samples += len(x) 1037 | uidx += 1 1038 | use_noise.set_value(1.) 1039 | 1040 | x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, 1041 | n_words_src=n_words_src, 1042 | n_words=n_words) 1043 | 1044 | if x is None: 1045 | print 'Minibatch with zero sample under length ', maxlen 1046 | uidx -= 1 1047 | continue 1048 | 1049 | ud_start = time.time() 1050 | 1051 | # compute cost, grads and copy grads to shared variables 1052 | cost = f_grad_shared(x, x_mask, y, y_mask) 1053 | 1054 | # do the update on parameters 1055 | f_update(lrate) 1056 | 1057 | ud = time.time() - ud_start 1058 | 1059 | # check for bad numbers, usually we remove non-finite elements 1060 | # and continue training - but not done here 1061 | if numpy.isnan(cost) or numpy.isinf(cost): 1062 | print 'NaN detected' 1063 | return 1., 1., 1. 1064 | 1065 | # verbose 1066 | if numpy.mod(uidx, dispFreq) == 0: 1067 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 1068 | 1069 | # save the best model so far, in addition, save the latest model 1070 | # into a separate file with the iteration number for external eval 1071 | if numpy.mod(uidx, saveFreq) == 0: 1072 | print 'Saving the best model...', 1073 | if best_p is not None: 1074 | params = best_p 1075 | else: 1076 | params = unzip(tparams) 1077 | numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) 1078 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) 1079 | params = unzip(tparams) 1080 | numpy.savez('%s.current'%(saveto), history_errs=history_errs, **params) 1081 | pkl.dump(model_options, open('%s.current.pkl' % saveto, 'wb')) 1082 | print 'Done' 1083 | 1084 | # save with uidx 1085 | if not overwrite: 1086 | print 'Saving the model at iteration {}...'.format(uidx), 1087 | saveto_uidx = '{}.iter{}.npz'.format( 1088 | os.path.splitext(saveto)[0], uidx) 1089 | numpy.savez(saveto_uidx, history_errs=history_errs, 1090 | uidx=uidx, **unzip(tparams)) 1091 | print 'Done' 1092 | 1093 | 1094 | # generate some samples with the model and display them 1095 | if numpy.mod(uidx, sampleFreq) == 0: 1096 | # FIXME: random selection? 1097 | for jj in xrange(numpy.minimum(5, x.shape[1])): 1098 | stochastic = False 1099 | sample, score = gen_sample(tparams, f_init, f_next, 1100 | x[:, jj][:, None], 1101 | model_options, trng=trng, k=1, 1102 | maxlen=30, 1103 | stochastic=stochastic, 1104 | argmax=True) 1105 | print 'Source ', jj, ': ', 1106 | ss = [] 1107 | for vv in x[:, jj]: 1108 | if vv == 0: 1109 | break 1110 | if vv in worddicts_r[0]: 1111 | ss.append(worddicts_r[0][vv]) 1112 | else: 1113 | ss.append('UNK') 1114 | print ' '.join(ss).replace('@@ ', '') 1115 | print 'Truth ', jj, ' : ', 1116 | ss = [] 1117 | for vv in y[:, jj]: 1118 | if vv == 0: 1119 | break 1120 | if vv in worddicts_r[1]: 1121 | ss.append(worddicts_r[1][vv]) 1122 | else: 1123 | ss.append('UNK') 1124 | print ' '.join(ss).replace('@@ ', '') 1125 | print 'Sample ', jj, ': ', 1126 | tt = [] 1127 | score = score / numpy.array([len(s) for s in sample]) 1128 | ss = sample[score.argmin()] 1129 | for vv in ss: 1130 | if vv == 0: 1131 | break 1132 | if vv in worddicts_r[1]: 1133 | tt.append(worddicts_r[1][vv]) 1134 | else: 1135 | tt.append('UNK') 1136 | print ' '.join(tt).replace('@@ ', '') 1137 | 1138 | # validate model on validation set and early stop if necessary 1139 | if numpy.mod(uidx, validFreq) == 0: 1140 | use_noise.set_value(0.) 1141 | valid_errs = pred_probs(f_log_probs, prepare_data, 1142 | model_options, valid) 1143 | valid_err = valid_errs.mean() 1144 | history_errs.append(valid_err) 1145 | 1146 | if uidx == 0 or valid_err <= numpy.array(history_errs).min(): 1147 | best_p = unzip(tparams) 1148 | bad_counter = 0 1149 | if len(history_errs) > patience and valid_err >= \ 1150 | numpy.array(history_errs)[:-patience].min(): 1151 | bad_counter += 1 1152 | if bad_counter > patience: 1153 | print 'Early Stop!' 1154 | estop = True 1155 | break 1156 | 1157 | #if numpy.isnan(valid_err): 1158 | # ipdb.set_trace() 1159 | 1160 | print 'Valid ', valid_err 1161 | 1162 | # finish after this many updates 1163 | if uidx >= finish_after: 1164 | print 'Finishing after %d iterations!' % uidx 1165 | estop = True 1166 | break 1167 | 1168 | print 'Seen %d samples' % n_samples 1169 | 1170 | if estop: 1171 | break 1172 | 1173 | if best_p is not None: 1174 | zipp(best_p, tparams) 1175 | 1176 | use_noise.set_value(0.) 1177 | valid_err = pred_probs(f_log_probs, prepare_data, 1178 | model_options, valid).mean() 1179 | 1180 | print 'Valid ', valid_err 1181 | 1182 | params = copy.copy(best_p) 1183 | numpy.savez(saveto, zipped_params=best_p, 1184 | history_errs=history_errs, 1185 | uidx=uidx, 1186 | **params) 1187 | 1188 | return valid_err 1189 | 1190 | def grad_clip(dJ, clip_c=1): 1191 | clip_c = clip_c. 1192 | if clip_c > 0.: 1193 | g2 = 0. 1194 | for g in dJ: 1195 | g2 += (g ** 2).sum() 1196 | new_grads = [] 1197 | for g in dJ: 1198 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g)) 1199 | dJ = new_grads 1200 | return dJ 1201 | 1202 | if __name__ == '__main__': 1203 | pass 1204 | -------------------------------------------------------------------------------- /noisy_translator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural Machine Translation with Reinforcement Bias 3 | """ 4 | 5 | from nmt_uni import * 6 | from reward import translation_cost 7 | import time 8 | 9 | time = time.time 10 | 11 | # utility functions 12 | def _seqs2words(caps, idict): 13 | capsw = [] 14 | for cc in caps: 15 | ww = [] 16 | for w in cc: 17 | if w == 0: 18 | break 19 | ww.append(idict[w]) 20 | capsw.append(' '.join(ww)) 21 | return capsw 22 | 23 | def _bpe2words(capsw): 24 | capw = [] 25 | for cc in capsw: 26 | capw += [cc.replace('@@ ', '')] 27 | return capw 28 | 29 | def _action2delay(src, actions): 30 | delays = [] 31 | X = len(src) 32 | for act in actions: 33 | A = numpy.array(act, dtype='float32') 34 | Y = numpy.sum(act) 35 | S = numpy.sum(numpy.cumsum(1 - A) * A) 36 | 37 | assert (X > 0) and (Y > 0), 'avoid NAN {}, {}'.format(X, Y) 38 | 39 | tau = S / (Y * X) 40 | delays.append([tau, X, Y, S]) 41 | 42 | return delays 43 | 44 | 45 | # padding for computing policy gradient 46 | def _padding(arrays, shape, dtype='float32', return_mask=False, sidx=0): 47 | B = numpy.zeros(shape, dtype=dtype) 48 | 49 | if return_mask: 50 | M = numpy.zeros((shape[0], shape[1]), dtype='float32') 51 | 52 | for it, arr in enumerate(arrays): 53 | arr = numpy.asarray(arr, dtype=dtype) 54 | # print arr.shape 55 | 56 | steps = arr.shape[0] 57 | 58 | if arr.ndim < 2: 59 | B[sidx: steps + sidx, it] = arr 60 | else: 61 | steps2 = arr.shape[1] 62 | B[sidx: steps + sidx, it, : steps2] = arr 63 | 64 | if return_mask: 65 | M[sidx: steps + sidx, it] = 1. 66 | 67 | if return_mask: 68 | return B, M 69 | return B 70 | 71 | 72 | class PIPE(object): 73 | def __init__(self, keys=None): 74 | self.messages = OrderedDict() 75 | self.hyp_messages = OrderedDict() 76 | self.new_hyp_messages = OrderedDict() 77 | for key in keys: 78 | self.messages[key] = [] 79 | 80 | def reset(self): 81 | for key in self.messages: 82 | self.messages[key] = [] 83 | 84 | self.hyp_messages = OrderedDict() 85 | self.new_hyp_messages = OrderedDict() 86 | 87 | def clean_hyp(self): 88 | self.hyp_messages = OrderedDict() 89 | 90 | def clean_new_hyp(self): 91 | self.new_hyp_messages = OrderedDict() 92 | 93 | def init_hyp(self, key, live_k=None): 94 | if live_k is not None: 95 | self.hyp_messages[key] = [[] for _ in xrange(live_k)] 96 | else: 97 | self.hyp_messages[key] = [] 98 | 99 | def init_new_hyp(self, key, use_copy=False): 100 | if use_copy: 101 | self.new_hyp_messages[key] = copy.copy(self.hyp_messages[key]) 102 | else: 103 | self.new_hyp_messages[key] = [] 104 | 105 | def append(self, key, new, idx=None, use_hyp=False): 106 | if not use_hyp: 107 | self.new_hyp_messages[key].append(new) 108 | else: 109 | self.new_hyp_messages[key].append(self.hyp_messages[key][idx] + [new]) 110 | 111 | def append_new(self, key, idx, hyper=True): 112 | if hyper: 113 | self.hyp_messages[key].append(self.new_hyp_messages[key][idx]) 114 | else: 115 | # print self.messages['sample'] 116 | self.messages[key].append(self.new_hyp_messages[key][idx]) 117 | 118 | def add(self, key, new, idx): 119 | self.new_hyp_messages[key][idx] += new 120 | 121 | def asarray(self, key, replace=False): 122 | if replace: 123 | self.hyp_messages[key] = numpy.array(self.hyp_messages[key]) 124 | else: 125 | return numpy.array(self.hyp_messages[key], dtype='float32') 126 | 127 | def split(self): 128 | truth = OrderedDict() 129 | sample = OrderedDict() 130 | 131 | 132 | for key in self.messages: 133 | if key == 'source': 134 | continue 135 | 136 | truth[key] = [] 137 | sample[key] = [] 138 | 139 | if key == 'mask': 140 | for idx in xrange(len(self.messages['source'])): 141 | if self.messages['source'][idx] < 0: 142 | sample[key].append(self.messages[key][:, idx]) 143 | else: 144 | truth[key].append(self.messages[key][:, idx]) 145 | else: 146 | for idx in xrange(len(self.messages['source'])): 147 | if self.messages['source'][idx] < 0: 148 | sample[key].append(self.messages[key][idx]) 149 | else: 150 | truth[key].append(self.messages[key][idx]) 151 | 152 | self.messages = sample 153 | return truth 154 | 155 | 156 | 157 | # ============================================================================ # 158 | # Noisy Decoding in Batch-Mode 159 | # ============================================================================ # 160 | def noisy_decoding(f_sim_ctx, 161 | f_sim_init, 162 | f_sim_next, 163 | f_cost, 164 | srcs, # source sentences 165 | trgs, # taeget sentences 166 | t_idict=None, 167 | _policy=None, 168 | n_samples=10, 169 | maxlen=200, 170 | reward_config=None, 171 | train=False): 172 | """ 173 | :param f_init: initializer using the first "sidx" words. 174 | :param f_sim_next: 175 | :param f_partial: 176 | :param src: the original input needed to be translated (just for the speed) 177 | :param step: step_size for each wait 178 | :param peek: 179 | hidden0 = _policy.init_hidden() 180 | :param sidx: pre-read sidx words from the source 181 | :return: 182 | """ 183 | Statistcs = OrderedDict() 184 | n_sentences = len(srcs) 185 | max_steps = -1 186 | 187 | # ======================================================================== # 188 | # Generating Trajectories based on Current Policy 189 | # ======================================================================== # 190 | 191 | live_k = n_samples * n_sentences 192 | live_all = live_k 193 | 194 | x, ctx0, z0, secs0 = [], [], [], [] 195 | # data initialization 196 | for id, (src, trg) in enumerate(zip(srcs, trgs)): 197 | 198 | _x = numpy.array(src, dtype='int64')[:, None] 199 | _ctx0 = f_sim_ctx(_x) 200 | _z0 = f_sim_init(_ctx0[:sidx, :]) 201 | 202 | x.append(_x[:, 0]) 203 | ctx0.append(_ctx0[:, 0, :]) 204 | z0.append(_z0.flatten()) 205 | secs0.append([id, len(src), 0]) # word id / source length / correctness 206 | 207 | # pad the results 208 | x, x_mask = _padding(x, (src_max, n_sentences), dtype='int64', return_mask=True) 209 | ctx = _padding(ctx0, (src_max, n_sentences, ctx0[0].shape[-1])) 210 | z0 = numpy.asarray(z0) 211 | mask = x_mask 212 | 213 | # initial actions and hidden states 214 | action0, _, _, hidden0 = _policy.init_action(n_samples=n_samples) 215 | 216 | x_mask = numpy.ones_like(x, dtype='float32') 217 | mask0 = x_mask 218 | 219 | # if we have multiple samples for one input sentence 220 | mask = numpy.tile(mask0, [1, n_samples]) 221 | z0 = numpy.tile(z0, [n_samples, 1]) 222 | ctx = numpy.tile(ctx, [1, n_samples, 1]) 223 | 224 | hidden0 = numpy.tile(hidden0, [live_k, 1]) 225 | action0 = numpy.tile(action0, [live_k, 1]) 226 | 227 | secs = [] 228 | for _ in xrange(live_k / n_sentences): 229 | secs += copy.deepcopy(secs0) 230 | 231 | # PIPE for message passing 232 | pipe = PIPE(['sample', 'score', 'action', 'obs', 'attentions','secs']) 233 | 234 | # Build for the temporal results: hyp-message 235 | for key in ['sample', 'obs', 'attentions', 'hidden', 'action']: 236 | pipe.init_hyp(key, live_k) 237 | 238 | # special care 239 | pipe.hyp_messages['score'] = numpy.zeros(live_k).astype('float32') 240 | pipe.hyp_messages['secs'] = secs 241 | pipe.hyp_messages['states'] = z0 242 | pipe.hyp_messages['mask'] = mask 243 | pipe.hyp_messages['ctx'] = ctx 244 | 245 | # these are inputs that needs to be updated 246 | prev_w = -1 * numpy.ones((live_k, )).astype('int64') 247 | prev_z = z0 248 | prev_hid = hidden0 249 | prev_noise = action0 250 | step = 0 251 | 252 | # ROLLOUT: Iteration until all the samples over. 253 | # Action space: 254 | # ======================================================================= 255 | while live_k > 0: 256 | 257 | step += 1 258 | 259 | # compute one step 260 | inps = [prev_w, ctx, mask, prev_z, prev_noise] 261 | next_p, _, next_z, next_o, next_a = f_sim_next(*inps) 262 | 263 | # obtain the candidate and the accumulated score. 264 | _cand = next_p.argmax(axis=-1) # live_k 265 | _score = next_p[range(live_k), _cand] 266 | 267 | # new place-holders for temporal results: new-hyp-message 268 | pipe.clean_new_hyp() 269 | 270 | for key in ['sample', 'score', 'attentions', 'secs', 'mask', 'ctx', 'states']: 271 | pipe.init_new_hyp(key, use_copy=True) 272 | 273 | for key in ['action', 'obs', 'hidden']: 274 | pipe.init_new_hyp(key, use_copy=False) 275 | 276 | 277 | # Rollout the action. 278 | _actions, _mean, _logstd, _hidden = _policy.action(next_o, prev_hid) # input the current observation 279 | 280 | 281 | # check each candidate 282 | for idx, wi in enumerate(_cand): 283 | 284 | # collect the action 285 | a = _actions[idx] # 1024-D Gaussian Vector 286 | 287 | # message appending 288 | pipe.append('obs', next_o[idx], idx=idx, use_hyp=True) 289 | pipe.append('action', a, idx=idx, use_hyp=True) # collect action. 290 | pipe.append('hidden', _hidden[idx]) 291 | 292 | # for commit: 293 | # update new_hyp_message 294 | pipe.add('sample', [wi], idx) 295 | pipe.add('score', _score[idx], idx) 296 | pipe.add('attentions', [next_a[idx]], idx) 297 | 298 | # *** special care 299 | pipe.new_hyp_messages['states'][idx] = next_z[idx] 300 | 301 | 302 | # kill the completed samples, so I need to build new hyp-messages 303 | pipe.clean_hyp() 304 | 305 | for key in ['sample', 'score', 'states', 306 | 'action', 'obs', 'attentions', 'hidden', 307 | 'ctx', 'secs', 'mask']: 308 | pipe.init_hyp(key) 309 | 310 | 311 | # print new_hyp_sample 312 | for idx in xrange(len(pipe.new_hyp_messages['sample'])): 313 | # check if reachs the end 314 | 315 | if (len(pipe.new_hyp_messages['sample'][idx]) >= maxlen) or \ 316 | (pipe.new_hyp_messages['sample'][idx][-1] == 0): 317 | 318 | for key in ['sample', 'score', 'action', 'obs', 'attentions']: 319 | pipe.append_new(key, idx, hyper=False) 320 | 321 | live_k -= 1 322 | 323 | else: 324 | 325 | for key in ['sample', 'score', 'states', 'action', 326 | 'obs', 'attentions', 'hidden']: 327 | pipe.append_new(key, idx, hyper=True) 328 | 329 | # *** special care *** 330 | pipe.hyp_messages['secs'].append(pipe.new_hyp_messages['secs'][idx]) 331 | pipe.hyp_messages['mask'].append(pipe.new_hyp_messages['mask'][:, idx]) 332 | pipe.hyp_messages['ctx'].append(pipe.new_hyp_messages['ctx'][:, idx]) 333 | 334 | 335 | 336 | # make it numpy array 337 | for key in ['score', 'mask', 'ctx', 'states', 'hidden']: 338 | pipe.asarray(key, True) 339 | 340 | pipe.hyp_messages['mask'] = pipe.hyp_messages['mask'].T 341 | if pipe.hyp_messages['ctx'].ndim == 3: 342 | pipe.hyp_messages['ctx'] = pipe.hyp_messages['ctx'].transpose(1, 0, 2) 343 | elif pipe.hyp_messages['ctx'].ndim == 2: 344 | pipe.hyp_messages['ctx'] = pipe.hyp_messages['ctx'][:, None, :] 345 | 346 | prev_z = pipe.hyp_messages['states'] 347 | prev_hid = pipe.hyp_messages['hidden'] 348 | mask = pipe.hyp_messages['mask'] 349 | ctx = pipe.hyp_messages['ctx'] 350 | 351 | prev_w = numpy.array([w[-1] if len(w) > 0 352 | else -1 for w in pipe.hyp_messages['sample']], 353 | dtype='int64') 354 | 355 | mask = numpy.tile(mask0, [1, live_k]) 356 | 357 | prev_noise = numpy.array([a[-1] for a in pipe.hyp_messages['action']], dtype='float32') 358 | # prev_noise = numpy.concatenate(pipe.hyp_messages['action'], axis=0) 359 | 360 | 361 | # ======================================================================= 362 | # Collecting Rewards. 363 | # ======================================================================= 364 | # print 'collect reward' 365 | R = [] 366 | track = [] 367 | reference = [_bpe2words(_seqs2words([trg], t_idict))[0].split()] 368 | for k in xrange(n_samples): 369 | sp, sc, act = [pipe.messages[key][k] for key in ['sample', 'score', 'action']] 370 | y = numpy.asarray(sp, dtype='int64')[:, None] 371 | y_mask = numpy.ones_like(y, dtype='float32') 372 | steps = len(act) 373 | 374 | # turn back to sentence level 375 | words = _seqs2words([sp], t_idict)[0] 376 | decoded = _bpe2words([words])[0].split() 377 | 378 | # -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*- 379 | # reward configs 380 | keys = {"steps": steps, "y": y, 381 | "y_mask": y_mask, 382 | "x_mask": x_mask, 383 | "f_cost": f_cost, 384 | "sample": decoded, 385 | "reference": reference, 386 | "words": words} 387 | 388 | ret = translation_cost(**keys) 389 | Rk, bleu = ret 390 | 391 | R += [Rk] 392 | track += [bleu] 393 | 394 | pipe.messages['R'] = R 395 | pipe.messages['track'] = track 396 | 397 | # --------------------------------------------------- # 398 | # add to global lists. 399 | keywords = ['sample', 'action', 'obs', 'secs', 400 | 'attentions', 'score', 'track', 'R'] 401 | for k in keywords: 402 | if k not in Statistcs: 403 | Statistcs[k] = pipe.messages[k] 404 | else: 405 | Statistcs[k] += pipe.messages[k] 406 | 407 | 408 | # If not train, End here 409 | if not train: 410 | return Statistcs 411 | 412 | # ================================================================================================= # 413 | # Policy Gradient over Trajectories 414 | # ================================================================================================= # 415 | 416 | p_obs, p_mask \ 417 | = _padding(Observations, 418 | shape=(max_steps, n_samples * n_sentences, _policy.n_in), 419 | return_mask=True) 420 | p_r = _padding(Rewards, 421 | shape=(max_steps, n_samples * n_sentences)) 422 | p_act = _padding(Actions, 423 | shape=(max_steps, n_samples * n_sentences, _policy.n_out)) 424 | 425 | 426 | # print 'learning policy gradient' 427 | # learning 428 | info = _policy.get_learner()([p_obs, p_mask], p_act, p_r) 429 | 430 | # add the reward statistics 431 | q = Tracks 432 | info['Q'] = numpy.mean(q) 433 | info['A'] = numpy.mean(p_act) 434 | 435 | return Samples, Scores, Actions, Rewards, info 436 | 437 | 438 | -------------------------------------------------------------------------------- /noisytrans_training.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural Machine Translation with Greedy Decoding 3 | """ 4 | import argparse 5 | import os 6 | import cPickle as pkl 7 | 8 | from nmt_uni import * 9 | from policy import Controller as Policy 10 | from utils import Progbar, Monitor 11 | from noisy_translator import noisy_decoding 12 | from simultrans_infinite2 import _seqs2words, _bpe2words, _action2delay 13 | 14 | import time 15 | 16 | 17 | numpy.random.seed(19920206) 18 | timer = time.time 19 | 20 | # check hidden folders 21 | def check_env(): 22 | paths = ['.policy', '.pretrained', '.log', 23 | '.config', '.images', '.translate'] 24 | for p in paths: 25 | if not os.path.exists(p): 26 | os.mkdir(p) 27 | 28 | 29 | # run training function:: >>> 30 | def run_simultrans(model, 31 | options_file=None, 32 | config=None, 33 | policy=None, 34 | id=None, 35 | remote=False): 36 | # check envoriments 37 | check_env() 38 | if id is not None: 39 | fcon = '.config/{}.conf'.format(id) 40 | if os.path.exists(fcon): 41 | print 'load config files' 42 | policy, config = pkl.load(open(fcon, 'r')) 43 | 44 | # ======================================================================= # 45 | # load model model_options 46 | # ======================================================================= # 47 | _model = model 48 | model = '.pretrained/{}'.format(model) 49 | 50 | if options_file is not None: 51 | with open(options_file, 'rb') as f: 52 | options = pkl.load(f) 53 | else: 54 | with open('%s.pkl' % model, 'rb') as f: 55 | options = pkl.load(f) 56 | options['birnn'] = True 57 | 58 | print 'load options...' 59 | for w, p in sorted(options.items(), key=lambda x:x[0]): 60 | print '{}: {}'.format(w, p) 61 | 62 | # load detail settings from option file: 63 | dictionary, dictionary_target = options['dictionaries'] 64 | 65 | def _iter(fname): 66 | with open(fname, 'r') as f: 67 | for line in f: 68 | words = line.strip().split() 69 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 70 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 71 | x += [0] 72 | yield x 73 | 74 | def _check_length(fname): 75 | f = open(fname, 'r') 76 | count = 0 77 | for _ in f: 78 | count += 1 79 | f.close() 80 | 81 | return count 82 | 83 | # load source dictionary and invert 84 | with open(dictionary, 'rb') as f: 85 | word_dict = pkl.load(f) 86 | word_idict = dict() 87 | for kk, vv in word_dict.iteritems(): 88 | word_idict[vv] = kk 89 | word_idict[0] = '' 90 | word_idict[1] = 'UNK' 91 | 92 | # load target dictionary and invert 93 | with open(dictionary_target, 'rb') as f: 94 | word_dict_trg = pkl.load(f) 95 | word_idict_trg = dict() 96 | for kk, vv in word_dict_trg.iteritems(): 97 | word_idict_trg[vv] = kk 98 | word_idict_trg[0] = '' 99 | word_idict_trg[1] = 'UNK' 100 | 101 | # ======================================================================== # 102 | # Build a Translator 103 | # ======================================================================== # 104 | 105 | # allocate model parameters 106 | params = init_params(options) 107 | params = load_params(model, params) 108 | tparams = init_tparams(params) 109 | 110 | # print 'build the model for computing cost (full source sentence).' 111 | trng, use_noise, \ 112 | _x, _x_mask, _y, _y_mask, \ 113 | opt_ret, \ 114 | cost, f_cost = build_model(tparams, options) 115 | print 'done.' 116 | 117 | # functions for sampler 118 | # f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng) 119 | f_sim_ctx, f_sim_init, f_sim_next = build_noisy_sampler(tparams, options, trng) 120 | print 'build sampler done.' 121 | 122 | # check the ID: 123 | policy['base'] = _model 124 | _policy = Policy(trng, options, policy, config, 125 | n_out=options['dim'], 126 | recurrent=True, id=id) 127 | 128 | 129 | # DATASET 130 | trainIter = TextIterator(options['datasets'][0], options['datasets'][1], 131 | options['dictionaries'][0], options['dictionaries'][1], 132 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 133 | batch_size=config['batchsize'], 134 | maxlen=options['maxlen']) 135 | 136 | train_num = trainIter.num 137 | 138 | validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1], 139 | options['dictionaries'][0], options['dictionaries'][1], 140 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 141 | batch_size=1, 142 | maxlen=options['maxlen']) 143 | 144 | valid_num = validIter.num 145 | 146 | valid_ = options['valid_datasets'][0] 147 | valid_num = _check_length(valid_) 148 | print 'training set {} lines / validation set {} lines'.format(train_num, valid_num) 149 | print 'use the reward function {}'.format(chr(config['Rtype'] + 65)) 150 | 151 | # Translator model 152 | def _translate(src, trg, train=False, samples=80): 153 | ret = noisy_decoding( 154 | f_sim_ctx, f_sim_init, 155 | f_sim_next, f_cost, 156 | src, trg, word_idict_trg, n_samples=samples, 157 | train=train, 158 | _policy=_policy) 159 | 160 | if not train: 161 | sample, score, actions, R, tracks, attentions = ret 162 | return sample, score, actions, R, tracks 163 | else: 164 | sample, score, actions, R, info = ret 165 | return sample, score, actions, R, info 166 | 167 | 168 | # ======================================================================== # 169 | # Main Loop: Run 170 | # ======================================================================== # 171 | print 'Start Simultaneous Translator...' 172 | probar = Progbar(train_num / config['batchsize'], with_history=False) 173 | 174 | # freqs 175 | save_freq = 2000 176 | sample_freq = 10 177 | valid_freq = 1000 178 | valid_size = 200 179 | display_freq = 50 180 | 181 | history, last_it = _policy.load() 182 | time0 = timer() 183 | 184 | for it, (srcs, trgs) in enumerate(trainIter): # only one sentence each iteration 185 | if it < last_it: # go over the scanned lines. 186 | continue 187 | 188 | samples, scores, actions, rewards, info = _translate(srcs, trgs, train=True) 189 | if it % sample_freq == 0: 190 | 191 | print '\nModel has been trained for {} seconds'.format(timer() - time0) 192 | print 'source: ', _bpe2words(_seqs2words([srcs[0]], word_idict))[0] 193 | print 'target: ', _bpe2words(_seqs2words([trgs[0]], word_idict_trg))[0] 194 | 195 | # obtain the translation results 196 | samples = _bpe2words(_seqs2words(samples, word_idict_trg)) 197 | 198 | print '---' 199 | print 'sample: ', samples[40] 200 | print 'sample: ', samples[60] 201 | 202 | values = [(w, info[w]) for w in info] 203 | probar.update(it + 1, values=values) 204 | 205 | # NaN detector 206 | for w in info: 207 | if numpy.isnan(info[w]) or numpy.isinf(info[w]): 208 | raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id) 209 | 210 | 211 | 212 | if __name__ == "__main__": 213 | parser = argparse.ArgumentParser() 214 | parser.add_argument('-m', '--model', 215 | default='model_wmt15_bpe2k_basic_cs-en.npz') 216 | parser.add_argument('--id', type=str, default=None) 217 | parser.add_argument('-o', type=str, default=None) 218 | 219 | args = parser.parse_args() 220 | print args 221 | 222 | policy = OrderedDict() 223 | policy['layernorm'] = True 224 | policy['upper'] = False 225 | policy['updater'] = 'REINFORCE' 226 | policy['type'] = 'gaussian' 227 | 228 | config = OrderedDict() 229 | config['batchsize'] = 1 230 | config['Rtype'] = 8 231 | 232 | run_simultrans(args.model, 233 | options_file=args.o, 234 | config=config, 235 | policy=policy, 236 | id=args.id, 237 | remote=False) 238 | 239 | 240 | -------------------------------------------------------------------------------- /optimizer.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as tensor 3 | import numpy 4 | 5 | from layers import * 6 | profile = False 7 | 8 | # optimizers 9 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 10 | 11 | """ 12 | First order optimizer 13 | """ 14 | def adam(lr, tparams, grads, inp, cost): 15 | gshared = [theano.shared(p.get_value() * 0., 16 | name='%s_grad' % k) 17 | for k, p in tparams.iteritems()] 18 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 19 | 20 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile, on_unused_input='ignore') 21 | 22 | lr0 = lr # 0.0002 23 | b1 = 0.1 24 | b2 = 0.001 25 | e = 1e-8 26 | 27 | updates = [] 28 | 29 | i = theano.shared(numpy.float32(0.)) 30 | i_t = i + 1. 31 | fix1 = 1. - b1**(i_t) 32 | fix2 = 1. - b2**(i_t) 33 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 34 | 35 | for p, g in zip(tparams.values(), gshared): 36 | m = theano.shared(p.get_value() * 0.) 37 | v = theano.shared(p.get_value() * 0.) 38 | m_t = (b1 * g) + ((1. - b1) * m) 39 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 40 | g_t = m_t / (tensor.sqrt(v_t) + e) 41 | p_t = p - (lr_t * g_t) 42 | updates.append((m, m_t)) 43 | updates.append((v, v_t)) 44 | updates.append((p, p_t)) 45 | updates.append((i, i_t)) 46 | 47 | print 'build optimizer with Adam' 48 | f_update = theano.function([lr], [], updates=updates, 49 | on_unused_input='ignore', profile=profile) 50 | 51 | return f_grad_shared, f_update 52 | 53 | 54 | def adadelta(lr, tparams, grads, inp, cost): 55 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 56 | name='%s_grad' % k) 57 | for k, p in tparams.iteritems()] 58 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 59 | name='%s_rup2' % k) 60 | for k, p in tparams.iteritems()] 61 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 62 | name='%s_rgrad2' % k) 63 | for k, p in tparams.iteritems()] 64 | 65 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 66 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 67 | for rg2, g in zip(running_grads2, grads)] 68 | 69 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 70 | profile=profile) 71 | 72 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 73 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 74 | running_grads2)] 75 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 76 | for ru2, ud in zip(running_up2, updir)] 77 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 78 | 79 | f_update = theano.function([lr], [], updates=ru2up+param_up, 80 | on_unused_input='ignore', profile=profile) 81 | 82 | print 'build optimizer with Adadelta' 83 | return f_grad_shared, f_update 84 | 85 | 86 | def rmsprop(lr, tparams, grads, inp, cost): 87 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 88 | name='%s_grad' % k) 89 | for k, p in tparams.iteritems()] 90 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 91 | name='%s_rgrad' % k) 92 | for k, p in tparams.iteritems()] 93 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 94 | name='%s_rgrad2' % k) 95 | for k, p in tparams.iteritems()] 96 | 97 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 98 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 99 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 100 | for rg2, g in zip(running_grads2, grads)] 101 | 102 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 103 | on_unused_input='ignore', profile=profile) 104 | 105 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 106 | name='%s_updir' % k) 107 | for k, p in tparams.iteritems()] 108 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 109 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 110 | running_grads2)] 111 | param_up = [(p, p + udn[1]) 112 | for p, udn in zip(itemlist(tparams), updir_new)] 113 | f_update = theano.function([lr], [], updates=updir_new+param_up, 114 | on_unused_input='ignore', profile=profile) 115 | 116 | print 'build optimizer with Rmsprop' 117 | return f_grad_shared, f_update 118 | 119 | 120 | def sgd(lr, tparams, grads, x, mask, y, cost): 121 | gshared = [theano.shared(p.get_value() * 0., 122 | name='%s_grad' % k) 123 | for k, p in tparams.iteritems()] 124 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 125 | 126 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 127 | profile=profile) 128 | 129 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 130 | f_update = theano.function([lr], [], updates=pup, profile=profile) 131 | 132 | print 'build optimizer with SGD' 133 | return f_grad_shared, f_update 134 | 135 | 136 | """ 137 | Beyond first-order optimizer 138 | """ 139 | def conjugate(lr, tparams, grads, inps, cost): 140 | """ 141 | Performs constrained optimization via line search. 142 | The search direction is computed using a conjugate gradient algorithm, 143 | which gives x = A^{-1}g, where A is a second order approximation of the constraint and g is the gradient 144 | of the loss function. 145 | """ 146 | pass 147 | -------------------------------------------------------------------------------- /policy.py: -------------------------------------------------------------------------------- 1 | """ 2 | -- Policy Network for decision making [more general] 3 | """ 4 | from nmt_uni import * 5 | from layers import _p 6 | 7 | import os 8 | import time, datetime 9 | import cPickle as pkl 10 | 11 | # hyper params 12 | TINY = 1e-7 13 | PI = numpy.pi 14 | E = numpy.e 15 | A = 0.2 16 | B = 1 17 | 18 | class Controller(object): 19 | 20 | def __init__(self, trng, 21 | options, 22 | policy, 23 | config, 24 | n_in=None, n_out=None, 25 | recurrent=False, id=None): 26 | 27 | self.trng = trng 28 | self.options = options 29 | self.policy = policy 30 | self.recurrent = recurrent 31 | self.type = self.policy.get('type', 'categorical') 32 | 33 | self.n_hidden = 512 34 | self.n_in = n_in 35 | self.n_out = n_out 36 | 37 | if self.policy.get('layernorm', True): 38 | self.rec = 'lngru' 39 | else: 40 | self.rec = 'gru' 41 | 42 | if not n_in: 43 | self.n_in = options['readout_dim'] 44 | 45 | if not n_out: 46 | if self.type == 'categorical': 47 | self.n_out = 2 # initially it is a WAIT/COMMIT action. 48 | elif self.type == 'gaussian': 49 | self.n_out = 100 50 | else: 51 | raise NotImplementedError 52 | 53 | # build the policy network 54 | print 'parameter initialization' 55 | 56 | params = OrderedDict() 57 | 58 | if not self.recurrent: 59 | print 'building a feedforward controller' 60 | params = get_layer('ff')[0](options, params, prefix='policy_net_in', 61 | nin=self.n_in, nout=self.n_hidden, scale=0.001) 62 | else: 63 | print 'building a recurrent controller' 64 | params = get_layer(self.rec)[0](options, params, prefix='policy_net_in', 65 | nin=self.n_in, dim=self.n_hidden, scale=0.001) 66 | 67 | params = get_layer('ff')[0](options, params, prefix='policy_net_out', 68 | nin=self.n_hidden, 69 | nout=self.n_out if self.type == 'categorical' else self.n_out * 2, 70 | scale=0.001) 71 | 72 | # bias the forget probability 73 | # if self.n_out == 3: 74 | # params[_p('policy_net_out', 'b')][-1] = -2 75 | 76 | 77 | # for the baseline network. 78 | params_b = OrderedDict() 79 | 80 | # using a scalar baseline [**] 81 | # params_b['b0'] = numpy.array(numpy.random.rand() * 0.0, dtype='float32') 82 | 83 | # using a MLP as a baseline 84 | params_b = get_layer('ff')[0](options, params_b, prefix='baseline_net_in', 85 | nin=self.n_in, nout=128, scale=0.001) 86 | params_b = get_layer('ff')[0](options, params_b, prefix='baseline_net_out', 87 | nin=128, nout=1, scale=0.001) 88 | 89 | if id is not None: 90 | print 'reload the saved model: {}'.format(id) 91 | params = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params) 92 | params_b = load_params('.policy/{}-{}.current.npz'.format(id, self.policy['base']), params_b) 93 | else: 94 | id = datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d-%H%M%S') 95 | print 'start from a new model: {}'.format(id) 96 | 97 | with open('.config/conf.{}.txt'.format(id), 'w') as f: 98 | f.write('[config]\n') 99 | 100 | for c in config: 101 | f.write('{}: {}\n'.format(c, config[c])) 102 | f.write('\n') 103 | 104 | f.write('[policy]\n') 105 | 106 | for c in policy: 107 | f.write('{}: {}\n'.format(c, policy[c])) 108 | 109 | # pkl.dump([policy, config], open('.config/{}.conf'.format(id), 'w')) 110 | print 'save the config file' 111 | 112 | self.id = id 113 | self.model = '.policy/{}-{}'.format(id, self.policy['base']) 114 | 115 | # theano shared params 116 | tparams = init_tparams(params) 117 | tparams_b = init_tparams(params_b) 118 | 119 | if ('bn' in policy) and policy['bn']: 120 | # params for input-batch normalization 121 | self.gamma = theano.shared(numpy.asarray(numpy.random.uniform( 122 | low=-1.0 / numpy.sqrt(self.n_in), 123 | high=1.0 / numpy.sqrt(self.n_in), 124 | size=(self.n_in)), dtype=theano.config.floatX), name='policy_gamma', borrow=True) 125 | self.beta = theano.shared(numpy.zeros( 126 | (self.n_in), dtype=theano.config.floatX), name='policy_beta', borrow=True) 127 | 128 | self.mean = theano.shared(numpy.zeros((self.n_in), dtype=theano.config.floatX), name='mean', borrow=True) 129 | self.var = theano.shared(numpy.ones((self.n_in), dtype=theano.config.floatX), name='var', borrow=True) 130 | tparams['gamma'] = self.gamma 131 | tparams['beta'] = self.beta 132 | 133 | self.tparams = tparams 134 | self.tparams_b = tparams_b 135 | 136 | # build the policy network 137 | self.build_sampler(options=options) 138 | self.build_discriminator(options=options) 139 | 140 | 141 | def build_batchnorm(self, observation, mask=None): 142 | raise NotImplementedError 143 | 144 | 145 | def build_sampler(self, options): 146 | 147 | # ==================================================================================== # 148 | # Build Action function: samplers 149 | # ==================================================================================== # 150 | 151 | observation = tensor.matrix('observation', dtype='float32') # batch_size x readout_dim (seq_steps=1) 152 | prev_hidden = tensor.matrix('p_hidden', dtype='float32') 153 | 154 | if not self.recurrent: 155 | hiddens = get_layer('ff')[1](self.tparams, observation, 156 | options, prefix='policy_net_in', 157 | activ='tanh') 158 | else: 159 | hiddens = get_layer(self.rec)[1](self.tparams, observation, 160 | options, prefix='policy_net_in', mask=None, 161 | one_step=True, _init_state=prev_hidden)[0] 162 | 163 | act_inps = [observation, prev_hidden] 164 | if self.type == 'categorical': 165 | act_prob = get_layer('ff')[1](self.tparams, hiddens, options, 166 | prefix='policy_net_out', 167 | activ='softmax' 168 | ) # batch_size x n_out 169 | 170 | # add action mask 171 | if self.policy.get('act_mask', False): 172 | act_mask = tensor.matrix('act_mask', dtype='float32') 173 | act_inps += [act_mask] 174 | act_prob *= act_mask 175 | act_prob /= (act_prob.sum(axis=-1, keepdims=True) + TINY) 176 | act_prob *= act_mask 177 | 178 | act_prob2 = tensor.clip(act_prob, TINY, 1 - TINY) 179 | 180 | # testing upper bound 181 | # if self.policy['upper']: 182 | # act_prob *= 0.0 183 | 184 | # compiling the sampling function for action 185 | # action = self.trng.binomial(size=act_prop.shape, p=act_prop) 186 | action = self.trng.multinomial(pvals=act_prob).argmax(1) # 0, 1, ... 187 | 188 | print 'build action sampling function [Discrete]' 189 | self.f_action = theano.function(act_inps, [action, act_prob, hiddens, act_prob2], 190 | on_unused_input='ignore') # action/dist/hiddens 191 | 192 | elif self.type == 'gaussian': 193 | _temp = get_layer('ff')[1](self.tparams, hiddens, options, 194 | prefix='policy_net_out', 195 | activ='linear' 196 | ) # batch_size x n_out 197 | mean, log_std = _temp[:, :self.n_out], _temp[:, self.n_out:] 198 | mean, log_std = -A * tanh(mean), -B-relu(log_std) 199 | 200 | action0 = self.trng.normal(size=mean.shape, dtype='float32') 201 | action = action0 * tensor.exp(log_std) + mean 202 | 203 | 204 | print 'build action sampling function [Gaussian]' 205 | self.f_action = theano.function(act_inps, [action, mean, log_std, hiddens], 206 | on_unused_input='ignore') # action/dist/hiddens 207 | else: 208 | raise NotImplementedError 209 | 210 | 211 | def build_discriminator(self, options): 212 | # ==================================================================================== # 213 | # Build Action Discriminator 214 | # ==================================================================================== # 215 | 216 | observations = tensor.tensor3('observations', dtype='float32') 217 | mask = tensor.matrix('mask', dtype='float32') 218 | if self.type == 'categorical': 219 | actions = tensor.matrix('actions', dtype='int64') 220 | elif self.type == 'gaussian': 221 | actions = tensor.tensor3('actions', dtype='float32') 222 | else: 223 | raise NotImplementedError 224 | 225 | 226 | if not self.recurrent: 227 | hiddens = get_layer('ff')[1](self.tparams, observations, 228 | options, prefix='policy_net_in', 229 | activ='tanh') 230 | else: 231 | hiddens = get_layer(self.rec)[1](self.tparams, observations, 232 | options, prefix='policy_net_in', mask=mask)[0] 233 | 234 | act_inputs = [observations, mask] 235 | if self.type == 'categorical': 236 | act_probs = get_layer('ff')[1](self.tparams, hiddens, options, prefix='policy_net_out', 237 | activ='softmax') # seq_steps x batch_size x n_out 238 | 239 | if 'act_mask' in self.policy and self.policy['act_mask']: 240 | act_masks = tensor.tensor3('act_masks', dtype='float32') 241 | act_inputs += [act_masks] 242 | act_probs *= act_masks 243 | act_probs /= (act_probs.sum(axis=-1, keepdims=True) + TINY) 244 | act_probs *= act_masks 245 | 246 | act_probs = tensor.clip(act_probs, TINY, 1 - TINY) 247 | 248 | print 'build action distribiution' 249 | self.f_probs = theano.function(act_inputs, act_probs, 250 | on_unused_input='ignore') # get the action probabilities 251 | elif self.type == 'gaussian': 252 | _temps = get_layer('ff')[1](self.tparams, hiddens, options, 253 | prefix='policy_net_out', 254 | activ='linear' 255 | ) # batch_size x n_out 256 | means, log_stds = _temps[:, :, :self.n_out], _temps[:, :, self.n_out:] 257 | means, log_stds = -A * tanh(means), -B-relu(log_stds) 258 | 259 | act_probs = [means, log_stds] 260 | 261 | print 'build Gaussian PDF' 262 | self.f_pdf = theano.function(act_inputs, [means, log_stds], 263 | on_unused_input='ignore') # get the action probabilities 264 | else: 265 | raise NotImplementedError 266 | 267 | 268 | # ==================================================================================== # 269 | # Build Baseline Network (Input-dependent Value Function) & Advantages 270 | # ==================================================================================== # 271 | 272 | print 'setup the advantages & baseline network' 273 | reward = tensor.matrix('reward') # seq_steps x batch_size :: rewards for each steps 274 | 275 | # baseline is estimated with a 2-layer neural network. 276 | hiddens_b = get_layer('ff')[1](self.tparams_b, observations, options, 277 | prefix='baseline_net_in', 278 | activ='tanh') 279 | baseline = get_layer('ff')[1](self.tparams_b, hiddens_b, options, 280 | prefix='baseline_net_out', 281 | activ='linear')[:, :, 0] # seq_steps x batch_size or batch_size 282 | advantages = self.build_advantages(act_inputs, reward, baseline, normalize=True) 283 | 284 | 285 | # ==================================================================================== # 286 | # Build Policy Gradient (here we provide two options) 287 | # ==================================================================================== # 288 | if self.policy['updater'] == 'REINFORCE': 289 | print 'build RENIFROCE.' 290 | self.build_reinforce(act_inputs, act_probs, actions, advantages) 291 | 292 | elif self.policy['updater'] == 'TRPO': 293 | print 'build TRPO' 294 | self.build_trpo(act_inputs, act_probs, actions, advantages) 295 | else: 296 | raise NotImplementedError 297 | 298 | # ==================================================================================== # 299 | # Controller Actions 300 | # ==================================================================================== # 301 | def random(self, states, p=0.5): 302 | live_k = states.shape[0] 303 | return (numpy.random.random(live_k) > p).astype('int64'), \ 304 | numpy.ones(live_k) * p 305 | 306 | def action(self, states, prevhidden, act_mask=None): 307 | if act_mask is None: 308 | return self.f_action(states, prevhidden) 309 | else: 310 | return self.f_action(states, prevhidden, act_mask) 311 | 312 | 313 | def init_hidden(self, n_samples=1): 314 | return numpy.zeros((n_samples, self.n_hidden), dtype='float32') 315 | 316 | def init_action(self, n_samples=1): 317 | states0 = numpy.zeros((n_samples, self.n_in), dtype='float32') 318 | return self.f_action(states0, self.init_hidden(n_samples)) 319 | 320 | 321 | def get_learner(self): 322 | if self.policy['updater'] == 'REINFORCE': 323 | return self.run_reinforce 324 | elif self.policy['updater'] == 'TRPO': 325 | return self.run_trpo 326 | else: 327 | raise NotImplementedError 328 | 329 | @staticmethod 330 | def kl(prob0, prob1): 331 | p1 = (prob0 + TINY) / (prob1 + TINY) 332 | # p2 = (1 - prob0 + TINY) / (1 - prob1 + TINY) 333 | return tensor.sum(prob0 * tensor.log(p1), axis=-1) 334 | 335 | 336 | @staticmethod 337 | def _grab_prob(probs, X): 338 | assert probs.ndim == 3 339 | 340 | batch_size = probs.shape[1] 341 | max_len = probs.shape[0] 342 | vocab_size = probs.shape[2] 343 | 344 | probs = probs.reshape((batch_size * max_len, vocab_size)) 345 | return probs[tensor.arange(batch_size * max_len), X.flatten(1)].reshape(X.shape) # advanced indexing 346 | 347 | def cross(self, probs, actions): 348 | # return tensor.log(probs) * actions + tensor.log(1 - probs) * (1 - actions) 349 | return self._grab_prob(tensor.log(probs), actions) 350 | 351 | def build_advantages(self, act_inputs, reward, baseline, normalize=True): 352 | # TODO: maybe we need a discount factor gamma for advantages. 353 | # TODO: we can also rewrite advantages with value functions (GAE) 354 | 355 | # Advantages and Normalization the return 356 | reward_adv = reward - baseline 357 | mask = act_inputs[1] 358 | 359 | if normalize: 360 | reward_mean = tensor.sum(mask * reward_adv) / (tensor.sum(mask) + TINY) 361 | reward_mean2 = tensor.sum(mask * (reward_adv ** 2)) / (tensor.sum(mask) + TINY) 362 | reward_std = tensor.sqrt(tensor.maximum(reward_mean2 - reward_mean ** 2, TINY)) 363 | # reward_std = tensor.maximum(reward_std, 1) 364 | reward_c = reward_adv - reward_mean # independent mean 365 | advantages = reward_c / (reward_std + TINY) 366 | else: 367 | advantages = reward_adv 368 | 369 | print 'build advantages and baseline gradient' 370 | L = tensor.sum(mask * (reward_adv ** 2)) / (tensor.sum(mask) + TINY) 371 | dL = tensor.grad(L, wrt=itemlist(self.tparams_b)) 372 | lr = tensor.scalar(name='lr') 373 | 374 | inps_b = act_inputs + [reward] 375 | oups_b = [L, advantages] 376 | f_adv, f_update_b = adam(lr, self.tparams_b, dL, inps_b, oups_b) 377 | # f_adv, f_update_b = rmsprop(lr, self.tparams_b, dL, inps_b, oups_b) 378 | 379 | self.f_adv = f_adv 380 | self.f_update_b = f_update_b 381 | 382 | return advantages 383 | 384 | 385 | # =================================================================== 386 | # Policy Grident: REINFORCE with Adam 387 | # =================================================================== 388 | def build_reinforce(self, act_inputs, act_probs, actions, advantages): 389 | 390 | mask = act_inputs[1] 391 | 392 | if self.type == 'categorical': 393 | if self.policy.get('act_mask', False): 394 | act_masks = act_inputs[2] 395 | negEntropy = tensor.sum(tensor.log(act_probs) * (act_probs * act_masks), axis=-1) 396 | else: 397 | negEntropy = tensor.sum(tensor.log(act_probs) * act_probs, axis=-1) 398 | 399 | logLikelihood = self.cross(act_probs, actions) 400 | 401 | elif self.type == 'gaussian': 402 | means, log_stds = act_probs 403 | negEntropy = -tensor.sum(log_stds + tensor.log(tensor.sqrt(2 * PI * E)), axis=-1) 404 | 405 | actions0 = (actions - means) / tensor.exp(log_stds) 406 | logLikelihood = -tensor.sum(log_stds, axis=-1) - \ 407 | 0.5 * tensor.sum(tensor.sqr(actions0), axis=-1) - \ 408 | 0.5 * means.shape[-1] * tensor.log(2 * PI) 409 | 410 | else: 411 | raise NotImplementedError 412 | 413 | # tensor.log(act_probs) * actions + tensor.log(1 - act_probs) * (1 - actions) 414 | 415 | H = tensor.sum(mask * negEntropy, axis=0).mean() * 0.01 # entropy penalty 416 | J = tensor.sum(mask * -logLikelihood * advantages, axis=0).mean() + H 417 | dJ = tensor.grad(J, wrt=itemlist(self.tparams)) 418 | 419 | # clip the policy gradient to 1 (to avoid gradient exploding) 420 | clip_c = 1. 421 | if clip_c > 0.: 422 | g2 = 0. 423 | for g in dJ: 424 | g2 += (g ** 2).sum() 425 | new_grads = [] 426 | for g in dJ: 427 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g)) 428 | dJ = new_grads 429 | 430 | print 'build REINFORCE optimizer' 431 | lr = tensor.scalar(name='lr') 432 | 433 | inps = act_inputs + [actions, advantages] 434 | outps = [J, H] 435 | if self.type == 'gaussian': 436 | outps += [actions0.mean(), actions.mean()] 437 | 438 | f_cost, f_update = adam(lr, self.tparams, dJ, inps, outps) 439 | # f_cost, f_update = rmsprop(lr, self.tparams, dJ, inps, outps) 440 | 441 | self.f_cost = f_cost 442 | self.f_update = f_update 443 | print 'done' 444 | 445 | 446 | def run_reinforce(self, act_inputs, actions, reward, update=True, lr=0.0001): 447 | 448 | # sub baseline 449 | inps_adv = act_inputs + [reward] 450 | L, advantages = self.f_adv(*inps_adv) 451 | 452 | inps_reinfoce = act_inputs + [actions, advantages] 453 | if self.type == 'gaussian': 454 | J, H, m, s = self.f_cost(*inps_reinfoce) 455 | info = {'J': J, 'G_norm': H, 'B_loss': L, 'Adv': advantages.mean(), 'm': m, 's': s} 456 | else: 457 | J, H = self.f_cost(*inps_reinfoce) 458 | info = {'J': J, 'G_norm': H, 'B_loss': L, 'Adv': advantages.mean()} 459 | 460 | 461 | if update: # update the parameters 462 | self.f_update_b(lr) 463 | self.f_update(lr) 464 | 465 | return info 466 | 467 | 468 | # ==================================================================================== # 469 | # Trust Region Policy Optimization 470 | # ==================================================================================== # 471 | def build_trpo(self, act_inputs, act_probs, actions, advantages): 472 | 473 | assert self.type == 'categorical', 'in this stage not support TRPO' 474 | 475 | # probability distribution 476 | mask = act_inputs[1] 477 | probs = act_probs 478 | probs_old = tensor.matrix(dtype='float32') 479 | 480 | logp = self.cross(probs, actions) 481 | logp_old = self.cross(probs_old, actions) 482 | 483 | # policy gradient 484 | J = tensor.sum(mask * -tensor.exp(logp - logp_old) * advantages, axis=0).mean() 485 | dJ = flatgrad(J, self.tparams) 486 | probs_fix = theano.gradient.disconnected_grad(probs) 487 | 488 | kl_fix = tensor.sum(mask * self.kl(probs_fix, probs), axis=0).mean() 489 | kl_grads = tensor.grad(kl_fix, wrt=itemlist(self.tparams)) 490 | ftangents = tensor.fvector(name='flat_tan') 491 | shapes = [self.tparams[var].get_value(borrow=True).shape for var in self.tparams] 492 | start = 0 493 | tangents = [] 494 | for shape in shapes: 495 | size = numpy.prod(shape) 496 | tangents.append(tensor.reshape(ftangents[start:start + size], shape)) 497 | start += size 498 | gvp = tensor.add(*[tensor.sum(g * t) for (g, t) in zipsame(kl_grads, tangents)]) 499 | 500 | # Fisher-vectror product 501 | fvp = flatgrad(gvp, self.tparams) 502 | entropy = tensor.sum(mask * -self.cross(probs, probs), axis=0).mean() 503 | kl = tensor.sum(mask * self.kl(probs_old, probs), axis=0).mean() 504 | 505 | print 'compile the functions' 506 | inps = act_inputs + [actions, advantages, probs_old] 507 | loss = [J, kl, entropy] 508 | self.f_pg = theano.function(inps, dJ) 509 | self.f_loss = theano.function(inps, loss) 510 | self.f_fisher = theano.function([ftangents] + inps, fvp, on_unused_input='ignore') 511 | 512 | # get/set flatten params 513 | print 'compling flat updater' 514 | self.get_flat = theano.function([], tensor.concatenate([self.tparams[v].flatten() for v in self.tparams])) 515 | theta = tensor.vector() 516 | start = 0 517 | updates = [] 518 | for v in self.tparams: 519 | p = self.tparams[v] 520 | shape = p.shape 521 | size = tensor.prod(shape) 522 | updates.append((p, theta[start:start + size].reshape(shape))) 523 | start += size 524 | self.set_flat = theano.function([theta], [], updates=updates) 525 | 526 | 527 | def run_trpo(self, act_inputs, actions, reward, 528 | update=True, cg_damping=1e-3, max_kl=1e-2, lr=0.0002): 529 | 530 | # sub baseline 531 | inps_adv = act_inputs + [reward] 532 | L, advantages = self.f_adv(*inps_adv) 533 | self.f_update_b(lr) 534 | 535 | # get current action distributions 536 | probs = self.f_probs(*act_inputs) 537 | inps = act_inputs + [actions, advantages, probs] 538 | thprev = self.get_flat() 539 | 540 | def fisher_vector_product(p): 541 | return self.f_fisher(p, *inps) + cg_damping * p 542 | 543 | g = self.f_pg(*inps) 544 | losses_before = self.f_loss(*inps) 545 | 546 | if numpy.allclose(g, 0): 547 | print 'zero gradient, not updating' 548 | else: 549 | stepdir = self.cg(fisher_vector_product, -g) 550 | shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) 551 | lm = numpy.sqrt(shs / max_kl) 552 | 553 | print "\nlagrange multiplier:", lm, "gnorm:", numpy.linalg.norm(g) 554 | fullstep = stepdir / lm 555 | neggdotstepdir = -g.dot(stepdir) 556 | 557 | def loss(th): 558 | self.set_flat(th) 559 | return self.f_loss(*inps)[0] 560 | 561 | print 'do line search' 562 | success, theta = self.linesearch(loss, thprev, fullstep, neggdotstepdir / lm) 563 | 564 | print "success", success 565 | self.set_flat(theta) 566 | 567 | losses_after = self.f_loss(*inps) 568 | 569 | info = OrderedDict() 570 | for (lname, lbefore, lafter) in zipsame(['J', 'KL', 'entropy'], losses_before, losses_after): 571 | info[lname + "_before"] = lbefore 572 | info[lname + "_after"] = lafter 573 | 574 | # add the baseline loss into full information 575 | info['B_loss'] = L 576 | return info 577 | 578 | 579 | @staticmethod 580 | def linesearch(f, x, fullstep, expected_improve_rate, max_backtracks=10, accept_ratio=.1): 581 | """ 582 | Backtracking linesearch, where expected_improve_rate is the slope dy/dx at the initial point 583 | """ 584 | fval = f(x) 585 | print "fval before", fval 586 | for (_n_backtracks, stepfrac) in enumerate(.5 ** numpy.arange(max_backtracks)): 587 | xnew = x + stepfrac * fullstep 588 | newfval = f(xnew) 589 | actual_improve = fval - newfval 590 | expected_improve = expected_improve_rate * stepfrac 591 | ratio = actual_improve / expected_improve 592 | print "a/e/r", actual_improve, expected_improve, ratio 593 | if ratio > accept_ratio and actual_improve > 0: 594 | print "fval after", newfval 595 | return True, xnew 596 | return False, x 597 | 598 | @staticmethod 599 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 600 | """ 601 | Conjuctate Gradient 602 | """ 603 | p = b.copy() 604 | r = b.copy() 605 | x = numpy.zeros_like(b) 606 | rdotr = r.dot(r) 607 | 608 | fmtstr = "%10i %10.3g %10.3g" 609 | titlestr = "%10s %10s %10s" 610 | if verbose: print titlestr % ("iter", "residual norm", "soln norm") 611 | 612 | for i in xrange(cg_iters): 613 | if callback is not None: 614 | callback(x) 615 | if verbose: print fmtstr % (i, rdotr, numpy.linalg.norm(x)) 616 | z = f_Ax(p) 617 | v = rdotr / p.dot(z) 618 | x += v * p 619 | r -= v * z 620 | newrdotr = r.dot(r) 621 | mu = newrdotr / rdotr 622 | p = r + mu * p 623 | 624 | rdotr = newrdotr 625 | if rdotr < residual_tol: 626 | break 627 | 628 | if callback is not None: 629 | callback(x) 630 | if verbose: print fmtstr % (i + 1, rdotr, numpy.linalg.norm(x)) 631 | return x 632 | 633 | 634 | # ====================================================================== # 635 | # Save & Load 636 | # ====================================================================== # 637 | 638 | def save(self, history, it): 639 | _params = OrderedDict() 640 | _params = unzip(self.tparams, _params) 641 | _params = unzip(self.tparams_b, _params) 642 | 643 | print 'save the policy network >> {}'.format(self.model) 644 | numpy.savez('%s.current' % (self.model), 645 | history=history, 646 | it=it, 647 | **_params) 648 | 649 | def load(self): 650 | if os.path.exists(self.model): 651 | print 'loading from the existing model (current)' 652 | 653 | rmodel = numpy.load(self.model) 654 | history = rmodel['history'] 655 | it = rmodel['it'] 656 | 657 | self.params = load_params(rmodel, self.params) 658 | self.params_b = load_params(rmodel, self.params_b) 659 | self.tparams = init_tparams(self.params) 660 | self.tparams_b = init_tparams(self.params_b) 661 | 662 | print 'the dataset need to go over {} lines'.format(it) 663 | return history, it 664 | else: 665 | return [], -1 666 | 667 | 668 | 669 | 670 | -------------------------------------------------------------------------------- /pretrain_uni.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from nmt_uni import train 5 | 6 | def main(job_id, params): 7 | print params 8 | validerr = train(saveto=params['model'][0], 9 | reload_=params['reload'][0], 10 | dim_word=params['dim_word'][0], 11 | dim=params['dim'][0], 12 | n_words=params['n-words'][0], 13 | n_words_src=params['n-words'][0], 14 | decay_c=params['decay-c'][0], 15 | clip_c=params['clip-c'][0], 16 | lrate=params['learning-rate'][0], 17 | optimizer=params['optimizer'][0], 18 | patience=1000, 19 | maxlen=50, 20 | batch_size=64, 21 | valid_batch_size=64, 22 | validFreq=1000, 23 | dispFreq=50, 24 | saveFreq=1000, 25 | sampleFreq=99, 26 | datasets=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe', 27 | '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe'], 28 | valid_datasets=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe', 29 | '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok.bpe'], 30 | dictionaries=['/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl', 31 | '/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl'], 32 | use_dropout=params['use-dropout'][0]) 33 | return validerr 34 | 35 | if __name__ == '__main__': 36 | main(0, { 37 | 'model': ['models/model_wmt15_bpe2k_uni_en-ru.npz'], 38 | 'dim_word': [512], 39 | 'dim': [1028], 40 | 'n-words': [20000], 41 | 'optimizer': ['adadelta'], 42 | 'decay-c': [0.], 43 | 'clip-c': [1.], 44 | 'use-dropout': [False], 45 | 'learning-rate': [0.0001], 46 | 'reload': [False]}) 47 | 48 | 49 | -------------------------------------------------------------------------------- /reward.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Collection of reward functions for Simultaneous Machine Translation 5 | """ 6 | import numpy 7 | from bleu import * 8 | 9 | 10 | # computing the discounting matrix 11 | gamma = 0.9 12 | maxlen = 100 13 | 14 | 15 | def compute_discount(gamma, maxlen): 16 | c = numpy.ones((maxlen,)) * gamma 17 | c[0] = 1. 18 | c = c.cumprod() 19 | 20 | C = numpy.triu(numpy.repeat(c[None, :], repeats=maxlen, axis=0)) 21 | C /= c[:, None] 22 | return C 23 | 24 | 25 | GAMMA = compute_discount(gamma, maxlen) # precomputed 26 | 27 | def translation_cost(**_k): 28 | 29 | def BLEU(): 30 | q = numpy.zeros((_k['steps'],)) 31 | s = _k['sample'] 32 | r = _k['reference'] 33 | chencherry = SmoothingFunction() 34 | b = sentence_bleu(r, s, smoothing_function=chencherry.method5) 35 | q[-1] = b[1] 36 | return q, b 37 | 38 | 39 | return BLEU() 40 | 41 | 42 | 43 | 44 | # The general function for rewards (for simultrans): 45 | def return_reward(**_k): 46 | 47 | # ----------------------------------------------------------------- # 48 | # reward for quality 49 | # use negative-loglikelihood as the reward (full sentence) 50 | # we can also use BLEU for quality, but let's try the simplest one' 51 | # 52 | @staticmethod 53 | def _bpe2words(capsw): 54 | capw = [] 55 | for cc in capsw: 56 | capw += [cc.replace('@@ ', '')] 57 | return capw 58 | 59 | 60 | def LogLikelihood(): 61 | q = numpy.zeros((_k['steps'],)) 62 | q[-1] = _k['f_cost']( 63 | _k['ctx0'], _k['x_mask'], _k['y'], _k['y_mask'] 64 | ) 65 | return q 66 | 67 | def StepLogLikelihood(): 68 | pass 69 | 70 | 71 | def NormLogLikelihood(): 72 | q = LogLikelihood() 73 | length = _k['y'].shape[0] 74 | return q / float(length) 75 | 76 | def BLEU(): 77 | q = numpy.zeros((_k['steps'],)) 78 | s = _k['sample'] 79 | r = _k['reference'] 80 | chencherry = SmoothingFunction() 81 | q[-1] = sentence_bleu(r, s, smoothing_function=chencherry.method5) 82 | return q 83 | 84 | def LatencyBLEUwithForget(beta=None, discount=1., return_quality=False): 85 | 86 | # init 87 | words = _k['words'].split() # end-of-sentence is treated as a word 88 | ref = _k['reference'] 89 | 90 | q0 = numpy.zeros((_k['steps'],)) 91 | 92 | # check 0, 1 93 | maps = [(it, a) for it, a in enumerate(_k['act']) if a < 2] 94 | kmap = len(maps) 95 | lb = numpy.zeros((kmap,)) 96 | ts = numpy.zeros((kmap,)) 97 | q = numpy.zeros((kmap,)) 98 | 99 | if not beta: 100 | beta = kmap 101 | 102 | beta = 1. / float(beta) 103 | 104 | chencherry = SmoothingFunction() 105 | 106 | # compute BLEU for each Yt 107 | Y = [] 108 | bleus = [] 109 | truebleus = [] 110 | for t in xrange(len(words)): 111 | if len(Y) > 0: 112 | _temp = Y[-1] + ' ' + words[t] 113 | _temp = _temp.replace('@@ ', '') 114 | Y = Y[:-1] + _temp.split() 115 | else: 116 | Y = [words[t]] 117 | 118 | bb = sentence_bleu(ref, Y, smoothing_function=chencherry.method5) 119 | 120 | bleus.append(bb[0]) 121 | truebleus.append(bb[1]) 122 | 123 | bleus.reverse() 124 | truebleus.reverse() 125 | 126 | # compute the Latency-Bleu 127 | T = 0 128 | Prev = 0 129 | for i, (it, a) in enumerate(maps): 130 | # print 'Prev', Prev 131 | if a == 0: # WAIT 132 | T += 1 133 | if i == 0: 134 | lb[i] = 0 135 | else: 136 | lb[i] = lb[i - 1] + Prev 137 | elif a == 1: 138 | if i < kmap - 1: 139 | lb[i] = lb[i - 1] - Prev 140 | 141 | Prev = bleus.pop() 142 | lb[i] += Prev 143 | else: 144 | lb[i] = lb[i - 2] 145 | else: 146 | lb[i] = 0 147 | 148 | ts[i] = T 149 | 150 | # average the score 151 | # print 'Unnormalized BLEU', lb 152 | lbn = lb / ts 153 | 154 | # print 'Latency BLEU', lbn 155 | q[1:] = lbn[1:] - lbn[:-1] 156 | # print 'instant reward', q 157 | 158 | # add the whole sentence balance on it 159 | q[-1] = Prev # the last BLEU 160 | # print 'instant reward', q 161 | 162 | for i, (it, a) in enumerate(maps): 163 | q0[it] = q[i] 164 | 165 | return q0 166 | 167 | 168 | def LatencyBLEUex(beta=None, discount=1., return_quality=False): 169 | 170 | # init 171 | words = _k['words'].split() # end-of-sentence is treated as a word 172 | ref = _k['reference'] 173 | 174 | q = numpy.zeros((_k['steps'],)) 175 | lb = numpy.zeros((_k['steps'],)) 176 | ts = numpy.zeros((_k['steps'],)) 177 | 178 | if not beta: 179 | beta = _k['steps'] 180 | 181 | beta = 1. / float(beta) 182 | 183 | chencherry = SmoothingFunction() 184 | 185 | # compute BLEU for each Yt 186 | Y = [] 187 | bleus = [] 188 | truebleus = [] 189 | for t in xrange(len(words)): 190 | if len(Y) > 0: 191 | _temp = Y[-1] + ' ' + words[t] 192 | _temp = _temp.replace('@@ ', '') 193 | Y = Y[:-1] + _temp.split() 194 | else: 195 | Y = [words[t]] 196 | 197 | bb = sentence_bleu(ref, Y, smoothing_function=chencherry.method5) 198 | 199 | bleus.append(bb[0]) 200 | truebleus.append(bb[1]) 201 | 202 | bleus.reverse() 203 | truebleus.reverse() 204 | # print bleus 205 | 206 | # compute the Latency-Bleu 207 | T = 0 208 | Prev = 0 209 | for i, a in enumerate(_k['act']): 210 | # print 'Prev', Prev 211 | if a == 0: # WAIT 212 | T += 1 213 | if i == 0: 214 | lb[i] = 0 215 | else: 216 | lb[i] = lb[i - 1] + Prev 217 | elif a == 1: 218 | if i < len(_k['act']) - 1: 219 | lb[i] = lb[i - 1] - Prev 220 | 221 | Prev = bleus.pop() 222 | lb[i] += Prev 223 | else: 224 | lb[i] = lb[i - 2] 225 | else: 226 | lb[i] = 0 227 | 228 | ts[i] = T 229 | 230 | # average the score 231 | # print 'Unnormalized BLEU', lb 232 | lbn = lb / ts 233 | 234 | # print 'Latency BLEU', lbn 235 | q[1:] = lbn[1:] - lbn[:-1] 236 | # print 'instant reward', q 237 | 238 | # add the whole sentence balance on it 239 | q[-1] = Prev # the last BLEU 240 | # print 'instant reward', q 241 | 242 | if return_quality: # instant reward sequence (Latency BLEU) 243 | return q 244 | 245 | 246 | # cumulitive futurereward (with discounting factor) 247 | 248 | if discount == 1: 249 | R = q[::-1].cumsum()[::-1] 250 | # print 'future reward', R 251 | 252 | else: 253 | L = _k['steps'] 254 | FLAG = False 255 | 256 | if not(gamma == discount): 257 | FLAG = True 258 | gamma = discount 259 | 260 | if L > maxlen: 261 | FLAG = True 262 | maxlen = L 263 | 264 | if FLAG: 265 | GAMMA = compute_discount(gamma, maxlen) 266 | FLAG = False 267 | 268 | R = numpy.dot(GAMMA[:L, :L], q[:, None]).flatten() 269 | # import sys 270 | # sys.exit(123) 271 | # print q # collect all instant reward 272 | 273 | d = NormalizedDelay() 274 | return R, q[-1], d[-1], lbn[-1] + q[-1] 275 | 276 | 277 | # ----------------------------------------------------------------- # 278 | # reward for delay 279 | # several options: 280 | # 1. the total delay, which is computed at the last step 281 | def NormalizedDelay(): 282 | d = numpy.zeros((_k['steps'],)) 283 | # print a 284 | _src = 0 285 | _trg = 0 286 | _sum = 0 287 | for it, a in enumerate(_k['act']): 288 | if a == 0: 289 | _src += 1 290 | elif a == 1: 291 | _trg += 1 292 | _sum += _src 293 | d[-1] = _sum / (_src * _trg + 1e-6) 294 | return d 295 | 296 | # do not use this 297 | def NormalizedDelaywithPenalty(): 298 | d = numpy.zeros((_k['steps'],)) 299 | a = numpy.array(_k['act'], dtype='float32') 300 | # print a 301 | d[-1] = numpy.sum(numpy.cumsum(1 - a) * a) / (_k['src_max'] * numpy.sum(a)) * numpy.exp(-3. / _k['src_max']) 302 | return d 303 | 304 | def ConsectiveWaiting(): 305 | d = numpy.zeros((_k['steps'],)) 306 | a = numpy.array(_k['act'], dtype='float32') 307 | 308 | 309 | def StepDeley(): 310 | d = numpy.array(_k['act'], dtype='float32') - 1. 311 | return d 312 | 313 | 314 | def SilceDelay(win=5): 315 | d0 = numpy.array(_k['act'], dtype='float32') - 1. 316 | 317 | def slice(m): 318 | d = d0 319 | d[m:] = d0[:-m] 320 | return d 321 | 322 | dd = numpy.mean([d0] + [slice(w) for w in range(1, win)]) 323 | return dd 324 | 325 | # -reward of delay 326 | def MovingDelay(beta=0.1): 327 | d = numpy.zeros((_k['steps'],)) 328 | _max = 0 329 | _cur = 0 330 | 331 | for it, a in enumerate(_k['act']): 332 | if a == 0: 333 | _cur += 1 334 | if _cur > _max: 335 | _max += 1 336 | d[it] = -1 337 | else: 338 | _cur = 0 339 | 340 | return d * beta 341 | 342 | 343 | def MaximumDelay(_max=5, beta=0.1): 344 | d = numpy.zeros((_k['steps'],)) 345 | _cur = 0 346 | for it, a in enumerate(_k['act']): 347 | if a == 0: 348 | _cur += 1 349 | if _cur > _max: 350 | d[it] = -1 351 | pass 352 | elif a == 1: # only for new commit 353 | _cur = 0 354 | 355 | return d * beta 356 | 357 | # ----------------------------------------------------------------- # 358 | def MaximumSource(_max=7, beta=0.1): 359 | s = numpy.zeros((_k['steps'], )) 360 | _cur = 0 361 | _end = 0 362 | for it, a in enumerate(_k['act']): 363 | if a == 0: 364 | _cur += 1 365 | elif a == 2: 366 | _end += 1 367 | 368 | if (_cur - _end) > _max: 369 | s[it] = -1 370 | return s * beta 371 | 372 | def MovingSource(beta=0.1): 373 | s = numpy.zeros((_k['steps'],)) 374 | _max = 0 375 | _cur = 0 376 | _end = 0 377 | 378 | for it, a in enumerate(_k['act']): 379 | if a == 0: 380 | _cur += 1 381 | elif a == 2: 382 | _end += 1 383 | 384 | temp = _cur - _end 385 | if temp > _max: 386 | s[it] = -1 387 | _max = temp 388 | 389 | return s * beta 390 | 391 | def AwardForget(_max=5, beta=0.1): 392 | s = numpy.zeros((_k['steps'],)) 393 | _cur = 0 394 | _end = 0 395 | for it, a in enumerate(_k['act']): 396 | if a == 0: 397 | _cur += 1 398 | elif a == 2: 399 | _end += 1 400 | 401 | if ((_cur - _end) >= _max) and (a == 2): 402 | s[it] = 1 403 | return s * beta, _cur / float(_k['src_max']) 404 | 405 | def AwardForget2(_max=5, beta=0.001): 406 | s = numpy.zeros((_k['steps'],)) 407 | _cur = 0 408 | _end = 0 409 | for it, a in enumerate(_k['act']): 410 | if a == 0: 411 | _cur += 1 412 | elif a == 2: 413 | _end += 1 414 | 415 | if a == 2: 416 | s[it] = (_cur - _end - _max) * 2 417 | return s * beta 418 | 419 | 420 | 421 | # ----------------------------------------------------------------- # 422 | # reward for quality + delay 423 | def Q2D1(alpha=0.5): 424 | # q = LogLikelihood() 425 | q = NormLogLikelihood() 426 | d = NormalizedDelay() 427 | 428 | r = (q ** alpha) * ((1 - d) ** (1 - alpha)) 429 | R = r[::-1].cumsum()[::-1] 430 | return R, q[-1], d[-1], r[-1] 431 | 432 | def Q2D2(alpha=0.5): 433 | # q = LogLikelihood() 434 | q = BLEU() 435 | d = NormalizedDelaywithPenalty() 436 | 437 | r = (q * alpha) + ((1 - d) * (1 - alpha)) 438 | R = r[::-1].cumsum()[::-1] 439 | return R, q[-1], d[-1], r[-1] 440 | 441 | def Q2D3(alpha=0.5): 442 | # q = LogLikelihood() 443 | q = BLEU() 444 | d = NormalizedDelay() 445 | 446 | r = q # (q * alpha) + ((1 - d) * (1 - alpha)) 447 | R = r[::-1].cumsum()[::-1] 448 | return R, q[-1], d[-1], r[-1] 449 | 450 | def Q2D4(alpha=0.5): 451 | # q = LogLikelihood() 452 | q = BLEU() 453 | d = NormalizedDelay() 454 | d0 = d[-1] 455 | d[-1] = numpy.exp(-max(d0 - 0.7, 0)) 456 | r = q * d # (q * alpha) + ((1 - d) * (1 - alpha)) 457 | R = r[::-1].cumsum()[::-1] 458 | return R, q[-1], d0, r[-1] 459 | 460 | 461 | # ---------------------------------------------------------------- # 462 | # user defined target delay \tau* 463 | def QualityDelay(tau = 0.5, gamma=3): 464 | q = LatencyBLEUex(return_quality=True) 465 | d = NormalizedDelay() 466 | 467 | # just bleu 468 | bleu = q[-1] 469 | 470 | # just delay 471 | delay = d[-1] 472 | 473 | r = q - gamma * numpy.maximum(d - tau, 0) ** 2 # instant reward 474 | R = r[::-1].cumsum()[::-1] 475 | return R, bleu, delay, r 476 | 477 | def FullQualityDelay(tau = 0.5, gamma=10): 478 | q = LatencyBLEUex(return_quality=True) 479 | d = NormalizedDelay() 480 | d1 = SilceDelay() 481 | 482 | # just bleu 483 | bleu = q[-1] 484 | 485 | # just delay 486 | delay = d[-1] 487 | 488 | r = q + d1 - gamma * numpy.maximum(d - tau, 0) ** 2 # instant reward 489 | R = r[::-1].cumsum()[::-1] 490 | return R, bleu, delay, r 491 | 492 | # UPDATE: July 11, 2016: we have several varisions:: 493 | def ReturnA(): 494 | # params 495 | gamma = _k['gamma'] 496 | beta = 0.1 497 | 498 | q0 = LatencyBLEUex(return_quality=True) 499 | d0 = NormalizedDelay() 500 | 501 | # just bleu 502 | bleu = q0[-1] 503 | 504 | # just delay 505 | delay = d0[-1] 506 | 507 | # use moving-delay + latency bleu (without final BLEU) 508 | q = q0 509 | q[-1] = 0. 510 | d = MovingDelay(beta=beta) 511 | 512 | r = q + gamma * d 513 | R = r[::-1].cumsum()[::-1] 514 | return R, bleu, delay, r 515 | 516 | def ReturnB(): 517 | # params 518 | gamma = _k['gamma'] 519 | beta = 0.1 520 | 521 | q0 = LatencyBLEUex(return_quality=True) 522 | d0 = NormalizedDelay() 523 | 524 | # just bleu 525 | bleu = q0[-1] 526 | 527 | # just delay 528 | delay = d0[-1] 529 | 530 | # use maximum-delay + latency bleu (without final BLEU) 531 | q = q0 532 | q[-1] = 0. 533 | d = MaximumDelay(_max=4, beta=beta) 534 | 535 | r = q + gamma * d 536 | R = r[::-1].cumsum()[::-1] 537 | return R, bleu, delay, r 538 | 539 | def ReturnC(): 540 | # params 541 | gamma = _k['gamma'] 542 | beta = 0.1 543 | 544 | q0 = LatencyBLEUex(return_quality=True) 545 | d0 = NormalizedDelay() 546 | 547 | # just bleu 548 | bleu = q0[-1] 549 | 550 | # just delay 551 | delay = d0[-1] 552 | 553 | # use maximum-delay + latency bleu (with final BLEU) 554 | q = q0 555 | d = MaximumDelay(_max=5, beta=beta) 556 | 557 | r = q + gamma * d 558 | R = r[::-1].cumsum()[::-1] 559 | return R, bleu, delay, r 560 | 561 | def ReturnD(): 562 | # params 563 | gamma = _k['gamma'] 564 | beta = 0.1 565 | 566 | q0 = LatencyBLEUex(return_quality=True) 567 | d0 = NormalizedDelay() 568 | 569 | # just bleu 570 | bleu = q0[-1] 571 | 572 | # just delay 573 | delay = d0[-1] 574 | 575 | # use moving-delay + latency bleu (with final BLEU) 576 | q = q0 577 | d = MovingDelay(beta=beta) 578 | 579 | r = q + gamma * d 580 | R = r[::-1].cumsum()[::-1] 581 | return R, bleu, delay, r 582 | 583 | def ReturnE(): 584 | # params 585 | gamma = _k['gamma'] 586 | beta = 0.1 587 | tau = _k['target'] 588 | 589 | q0 = LatencyBLEUex(return_quality=True) 590 | d0 = NormalizedDelay() 591 | 592 | # just bleu 593 | bleu = q0[-1] 594 | 595 | # just delay 596 | delay = d0[-1] 597 | 598 | # use maximum-delay + latency bleu (without final BLEU) + global delay 599 | q = q0 600 | q[-1] = 0. 601 | d = MaximumDelay(_max=4, beta=beta) 602 | d[-1]-= numpy.maximum(delay - tau, 0) 603 | 604 | r = q + gamma * d 605 | R = r[::-1].cumsum()[::-1] 606 | return R, bleu, delay, r 607 | 608 | def ReturnF(): 609 | # params 610 | gamma = _k['gamma'] 611 | beta = 0.1 612 | tau = _k['target'] 613 | 614 | q0 = LatencyBLEUex(return_quality=True) 615 | d0 = NormalizedDelay() 616 | 617 | # just bleu 618 | bleu = q0[-1] 619 | 620 | # just delay 621 | delay = d0[-1] 622 | 623 | # use maximum-delay + latency bleu (with final BLEU) + global delay 624 | q = q0 625 | d = MaximumDelay(_max=5, beta=beta) 626 | d[-1] -= numpy.maximum(delay - tau, 0) * gamma 627 | 628 | r = q + d 629 | R = r[::-1].cumsum()[::-1] 630 | return R, bleu, delay, r 631 | 632 | # ---------------------------------------------------------------- # 633 | def ReturnG(): 634 | # params 635 | discount = _k['discount'] ## 0.95 here gamma is the discounting factor 636 | beta = 0.1 637 | 638 | q0 = LatencyBLEUwithForget(return_quality=True) 639 | d0 = NormalizedDelay() 640 | 641 | # just bleu 642 | bleu = q0[-1] 643 | 644 | # just delay 645 | delay = d0[-1] 646 | 647 | # use maximum-delay + latency bleu (with final BLEU) 648 | q = q0 649 | d = MaximumDelay(_max=4, beta=beta) 650 | s = MaximumSource(_max=7, beta=0.01) 651 | 652 | if discount == 1: 653 | r = q + d + s 654 | R = r[::-1].cumsum()[::-1] 655 | else: 656 | raise NotImplementedError 657 | 658 | return R, bleu, delay, r 659 | 660 | def ReturnH(): 661 | # params 662 | discount = _k['discount'] ## 0.95 here gamma is the discounting factor 663 | beta = 0.1 664 | 665 | q0 = LatencyBLEUwithForget(return_quality=True) 666 | d0 = NormalizedDelay() 667 | 668 | # just bleu 669 | bleu = q0[-1] 670 | 671 | # just delay 672 | delay = d0[-1] 673 | 674 | # use maximum-delay + latency bleu (with final BLEU) 675 | q = q0 676 | d = MaximumDelay(_max=4, beta=beta) 677 | s = MovingSource(beta=0.02) 678 | 679 | if discount == 1: 680 | r = q + d + s 681 | R = r[::-1].cumsum()[::-1] 682 | else: 683 | raise NotImplementedError 684 | 685 | return R, bleu, delay, r 686 | 687 | def ReturnI(): 688 | # params 689 | 690 | discount = _k['gamma'] ## 0.95 here gamma is the discounting factor 691 | maxsrc = _k['maxsrc'] 692 | beta = 0.1 693 | 694 | q0 = LatencyBLEUwithForget(return_quality=True) 695 | d0 = NormalizedDelay() 696 | 697 | # global reward signal :::>>> 698 | # just bleu 699 | bleu = q0[-1] 700 | 701 | # just delay 702 | delay = d0[-1] 703 | 704 | # local reward signal :::>>>> 705 | # use maximum-delay + latency bleu (with final BLEU) 706 | q = q0 707 | q[-1] = 0 708 | d = MaximumDelay(_max=5, beta=beta) 709 | s, _ = AwardForget(_max=maxsrc, beta=0.01) 710 | # s = AwardForget2(_max=maxsrc, beta=0.001) 711 | 712 | r0 = q + d + s 713 | rg = bleu # it is a global reward, will not be discounted. 714 | 715 | if discount == 1: 716 | r = r0 717 | r[-1] += rg 718 | R = r[::-1].cumsum()[::-1] 719 | else: 720 | R = numpy.zeros_like(r0) 721 | R[-1] = r0[-1] 722 | for it in range(_k['steps'] - 2, -1, -1): 723 | R[it] = discount * R[it + 1] + r0[it] 724 | R += rg # add a global signal (without a discount factor) 725 | 726 | return R, bleu, delay, r0 727 | 728 | def ReturnJ(): 729 | # params 730 | 731 | discount = _k['gamma'] ## 0.95 here gamma is the discounting factor 732 | beta = 0.1 733 | 734 | q0 = LatencyBLEUwithForget(return_quality=True) 735 | d0 = NormalizedDelay() 736 | 737 | # global reward signal :::>>> 738 | # just bleu 739 | bleu = q0[-1] 740 | 741 | # just delay 742 | delay = d0[-1] 743 | 744 | # local reward signal :::>>>> 745 | # use maximum-delay + latency bleu (with final BLEU) 746 | q = q0 747 | q[-1] = 0 748 | d = MaximumDelay(_max=5, beta=beta) 749 | # s, m = AwardForget(_max=5, beta=0.01) 750 | 751 | r0 = q + d # + s 752 | rg = bleu # * m # it is a global reward, will not be discounted. 753 | 754 | if discount == 1: 755 | r = r0 756 | r[-1] += rg 757 | R = r[::-1].cumsum()[::-1] 758 | else: 759 | R = numpy.zeros_like(r0) 760 | R[-1] = r0[-1] 761 | for it in range(_k['steps'] - 2, -1, -1): 762 | R[it] = discount * R[it + 1] + r0[it] 763 | R += rg # add a global signal (without a discount factor) 764 | 765 | return R, bleu, delay, r0 766 | 767 | 768 | # **------------------------------------------------ **# 769 | # アサガオの散る頃に 770 | 771 | def Q2Ds(): 772 | q = NormLogLikelihood() 773 | d = NormalizedDelay() 774 | return q, d 775 | 776 | gamma = _k['gamma'] 777 | type = _k['Rtype'] 778 | 779 | funcs = [ReturnA, ReturnB, ReturnC, ReturnD, ReturnE, ReturnF, ReturnG, ReturnH, ReturnI, ReturnJ] 780 | return funcs[type]() 781 | 782 | # return FullQualityDelay(tau, gamma) 783 | # return QualityDelay(tau=tau, gamma=gamma) 784 | 785 | # return LatencyBLEUex() 786 | # return Q2D4(0.2) 787 | # return Q2Ds() 788 | -------------------------------------------------------------------------------- /run_eval.sh: -------------------------------------------------------------------------------- 1 | THEANO_FLAGS=device=gpu$2 python simultrans_evaluation.py --sinit 1 --target 0.5 --sample 64 --batchsize 1 --Rtype $1 --gamma 1 --id $3 --recurrent True 2>&1 | tee .images/$4.log 2 | -------------------------------------------------------------------------------- /run_train.sh: -------------------------------------------------------------------------------- 1 | THEANO_FLAGS=device=gpu$2 python simultrans_training.py --sample 32 --batchsize 1 --target $1 --gamma $3 --recurrent True 2>&1 | tee .log/$4.log 2 | -------------------------------------------------------------------------------- /translate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | model=".pretrained/model_wmt15_bpe2k_uni_en-ru.npz" 4 | dict="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl" 5 | dict_rev="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl" 6 | source="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe" 7 | saveto=".translate/standard.trans.1" 8 | 9 | THEANO_FLAGS="floatX=float32, device=cpu" python translate_uni.py -k 1 $model $dict $dict_rev $source $saveto 10 | -------------------------------------------------------------------------------- /translate_uni.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import theano 5 | import argparse 6 | 7 | import numpy 8 | import cPickle as pkl 9 | 10 | from nmt_uni import (build_model, build_sampler, gen_sample, load_params, 11 | init_params, init_tparams, prepare_data) 12 | 13 | from multiprocessing import Process, Queue 14 | 15 | 16 | def translate_model(queue, rqueue, pid, model, options, k, normalize, kp, sigma): 17 | 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 19 | trng = RandomStreams(1234) 20 | 21 | # allocate model parameters 22 | params = init_params(options) 23 | 24 | # load model parameters and set theano shared variables 25 | params = load_params(model, params) 26 | tparams = init_tparams(params) 27 | 28 | trng, use_noise, \ 29 | x, x_mask, y, y_mask, \ 30 | opt_ret, \ 31 | cost = \ 32 | build_model(tparams, options) 33 | inps = [x, x_mask, y, y_mask] 34 | 35 | f_log_probs = theano.function(inps, cost) 36 | 37 | # word index 38 | f_init, f_next = build_sampler(tparams, options, trng) 39 | 40 | def _translate(idx, seq): 41 | all_samples = [] 42 | all_scores = [] 43 | 44 | for kidx in xrange(kp): 45 | if kidx == 0: 46 | ss = -1. 47 | else: 48 | ss = sigma 49 | # sample given an input sequence and obtain scores 50 | sample, score = gen_sample(tparams, f_init, f_next, 51 | numpy.array(seq).reshape([len(seq), 1]), 52 | options, trng=trng, k=k, maxlen=200, 53 | stochastic=False, argmax=False, sigma=ss) 54 | 55 | # normalize scores according to sequence lengths 56 | if normalize: 57 | lengths = numpy.array([len(s) for s in sample]) 58 | score = score / lengths 59 | #print idx, score 60 | sidx = numpy.argmin(score) 61 | all_samples.append(sample[sidx]) 62 | all_scores.append(score[sidx]) 63 | 64 | source_list = [seq] * kp 65 | x, x_mask, y, y_mask = prepare_data(source_list, all_samples, maxlen=None) 66 | all_scores = f_log_probs(x, x_mask, y, y_mask) 67 | if normalize: 68 | lengths = numpy.array([len(s) for s in all_samples]) 69 | all_scores = all_scores / lengths 70 | 71 | print idx, all_scores 72 | sidx = numpy.argmin(all_scores) 73 | return all_samples[sidx] 74 | 75 | while True: 76 | req = queue.get() 77 | if req is None: 78 | break 79 | 80 | idx, x = req[0], req[1] 81 | print pid, '-', idx 82 | seq = _translate(idx, x) 83 | 84 | rqueue.put((idx, seq)) 85 | 86 | return 87 | 88 | 89 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5, 90 | normalize=False, n_process=5, chr_level=False, 91 | options_file=None, sigma=-1., kp=1): 92 | 93 | # load model model_options 94 | if options_file is not None: 95 | with open(options_file, 'rb') as f: 96 | options = pkl.load(f) 97 | else: 98 | with open('%s.pkl' % model, 'rb') as f: 99 | options = pkl.load(f) 100 | 101 | # load source dictionary and invert 102 | with open(dictionary, 'rb') as f: 103 | word_dict = pkl.load(f) 104 | word_idict = dict() 105 | for kk, vv in word_dict.iteritems(): 106 | word_idict[vv] = kk 107 | word_idict[0] = '' 108 | word_idict[1] = 'UNK' 109 | 110 | # load target dictionary and invert 111 | with open(dictionary_target, 'rb') as f: 112 | word_dict_trg = pkl.load(f) 113 | word_idict_trg = dict() 114 | for kk, vv in word_dict_trg.iteritems(): 115 | word_idict_trg[vv] = kk 116 | word_idict_trg[0] = '' 117 | word_idict_trg[1] = 'UNK' 118 | 119 | # create input and output queues for processes 120 | queue = Queue() 121 | rqueue = Queue() 122 | processes = [None] * n_process 123 | for midx in xrange(n_process): 124 | processes[midx] = Process( 125 | target=translate_model, 126 | args=(queue, rqueue, midx, model, options, k, normalize, kp, sigma)) 127 | processes[midx].start() 128 | 129 | # utility function 130 | def _seqs2words(caps): 131 | capsw = [] 132 | for cc in caps: 133 | ww = [] 134 | for w in cc: 135 | if w == 0: 136 | break 137 | ww.append(word_idict_trg[w]) 138 | capsw.append(' '.join(ww)) 139 | return capsw 140 | 141 | def _send_jobs(fname): 142 | with open(fname, 'r') as f: 143 | for idx, line in enumerate(f): 144 | if chr_level: 145 | words = list(line.decode('utf-8').strip()) 146 | else: 147 | words = line.strip().split() 148 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 149 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 150 | x += [0] 151 | queue.put((idx, x)) 152 | return idx+1 153 | 154 | def _finish_processes(): 155 | for midx in xrange(n_process): 156 | queue.put(None) 157 | 158 | def _retrieve_jobs(n_samples): 159 | trans = [None] * n_samples 160 | for idx in xrange(n_samples): 161 | resp = rqueue.get() 162 | trans[resp[0]] = resp[1] 163 | if numpy.mod(idx, 10) == 0: 164 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 165 | return trans 166 | 167 | print 'Translating ', source_file, '...' 168 | n_samples = _send_jobs(source_file) 169 | trans = _seqs2words(_retrieve_jobs(n_samples)) 170 | _finish_processes() 171 | with open(saveto, 'w') as f: 172 | print >>f, '\n'.join(trans) 173 | print 'Done' 174 | 175 | 176 | if __name__ == "__main__": 177 | parser = argparse.ArgumentParser() 178 | parser.add_argument('-k', type=int, default=5) 179 | parser.add_argument('-kp', type=int, default=1) 180 | parser.add_argument('-p', type=int, default=5) 181 | parser.add_argument('-n', action="store_true", default=False) 182 | parser.add_argument('-c', action="store_true", default=False) 183 | parser.add_argument('-o', type=str, default=None) 184 | parser.add_argument('-s', type=float, default=-1.) 185 | parser.add_argument('model', type=str) 186 | parser.add_argument('dictionary', type=str) 187 | parser.add_argument('dictionary_target', type=str) 188 | parser.add_argument('source', type=str) 189 | parser.add_argument('saveto', type=str) 190 | 191 | args = parser.parse_args() 192 | 193 | main(args.model, args.dictionary, args.dictionary_target, args.source, 194 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 195 | chr_level=args.c, options_file=args.o, kp=args.kp, sigma=args.s) 196 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is for functions to help the translation 3 | """ 4 | import numpy as np 5 | import time 6 | import sys 7 | import json 8 | 9 | class Monitor(object): 10 | def __init__(self, root='http://localhost:9000'): 11 | self.root = root 12 | 13 | def display(self, batch, logs={}): 14 | import requests 15 | send = {} 16 | send['epoch'] = batch 17 | for k, v in logs.items(): 18 | send[k] = v 19 | 20 | try: 21 | requests.post(self.root + '/publish/epoch/end/', 22 | {'data': json.dumps(send)}) 23 | except: 24 | print('Warning: could not reach RemoteMonitor ' 25 | 'root server at ' + str(self.root)) 26 | 27 | 28 | 29 | class Progbar(object): 30 | def __init__(self, target, width=30, verbose=1, with_history=True): 31 | ''' 32 | @param target: total number of steps expected 33 | ''' 34 | self.width = width 35 | self.target = target 36 | self.sum_values = {} 37 | self.unique_values = [] 38 | self.start = time.time() 39 | self.total_width = 0 40 | self.seen_so_far = 0 41 | self.verbose = verbose 42 | self.with_history = with_history 43 | 44 | def update(self, current, values=[]): 45 | ''' 46 | @param current: index of current step 47 | @param values: list of tuples (name, value_for_last_step). 48 | The progress bar will display averages for these values. 49 | ''' 50 | if not self.with_history: 51 | self.sum_values = {} 52 | self.unique_values = [] 53 | 54 | for k, v in values: 55 | if k not in self.sum_values: 56 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far] 57 | self.unique_values.append(k) 58 | else: 59 | self.sum_values[k][0] += v * (current - self.seen_so_far) 60 | self.sum_values[k][1] += (current - self.seen_so_far) 61 | self.seen_so_far = current 62 | 63 | now = time.time() 64 | if self.verbose == 1: 65 | prev_total_width = self.total_width 66 | sys.stdout.write("\b" * prev_total_width) 67 | sys.stdout.write("\r") 68 | 69 | numdigits = int(np.floor(np.log10(self.target))) + 1 70 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 71 | bar = barstr % (current, self.target) 72 | prog = float(current)/self.target 73 | prog_width = int(self.width*prog) 74 | if prog_width > 0: 75 | bar += ('.'*(prog_width-1)) 76 | if current < self.target: 77 | bar += '(-w-)' 78 | else: 79 | bar += '(-v-)!!' 80 | bar += ('~' * (self.width-prog_width)) 81 | bar += ']' 82 | sys.stdout.write(bar) 83 | self.total_width = len(bar) 84 | 85 | if current: 86 | time_per_unit = (now - self.start) / current 87 | else: 88 | time_per_unit = 0 89 | eta = time_per_unit*(self.target - current) 90 | info = '' 91 | if current < self.target: 92 | info += ' - ETA: %ds' % eta 93 | else: 94 | info += ' - %ds' % (now - self.start) 95 | for k in self.unique_values: 96 | if k == 'perplexity' or k == 'PPL': 97 | info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1]))) 98 | else: 99 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 100 | 101 | self.total_width += len(info) 102 | if prev_total_width > self.total_width: 103 | info += ((prev_total_width-self.total_width) * " ") 104 | 105 | sys.stdout.write(info) 106 | sys.stdout.flush() 107 | 108 | if current >= self.target: 109 | sys.stdout.write("\n") 110 | 111 | if self.verbose == 2: 112 | if current >= self.target: 113 | info = '%ds' % (now - self.start) 114 | for k in self.unique_values: 115 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 116 | sys.stdout.write(info + "\n") 117 | 118 | def add(self, n, values=[]): 119 | self.update(self.seen_so_far + n, values) 120 | 121 | def clear(self): 122 | self.sum_values = {} 123 | self.unique_values = [] 124 | self.total_width = 0 125 | self.seen_so_far = 0 126 | --------------------------------------------------------------------------------