├── .gitattributes
├── .gitignore
├── Att_CopyNet
    ├── AttCopy.py
    ├── Att_CopyNet_copy_supervision
    │   ├── Att_copy_s.py
    │   ├── build_dictionary.py
    │   ├── data_iterator.py
    │   ├── train.py
    │   ├── translate.py
    │   └── translate_Windows.py
    ├── README.md
    ├── build_dictionary.py
    ├── data_2
    │   ├── dict2.txt
    │   ├── p.txt
    │   ├── r.txt
    │   ├── ttt.txt
    │   └── word_dict.pkl
    ├── data_iterator.py
    ├── predict.py
    ├── predict_windows.py
    └── train.py
├── Att_POS_CopyNet
    ├── README.md
    ├── build_dictionary.py
    ├── data_2
    │   ├── dict2.txt
    │   ├── p.txt
    │   ├── p_pos.txt
    │   ├── pos_dict.pkl
    │   ├── r.txt
    │   ├── r_pos.txt
    │   └── word_dict.pkl
    ├── data_iterator_for_pos.py
    ├── nmt_new_pos_word.py
    └── train.py
├── Att_Seq2Seq
    ├── Pdt.py
    ├── Pdt_windows.py
    ├── README.md
    ├── Seq2SeqAtt.py
    ├── data
    │   ├── pp.txt
    │   ├── pp.txt.pkl
    │   ├── ppv.txt
    │   ├── rr.txt
    │   ├── rr.txt.pkl
    │   └── rrv.txt
    ├── data_iterator.py
    └── train.py
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/Att_CopyNet/AttCopy.py:
--------------------------------------------------------------------------------
   1 | #-*- coding:utf-8 -*-
   2 | ############################################
   3 | #
   4 | #    Author: Chuwei Luo
   5 | #    Email: luochuwei@gmail.com
   6 | #    Date: 18/08/2016
   7 | #    Usage: copy net
   8 | #
   9 | ############################################
  10 | import theano
  11 | import theano.tensor as tensor
  12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  13 | 
  14 | import cPickle as pkl
  15 | # import ipdb
  16 | import numpy
  17 | import copy
  18 | 
  19 | import os
  20 | import warnings
  21 | import sys
  22 | import time
  23 | 
  24 | from collections import OrderedDict
  25 | 
  26 | from data_iterator import TextIterator
  27 | 
  28 | profile = False
  29 | 
  30 | 
  31 | # push parameters to Theano shared variables
  32 | def zipp(params, tparams):
  33 |     for kk, vv in params.iteritems():
  34 |         tparams[kk].set_value(vv)
  35 | 
  36 | 
  37 | # pull parameters from Theano shared variables
  38 | def unzip(zipped):
  39 |     new_params = OrderedDict()
  40 |     for kk, vv in zipped.iteritems():
  41 |         new_params[kk] = vv.get_value()
  42 |     return new_params
  43 | 
  44 | 
  45 | # get the list of parameters: Note that tparams must be OrderedDict
  46 | def itemlist(tparams):
  47 |     return [vv for kk, vv in tparams.iteritems()]
  48 | 
  49 | 
  50 | # dropout
  51 | def dropout_layer(state_before, use_noise, trng):
  52 |     proj = tensor.switch(
  53 |         use_noise,
  54 |         state_before * trng.binomial(state_before.shape, p=0.5, n=1,
  55 |                                      dtype=state_before.dtype),
  56 |         state_before * 0.5)
  57 |     return proj
  58 | 
  59 | 
  60 | # make prefix-appended name
  61 | def _p(pp, name):
  62 |     return '%s_%s' % (pp, name)
  63 | 
  64 | 
  65 | # initialize Theano shared variables according to the initial parameters
  66 | def init_tparams(params):
  67 |     tparams = OrderedDict()
  68 |     for kk, pp in params.iteritems():
  69 |         tparams[kk] = theano.shared(params[kk], name=kk)
  70 |     return tparams
  71 | 
  72 | 
  73 | # load parameters
  74 | def load_params(path, params):
  75 |     pp = numpy.load(path)
  76 |     for kk, vv in params.iteritems():
  77 |         if kk not in pp:
  78 |             warnings.warn('%s is not in the archive' % kk)
  79 |             continue
  80 |         params[kk] = pp[kk]
  81 | 
  82 |     return params
  83 | 
  84 | # layers: 'name': ('parameter initializer', 'feedforward')
  85 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
  86 |           'gru': ('param_init_gru', 'gru_layer'),
  87 |           'gru_cond': ('param_init_gru_cond', 'gru_cond_layer'),
  88 |           }
  89 | 
  90 | 
  91 | def get_layer(name):
  92 |     fns = layers[name]
  93 |     return (eval(fns[0]), eval(fns[1]))
  94 | 
  95 | 
  96 | # some utilities
  97 | def ortho_weight(ndim):
  98 |     W = numpy.random.randn(ndim, ndim)
  99 |     u, s, v = numpy.linalg.svd(W)
 100 |     return u.astype('float32')
 101 | 
 102 | 
 103 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
 104 |     if nout is None:
 105 |         nout = nin
 106 |     if nout == nin and ortho:
 107 |         W = ortho_weight(nin)
 108 |     else:
 109 |         W = scale * numpy.random.randn(nin, nout)
 110 |     return W.astype('float32')
 111 | 
 112 | 
 113 | def tanh(x):
 114 |     return tensor.tanh(x)
 115 | 
 116 | 
 117 | def linear(x):
 118 |     return x
 119 | 
 120 | 
 121 | def concatenate(tensor_list, axis=0):
 122 |     """
 123 |     Alternative implementation of `theano.tensor.concatenate`.
 124 |     This function does exactly the same thing, but contrary to Theano's own
 125 |     implementation, the gradient is implemented on the GPU.
 126 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
 127 |     because the inverse operation (splitting) needs to be done on the CPU.
 128 |     This implementation does not have that problem.
 129 |     :usage:
 130 |         >>> x, y = theano.tensor.matrices('x', 'y')
 131 |         >>> c = concatenate([x, y], axis=1)
 132 |     :parameters:
 133 |         - tensor_list : list
 134 |             list of Theano tensor expressions that should be concatenated.
 135 |         - axis : int
 136 |             the tensors will be joined along this axis.
 137 |     :returns:
 138 |         - out : tensor
 139 |             the concatenated tensor expression.
 140 |     """
 141 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
 142 | 
 143 |     output_shape = ()
 144 |     for k in range(axis):
 145 |         output_shape += (tensor_list[0].shape[k],)
 146 |     output_shape += (concat_size,)
 147 |     for k in range(axis + 1, tensor_list[0].ndim):
 148 |         output_shape += (tensor_list[0].shape[k],)
 149 | 
 150 |     out = tensor.zeros(output_shape)
 151 |     offset = 0
 152 |     for tt in tensor_list:
 153 |         indices = ()
 154 |         for k in range(axis):
 155 |             indices += (slice(None),)
 156 |         indices += (slice(offset, offset + tt.shape[axis]),)
 157 |         for k in range(axis + 1, tensor_list[0].ndim):
 158 |             indices += (slice(None),)
 159 | 
 160 |         out = tensor.set_subtensor(out[indices], tt)
 161 |         offset += tt.shape[axis]
 162 | 
 163 |     return out
 164 | 
 165 | 
 166 | # batch preparation
 167 | def prepare_data(seqs_x, seqs_y, maxlen=None, n_words_src=30000,
 168 |                  n_words=30000):
 169 |     # x: a list of sentences
 170 |     lengths_x = [len(s) for s in seqs_x]
 171 |     lengths_y = [len(s) for s in seqs_y]
 172 | 
 173 |     if maxlen is not None:
 174 |         new_seqs_x = []
 175 |         new_seqs_y = []
 176 |         new_lengths_x = []
 177 |         new_lengths_y = []
 178 |         for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
 179 |             if l_x < maxlen and l_y < maxlen:
 180 |                 new_seqs_x.append(s_x)
 181 |                 new_lengths_x.append(l_x)
 182 |                 new_seqs_y.append(s_y)
 183 |                 new_lengths_y.append(l_y)
 184 |         lengths_x = new_lengths_x
 185 |         seqs_x = new_seqs_x
 186 |         lengths_y = new_lengths_y
 187 |         seqs_y = new_seqs_y
 188 | 
 189 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
 190 |             return None, None, None, None
 191 | 
 192 |     n_samples = len(seqs_x)
 193 |     maxlen_x = numpy.max(lengths_x) + 1
 194 |     maxlen_y = numpy.max(lengths_y) + 1
 195 | 
 196 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
 197 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 198 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 199 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 200 |     for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
 201 |         x[:lengths_x[idx], idx] = s_x
 202 |         x_mask[:lengths_x[idx]+1, idx] = 1.
 203 |         y[:lengths_y[idx], idx] = s_y
 204 |         y_mask[:lengths_y[idx]+1, idx] = 1.
 205 | 
 206 |     return x, x_mask, y, y_mask
 207 | 
 208 | 
 209 | # feedforward layer: affine transformation + point-wise nonlinearity
 210 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
 211 |                        ortho=True):
 212 |     if nin is None:
 213 |         nin = options['dim_proj']
 214 |     if nout is None:
 215 |         nout = options['dim_proj']
 216 |     params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
 217 |     params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
 218 | 
 219 |     return params
 220 | 
 221 | 
 222 | def fflayer(tparams, state_below, options, prefix='rconv',
 223 |             activ='lambda x: tensor.tanh(x)', **kwargs):
 224 |     return eval(activ)(
 225 |         tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
 226 |         tparams[_p(prefix, 'b')])
 227 | 
 228 | 
 229 | # GRU layer
 230 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
 231 |     if nin is None:
 232 |         nin = options['dim_proj']
 233 |     if dim is None:
 234 |         dim = options['dim_proj']
 235 | 
 236 |     # embedding to gates transformation weights, biases
 237 |     W = numpy.concatenate([norm_weight(nin, dim),
 238 |                            norm_weight(nin, dim)], axis=1)
 239 |     params[_p(prefix, 'W')] = W
 240 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
 241 | 
 242 |     # recurrent transformation weights for gates
 243 |     U = numpy.concatenate([ortho_weight(dim),
 244 |                            ortho_weight(dim)], axis=1)
 245 |     params[_p(prefix, 'U')] = U
 246 | 
 247 |     # embedding to hidden state proposal weights, biases
 248 |     Wx = norm_weight(nin, dim)
 249 |     params[_p(prefix, 'Wx')] = Wx
 250 |     params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
 251 | 
 252 |     # recurrent transformation weights for hidden state proposal
 253 |     Ux = ortho_weight(dim)
 254 |     params[_p(prefix, 'Ux')] = Ux
 255 | 
 256 |     return params
 257 | 
 258 | 
 259 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None,
 260 |               **kwargs):
 261 |     nsteps = state_below.shape[0]
 262 |     if state_below.ndim == 3:
 263 |         n_samples = state_below.shape[1]
 264 |     else:
 265 |         n_samples = 1
 266 | 
 267 |     dim = tparams[_p(prefix, 'Ux')].shape[1]
 268 | 
 269 |     if mask is None:
 270 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 271 | 
 272 |     # utility function to slice a tensor
 273 |     def _slice(_x, n, dim):
 274 |         if _x.ndim == 3:
 275 |             return _x[:, :, n*dim:(n+1)*dim]
 276 |         return _x[:, n*dim:(n+1)*dim]
 277 | 
 278 |     # state_below is the input word embeddings
 279 |     # input to the gates, concatenated
 280 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
 281 |         tparams[_p(prefix, 'b')]
 282 |     # input to compute the hidden state proposal
 283 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
 284 |         tparams[_p(prefix, 'bx')]
 285 | 
 286 |     # step function to be used by scan
 287 |     # arguments    | sequences |outputs-info| non-seqs
 288 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
 289 |         preact = tensor.dot(h_, U)
 290 |         preact += x_
 291 | 
 292 |         # reset and update gates
 293 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
 294 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
 295 | 
 296 |         # compute the hidden state proposal
 297 |         preactx = tensor.dot(h_, Ux)
 298 |         preactx = preactx * r
 299 |         preactx = preactx + xx_
 300 | 
 301 |         # hidden state proposal
 302 |         h = tensor.tanh(preactx)
 303 | 
 304 |         # leaky integrate and obtain next hidden state
 305 |         h = u * h_ + (1. - u) * h
 306 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
 307 | 
 308 |         return h
 309 | 
 310 |     # prepare scan arguments
 311 |     seqs = [mask, state_below_, state_belowx]
 312 |     init_states = [tensor.alloc(0., n_samples, dim)]
 313 |     _step = _step_slice
 314 |     shared_vars = [tparams[_p(prefix, 'U')],
 315 |                    tparams[_p(prefix, 'Ux')]]
 316 | 
 317 |     rval, updates = theano.scan(_step,
 318 |                                 sequences=seqs,
 319 |                                 outputs_info=init_states,
 320 |                                 non_sequences=shared_vars,
 321 |                                 name=_p(prefix, '_layers'),
 322 |                                 n_steps=nsteps,
 323 |                                 profile=profile,
 324 |                                 strict=True)
 325 |     rval = [rval]
 326 |     return rval
 327 | 
 328 | 
 329 | # Conditional GRU layer with Attention
 330 | def param_init_gru_cond(options, params, prefix='gru_cond',
 331 |                         nin=None, dim=None, dimctx=None,
 332 |                         nin_nonlin=None, dim_nonlin=None):
 333 |     if nin is None:
 334 |         nin = options['dim']
 335 |     if dim is None:
 336 |         dim = options['dim']
 337 |     if dimctx is None:
 338 |         dimctx = options['dim']
 339 |     if nin_nonlin is None:
 340 |         nin_nonlin = nin
 341 |     if dim_nonlin is None:
 342 |         dim_nonlin = dim
 343 | 
 344 |     W = numpy.concatenate([norm_weight(nin, dim),
 345 |                            norm_weight(nin, dim)], axis=1)
 346 |     params[_p(prefix, 'W')] = W
 347 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
 348 |     U = numpy.concatenate([ortho_weight(dim_nonlin),
 349 |                            ortho_weight(dim_nonlin)], axis=1)
 350 |     params[_p(prefix, 'U')] = U
 351 | 
 352 |     Wx = norm_weight(nin_nonlin, dim_nonlin)
 353 |     params[_p(prefix, 'Wx')] = Wx
 354 |     Ux = ortho_weight(dim_nonlin)
 355 |     params[_p(prefix, 'Ux')] = Ux
 356 |     params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32')
 357 | 
 358 |     U_nl = numpy.concatenate([ortho_weight(dim_nonlin),
 359 |                               ortho_weight(dim_nonlin)], axis=1)
 360 |     params[_p(prefix, 'U_nl')] = U_nl
 361 |     params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32')
 362 | 
 363 |     Ux_nl = ortho_weight(dim_nonlin)
 364 |     params[_p(prefix, 'Ux_nl')] = Ux_nl
 365 |     params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32')
 366 | 
 367 |     # context to LSTM
 368 |     Wc = norm_weight(dimctx, dim*2)
 369 |     params[_p(prefix, 'Wc')] = Wc
 370 | 
 371 |     Wcx = norm_weight(dimctx, dim)
 372 |     params[_p(prefix, 'Wcx')] = Wcx
 373 | 
 374 |     # attention: combined -> hidden
 375 |     W_comb_att = norm_weight(dim, dimctx)
 376 |     params[_p(prefix, 'W_comb_att')] = W_comb_att
 377 | 
 378 |     # attention: context -> hidden
 379 |     Wc_att = norm_weight(dimctx)
 380 |     params[_p(prefix, 'Wc_att')] = Wc_att
 381 | 
 382 |     # attention: hidden bias
 383 |     b_att = numpy.zeros((dimctx,)).astype('float32')
 384 |     params[_p(prefix, 'b_att')] = b_att
 385 | 
 386 |     # attention:
 387 |     U_att = norm_weight(dimctx, 1)
 388 |     params[_p(prefix, 'U_att')] = U_att
 389 |     c_att = numpy.zeros((1,)).astype('float32')
 390 |     params[_p(prefix, 'c_tt')] = c_att
 391 | 
 392 |     return params
 393 | 
 394 | 
 395 | def gru_cond_layer(tparams, state_below, options, prefix='gru',
 396 |                    mask=None, context=None, one_step=False,
 397 |                    init_memory=None, init_state=None,
 398 |                    context_mask=None,
 399 |                    **kwargs):
 400 | 
 401 |     assert context, 'Context must be provided'
 402 | 
 403 |     if one_step:
 404 |         assert init_state, 'previous state must be provided'
 405 | 
 406 |     nsteps = state_below.shape[0]
 407 |     if state_below.ndim == 3:
 408 |         n_samples = state_below.shape[1]
 409 |     else:
 410 |         n_samples = 1
 411 | 
 412 |     # mask
 413 |     if mask is None:
 414 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 415 | 
 416 |     dim = tparams[_p(prefix, 'Wcx')].shape[1]
 417 | 
 418 |     # initial/previous state
 419 |     if init_state is None:
 420 |         init_state = tensor.alloc(0., n_samples, dim)
 421 | 
 422 |     # projected context
 423 |     assert context.ndim == 3, \
 424 |         'Context must be 3-d: #annotation x #sample x dim'
 425 |     pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
 426 |         tparams[_p(prefix, 'b_att')]
 427 | 
 428 |     def _slice(_x, n, dim):
 429 |         if _x.ndim == 3:
 430 |             return _x[:, :, n*dim:(n+1)*dim]
 431 |         return _x[:, n*dim:(n+1)*dim]
 432 | 
 433 |     # projected x
 434 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
 435 |         tparams[_p(prefix, 'bx')]
 436 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
 437 |         tparams[_p(prefix, 'b')]
 438 | 
 439 |     def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_,
 440 |                     U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
 441 |                     U_nl, Ux_nl, b_nl, bx_nl):
 442 |         preact1 = tensor.dot(h_, U)
 443 |         preact1 += x_
 444 |         preact1 = tensor.nnet.sigmoid(preact1)
 445 | 
 446 |         r1 = _slice(preact1, 0, dim)
 447 |         u1 = _slice(preact1, 1, dim)
 448 | 
 449 |         preactx1 = tensor.dot(h_, Ux)
 450 |         preactx1 *= r1
 451 |         preactx1 += xx_
 452 | 
 453 |         h1 = tensor.tanh(preactx1)
 454 | 
 455 |         h1 = u1 * h_ + (1. - u1) * h1
 456 |         h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
 457 | 
 458 |         # attention
 459 |         pstate_ = tensor.dot(h1, W_comb_att)
 460 |         pctx__ = pctx_ + pstate_[None, :, :]
 461 |         #pctx__ += xc_
 462 |         pctx__ = tensor.tanh(pctx__)
 463 |         alpha = tensor.dot(pctx__, U_att)+c_tt
 464 |         alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
 465 |         alpha = tensor.exp(alpha)
 466 |         if context_mask:
 467 |             alpha = alpha * context_mask
 468 |         alpha = alpha / alpha.sum(0, keepdims=True)
 469 |         ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context
 470 | 
 471 |         preact2 = tensor.dot(h1, U_nl)+b_nl
 472 |         preact2 += tensor.dot(ctx_, Wc)
 473 |         preact2 = tensor.nnet.sigmoid(preact2)
 474 | 
 475 |         r2 = _slice(preact2, 0, dim)
 476 |         u2 = _slice(preact2, 1, dim)
 477 | 
 478 |         preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
 479 |         preactx2 *= r2
 480 |         preactx2 += tensor.dot(ctx_, Wcx)
 481 | 
 482 |         h2 = tensor.tanh(preactx2)
 483 | 
 484 |         h2 = u2 * h1 + (1. - u2) * h2
 485 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
 486 | 
 487 |         return h2, ctx_, alpha.T  # pstate_, preact, preactx, r, u
 488 | 
 489 |     seqs = [mask, state_below_, state_belowx]
 490 |     #seqs = [mask, state_below_, state_belowx, state_belowc]
 491 |     _step = _step_slice
 492 | 
 493 |     shared_vars = [tparams[_p(prefix, 'U')],
 494 |                    tparams[_p(prefix, 'Wc')],
 495 |                    tparams[_p(prefix, 'W_comb_att')],
 496 |                    tparams[_p(prefix, 'U_att')],
 497 |                    tparams[_p(prefix, 'c_tt')],
 498 |                    tparams[_p(prefix, 'Ux')],
 499 |                    tparams[_p(prefix, 'Wcx')],
 500 |                    tparams[_p(prefix, 'U_nl')],
 501 |                    tparams[_p(prefix, 'Ux_nl')],
 502 |                    tparams[_p(prefix, 'b_nl')],
 503 |                    tparams[_p(prefix, 'bx_nl')]]
 504 | 
 505 |     if one_step:
 506 |         rval = _step(*(seqs + [init_state, None, None, pctx_, context] +
 507 |                        shared_vars))
 508 |     else:
 509 |         rval, updates = theano.scan(_step,
 510 |                                     sequences=seqs,
 511 |                                     outputs_info=[init_state,
 512 |                                                   tensor.alloc(0., n_samples,
 513 |                                                                context.shape[2]),
 514 |                                                   tensor.alloc(0., n_samples,
 515 |                                                                context.shape[0])],
 516 |                                     non_sequences=[pctx_, context]+shared_vars,
 517 |                                     name=_p(prefix, '_layers'),
 518 |                                     n_steps=nsteps,
 519 |                                     profile=profile,
 520 |                                     strict=True)
 521 |     return rval
 522 | 
 523 | 
 524 | # initialize all parameters
 525 | def init_params(options):
 526 |     params = OrderedDict()
 527 | 
 528 |     # embedding
 529 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
 530 |     # params['Wemb_pos'] = norm_weight(options['n_pos'], options['dim_pos'])
 531 | 
 532 |     params = get_layer(options['encoder'])[0](options, params,
 533 |                                               prefix='encoder',
 534 |                                               nin=options['dim_word'],
 535 |                                               dim=options['dim'])
 536 |     params = get_layer(options['encoder'])[0](options, params,
 537 |                                               prefix='encoder_r',
 538 |                                               nin=options['dim_word'],
 539 |                                               dim=options['dim'])
 540 |     ctxdim = 2 * options['dim']
 541 | 
 542 |     # init_state, init_cell
 543 |     params = get_layer('ff')[0](options, params, prefix='ff_state',
 544 |                                 nin=ctxdim, nout=options['dim'])
 545 |     # decoder
 546 |     params = get_layer(options['decoder'])[0](options, params,
 547 |                                               prefix='decoder',
 548 |                                               nin=options['dim_word'],
 549 |                                               dim=options['dim'],
 550 |                                               dimctx=ctxdim)
 551 |     # readout
 552 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
 553 |     #                             nin=options['dim'], nout=options['dim_word'],
 554 |     #                             ortho=False)
 555 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 556 |     #                             nin=options['dim_word'],
 557 |     #                             nout=options['dim_word'], ortho=False)
 558 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 559 |     #                             nin=ctxdim, nout=options['dim_word'],
 560 |     #                             ortho=False)
 561 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit',
 562 |     #                             nin=options['dim_word'],
 563 |     #                             nout=options['n_words'])
 564 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
 565 |                                 nin=options['dim'], nout=options['dim_word'],
 566 |                                 ortho=False)
 567 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 568 |                                 nin=options['dim_word'],
 569 |                                 nout=options['dim_word'], ortho=False)
 570 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 571 |                                 nin=ctxdim, nout=options['dim_word'],
 572 |                                 ortho=False)
 573 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
 574 |                                 nin=options['dim_word'],
 575 |                                 nout=options['n_words'])
 576 | 
 577 |     params['W_out_lambda'] = 0.01 * numpy.random.randn(options['dim'],1).astype('float32')
 578 | 
 579 | 
 580 |     return params
 581 | 
 582 | 
 583 | # build a training model
 584 | def build_model(tparams, options):
 585 |     opt_ret = dict()
 586 | 
 587 |     trng = RandomStreams(1234)
 588 |     use_noise = theano.shared(numpy.float32(0.))
 589 | 
 590 |     # description string: #words x #samples
 591 |     x = tensor.matrix('x', dtype='int64')
 592 |     x_map1 = tensor.vector('x', dtype='int64')
 593 |     # x_map2 = tensor.vector('x', dtype='int64')
 594 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 595 |     x_mask_for_attw = tensor.matrix('x_mask_for_attw', dtype='float32')
 596 |     y = tensor.matrix('y', dtype='int64')
 597 |     new_y = tensor.matrix('new_y', dtype='int64')
 598 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 599 | 
 600 |     word_map = tensor.vector('wm', dtype='int64')
 601 |     # label_for_dim_expand = tensor.vector('lde', dtype='int64')
 602 |     # lambda_a = tensor.matrix('lambda_a', dype='int64')
 603 | 
 604 |     # for the backward rnn, we just need to invert x and x_mask
 605 |     xr = x[::-1]
 606 |     xr_mask = x_mask[::-1]
 607 | 
 608 |     n_timesteps = x.shape[0]
 609 |     n_timesteps_trg = y.shape[0]
 610 |     n_samples = x.shape[1]
 611 | 
 612 |     # word embedding for forward rnn (source)
 613 |     emb = tparams['Wemb'][x.flatten()]
 614 | 
 615 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 616 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 617 |                                             prefix='encoder',
 618 |                                             mask=x_mask)
 619 |     # word embedding for backward rnn (source)
 620 |     embr = tparams['Wemb'][xr.flatten()]
 621 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 622 |     projr = get_layer(options['encoder'])[1](tparams, embr, options,
 623 |                                              prefix='encoder_r',
 624 |                                              mask=xr_mask)
 625 | 
 626 |     # context will be the concatenation of forward and backward rnns
 627 |     ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 628 | 
 629 |     # mean of the context (across time) will be used to initialize decoder rnn
 630 |     ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
 631 | 
 632 |     # or you can use the last state of forward + backward encoder rnns
 633 |     # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)
 634 | 
 635 |     # initial decoder state
 636 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 637 |                                     prefix='ff_state', activ='tanh')
 638 | 
 639 |     # word embedding (target), we will shift the target sequence one time step
 640 |     # to the right. This is done because of the bi-gram connections in the
 641 |     # readout and decoder rnn. The first target will be all zeros and we will
 642 |     # not condition on the last output.
 643 |     # emb = tparams['Wemb_dec'][y.flatten()]
 644 |     emb = tparams['Wemb'][y.flatten()]
 645 |     emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
 646 |     emb_shifted = tensor.zeros_like(emb)
 647 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
 648 |     emb = emb_shifted
 649 | 
 650 |     # decoder - pass through the decoder conditional gru with attention
 651 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 652 |                                             prefix='decoder',
 653 |                                             mask=y_mask, context=ctx,
 654 |                                             context_mask=x_mask,
 655 |                                             one_step=False,
 656 |                                             init_state=init_state)
 657 |     # hidden states of the decoder gru
 658 |     proj_h = proj[0]
 659 | 
 660 |     # weighted averages of context, generated by attention module
 661 |     ctxs = proj[1]
 662 | 
 663 |     # weights (alignment matrix)
 664 |     opt_ret['dec_alphas'] = proj[2]
 665 |     # print opt_ret['dec_alphas'].shape
 666 | 
 667 |     # compute word probabilities
 668 |     logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
 669 |                                     prefix='ff_logit_lstm', activ='linear')
 670 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 671 |                                     prefix='ff_logit_prev', activ='linear')
 672 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
 673 |                                    prefix='ff_logit_ctx', activ='linear')
 674 | 
 675 |     logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
 676 | 
 677 | 
 678 |     proj_h_shp = proj_h.shape
 679 | 
 680 |     attw_lambda = tensor.nnet.sigmoid(tensor.dot(proj_h.reshape([proj_h_shp[0] * proj_h_shp[1], proj_h_shp[2]]), tparams['W_out_lambda']))
 681 | 
 682 |     if options['use_dropout']:
 683 |         logit = dropout_layer(logit, use_noise, trng)
 684 | 
 685 |     logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map])
 686 | 
 687 |     #copy attention
 688 |     logit_shp = logit.shape
 689 |     r1,_ = theano.scan(lambda :tensor.constant(0), n_steps = logit_shp[2])
 690 |     logit_new = logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])
 691 | 
 692 |     alpha_shape = opt_ret['dec_alphas'].shape
 693 |     attw = opt_ret['dec_alphas'].reshape([alpha_shape[0], alpha_shape[1] * alpha_shape[2]])
 694 |     r2,_ = theano.scan(lambda :tensor.constant(0), n_steps = alpha_shape[2])
 695 |     attw = x_mask_for_attw.T.flatten() * attw
 696 |     lambda_plus_attw1 = attw_lambda[:,r2] * attw.reshape([alpha_shape[0] * alpha_shape[1], alpha_shape[2]])
 697 |     lambda_plus_attw = lambda_plus_attw1.reshape([alpha_shape[0], alpha_shape[1]*alpha_shape[2]])
 698 |     # logit_new2 = logit_new.reshape([logit_shp[0], logit_shp[1]*logit_shp[2]])
 699 | 
 700 |     # logit_new2 = tensor.set_subtensor(logit_new2[:,x_map1], logit_new2[:,x_map1] + lambda_plus_attw)
 701 | 
 702 |     probs_0 = (1-attw_lambda[:,r1]) * tensor.nnet.softmax(logit_new)
 703 | 
 704 |     probs_1 = probs_0.reshape([logit_shp[0], logit_shp[1]*logit_shp[2]])
 705 |     probs_1 = tensor.set_subtensor(probs_1[:,x_map1], probs_1[:,x_map1] + lambda_plus_attw)
 706 | 
 707 |     probs = probs_1.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])
 708 | 
 709 |     # cost
 710 |     # y_flat = y.flatten()
 711 |     y_flat = new_y.flatten()
 712 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * word_map.shape[0] + y_flat
 713 |     cost = -tensor.log(probs.flatten()[y_flat_idx])
 714 |     cost = cost.reshape([y.shape[0], y.shape[1]])
 715 |     cost = (cost * y_mask).sum(0)
 716 | 
 717 |     fucktest = [attw, probs]
 718 | 
 719 | 
 720 |     return trng, use_noise, x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, opt_ret, cost, word_map, fucktest
 721 | 
 722 | 
 723 | # build a sampler
 724 | def build_sampler(tparams, options, trng, use_noise):
 725 |     x = tensor.matrix('x', dtype='int64')
 726 |     xr = x[::-1]
 727 |     n_timesteps = x.shape[0]
 728 |     n_samples = x.shape[1]
 729 | 
 730 |     # word embedding (source), forward and backward
 731 |     # emb = tparams['Wemb'][x.flatten()]
 732 |     # emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 733 |     # embr = tparams['Wemb'][xr.flatten()]
 734 |     # embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 735 |     emb = tparams['Wemb'][x.flatten()]
 736 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 737 | 
 738 |     embr = tparams['Wemb'][xr.flatten()]
 739 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 740 | 
 741 |     # encoder
 742 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 743 |                                             prefix='encoder')
 744 |     projr = get_layer(options['encoder'])[1](tparams, embr, options,
 745 |                                              prefix='encoder_r')
 746 | 
 747 |     # concatenate forward and backward rnn hidden states
 748 |     ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 749 | 
 750 |     # get the input for decoder rnn initializer mlp
 751 |     ctx_mean = ctx.mean(0)
 752 |     # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
 753 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 754 |                                     prefix='ff_state', activ='tanh')
 755 | 
 756 |     print 'Building f_init...',
 757 |     outs = [init_state, ctx]
 758 |     f_init = theano.function([x], outs, name='f_init', profile=profile)
 759 |     print 'Done'
 760 | 
 761 |     # x: 1 x 1
 762 |     y = tensor.vector('y_sampler', dtype='int64')
 763 |     x_map1 = tensor.vector('x_map1', dtype='int64')
 764 |     x_mask = tensor.vector('x_mask', dtype='int64')
 765 |     # x_map2 = tensor.vector('x', dtype='int64')
 766 |     word_map = tensor.vector('wm', dtype='int64')
 767 |     init_state = tensor.matrix('init_state', dtype='float32')
 768 | 
 769 |     # if it's the first word, emb should be all zero and it is indicated by -1
 770 |     # emb = tensor.switch(y[:, None] < 0,
 771 |     #                     tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
 772 |     #                     tparams['Wemb_dec'][y])
 773 |     emb = tensor.switch(y[:, None] < 0,
 774 |                         tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
 775 |                         tparams['Wemb'][y])
 776 | 
 777 |     # apply one step of conditional gru with attention
 778 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 779 |                                             prefix='decoder',
 780 |                                             mask=None, context=ctx,
 781 |                                             one_step=True,
 782 |                                             init_state=init_state)
 783 |     # get the next hidden state
 784 |     next_state = proj[0]
 785 | 
 786 |     # get the weighted averages of context for this target word y
 787 |     ctxs = proj[1]
 788 | 
 789 |     logit_lstm = get_layer('ff')[1](tparams, next_state, options,
 790 |                                     prefix='ff_logit_lstm', activ='linear')
 791 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 792 |                                     prefix='ff_logit_prev', activ='linear')
 793 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
 794 |                                    prefix='ff_logit_ctx', activ='linear')
 795 | 
 796 |     logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx)
 797 | 
 798 | 
 799 |     if options['use_dropout']:
 800 |         logit = dropout_layer(logit, use_noise, trng)
 801 | 
 802 |     # logit = get_layer('ff')[1](tparams, logit, options,
 803 |     #                            prefix='ff_logit', activ='linear')
 804 |     # logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map])
 805 | 
 806 |     logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map])
 807 |     
 808 |     #do not copy 'eos'
 809 |     # tparams['att_lambda'] = tensor.set_subtensor(tparams['att_lambda'][0], 0.0)
 810 | 
 811 |     attw_lambda = tensor.nnet.sigmoid(tensor.dot(next_state, tparams['W_out_lambda']))
 812 | 
 813 |     f_lambda = theano.function([x_map1, y, ctx, init_state, word_map], attw_lambda, on_unused_input='ignore')
 814 | 
 815 |     # logit_new = (1-tparams['att_lambda'][word_map]) * logit
 816 | 
 817 |     r1,_ = theano.scan(lambda :tensor.constant(0), n_steps=logit.shape[-1])
 818 | 
 819 |     # logit_new =  logit
 820 | 
 821 |     # logit_new = tensor.set_subtensor(logit_new[:,x_map1], logit_new[:,x_map1] + tparams['att_lambda'][word_map][x_map2] * attw)
 822 |     # logit_new = tensor.set_subtensor(logit_new[:,x_map1], logit_new[:,x_map1] + attw_lambda[:, r2] * attw)
 823 | 
 824 |     prob_1 = (1-attw_lambda[:, r1]) * tensor.nnet.softmax(logit)
 825 | 
 826 |     attw = proj[2]
 827 |     attw = x_mask * attw
 828 | 
 829 |     r2,_ = theano.scan(lambda :tensor.constant(0), n_steps=attw.shape[-1])
 830 | 
 831 |     prob_1 = tensor.set_subtensor(prob_1[:,x_map1], prob_1[:,x_map1] + attw_lambda[:, r2] * attw)
 832 | 
 833 |     # compute the softmax probability
 834 |     # next_probs = tensor.nnet.softmax(logit_new)
 835 |     # next_probs = tensor.nnet.softmax(prob_1)
 836 |     # next_probs = next_probs[:,word_map]
 837 |     prob_sum = prob_1.sum(1).mean()
 838 |     next_probs = prob_1/prob_sum
 839 | 
 840 |     # sample from softmax distribution to get the sample
 841 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
 842 | 
 843 |     # compile a function to do the whole thing above, next word probability,
 844 |     # sampled word for the next target, next hidden state to be used
 845 |     print 'Building f_next..',
 846 |     inps = [x_mask, x_map1, y, ctx, init_state, word_map]
 847 |     outs = [next_probs, next_sample, next_state]
 848 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
 849 |     print 'Done'
 850 | 
 851 |     return f_init, f_next, f_lambda
 852 | 
 853 | 
 854 | # generate sample, either with stochastic sampling or beam search. Note that,
 855 | # this function iteratively calls f_init and f_next functions.
 856 | def gen_sample(tparams, f_init, f_next, f_lambda, x, x_mask, x_map1, word_map, options, trng=None, k=1, maxlen=30,
 857 |                stochastic=True, argmax=False):
 858 | 
 859 |     # k is the beam size we have
 860 | 
 861 |     if k > 1:
 862 |         assert not stochastic, \
 863 |             'Beam search does not support stochastic sampling'
 864 | 
 865 |     sample = []
 866 |     sample_score = []
 867 |     sample_lambda = []
 868 |     if stochastic:
 869 |         sample_score = 0
 870 | 
 871 |     live_k = 1
 872 |     dead_k = 0
 873 | 
 874 |     hyp_samples = [[]] * live_k
 875 | 
 876 |     hyp_scores = numpy.zeros(live_k).astype('float32')
 877 |     hyp_scoresp = numpy.zeros(live_k).astype('float32')
 878 |     hyp_states = []
 879 | 
 880 |     # get initial state of decoder rnn and encoder context
 881 |     ret = f_init(x)
 882 |     next_state, ctx0 = ret[0], ret[1]
 883 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
 884 | 
 885 |     for ii in xrange(maxlen):
 886 |         ctx = numpy.tile(ctx0, [live_k, 1])
 887 |         inps0 = [x_map1, next_w, ctx, next_state, word_map]
 888 |         inps = [x_mask, x_map1, next_w, ctx, next_state, word_map]
 889 |         # ttt = fftest(*inps)
 890 |         # ipdb.set_trace()
 891 |         lam = f_lambda(*inps0)
 892 |         ret = f_next(*inps)
 893 |         next_p, next_w0, next_state = ret[0], ret[1], ret[2]
 894 |         # ipdb.set_trace()
 895 |         next_w = numpy.array([word_map[next_w0[0]]])
 896 |         sample_lambda.append(lam)
 897 | 
 898 |         if stochastic:
 899 |             if argmax:
 900 |                 nw0 = next_p[0].argmax()
 901 |                 nw = word_map[next_p[0].argmax()]
 902 | 
 903 |             else:
 904 |                 nw0 = next_w0[0]
 905 |                 nw = next_w[0]
 906 | 
 907 |             sample.append(nw)
 908 | 
 909 |             sample_score -= numpy.log(next_p[0, nw0])
 910 | 
 911 |             if nw == 0:
 912 |                 break
 913 |         else:
 914 |             cand_scores = hyp_scores[:, None] - numpy.log(next_p)
 915 | 
 916 |             cand_flat = cand_scores.flatten()
 917 | 
 918 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
 919 | 
 920 | 
 921 |             voc_size = next_p.shape[1]
 922 | 
 923 |             trans_indices = ranks_flat / voc_size
 924 | 
 925 |             word_indices = ranks_flat % voc_size
 926 | 
 927 |             costs = cand_flat[ranks_flat]
 928 | 
 929 | 
 930 |             new_hyp_samples = []
 931 | 
 932 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
 933 | 
 934 |             new_hyp_states = []
 935 | 
 936 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
 937 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
 938 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
 939 |                 new_hyp_states.append(copy.copy(next_state[ti]))
 940 | 
 941 |             # check the finished samples
 942 |             new_live_k = 0
 943 |             hyp_samples = []
 944 |             hyp_scores = []
 945 |             hyp_states = []
 946 | 
 947 |             for idx in xrange(len(new_hyp_samples)):
 948 |                 if new_hyp_samples[idx][-1] == 0:
 949 |                     w_m = numpy.array(word_map)[new_hyp_samples[idx]]
 950 |                     sample.append(w_m)
 951 |                     # sample.append(new_hyp_samples[idx])
 952 |                     sample_score.append(new_hyp_scores[idx])
 953 |                     dead_k += 1
 954 |                 else:
 955 |                     new_live_k += 1
 956 |                     hyp_samples.append(new_hyp_samples[idx])
 957 |                     hyp_scores.append(new_hyp_scores[idx])
 958 |                     hyp_states.append(new_hyp_states[idx])
 959 |             hyp_scores = numpy.array(hyp_scores)
 960 |             live_k = new_live_k
 961 | 
 962 |             if new_live_k < 1:
 963 |                 break
 964 |             if dead_k >= k:
 965 |                 break
 966 | 
 967 |             # next_w = numpy.array([w[-1] for w in hyp_samples])
 968 |             next_w = numpy.array([word_map[w[-1]] for w in hyp_samples])
 969 |             next_state = numpy.array(hyp_states)
 970 | 
 971 |     if not stochastic:
 972 |         # dump every remaining one
 973 |         if live_k > 0:
 974 |             for idx in xrange(live_k):
 975 |                 w_m = numpy.array(word_map)[new_hyp_samples[idx]]
 976 |                 sample.append(w_m)
 977 |                 # sample.append(word_map[hyp_samples[idx]])
 978 |                 sample_score.append(hyp_scores[idx])
 979 | 
 980 |     return sample, sample_score, sample_lambda
 981 | 
 982 | 
 983 | # calculate the log probablities on a given corpus using translation model
 984 | def pred_probs(f_log_probs, prepare_data, options, iterator, word_map0, verbose=True):
 985 |     probs = []
 986 | 
 987 |     n_done = 0
 988 | 
 989 |     for x, y in iterator:
 990 |         n_done += len(x)
 991 | 
 992 |         x, x_mask, y, y_mask = prepare_data(x, y,
 993 |                                             n_words_src=options['n_words_src'],
 994 |                                             n_words=options['n_words'])
 995 | 
 996 |         word_map = list(set(list(x.reshape(x.shape[0]*x.shape[1]))+list(y.reshape(y.shape[0]*y.shape[1]))+word_map0))
 997 |         new_x = numpy.array([word_map.index(ii) for ii in x.reshape(x.shape[0]*x.shape[1])]).reshape(x.shape[0], x.shape[1])
 998 | 
 999 |         x_mask_for_attw = numpy.array([1 if jj !=0 else 0 for jj in x.flatten()], dtype='float32')
1000 |         x_mask_for_attw = x_mask_for_attw.reshape(x.shape)
1001 | 
1002 |         # x_map2 = new_x.T.flatten()
1003 |         x_map1 = new_x.T
1004 |         for iii in xrange(x_map1.shape[0]):
1005 |             x_map1[iii] += len(word_map)*iii
1006 |         x_map1 = x_map1.flatten()
1007 |         new_y = numpy.array([word_map.index(ii) for ii in y.reshape(y.shape[0]*y.shape[1])]).reshape(y.shape[0], y.shape[1])
1008 | 
1009 |         pprobs = f_log_probs(x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, numpy.array(word_map, dtype='int64'))
1010 |         for pp in pprobs:
1011 |             probs.append(pp)
1012 | 
1013 |         if numpy.isnan(numpy.mean(probs)):
1014 |             # ipdb.set_trace()
1015 |             print 1
1016 | 
1017 |         if verbose:
1018 |             print >>sys.stderr, '%d samples computed' % (n_done)
1019 | 
1020 |     return numpy.array(probs)
1021 | 
1022 | 
1023 | # optimizers
1024 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
1025 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8):
1026 | 
1027 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
1028 |                for k, p in tparams.iteritems()]
1029 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
1030 | 
1031 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)
1032 | 
1033 |     updates = []
1034 | 
1035 |     t_prev = theano.shared(numpy.float32(0.))
1036 |     t = t_prev + 1.
1037 |     lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t)
1038 | 
1039 |     for p, g in zip(tparams.values(), gshared):
1040 |         m = theano.shared(p.get_value() * 0., p.name + '_mean')
1041 |         v = theano.shared(p.get_value() * 0., p.name + '_variance')
1042 |         m_t = beta1 * m + (1. - beta1) * g
1043 |         v_t = beta2 * v + (1. - beta2) * g**2
1044 |         step = lr_t * m_t / (tensor.sqrt(v_t) + e)
1045 |         p_t = p - step
1046 |         updates.append((m, m_t))
1047 |         updates.append((v, v_t))
1048 |         updates.append((p, p_t))
1049 |     updates.append((t_prev, t))
1050 | 
1051 |     f_update = theano.function([lr], [], updates=updates,
1052 |                                on_unused_input='ignore', profile=profile)
1053 | 
1054 |     return f_grad_shared, f_update
1055 | 
1056 | 
1057 | def adadelta(lr, tparams, grads, inp, cost):
1058 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1059 |                                   name='%s_grad' % k)
1060 |                     for k, p in tparams.iteritems()]
1061 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
1062 |                                  name='%s_rup2' % k)
1063 |                    for k, p in tparams.iteritems()]
1064 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
1065 |                                     name='%s_rgrad2' % k)
1066 |                       for k, p in tparams.iteritems()]
1067 | 
1068 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
1069 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
1070 |              for rg2, g in zip(running_grads2, grads)]
1071 | 
1072 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
1073 |                                     profile=profile)
1074 | 
1075 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
1076 |              for zg, ru2, rg2 in zip(zipped_grads, running_up2,
1077 |                                      running_grads2)]
1078 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
1079 |              for ru2, ud in zip(running_up2, updir)]
1080 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
1081 | 
1082 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
1083 |                                on_unused_input='ignore', profile=profile)
1084 | 
1085 |     return f_grad_shared, f_update
1086 | 
1087 | 
1088 | def rmsprop(lr, tparams, grads, inp, cost):
1089 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1090 |                                   name='%s_grad' % k)
1091 |                     for k, p in tparams.iteritems()]
1092 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1093 |                                    name='%s_rgrad' % k)
1094 |                      for k, p in tparams.iteritems()]
1095 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
1096 |                                     name='%s_rgrad2' % k)
1097 |                       for k, p in tparams.iteritems()]
1098 | 
1099 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
1100 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
1101 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
1102 |              for rg2, g in zip(running_grads2, grads)]
1103 | 
1104 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
1105 |                                     profile=profile)
1106 | 
1107 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
1108 |                            name='%s_updir' % k)
1109 |              for k, p in tparams.iteritems()]
1110 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
1111 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
1112 |                                             running_grads2)]
1113 |     param_up = [(p, p + udn[1])
1114 |                 for p, udn in zip(itemlist(tparams), updir_new)]
1115 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
1116 |                                on_unused_input='ignore', profile=profile)
1117 | 
1118 |     return f_grad_shared, f_update
1119 | 
1120 | 
1121 | def sgd(lr, tparams, grads, x, mask, y, cost):
1122 |     gshared = [theano.shared(p.get_value() * 0.,
1123 |                              name='%s_grad' % k)
1124 |                for k, p in tparams.iteritems()]
1125 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
1126 | 
1127 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
1128 |                                     profile=profile)
1129 | 
1130 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
1131 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
1132 | 
1133 |     return f_grad_shared, f_update
1134 | 
1135 | 
1136 | def train(dim_word=100,  # word vector dimensionality
1137 |           dim=1000,  # the number of LSTM units
1138 |           encoder='gru',
1139 |           decoder='gru_cond',
1140 |           patience=10,  # early stopping patience
1141 |           max_epochs=5000,
1142 |           finish_after=10000000,  # finish after this many updates
1143 |           dispFreq=100,
1144 |           decay_c=0.,  # L2 regularization penalty
1145 |           alpha_c=0.,  # alignment regularization
1146 |           clip_c=-1.,  # gradient clipping threshold
1147 |           lrate=0.01,  # learning rate
1148 |           n_words_src=100000,  # source vocabulary size
1149 |           n_words=100000,  # target vocabulary size
1150 |           maxlen=100,  # maximum length of the description
1151 |           optimizer='rmsprop',
1152 |           batch_size=16,
1153 |           valid_batch_size=16,
1154 |           saveto='model.npz',
1155 |           validFreq=1000,
1156 |           saveFreq=1000,   # save the parameters after every saveFreq updates
1157 |           sampleFreq=100,   # generate some samples after every sampleFreq
1158 |           datasets=[
1159 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
1160 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
1161 |           valid_datasets=['../data/dev/newstest2011.en.tok',
1162 |                           '../data/dev/newstest2011.fr.tok'],
1163 |           dictionaries=[
1164 |               'data/worddicts.pkl',
1165 |               'data/dict2.txt'],
1166 |           use_dropout=False,
1167 |           reload_=False,
1168 |           overwrite=False,
1169 |           show_lambda = False):
1170 | 
1171 |     # Model options
1172 |     model_options = locals().copy()
1173 | 
1174 |     # load dictionaries and invert them
1175 | 
1176 |     with open(dictionaries[0], 'rb') as f:
1177 |         worddicts = pkl.load(f)
1178 |     worddicts_r = dict()
1179 |     for kk, vv in worddicts.iteritems():
1180 |         worddicts_r[vv] = kk
1181 | 
1182 |     word_map0 = []
1183 |     with open(dictionaries[-1]) as ff:
1184 |         for line in ff:
1185 |             line = line.strip()
1186 |             if line in worddicts:
1187 |                 if line not in word_map0 and worddicts[line]<n_words:
1188 |                     word_map0.append(worddicts[line])
1189 | 
1190 | 
1191 |     # reload options
1192 |     if reload_ and os.path.exists(saveto):
1193 |         print 'Reloading model options'
1194 |         with open('%s.pkl' % saveto, 'rb') as f:
1195 |             model_options = pkl.load(f)
1196 | 
1197 |     print 'Loading data'
1198 |     train = TextIterator(datasets[0], datasets[1],
1199 |                          dictionaries[0], dictionaries[0],
1200 |                          n_words_source=n_words, n_words_target=n_words,
1201 |                          batch_size=batch_size,
1202 |                          maxlen=maxlen)
1203 | 
1204 |     valid = TextIterator(valid_datasets[0], valid_datasets[1],
1205 |                          dictionaries[0], dictionaries[0],
1206 |                          n_words_source=n_words, n_words_target=n_words,
1207 |                          batch_size=valid_batch_size,
1208 |                          maxlen=maxlen)
1209 | 
1210 |     # ipdb.set_trace()
1211 | 
1212 |     print 'Building model'
1213 |     params = init_params(model_options)
1214 |     # reload parameters
1215 |     if reload_ and os.path.exists(saveto):
1216 |         print 'Reloading model parameters'
1217 |         params = load_params(saveto, params)
1218 | 
1219 |     tparams = init_tparams(params)
1220 | 
1221 |     trng, use_noise, \
1222 |         x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, \
1223 |         opt_ret, \
1224 |         cost, word_map, fucktest = \
1225 |         build_model(tparams, model_options)
1226 |     inps = [x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, word_map]
1227 | 
1228 |     # ftest = theano.function(inps, fucktest, on_unused_input='ignore')
1229 | 
1230 |     print 'Building sampler'
1231 |     f_init, f_next, f_lambda = build_sampler(tparams, model_options, trng, use_noise)
1232 | 
1233 |     # before any regularizer
1234 |     print 'Building f_log_probs...',
1235 |     f_log_probs = theano.function(inps, cost, profile=profile)
1236 |     print 'Done'
1237 | 
1238 |     cost = cost.mean()
1239 | 
1240 |     # apply L2 regularization on weights
1241 |     if decay_c > 0.:
1242 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
1243 |         weight_decay = 0.
1244 |         for kk, vv in tparams.iteritems():
1245 |             weight_decay += (vv ** 2).sum()
1246 |         weight_decay *= decay_c
1247 |         cost += weight_decay
1248 | 
1249 |     # regularize the alpha weights
1250 |     if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
1251 |         alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
1252 |         alpha_reg = alpha_c * (
1253 |             (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
1254 |              opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
1255 |         cost += alpha_reg
1256 | 
1257 |     # after all regularizers - compile the computational graph for cost
1258 |     print 'Building f_cost...',
1259 |     f_cost = theano.function(inps, cost, profile=profile)
1260 |     print 'Done'
1261 | 
1262 |     print 'Computing gradient...',
1263 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
1264 |     print 'Done'
1265 | 
1266 |     # apply gradient clipping here
1267 |     if clip_c > 0.:
1268 |         g2 = 0.
1269 |         for g in grads:
1270 |             g2 += (g**2).sum()
1271 |         new_grads = []
1272 |         for g in grads:
1273 |             new_grads.append(tensor.switch(g2 > (clip_c**2),
1274 |                                            g / tensor.sqrt(g2) * clip_c,
1275 |                                            g))
1276 |         grads = new_grads
1277 | 
1278 |     # compile the optimizer, the actual computational graph is compiled here
1279 |     lr = tensor.scalar(name='lr')
1280 |     print 'Building optimizers...',
1281 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
1282 |     print 'Done'
1283 | 
1284 |     print 'Optimization'
1285 | 
1286 |     best_p = None
1287 |     bad_counter = 0
1288 |     uidx = 0
1289 |     estop = False
1290 |     history_errs = []
1291 |     # reload history
1292 |     if reload_ and os.path.exists(saveto):
1293 |         rmodel = numpy.load(saveto)
1294 |         history_errs = list(rmodel['history_errs'])
1295 |         if 'uidx' in rmodel:
1296 |             uidx = rmodel['uidx']
1297 | 
1298 |     if validFreq == -1:
1299 |         validFreq = len(train[0])/batch_size
1300 |     if saveFreq == -1:
1301 |         saveFreq = len(train[0])/batch_size
1302 |     if sampleFreq == -1:
1303 |         sampleFreq = len(train[0])/batch_size
1304 | 
1305 |     for eidx in xrange(max_epochs):
1306 |         n_samples = 0
1307 | 
1308 |         for x, y in train:
1309 |             n_samples += len(x)
1310 |             if len(x) == 0:
1311 |                 continue
1312 |             uidx += 1
1313 |             use_noise.set_value(1.)
1314 | 
1315 |             x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
1316 |                                                 n_words_src=n_words_src,
1317 |                                                 n_words=n_words)
1318 | 
1319 |             if x is None:
1320 |                 print 'Minibatch with zero sample under length ', maxlen
1321 |                 uidx -= 1
1322 |                 continue
1323 |             word_map = list(set(list(x.reshape(x.shape[0]*x.shape[1]))+list(y.reshape(y.shape[0]*y.shape[1]))+word_map0))
1324 |             new_x = numpy.array([word_map.index(ii) for ii in x.reshape(x.shape[0]*x.shape[1])]).reshape(x.shape[0], x.shape[1])
1325 |             # x_map2 = new_x.T.flatten()
1326 |             x_map1 = new_x.T
1327 |             for iii in xrange(x_map1.shape[0]):
1328 |                 x_map1[iii] += len(word_map)*iii
1329 |             x_map1 = x_map1.flatten()
1330 | 
1331 |             new_y = numpy.array([word_map.index(ii) for ii in y.reshape(y.shape[0]*y.shape[1])]).reshape(y.shape[0], y.shape[1])
1332 |             # word_map3 = list(set(word_map+word_map0))
1333 | 
1334 |             x_mask_for_attw = numpy.array([1 if jij !=0 else 0 for jij in x.flatten()], dtype='float32')
1335 |             x_mask_for_attw = x_mask_for_attw.reshape(x.shape)
1336 | 
1337 |             ud_start = time.time()
1338 | 
1339 |             # ft1,ft2 = ftest(x, x_map1, x_mask, y, new_y, y_mask, numpy.array(word_map, dtype='int64'))
1340 |             # print ft1, ft2
1341 |             # ipdb.set_trace()
1342 | 
1343 |             # compute cost, grads and copy grads to shared variables
1344 |             # print 'fuck cost'
1345 |             cost = f_grad_shared(x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, numpy.array(word_map, dtype='int64'))
1346 | 
1347 |             # do the update on parameters
1348 |             f_update(lrate)
1349 | 
1350 |             ud = time.time() - ud_start
1351 | 
1352 |             # check for bad numbers, usually we remove non-finite elements
1353 |             # and continue training - but not done here
1354 |             if numpy.isnan(cost) or numpy.isinf(cost):
1355 |                 print 'NaN detected'
1356 |                 return 1., 1., 1.
1357 | 
1358 |             # verbose
1359 |             if numpy.mod(uidx, dispFreq) == 0:
1360 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
1361 | 
1362 |             # save the best model so far, in addition, save the latest model
1363 |             # into a separate file with the iteration number for external eval
1364 |             if numpy.mod(uidx, saveFreq) == 0:
1365 |                 print 'Saving the best model...',
1366 |                 if best_p is not None:
1367 |                     params = best_p
1368 |                 else:
1369 |                     params = unzip(tparams)
1370 |                 numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
1371 |                 pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
1372 |                 print 'Done'
1373 | 
1374 |                 # save with uidx
1375 |                 if not overwrite:
1376 |                     print 'Saving the model at iteration {}...'.format(uidx),
1377 |                     saveto_uidx = '{}.iter{}.npz'.format(
1378 |                         os.path.splitext(saveto)[0], uidx)
1379 |                     numpy.savez(saveto_uidx, history_errs=history_errs,
1380 |                                 uidx=uidx, **unzip(tparams))
1381 |                     print 'Done'
1382 | 
1383 | 
1384 |             # generate some samples with the model and display them
1385 |             if numpy.mod(uidx, sampleFreq) == 0:
1386 |                 # FIXME: random selection?
1387 |                 for jj in xrange(numpy.minimum(5, x.shape[1])):
1388 |                     stochastic = True
1389 |                     input_x = x[:, jj][:, None]
1390 |                     word_map = list(set(list(x[:, jj][:, None].reshape(x[:, jj][:, None].shape[0]*x[:, jj][:, None].shape[1]))+word_map0+list(y[:, jj][:, None].reshape(y[:, jj][:, None].shape[0]*y[:, jj][:, None].shape[1]))))
1391 | 
1392 |                     new_x_input = numpy.array([word_map.index(ii) for ii in input_x.reshape(input_x.shape[0]*input_x.shape[1])]).reshape(input_x.shape[0], input_x.shape[1])
1393 |                     assert new_x_input.T.shape[0] == 1
1394 |                     sx_map= new_x_input.T.flatten()
1395 | 
1396 |                     gen_x_mask = numpy.array([1 if jjj[0] !=0 else 0 for jjj in input_x])
1397 | 
1398 | 
1399 |                     sample, score, lam = gen_sample(tparams, f_init, f_next, f_lambda,
1400 |                                                input_x, gen_x_mask, sx_map, word_map,
1401 |                                                model_options, trng=trng, k=1,
1402 |                                                maxlen=30,
1403 |                                                stochastic=stochastic,
1404 |                                                argmax=False)
1405 |                     print 'Source ', jj, ': ',
1406 |                     for vv in x[:, jj]:
1407 |                         if vv == 0:
1408 |                             break
1409 |                         if vv in worddicts_r:
1410 |                             print worddicts_r[vv],
1411 |                         else:
1412 |                             print 'UNK',
1413 |                     print
1414 |                     print 'Truth ', jj, ' : ',
1415 |                     for vv in y[:, jj]:
1416 |                         if vv == 0:
1417 |                             break
1418 |                         if vv in worddicts_r:
1419 |                             print worddicts_r[vv],
1420 |                         else:
1421 |                             print 'UNK',
1422 |                     print
1423 |                     print 'Sample ', jj, ': ',
1424 |                     if stochastic:
1425 |                         ss = sample
1426 |                     else:
1427 |                         score = score / numpy.array([len(s) for s in sample])
1428 |                         ss = sample[score.argmin()]
1429 |                     label = 0
1430 |                     for vv in ss:
1431 |                         if vv == 0:
1432 |                             print str(worddicts_r[vv])+ "#" + str("%.4f" %float(lam[label])),
1433 |                             break
1434 |                         if vv in worddicts_r:
1435 |                             if show_lambda:
1436 |                                 print str(worddicts_r[vv])+ "#" + str("%.4f" %float(lam[label])),
1437 |                                 label += 1
1438 |                             else:
1439 |                                 print worddicts_r[vv],
1440 |                         else:
1441 |                             print 'UNK',
1442 |                     print
1443 | 
1444 |             # validate model on validation set and early stop if necessary
1445 |             if numpy.mod(uidx, validFreq) == 0:
1446 |                 use_noise.set_value(0.)
1447 |                 valid_errs = pred_probs(f_log_probs, prepare_data,
1448 |                                         model_options, valid, word_map0)
1449 |                 valid_err = valid_errs.mean()
1450 |                 history_errs.append(valid_err)
1451 | 
1452 |                 if uidx == 0 or valid_err <= numpy.array(history_errs).min():
1453 |                     best_p = unzip(tparams)
1454 |                     bad_counter = 0
1455 |                 if len(history_errs) > patience and valid_err >= \
1456 |                         numpy.array(history_errs)[:-patience].min():
1457 |                     bad_counter += 1
1458 |                     if bad_counter > patience:
1459 |                         print 'Early Stop!'
1460 |                         estop = True
1461 |                         break
1462 | 
1463 |                 if numpy.isnan(valid_err):
1464 |                     # ipdb.set_trace()
1465 |                     print 1
1466 | 
1467 |                 print 'Valid ', valid_err
1468 | 
1469 |             # finish after this many updates
1470 |             if uidx >= finish_after:
1471 |                 print 'Finishing after %d iterations!' % uidx
1472 |                 estop = True
1473 |                 break
1474 |         # ipdb.set_trace()
1475 |         print 'Seen %d samples' % n_samples
1476 | 
1477 |         if estop:
1478 |             break
1479 | 
1480 |     if best_p is not None:
1481 |         zipp(best_p, tparams)
1482 | 
1483 |     use_noise.set_value(0.)
1484 |     valid_err = pred_probs(f_log_probs, prepare_data,
1485 |                            model_options, valid, word_map0).mean()
1486 | 
1487 |     print 'Valid ', valid_err
1488 | 
1489 |     params = copy.copy(best_p)
1490 |     numpy.savez(saveto, zipped_params=best_p,
1491 |                 history_errs=history_errs,
1492 |                 uidx=uidx,
1493 |                 **params)
1494 |     
1495 | 
1496 |     return valid_err
1497 | 
1498 | 
1499 | if __name__ == '__main__':
1500 |     pass
1501 | 


--------------------------------------------------------------------------------
/Att_CopyNet/Att_CopyNet_copy_supervision/build_dictionary.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import cPickle as pkl
 3 | 
 4 | import sys
 5 | import fileinput
 6 | 
 7 | from collections import OrderedDict
 8 | 
 9 | def main(f_list, dictname, is_pos_dict=False):
10 |     word_freqs = OrderedDict()
11 |     for filename in f_list:
12 |         print 'Processing', filename
13 |         with open(filename, 'r') as f:
14 |             for line in f:
15 |                 words_in = line.strip().split(' ')
16 |                 for w in words_in:
17 |                     if w not in word_freqs:
18 |                         word_freqs[w] = 0
19 |                     else:
20 |                         word_freqs[w] += 1
21 |     words = word_freqs.keys()
22 |     freqs = word_freqs.values()
23 | 
24 |     sorted_idx = numpy.argsort(freqs)
25 |     sorted_words = [words[ii] for ii in sorted_idx[::-1]]
26 | 
27 |     worddict = OrderedDict()
28 |     worddict['eos'] = 0
29 |     worddict['UNK'] = 1
30 |     kk = 2
31 |     if is_pos_dict:
32 |         worddict = OrderedDict()
33 |         worddict['eos'] = 0
34 |         kk=1
35 | 
36 |     for ii, ww in enumerate(sorted_words):
37 |         worddict[ww] = ii+kk
38 | 
39 |     pkl.dump(worddict, open('data_2/%s.pkl'%dictname, 'wb'), True)
40 |     print worddict
41 | 
42 |     print 'Done'
43 | 
44 | if __name__ == '__main__':
45 |     f_list2 = ['data_2/p.txt', 'data_2/r.txt']
46 |     main(f_list2, 'word_dict')
47 | 
48 | 


--------------------------------------------------------------------------------
/Att_CopyNet/Att_CopyNet_copy_supervision/data_iterator.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pkl
 2 | import gzip
 3 | 
 4 | 
 5 | def fopen(filename, mode='r'):
 6 |     if filename.endswith('.gz'):
 7 |         return gzip.open(filename, mode)
 8 |     return open(filename, mode)
 9 | 
10 | 
11 | class TextIterator:
12 |     """Simple Bitext iterator."""
13 |     def __init__(self, source, target,
14 |                  source_dict, target_dict,
15 |                  batch_size=128,
16 |                  maxlen=100,
17 |                  n_words_source=-1,
18 |                  n_words_target=-1):
19 |         self.source = fopen(source, 'r')
20 |         self.target = fopen(target, 'r')
21 |         with open(source_dict, 'rb') as f:
22 |             self.source_dict = pkl.load(f)
23 |         with open(target_dict, 'rb') as f:
24 |             self.target_dict = pkl.load(f)
25 | 
26 |         self.batch_size = batch_size
27 |         self.maxlen = maxlen
28 | 
29 |         self.n_words_source = n_words_source
30 |         self.n_words_target = n_words_target
31 | 
32 |         self.end_of_data = False
33 | 
34 |     def __iter__(self):
35 |         return self
36 | 
37 |     def reset(self):
38 |         self.source.seek(0)
39 |         self.target.seek(0)
40 | 
41 |     def next(self):
42 |         if self.end_of_data:
43 |             self.end_of_data = False
44 |             self.reset()
45 |             raise StopIteration
46 | 
47 |         source = []
48 |         target = []
49 | 
50 |         try:
51 | 
52 |             # actual work here
53 |             while True:
54 | 
55 |                 # read from source file and map to word index
56 |                 ss = self.source.readline()
57 |                 if ss == "":
58 |                     raise IOError
59 |                 ss = ss.strip().split()
60 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
61 |                       for w in ss]
62 |                 if self.n_words_source > 0:
63 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
64 | 
65 |                 # read from source file and map to word index
66 |                 tt = self.target.readline()
67 |                 if tt == "":
68 |                     raise IOError
69 |                 tt = tt.strip().split()
70 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
71 |                       for w in tt]
72 |                 if self.n_words_target > 0:
73 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
74 | 
75 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
76 |                     continue
77 | 
78 |                 source.append(ss)
79 |                 target.append(tt)
80 | 
81 |                 if len(source) >= self.batch_size or \
82 |                         len(target) >= self.batch_size:
83 |                     break
84 |         except IOError:
85 |             self.end_of_data = True
86 | 
87 |         if len(source) <= 0 or len(target) <= 0:
88 |             self.end_of_data = False
89 |             self.reset()
90 |             raise StopIteration
91 | 
92 |         return source, target
93 | 


--------------------------------------------------------------------------------
/Att_CopyNet/Att_CopyNet_copy_supervision/train.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | import cPickle
 4 | 
 5 | from Att_copy_s import train
 6 | 
 7 | def main(job_id, params):
 8 |     print params
 9 |     basedir = 'data_2'
10 |     validerr = train(saveto=params['model'][0],
11 |                                         reload_=params['reload'][0],
12 |                                         dim_word=params['dim_word'][0],
13 |                                         dim=params['dim'][0],
14 |                                         n_words=params['n-words'][0],
15 |                                         n_words_src=params['n-words'][0],
16 |                                         decay_c=params['decay-c'][0],
17 |                                         clip_c=params['clip-c'][0],
18 |                                         lrate=params['learning-rate'][0],
19 |                                         optimizer=params['optimizer'][0],
20 |                                         maxlen=100,
21 |                                         batch_size=32,
22 |                                         valid_batch_size=32,
23 |                     datasets=['%s/validation.s'%basedir,
24 |                     '%s/validation.t'%basedir],
25 |                     valid_datasets=['%s/validation.s'%basedir,
26 |                     '%s/validation.t'%basedir,],
27 |                     # dictionaries=['%s/p.txt.pkl'%basedir,
28 |                     # '%s/r.txt.pkl'%basedir],
29 |                     dictionaries=['%s/training.s.pkl'%basedir,'%s/commonwords.txt'%basedir],
30 |                                         validFreq=1000,
31 |                                         dispFreq=100,
32 |                                         saveFreq=1000,
33 |                                         sampleFreq=100,
34 |                                         use_dropout=params['use-dropout'][0],
35 |                                         overwrite=False,
36 |                                         show_lambda=True)
37 |     return validerr
38 | 
39 | if __name__ == '__main__':
40 |     # f = cPickle.load(open(r'data//p.txt.pkl'))
41 |     # print f
42 | 
43 |     """
44 |     datasets:
45 | 
46 |     dictionaries:
47 |     OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)])
48 |     OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)])
49 | 
50 |     """
51 |     basedir = 'data_2'
52 |     main(0, {
53 |         'model': ['%s/model/m.npz'%basedir],
54 |         'dim_word': [512],#word embedding dim
55 |         'dim': [512],     #hidden dim
56 |         'n-words': [10000],   #vocabulary size
57 |         'optimizer': ['rmsprop'],
58 |         'decay-c': [0.],
59 |         'clip-c': [1.],
60 |         'use-dropout': [False],
61 |         'learning-rate': [0.05],
62 |         'reload': [False]})
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/Att_CopyNet/Att_CopyNet_copy_supervision/translate.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import theano
  4 | import numpy
  5 | import cPickle as pkl
  6 | 
  7 | from nmt_word import (build_sampler, gen_sample, load_params, init_params, init_tparams)
  8 | 
  9 | from multiprocessing import Process, Queue
 10 | 
 11 | 
 12 | 
 13 | 
 14 | def translate_model(word_map0, queue, rqueue, pid, model, options, k, normalize, n_best):
 15 | 
 16 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 17 |     trng = RandomStreams(1234)
 18 |     use_noise = theano.shared(numpy.float32(0.))
 19 | 
 20 |     # allocate model parameters
 21 |     params = init_params(options)
 22 | 
 23 |     # load model parameters and set theano shared variables
 24 |     params = load_params(model, params)
 25 |     tparams = init_tparams(params)
 26 | 
 27 |     # word index
 28 |     f_init, f_next, f_lambda = build_sampler(tparams, options, trng, use_noise)
 29 | 
 30 |     def _translate(seq):
 31 |         xx = numpy.array(seq).reshape([len(seq), 1])
 32 |         word_map = list(set(list(xx.reshape(xx.shape[0]*xx.shape[1]))+word_map0))
 33 | 
 34 |         new_x_input = numpy.array([word_map.index(ii) for ii in xx.reshape(xx.shape[0]*xx.shape[1])]).reshape(xx.shape[0], xx.shape[1])
 35 |         sx_map = new_x_input.T.flatten()
 36 | 
 37 |         gen_x_mask = numpy.array([1 if jjj[0] !=0 else 0 for jjj in xx])
 38 |         # sample given an input sequence and obtain scores
 39 |         sample, score, _ = gen_sample(tparams, f_init, f_next, f_lambda,
 40 |                                    xx, gen_x_mask, sx_map, word_map,
 41 |                                    options, trng=trng, k=k, maxlen=200,
 42 |                                    stochastic=False, argmax=False)
 43 | 
 44 |         # normalize scores according to sequence lengths
 45 |         if normalize:
 46 |             lengths = numpy.array([len(s) for s in sample])
 47 |             score = score / lengths
 48 |         if n_best > 1:
 49 |             sidx = numpy.argsort(score)[:n_best]
 50 | 
 51 |         else:
 52 |             sidx = numpy.argmin(score)
 53 |         # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx]
 54 | 
 55 |         return numpy.array(sample)[sidx], numpy.array(score)[sidx]
 56 |         # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx]
 57 | 
 58 |     while True:
 59 |         req = queue.get()
 60 |         if req is None:
 61 |             break
 62 | 
 63 |         idx, x = req[0], req[1]
 64 |         print pid, '-', idx
 65 |         seq, scores = _translate(x)
 66 |         # print seq, scores
 67 | 
 68 |         rqueue.put((idx, seq, scores))
 69 | 
 70 |     # print tparams['att_lambda'].get_value()[0]
 71 | 
 72 |     return
 73 | 
 74 | 
 75 | def predict(model, dictionary, common_dictionary, source_file, saveto, k=5,
 76 |          normalize=False, n_process=5, chr_level=False, n_best=1):
 77 | 
 78 |     # load model model_options
 79 |     with open('%s.pkl' % model, 'rb') as f:
 80 |         options = pkl.load(f)
 81 | 
 82 |     # load source dictionary and invert
 83 |     with open(dictionary, 'rb') as f:
 84 |         word_dict = pkl.load(f)
 85 |     word_idict = dict()
 86 |     for kk, vv in word_dict.iteritems():
 87 |         word_idict[vv] = kk
 88 |     word_idict[0] = '<eos>'
 89 |     word_idict[1] = 'UNK'
 90 | 
 91 |     word_idict_trg = word_idict
 92 |     # load target dictionary and invert
 93 |     # with open(dictionary_target, 'rb') as f:
 94 |     #     word_dict_trg = pkl.load(f)
 95 |     # word_idict_trg = dict()
 96 |     # for kk, vv in word_dict_trg.iteritems():
 97 |     #     word_idict_trg[vv] = kk
 98 |     # word_idict_trg[0] = '<eos>'
 99 |     # word_idict_trg[1] = 'UNK'
100 | 
101 |     word_map0 = []
102 |     with open(common_dictionary) as ff:
103 |         for line in ff:
104 |             line = line.strip()
105 |             if line in word_dict:
106 |                 if line not in word_map0:
107 |                     word_map0.append(word_dict[line])
108 | 
109 | 
110 | 
111 |     # create input and output queues for processes
112 |     queue = Queue()
113 |     rqueue = Queue()
114 |     processes = [None] * n_process
115 |     for midx in xrange(n_process):
116 |         processes[midx] = Process(
117 |             target=translate_model,
118 |             args=(word_map0, queue, rqueue, midx, model, options, k, normalize, n_best))
119 |         processes[midx].start()
120 | 
121 |     # utility function
122 |     def _seqs2words(caps):
123 |         capsw = []
124 |         for cc in caps:
125 |             ww = []
126 |             for w in cc:
127 |                 if w == 0:
128 |                     break
129 |                 ww.append(word_idict_trg[w])
130 |             capsw.append(' '.join(ww))
131 |         return capsw
132 |     # def _seqs2words(caps):
133 |     #     capsw = []
134 |     #     attw = []
135 |     #     for cc in caps:
136 |     #         ww = []
137 |     #         www = []
138 |     #         label = 0
139 |     #         for w in cc:
140 |     #             if w == 0 and label != 0:
141 |     #                 break
142 |     #             elif w == 0:
143 |     #                 continue
144 |     #             label += 1
145 |     #             ww.append(word_idict_trg[w])
146 |     #             www.append(str(tparams['att_lambda'].get_value()[w]))
147 |     #         wwww = []
148 |     #         for aa, bb in zip(ww, www):
149 |     #             wwww.append(aa+'_'+bb)
150 |     #         # capsw.append(' '.join(ww))
151 |     #         capsw.append(' '.join(wwww))
152 |     #     return capsw
153 | 
154 |     def _send_jobs(fname):
155 |         with open(fname, 'r') as f:
156 |             for idx, line in enumerate(f):
157 |                 if chr_level:
158 |                     words = list(line.decode('utf-8').strip())
159 |                 else:
160 |                     words = line.strip().split()
161 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
162 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
163 |                 x += [0]
164 |                 queue.put((idx, x))
165 |         return idx+1
166 | 
167 |     def _finish_processes():
168 |         for midx in xrange(n_process):
169 |             queue.put(None)
170 | 
171 |     def _retrieve_jobs(n_samples):
172 |         trans = [None] * n_samples
173 |         scores = [None] * n_samples
174 |         for idx in xrange(n_samples):
175 |             resp = rqueue.get()
176 |             trans[resp[0]] = resp[1]
177 |             scores[resp[0]] = resp[2]
178 |             if numpy.mod(idx, 10) == 0:
179 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
180 |         return trans, scores
181 | 
182 |     print 'Translating ', source_file, '...'
183 |     n_samples = _send_jobs(source_file)
184 |     trans, scores = _retrieve_jobs(n_samples)
185 |     _finish_processes()
186 | 
187 |     if n_best == 1:
188 |         trans = _seqs2words(trans)
189 |     else:
190 |         n_best_trans = []
191 |         for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)):
192 |             sentences = _seqs2words(n_best_tr)
193 |             for ids, trans_ in enumerate(sentences):
194 |                 n_best_trans.append(
195 |                     '|||'.join(
196 |                         ['{}'.format(idx), trans_,
197 |                          '{}'.format(score_[ids])]))
198 |         trans = n_best_trans
199 | 
200 |     with open(saveto, 'w') as f:
201 |         print >>f, '\n'.join(trans)
202 |     print 'Done'
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     parser = argparse.ArgumentParser()
207 |     parser.add_argument('-k', type=int, default=5, help="Beam size")
208 |     parser.add_argument('-p', type=int, default=5, help="Number of processes")
209 |     parser.add_argument('-n', action="store_true", default=False,
210 |                         help="Normalize wrt sequence length")
211 |     parser.add_argument('-c', action="store_true", default=False,
212 |                         help="Character level")
213 |     parser.add_argument('-b', type=int, default=1, help="Output n-best list")
214 |     parser.add_argument('model', type=str)
215 |     parser.add_argument('dictionary', type=str)
216 |     parser.add_argument('common_dictionary', type=str)
217 |     parser.add_argument('source', type=str)
218 |     parser.add_argument('saveto', type=str)
219 | 
220 |     args = parser.parse_args()
221 | 
222 |     main(args.model, args.dictionary, args.common_dictionary, args.source,
223 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
224 |          chr_level=args.c, n_best=args.b)
225 | 


--------------------------------------------------------------------------------
/Att_CopyNet/Att_CopyNet_copy_supervision/translate_Windows.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Translates a source file using a translation model.
 3 | '''
 4 | import translate as TTT
 5 | 
 6 | if __name__ == '__main__':
 7 |     TTT.predict(r'data_2/model/m.npz', r'data_2/word_dict.pkl', r'data_2/dict2.txt', r'data_2/p.txt', r'data_2/ttt.txt', k=5, n_process=1)
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Att_CopyNet/README.md:
--------------------------------------------------------------------------------
1 | # Attention_CopyNet
2 | 
3 | 


--------------------------------------------------------------------------------
/Att_CopyNet/build_dictionary.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | ############################################
 3 | #
 4 | #    Author: Chuwei Luo
 5 | #    Email: luochuwei@gmail.com
 6 | #    Date: 15/08/2016
 7 | #    Usage: build dict
 8 | #
 9 | ############################################
10 | import numpy
11 | import cPickle as pkl
12 | 
13 | import sys
14 | import fileinput
15 | 
16 | from collections import OrderedDict
17 | 
18 | def main(f_list, dictname, is_pos_dict=False):
19 |     word_freqs = OrderedDict()
20 |     for filename in f_list:
21 |         print 'Processing', filename
22 |         with open(filename, 'r') as f:
23 |             for line in f:
24 |                 words_in = line.strip().split(' ')
25 |                 for w in words_in:
26 |                     if w not in word_freqs:
27 |                         word_freqs[w] = 0
28 |                     else:
29 |                         word_freqs[w] += 1
30 |     words = word_freqs.keys()
31 |     freqs = word_freqs.values()
32 | 
33 |     sorted_idx = numpy.argsort(freqs)
34 |     sorted_words = [words[ii] for ii in sorted_idx[::-1]]
35 | 
36 |     worddict = OrderedDict()
37 |     worddict['eos'] = 0
38 |     worddict['UNK'] = 1
39 |     kk = 2
40 |     if is_pos_dict:
41 |         worddict = OrderedDict()
42 |         worddict['eos'] = 0
43 |         kk=1
44 | 
45 |     for ii, ww in enumerate(sorted_words):
46 |         worddict[ww] = ii+kk
47 | 
48 |     pkl.dump(worddict, open('data_2/%s.pkl'%dictname, 'wb'), True)
49 |     print worddict
50 | 
51 |     print 'Done'
52 | 
53 | if __name__ == '__main__':
54 |     f_list2 = ['data_2/p.txt', 'data_2/r.txt']
55 |     main(f_list2, 'word_dict')
56 | 
57 | 


--------------------------------------------------------------------------------
/Att_CopyNet/data_2/dict2.txt:
--------------------------------------------------------------------------------
1 | a
2 | 


--------------------------------------------------------------------------------
/Att_CopyNet/data_2/p.txt:
--------------------------------------------------------------------------------
1 | a b
2 | c a d
3 | a
4 | b
5 | c
6 | d
7 | 


--------------------------------------------------------------------------------
/Att_CopyNet/data_2/r.txt:
--------------------------------------------------------------------------------
1 | c
2 | d c
3 | d b
4 | c a d
5 | b
6 | a
7 | 


--------------------------------------------------------------------------------
/Att_CopyNet/data_2/ttt.txt:
--------------------------------------------------------------------------------
1 | a b
2 | c a d
3 | a
4 | b
5 | c
6 | d
7 | 


--------------------------------------------------------------------------------
/Att_CopyNet/data_2/word_dict.pkl:
--------------------------------------------------------------------------------
1 | ccollections
2 | OrderedDict
3 | q(]q(]q(UeosqK e]q(UUNKqKe]q(UdKe]q(UcKe]q	(UaKe]q
4 | (UbKeetRq.


--------------------------------------------------------------------------------
/Att_CopyNet/data_iterator.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pkl
 2 | import gzip
 3 | 
 4 | 
 5 | def fopen(filename, mode='r'):
 6 |     if filename.endswith('.gz'):
 7 |         return gzip.open(filename, mode)
 8 |     return open(filename, mode)
 9 | 
10 | 
11 | class TextIterator:
12 |     """Simple Bitext iterator."""
13 |     def __init__(self, source, target,
14 |                  source_dict, target_dict,
15 |                  batch_size=128,
16 |                  maxlen=100,
17 |                  n_words_source=-1,
18 |                  n_words_target=-1):
19 |         self.source = fopen(source, 'r')
20 |         self.target = fopen(target, 'r')
21 |         with open(source_dict, 'rb') as f:
22 |             self.source_dict = pkl.load(f)
23 |         with open(target_dict, 'rb') as f:
24 |             self.target_dict = pkl.load(f)
25 | 
26 |         self.batch_size = batch_size
27 |         self.maxlen = maxlen
28 | 
29 |         self.n_words_source = n_words_source
30 |         self.n_words_target = n_words_target
31 | 
32 |         self.end_of_data = False
33 | 
34 |     def __iter__(self):
35 |         return self
36 | 
37 |     def reset(self):
38 |         self.source.seek(0)
39 |         self.target.seek(0)
40 | 
41 |     def next(self):
42 |         if self.end_of_data:
43 |             self.end_of_data = False
44 |             self.reset()
45 |             raise StopIteration
46 | 
47 |         source = []
48 |         target = []
49 | 
50 |         try:
51 | 
52 |             # actual work here
53 |             while True:
54 | 
55 |                 # read from source file and map to word index
56 |                 ss = self.source.readline()
57 |                 if ss == "":
58 |                     raise IOError
59 |                 ss = ss.strip().split()
60 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
61 |                       for w in ss]
62 |                 if self.n_words_source > 0:
63 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
64 | 
65 |                 # read from source file and map to word index
66 |                 tt = self.target.readline()
67 |                 if tt == "":
68 |                     raise IOError
69 |                 tt = tt.strip().split()
70 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
71 |                       for w in tt]
72 |                 if self.n_words_target > 0:
73 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
74 | 
75 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
76 |                     continue
77 | 
78 |                 source.append(ss)
79 |                 target.append(tt)
80 | 
81 |                 if len(source) >= self.batch_size or \
82 |                         len(target) >= self.batch_size:
83 |                     break
84 |         except IOError:
85 |             self.end_of_data = True
86 | 
87 |         if len(source) <= 0 or len(target) <= 0:
88 |             self.end_of_data = False
89 |             self.reset()
90 |             raise StopIteration
91 | 
92 |         return source, target
93 | 


--------------------------------------------------------------------------------
/Att_CopyNet/predict.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | ############################################
  3 | #
  4 | #    Author: Chuwei Luo
  5 | #    Email: luochuwei@gmail.com
  6 | #    Date: 15/08/2016
  7 | #    Usage: for testing
  8 | #
  9 | ############################################
 10 | import argparse
 11 | import theano
 12 | import numpy
 13 | import cPickle as pkl
 14 | 
 15 | from AttCopy import (build_sampler, gen_sample, load_params, init_params, init_tparams)
 16 | 
 17 | from multiprocessing import Process, Queue
 18 | 
 19 | 
 20 | 
 21 | 
 22 | def translate_model(word_map0, queue, rqueue, pid, model, options, k, normalize, n_best):
 23 | 
 24 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 25 |     trng = RandomStreams(1234)
 26 |     use_noise = theano.shared(numpy.float32(0.))
 27 | 
 28 |     # allocate model parameters
 29 |     params = init_params(options)
 30 | 
 31 |     # load model parameters and set theano shared variables
 32 |     params = load_params(model, params)
 33 |     tparams = init_tparams(params)
 34 | 
 35 |     # word index
 36 |     f_init, f_next, f_lambda = build_sampler(tparams, options, trng, use_noise)
 37 | 
 38 |     def _translate(seq):
 39 |         xx = numpy.array(seq).reshape([len(seq), 1])
 40 |         word_map = list(set(list(xx.reshape(xx.shape[0]*xx.shape[1]))+word_map0))
 41 | 
 42 |         new_x_input = numpy.array([word_map.index(ii) for ii in xx.reshape(xx.shape[0]*xx.shape[1])]).reshape(xx.shape[0], xx.shape[1])
 43 |         sx_map = new_x_input.T.flatten()
 44 |         # sample given an input sequence and obtain scores
 45 |         sample, score, _ = gen_sample(tparams, f_init, f_next, f_lambda,
 46 |                                    xx, sx_map, word_map,
 47 |                                    options, trng=trng, k=k, maxlen=200,
 48 |                                    stochastic=False, argmax=False)
 49 | 
 50 |         # normalize scores according to sequence lengths
 51 |         if normalize:
 52 |             lengths = numpy.array([len(s) for s in sample])
 53 |             score = score / lengths
 54 |         if n_best > 1:
 55 |             sidx = numpy.argsort(score)[:n_best]
 56 | 
 57 |         else:
 58 |             sidx = numpy.argmin(score)
 59 |         # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx]
 60 | 
 61 |         return numpy.array(sample)[sidx], numpy.array(score)[sidx]
 62 |         # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx]
 63 | 
 64 |     while True:
 65 |         req = queue.get()
 66 |         if req is None:
 67 |             break
 68 | 
 69 |         idx, x = req[0], req[1]
 70 |         print pid, '-', idx
 71 |         seq, scores = _translate(x)
 72 |         # print seq, scores
 73 | 
 74 |         rqueue.put((idx, seq, scores))
 75 | 
 76 |     # print tparams['att_lambda'].get_value()[0]
 77 | 
 78 |     return
 79 | 
 80 | 
 81 | def predict(model, dictionary, common_dictionary, source_file, saveto, k=5,
 82 |          normalize=False, n_process=5, chr_level=False, n_best=1):
 83 | 
 84 |     # load model model_options
 85 |     with open('%s.pkl' % model, 'rb') as f:
 86 |         options = pkl.load(f)
 87 | 
 88 |     # load source dictionary and invert
 89 |     with open(dictionary, 'rb') as f:
 90 |         word_dict = pkl.load(f)
 91 |     word_idict = dict()
 92 |     for kk, vv in word_dict.iteritems():
 93 |         word_idict[vv] = kk
 94 |     word_idict[0] = '<eos>'
 95 |     word_idict[1] = 'UNK'
 96 | 
 97 |     word_idict_trg = word_idict
 98 |     # load target dictionary and invert
 99 |     # with open(dictionary_target, 'rb') as f:
100 |     #     word_dict_trg = pkl.load(f)
101 |     # word_idict_trg = dict()
102 |     # for kk, vv in word_dict_trg.iteritems():
103 |     #     word_idict_trg[vv] = kk
104 |     # word_idict_trg[0] = '<eos>'
105 |     # word_idict_trg[1] = 'UNK'
106 | 
107 |     word_map0 = []
108 |     with open(common_dictionary) as ff:
109 |         for line in ff:
110 |             line = line.strip()
111 |             if line in word_dict:
112 |                 if line not in word_map0:
113 |                     word_map0.append(word_dict[line])
114 | 
115 | 
116 | 
117 |     # create input and output queues for processes
118 |     queue = Queue()
119 |     rqueue = Queue()
120 |     processes = [None] * n_process
121 |     for midx in xrange(n_process):
122 |         processes[midx] = Process(
123 |             target=translate_model,
124 |             args=(word_map0, queue, rqueue, midx, model, options, k, normalize, n_best))
125 |         processes[midx].start()
126 | 
127 |     # utility function
128 |     def _seqs2words(caps):
129 |         capsw = []
130 |         for cc in caps:
131 |             ww = []
132 |             for w in cc:
133 |                 if w == 0:
134 |                     break
135 |                 ww.append(word_idict_trg[w])
136 |             capsw.append(' '.join(ww))
137 |         return capsw
138 |     # def _seqs2words(caps):
139 |     #     capsw = []
140 |     #     attw = []
141 |     #     for cc in caps:
142 |     #         ww = []
143 |     #         www = []
144 |     #         label = 0
145 |     #         for w in cc:
146 |     #             if w == 0 and label != 0:
147 |     #                 break
148 |     #             elif w == 0:
149 |     #                 continue
150 |     #             label += 1
151 |     #             ww.append(word_idict_trg[w])
152 |     #             www.append(str(tparams['att_lambda'].get_value()[w]))
153 |     #         wwww = []
154 |     #         for aa, bb in zip(ww, www):
155 |     #             wwww.append(aa+'_'+bb)
156 |     #         # capsw.append(' '.join(ww))
157 |     #         capsw.append(' '.join(wwww))
158 |     #     return capsw
159 | 
160 |     def _send_jobs(fname):
161 |         with open(fname, 'r') as f:
162 |             for idx, line in enumerate(f):
163 |                 if chr_level:
164 |                     words = list(line.decode('utf-8').strip())
165 |                 else:
166 |                     words = line.strip().split()
167 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
168 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
169 |                 x += [0]
170 |                 queue.put((idx, x))
171 |         return idx+1
172 | 
173 |     def _finish_processes():
174 |         for midx in xrange(n_process):
175 |             queue.put(None)
176 | 
177 |     def _retrieve_jobs(n_samples):
178 |         trans = [None] * n_samples
179 |         scores = [None] * n_samples
180 |         for idx in xrange(n_samples):
181 |             resp = rqueue.get()
182 |             trans[resp[0]] = resp[1]
183 |             scores[resp[0]] = resp[2]
184 |             if numpy.mod(idx, 10) == 0:
185 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
186 |         return trans, scores
187 | 
188 |     print 'Translating ', source_file, '...'
189 |     n_samples = _send_jobs(source_file)
190 |     trans, scores = _retrieve_jobs(n_samples)
191 |     _finish_processes()
192 | 
193 |     if n_best == 1:
194 |         trans = _seqs2words(trans)
195 |     else:
196 |         n_best_trans = []
197 |         for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)):
198 |             sentences = _seqs2words(n_best_tr)
199 |             for ids, trans_ in enumerate(sentences):
200 |                 n_best_trans.append(
201 |                     '|||'.join(
202 |                         ['{}'.format(idx), trans_,
203 |                          '{}'.format(score_[ids])]))
204 |         trans = n_best_trans
205 | 
206 |     with open(saveto, 'w') as f:
207 |         print >>f, '\n'.join(trans)
208 |     print 'Done'
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     parser = argparse.ArgumentParser()
213 |     parser.add_argument('-k', type=int, default=5, help="Beam size")
214 |     parser.add_argument('-p', type=int, default=5, help="Number of processes")
215 |     parser.add_argument('-n', action="store_true", default=False,
216 |                         help="Normalize wrt sequence length")
217 |     parser.add_argument('-c', action="store_true", default=False,
218 |                         help="Character level")
219 |     parser.add_argument('-b', type=int, default=1, help="Output n-best list")
220 |     parser.add_argument('model', type=str)
221 |     parser.add_argument('dictionary', type=str)
222 |     parser.add_argument('common_dictionary', type=str)
223 |     parser.add_argument('source', type=str)
224 |     parser.add_argument('saveto', type=str)
225 | 
226 |     args = parser.parse_args()
227 | 
228 |     main(args.model, args.dictionary, args.common_dictionary, args.source,
229 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
230 |          chr_level=args.c, n_best=args.b)
231 | 


--------------------------------------------------------------------------------
/Att_CopyNet/predict_windows.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | ############################################
 3 | #
 4 | #    Author: Chuwei Luo
 5 | #    Email: luochuwei@gmail.com
 6 | #    Date: 15/08/2016
 7 | #    Usage: for testing in Windows
 8 | #
 9 | ############################################
10 | import translate as TTT
11 | 
12 | if __name__ == '__main__':
13 |     TTT.predict(r'data_2/model/m.npz', r'data_2/word_dict.pkl', r'data_2/dict2.txt', r'data_2/p.txt', r'data_2/ttt.txt', k=5, n_process=1)
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/Att_CopyNet/train.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | ############################################
 3 | #
 4 | #    Author: Chuwei Luo
 5 | #    Email: luochuwei@gmail.com
 6 | #    Date: 15/08/2016
 7 | #    Usage: for training
 8 | #
 9 | ############################################
10 | 
11 | import numpy
12 | import os
13 | import cPickle
14 | 
15 | from AttCopy import train
16 | 
17 | def main(job_id, params):
18 |     print params
19 |     basedir = 'data_2'
20 |     validerr = train(saveto=params['model'][0],
21 |                                         reload_=params['reload'][0],
22 |                                         dim_word=params['dim_word'][0],
23 |                                         dim=params['dim'][0],
24 |                                         n_words=params['n-words'][0],
25 |                                         n_words_src=params['n-words'][0],
26 |                                         decay_c=params['decay-c'][0],
27 |                                         clip_c=params['clip-c'][0],
28 |                                         lrate=params['learning-rate'][0],
29 |                                         optimizer=params['optimizer'][0],
30 |                                         maxlen=100,
31 |                                         batch_size=32,
32 |                                         valid_batch_size=32,
33 |                     datasets=['%s/p.txt'%basedir,
34 |                     '%s/p.txt'%basedir],
35 |                     valid_datasets=['%s/p.txt'%basedir,
36 |                     '%s/p.txt'%basedir,],
37 |                     # dictionaries=['%s/p.txt.pkl'%basedir,
38 |                     # '%s/r.txt.pkl'%basedir],
39 |                     dictionaries=['%s/word_dict.pkl'%basedir,'%s/dict2.txt'%basedir],
40 |                                         validFreq=100,
41 |                                         dispFreq=1,
42 |                                         saveFreq=100,
43 |                                         sampleFreq=1,
44 |                                         use_dropout=params['use-dropout'][0],
45 |                                         overwrite=False,
46 |                                         show_lambda=False)
47 |     return validerr
48 | 
49 | if __name__ == '__main__':
50 |     # f = cPickle.load(open(r'data//p.txt.pkl'))
51 |     # print f
52 | 
53 |     """
54 |     datasets:
55 | 
56 |     dictionaries:
57 |     OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)])
58 |     OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)])
59 | 
60 |     """
61 |     basedir = 'data_2'
62 |     main(0, {
63 |         'model': ['%s/model/m.npz'%basedir],
64 |         'dim_word': [100],#word embedding dim
65 |         'dim': [100],     #hidden dim
66 |         'n-words': [6],   #vocabulary size
67 |         'optimizer': ['rmsprop'],
68 |         'decay-c': [0.],
69 |         'clip-c': [1.],
70 |         'use-dropout': [False],
71 |         'learning-rate': [0.1],
72 |         'reload': [False]})
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/README.md:
--------------------------------------------------------------------------------
1 | # Attention_POS_CopyNet
2 | 
3 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/build_dictionary.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import cPickle as pkl
 3 | 
 4 | import sys
 5 | import fileinput
 6 | 
 7 | from collections import OrderedDict
 8 | 
 9 | def main(f_list, dictname, is_pos_dict=False):
10 |     word_freqs = OrderedDict()
11 |     for filename in f_list:
12 |         print 'Processing', filename
13 |         with open(filename, 'r') as f:
14 |             for line in f:
15 |                 words_in = line.strip().split(' ')
16 |                 for w in words_in:
17 |                     if w not in word_freqs:
18 |                         word_freqs[w] = 0
19 |                     else:
20 |                         word_freqs[w] += 1
21 |     words = word_freqs.keys()
22 |     freqs = word_freqs.values()
23 | 
24 |     sorted_idx = numpy.argsort(freqs)
25 |     sorted_words = [words[ii] for ii in sorted_idx[::-1]]
26 | 
27 |     worddict = OrderedDict()
28 |     worddict['eos'] = 0
29 |     worddict['UNK'] = 1
30 |     kk = 2
31 |     if is_pos_dict:
32 |         worddict = OrderedDict()
33 |         worddict['eos'] = 0
34 |         kk=1
35 | 
36 |     for ii, ww in enumerate(sorted_words):
37 |         worddict[ww] = ii+kk
38 | 
39 |     pkl.dump(worddict, open('data_2/%s.pkl'%dictname, 'wb'), True)
40 |     print worddict
41 | 
42 |     print 'Done'
43 | 
44 | if __name__ == '__main__':
45 |     f_list1 = ['data_2/p_pos.txt', 'data_2/r_pos.txt']
46 |     main(f_list1, 'pos_dict', is_pos_dict=True)
47 | 
48 |     f_list2 = ['data_2/p.txt', 'data_2/r.txt']
49 |     main(f_list2, 'word_dict')
50 | 
51 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/dict2.txt:
--------------------------------------------------------------------------------
1 | a
2 | b
3 | d


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/p.txt:
--------------------------------------------------------------------------------
1 | a b
2 | c a d
3 | a
4 | b
5 | c
6 | d
7 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/p_pos.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 3 1 4
3 | 1
4 | 2
5 | 3
6 | 4
7 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/pos_dict.pkl:
--------------------------------------------------------------------------------
1 | ccollections
2 | OrderedDict
3 | q(]q(]q(UeosqK e]q(U4Ke]q(U3Ke]q(U1Ke]q(U2KeetRq	.


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/r.txt:
--------------------------------------------------------------------------------
1 | c d
2 | d c a


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/r_pos.txt:
--------------------------------------------------------------------------------
1 | 3 4
2 | 4 3 1


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_2/word_dict.pkl:
--------------------------------------------------------------------------------
1 | ccollections
2 | OrderedDict
3 | q(]q(]q(UeosqK e]q(UUNKqKe]q(UdKe]q(UcKe]q	(UaKe]q
4 | (UbKeetRq.


--------------------------------------------------------------------------------
/Att_POS_CopyNet/data_iterator_for_pos.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | #######################################################
  3 | #
  4 | #    Author: Chuwei Luo
  5 | #    Email: luochuwei@gmail.com
  6 | #    Date: 03/08/2016
  7 | #    Usage: text iterator for pos
  8 | #
  9 | #######################################################
 10 | 
 11 | import cPickle as pkl
 12 | import gzip
 13 | 
 14 | 
 15 | def fopen(filename, mode='r'):
 16 |     if filename.endswith('.gz'):
 17 |         return gzip.open(filename, mode)
 18 |     return open(filename, mode)
 19 | 
 20 | 
 21 | class TextIterator:
 22 |     """Simple Bitext iterator."""
 23 |     def __init__(self, source, target, source_pos, target_pos,
 24 |                  word_dic, pos_dic,
 25 |                  batch_size=128,
 26 |                  maxlen=100,
 27 |                  n_words=-1,
 28 |                  n_pos=-1):
 29 |         self.source = fopen(source, 'r')
 30 |         self.source_pos = fopen(source_pos, 'r')
 31 |         self.target = fopen(target, 'r')
 32 |         self.target_pos = fopen(target_pos, 'r')
 33 |         with open(word_dic, 'rb') as f:
 34 |             self.word_dic = pkl.load(f)
 35 |         with open(pos_dic, 'rb') as f:
 36 |             self.pos_dic = pkl.load(f)
 37 | 
 38 |         self.batch_size = batch_size
 39 |         self.maxlen = maxlen
 40 | 
 41 |         self.n_words = n_words
 42 |         self.n_pos = n_pos
 43 | 
 44 |         self.end_of_data = False
 45 | 
 46 |     def __iter__(self):
 47 |         return self
 48 | 
 49 |     def reset(self):
 50 |         self.source.seek(0)
 51 |         self.source_pos.seek(0)
 52 |         self.target.seek(0)
 53 |         self.target_pos.seek(0)
 54 | 
 55 |     def next(self):
 56 |         if self.end_of_data:
 57 |             self.end_of_data = False
 58 |             self.reset()
 59 |             raise StopIteration
 60 | 
 61 |         source = []
 62 |         source_pos = []
 63 |         target = []
 64 |         target_pos = []
 65 | 
 66 |         try:
 67 | 
 68 |             # actual work here
 69 |             while True:
 70 | 
 71 |                 # read from source file and map to word index
 72 |                 ss = self.source.readline()
 73 |                 if ss == "":
 74 |                     raise IOError
 75 |                 ss = ss.strip().split()
 76 |                 ss = [self.word_dic[w] if w in self.word_dic else 1
 77 |                       for w in ss]
 78 |                 if self.n_words > 0:
 79 |                     ss = [w if w < self.n_words else 1 for w in ss]
 80 | 
 81 |                 ssp = self.source_pos.readline()
 82 |                 if ssp == "":
 83 |                     raise IOError
 84 |                 ssp = ssp.strip().split()
 85 |                 ssp = [self.pos_dic[w] if w in self.pos_dic else 1
 86 |                       for w in ssp]
 87 |                 if self.n_pos > 0:
 88 |                     ssp = [w if w < self.n_pos else 1 for w in ssp]
 89 | 
 90 |                 # read from source file and map to word index
 91 |                 tt = self.target.readline()
 92 |                 if tt == "":
 93 |                     raise IOError
 94 |                 tt = tt.strip().split()
 95 |                 tt = [self.word_dic[w] if w in self.word_dic else 1
 96 |                       for w in tt]
 97 |                 if self.n_words > 0:
 98 |                     tt = [w if w < self.n_words else 1 for w in tt]
 99 | 
100 |                 ttp = self.target_pos.readline()
101 |                 if ttp == "":
102 |                     raise IOError
103 |                 ttp = ttp.strip().split()
104 |                 ttp = [self.pos_dic[w] if w in self.pos_dic else 1
105 |                       for w in ttp]
106 |                 if self.n_pos > 0:
107 |                     ttp = [w if w < self.n_pos else 1 for w in ttp]
108 | 
109 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen and len(ssp) > self.maxlen and len(ttp) > self.maxlen:
110 |                     continue
111 | 
112 |                 source.append(ss)
113 |                 source_pos.append(ssp)
114 |                 target.append(tt)
115 |                 target_pos.append(ttp)
116 | 
117 |                 if len(source) >= self.batch_size or \
118 |                         len(target) >= self.batch_size or len(source_pos) >= self.batch_size or len(target_pos) >= self.batch_size:
119 |                     break
120 |         except IOError:
121 |             self.end_of_data = True
122 | 
123 |         if len(source) <= 0 or len(target) <= 0 or len(source_pos) <= 0 or len(target_pos) <= 0:
124 |             self.end_of_data = False
125 |             self.reset()
126 |             raise StopIteration
127 | 
128 |         return source, target, source_pos, target_pos
129 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/nmt_new_pos_word.py:
--------------------------------------------------------------------------------
   1 | #-*- coding:utf-8 -*-
   2 | #######################################################
   3 | #
   4 | #    Author: Chuwei Luo
   5 | #    Email: luochuwei@gmail.com
   6 | #    Date: 03/08/2016
   7 | #    Usage: Seq2Seq Attention POS CopyNet(based on dl4mt)
   8 | #
   9 | #######################################################
  10 | import theano
  11 | import theano.tensor as tensor
  12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  13 | 
  14 | import cPickle as pkl
  15 | # import ipdb
  16 | import numpy
  17 | import copy
  18 | 
  19 | import os
  20 | import warnings
  21 | import sys
  22 | import time
  23 | 
  24 | from collections import OrderedDict
  25 | 
  26 | from data_iterator_for_pos import TextIterator
  27 | 
  28 | profile = False
  29 | 
  30 | 
  31 | # push parameters to Theano shared variables
  32 | def zipp(params, tparams):
  33 |     for kk, vv in params.iteritems():
  34 |         tparams[kk].set_value(vv)
  35 | 
  36 | 
  37 | # pull parameters from Theano shared variables
  38 | def unzip(zipped):
  39 |     new_params = OrderedDict()
  40 |     for kk, vv in zipped.iteritems():
  41 |         new_params[kk] = vv.get_value()
  42 |     return new_params
  43 | 
  44 | 
  45 | # get the list of parameters: Note that tparams must be OrderedDict
  46 | def itemlist(tparams):
  47 |     return [vv for kk, vv in tparams.iteritems()]
  48 | 
  49 | 
  50 | # dropout
  51 | def dropout_layer(state_before, use_noise, trng):
  52 |     proj = tensor.switch(
  53 |         use_noise,
  54 |         state_before * trng.binomial(state_before.shape, p=0.5, n=1,
  55 |                                      dtype=state_before.dtype),
  56 |         state_before * 0.5)
  57 |     return proj
  58 | 
  59 | 
  60 | # make prefix-appended name
  61 | def _p(pp, name):
  62 |     return '%s_%s' % (pp, name)
  63 | 
  64 | 
  65 | # initialize Theano shared variables according to the initial parameters
  66 | def init_tparams(params):
  67 |     tparams = OrderedDict()
  68 |     for kk, pp in params.iteritems():
  69 |         tparams[kk] = theano.shared(params[kk], name=kk)
  70 |     return tparams
  71 | 
  72 | 
  73 | # load parameters
  74 | def load_params(path, params):
  75 |     pp = numpy.load(path)
  76 |     for kk, vv in params.iteritems():
  77 |         if kk not in pp:
  78 |             warnings.warn('%s is not in the archive' % kk)
  79 |             continue
  80 |         params[kk] = pp[kk]
  81 | 
  82 |     return params
  83 | 
  84 | # layers: 'name': ('parameter initializer', 'feedforward')
  85 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
  86 |           'gru': ('param_init_gru', 'gru_layer'),
  87 |           'gru_cond': ('param_init_gru_cond', 'gru_cond_layer'),
  88 |           }
  89 | 
  90 | 
  91 | def get_layer(name):
  92 |     fns = layers[name]
  93 |     return (eval(fns[0]), eval(fns[1]))
  94 | 
  95 | 
  96 | # some utilities
  97 | def ortho_weight(ndim):
  98 |     W = numpy.random.randn(ndim, ndim)
  99 |     u, s, v = numpy.linalg.svd(W)
 100 |     return u.astype('float32')
 101 | 
 102 | 
 103 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
 104 |     if nout is None:
 105 |         nout = nin
 106 |     if nout == nin and ortho:
 107 |         W = ortho_weight(nin)
 108 |     else:
 109 |         W = scale * numpy.random.randn(nin, nout)
 110 |     return W.astype('float32')
 111 | 
 112 | 
 113 | def tanh(x):
 114 |     return tensor.tanh(x)
 115 | 
 116 | 
 117 | def linear(x):
 118 |     return x
 119 | 
 120 | 
 121 | def concatenate(tensor_list, axis=0):
 122 |     """
 123 |     Alternative implementation of `theano.tensor.concatenate`.
 124 |     This function does exactly the same thing, but contrary to Theano's own
 125 |     implementation, the gradient is implemented on the GPU.
 126 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
 127 |     because the inverse operation (splitting) needs to be done on the CPU.
 128 |     This implementation does not have that problem.
 129 |     :usage:
 130 |         >>> x, y = theano.tensor.matrices('x', 'y')
 131 |         >>> c = concatenate([x, y], axis=1)
 132 |     :parameters:
 133 |         - tensor_list : list
 134 |             list of Theano tensor expressions that should be concatenated.
 135 |         - axis : int
 136 |             the tensors will be joined along this axis.
 137 |     :returns:
 138 |         - out : tensor
 139 |             the concatenated tensor expression.
 140 |     """
 141 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
 142 | 
 143 |     output_shape = ()
 144 |     for k in range(axis):
 145 |         output_shape += (tensor_list[0].shape[k],)
 146 |     output_shape += (concat_size,)
 147 |     for k in range(axis + 1, tensor_list[0].ndim):
 148 |         output_shape += (tensor_list[0].shape[k],)
 149 | 
 150 |     out = tensor.zeros(output_shape)
 151 |     offset = 0
 152 |     for tt in tensor_list:
 153 |         indices = ()
 154 |         for k in range(axis):
 155 |             indices += (slice(None),)
 156 |         indices += (slice(offset, offset + tt.shape[axis]),)
 157 |         for k in range(axis + 1, tensor_list[0].ndim):
 158 |             indices += (slice(None),)
 159 | 
 160 |         out = tensor.set_subtensor(out[indices], tt)
 161 |         offset += tt.shape[axis]
 162 | 
 163 |     return out
 164 | 
 165 | 
 166 | # batch preparation
 167 | def prepare_data(seqs_x, seqs_y, maxlen=None, n_words_src=30000,
 168 |                  n_words=30000):
 169 |     # x: a list of sentences
 170 |     lengths_x = [len(s) for s in seqs_x]
 171 |     lengths_y = [len(s) for s in seqs_y]
 172 | 
 173 |     if maxlen is not None:
 174 |         new_seqs_x = []
 175 |         new_seqs_y = []
 176 |         new_lengths_x = []
 177 |         new_lengths_y = []
 178 |         for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
 179 |             if l_x < maxlen and l_y < maxlen:
 180 |                 new_seqs_x.append(s_x)
 181 |                 new_lengths_x.append(l_x)
 182 |                 new_seqs_y.append(s_y)
 183 |                 new_lengths_y.append(l_y)
 184 |         lengths_x = new_lengths_x
 185 |         seqs_x = new_seqs_x
 186 |         lengths_y = new_lengths_y
 187 |         seqs_y = new_seqs_y
 188 | 
 189 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
 190 |             return None, None, None, None
 191 | 
 192 |     n_samples = len(seqs_x)
 193 |     maxlen_x = numpy.max(lengths_x) + 1
 194 |     maxlen_y = numpy.max(lengths_y) + 1
 195 | 
 196 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
 197 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 198 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 199 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 200 |     for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
 201 |         x[:lengths_x[idx], idx] = s_x
 202 |         x_mask[:lengths_x[idx]+1, idx] = 1.
 203 |         y[:lengths_y[idx], idx] = s_y
 204 |         y_mask[:lengths_y[idx]+1, idx] = 1.
 205 | 
 206 |     return x, x_mask, y, y_mask
 207 | 
 208 | 
 209 | # feedforward layer: affine transformation + point-wise nonlinearity
 210 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
 211 |                        ortho=True):
 212 |     if nin is None:
 213 |         nin = options['dim_proj']
 214 |     if nout is None:
 215 |         nout = options['dim_proj']
 216 |     params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
 217 |     params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
 218 | 
 219 |     return params
 220 | 
 221 | 
 222 | def fflayer(tparams, state_below, options, prefix='rconv',
 223 |             activ='lambda x: tensor.tanh(x)', **kwargs):
 224 |     return eval(activ)(
 225 |         tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
 226 |         tparams[_p(prefix, 'b')])
 227 | 
 228 | 
 229 | # GRU layer
 230 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
 231 |     if nin is None:
 232 |         nin = options['dim_proj']
 233 |     if dim is None:
 234 |         dim = options['dim_proj']
 235 | 
 236 |     # embedding to gates transformation weights, biases
 237 |     W = numpy.concatenate([norm_weight(nin, dim),
 238 |                            norm_weight(nin, dim)], axis=1)
 239 |     params[_p(prefix, 'W')] = W
 240 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
 241 | 
 242 |     # recurrent transformation weights for gates
 243 |     U = numpy.concatenate([ortho_weight(dim),
 244 |                            ortho_weight(dim)], axis=1)
 245 |     params[_p(prefix, 'U')] = U
 246 | 
 247 |     # embedding to hidden state proposal weights, biases
 248 |     Wx = norm_weight(nin, dim)
 249 |     params[_p(prefix, 'Wx')] = Wx
 250 |     params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
 251 | 
 252 |     # recurrent transformation weights for hidden state proposal
 253 |     Ux = ortho_weight(dim)
 254 |     params[_p(prefix, 'Ux')] = Ux
 255 | 
 256 |     return params
 257 | 
 258 | 
 259 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None,
 260 |               **kwargs):
 261 |     nsteps = state_below.shape[0]
 262 |     if state_below.ndim == 3:
 263 |         n_samples = state_below.shape[1]
 264 |     else:
 265 |         n_samples = 1
 266 | 
 267 |     dim = tparams[_p(prefix, 'Ux')].shape[1]
 268 | 
 269 |     if mask is None:
 270 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 271 | 
 272 |     # utility function to slice a tensor
 273 |     def _slice(_x, n, dim):
 274 |         if _x.ndim == 3:
 275 |             return _x[:, :, n*dim:(n+1)*dim]
 276 |         return _x[:, n*dim:(n+1)*dim]
 277 | 
 278 |     # state_below is the input word embeddings
 279 |     # input to the gates, concatenated
 280 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
 281 |         tparams[_p(prefix, 'b')]
 282 |     # input to compute the hidden state proposal
 283 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
 284 |         tparams[_p(prefix, 'bx')]
 285 | 
 286 |     # step function to be used by scan
 287 |     # arguments    | sequences |outputs-info| non-seqs
 288 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
 289 |         preact = tensor.dot(h_, U)
 290 |         preact += x_
 291 | 
 292 |         # reset and update gates
 293 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
 294 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
 295 | 
 296 |         # compute the hidden state proposal
 297 |         preactx = tensor.dot(h_, Ux)
 298 |         preactx = preactx * r
 299 |         preactx = preactx + xx_
 300 | 
 301 |         # hidden state proposal
 302 |         h = tensor.tanh(preactx)
 303 | 
 304 |         # leaky integrate and obtain next hidden state
 305 |         h = u * h_ + (1. - u) * h
 306 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
 307 | 
 308 |         return h
 309 | 
 310 |     # prepare scan arguments
 311 |     seqs = [mask, state_below_, state_belowx]
 312 |     init_states = [tensor.alloc(0., n_samples, dim)]
 313 |     _step = _step_slice
 314 |     shared_vars = [tparams[_p(prefix, 'U')],
 315 |                    tparams[_p(prefix, 'Ux')]]
 316 | 
 317 |     rval, updates = theano.scan(_step,
 318 |                                 sequences=seqs,
 319 |                                 outputs_info=init_states,
 320 |                                 non_sequences=shared_vars,
 321 |                                 name=_p(prefix, '_layers'),
 322 |                                 n_steps=nsteps,
 323 |                                 profile=profile,
 324 |                                 strict=True)
 325 |     rval = [rval]
 326 |     return rval
 327 | 
 328 | 
 329 | # Conditional GRU layer with Attention
 330 | def param_init_gru_cond(options, params, prefix='gru_cond',
 331 |                         nin=None, dim=None, dimctx=None,
 332 |                         nin_nonlin=None, dim_nonlin=None):
 333 |     if nin is None:
 334 |         nin = options['dim']
 335 |     if dim is None:
 336 |         dim = options['dim']
 337 |     if dimctx is None:
 338 |         dimctx = options['dim']
 339 |     if nin_nonlin is None:
 340 |         nin_nonlin = nin
 341 |     if dim_nonlin is None:
 342 |         dim_nonlin = dim
 343 | 
 344 |     W = numpy.concatenate([norm_weight(nin, dim),
 345 |                            norm_weight(nin, dim)], axis=1)
 346 |     params[_p(prefix, 'W')] = W
 347 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
 348 |     U = numpy.concatenate([ortho_weight(dim_nonlin),
 349 |                            ortho_weight(dim_nonlin)], axis=1)
 350 |     params[_p(prefix, 'U')] = U
 351 | 
 352 |     Wx = norm_weight(nin_nonlin, dim_nonlin)
 353 |     params[_p(prefix, 'Wx')] = Wx
 354 |     Ux = ortho_weight(dim_nonlin)
 355 |     params[_p(prefix, 'Ux')] = Ux
 356 |     params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32')
 357 | 
 358 |     U_nl = numpy.concatenate([ortho_weight(dim_nonlin),
 359 |                               ortho_weight(dim_nonlin)], axis=1)
 360 |     params[_p(prefix, 'U_nl')] = U_nl
 361 |     params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32')
 362 | 
 363 |     Ux_nl = ortho_weight(dim_nonlin)
 364 |     params[_p(prefix, 'Ux_nl')] = Ux_nl
 365 |     params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32')
 366 | 
 367 |     # context to LSTM
 368 |     Wc = norm_weight(dimctx, dim*2)
 369 |     params[_p(prefix, 'Wc')] = Wc
 370 | 
 371 |     Wcx = norm_weight(dimctx, dim)
 372 |     params[_p(prefix, 'Wcx')] = Wcx
 373 | 
 374 |     # attention: combined -> hidden
 375 |     W_comb_att = norm_weight(dim, dimctx)
 376 |     params[_p(prefix, 'W_comb_att')] = W_comb_att
 377 | 
 378 |     # attention: context -> hidden
 379 |     Wc_att = norm_weight(dimctx)
 380 |     params[_p(prefix, 'Wc_att')] = Wc_att
 381 | 
 382 |     # attention: hidden bias
 383 |     b_att = numpy.zeros((dimctx,)).astype('float32')
 384 |     params[_p(prefix, 'b_att')] = b_att
 385 | 
 386 |     # attention:
 387 |     U_att = norm_weight(dimctx, 1)
 388 |     params[_p(prefix, 'U_att')] = U_att
 389 |     c_att = numpy.zeros((1,)).astype('float32')
 390 |     params[_p(prefix, 'c_tt')] = c_att
 391 | 
 392 |     return params
 393 | 
 394 | 
 395 | def gru_cond_layer(tparams, state_below, options, prefix='gru',
 396 |                    mask=None, context=None, one_step=False,
 397 |                    init_memory=None, init_state=None,
 398 |                    context_mask=None,
 399 |                    **kwargs):
 400 | 
 401 |     assert context, 'Context must be provided'
 402 | 
 403 |     if one_step:
 404 |         assert init_state, 'previous state must be provided'
 405 | 
 406 |     nsteps = state_below.shape[0]
 407 |     if state_below.ndim == 3:
 408 |         n_samples = state_below.shape[1]
 409 |     else:
 410 |         n_samples = 1
 411 | 
 412 |     # mask
 413 |     if mask is None:
 414 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 415 | 
 416 |     dim = tparams[_p(prefix, 'Wcx')].shape[1]
 417 | 
 418 |     # initial/previous state
 419 |     if init_state is None:
 420 |         init_state = tensor.alloc(0., n_samples, dim)
 421 | 
 422 |     # projected context
 423 |     assert context.ndim == 3, \
 424 |         'Context must be 3-d: #annotation x #sample x dim'
 425 |     pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\
 426 |         tparams[_p(prefix, 'b_att')]
 427 | 
 428 |     def _slice(_x, n, dim):
 429 |         if _x.ndim == 3:
 430 |             return _x[:, :, n*dim:(n+1)*dim]
 431 |         return _x[:, n*dim:(n+1)*dim]
 432 | 
 433 |     # projected x
 434 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\
 435 |         tparams[_p(prefix, 'bx')]
 436 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\
 437 |         tparams[_p(prefix, 'b')]
 438 | 
 439 |     def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_,
 440 |                     U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx,
 441 |                     U_nl, Ux_nl, b_nl, bx_nl):
 442 |         preact1 = tensor.dot(h_, U)
 443 |         preact1 += x_
 444 |         preact1 = tensor.nnet.sigmoid(preact1)
 445 | 
 446 |         r1 = _slice(preact1, 0, dim)
 447 |         u1 = _slice(preact1, 1, dim)
 448 | 
 449 |         preactx1 = tensor.dot(h_, Ux)
 450 |         preactx1 *= r1
 451 |         preactx1 += xx_
 452 | 
 453 |         h1 = tensor.tanh(preactx1)
 454 | 
 455 |         h1 = u1 * h_ + (1. - u1) * h1
 456 |         h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_
 457 | 
 458 |         # attention
 459 |         pstate_ = tensor.dot(h1, W_comb_att)
 460 |         pctx__ = pctx_ + pstate_[None, :, :]
 461 |         #pctx__ += xc_
 462 |         pctx__ = tensor.tanh(pctx__)
 463 |         alpha = tensor.dot(pctx__, U_att)+c_tt
 464 |         alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
 465 |         alpha = tensor.exp(alpha)
 466 |         if context_mask:
 467 |             alpha = alpha * context_mask
 468 |         alpha = alpha / alpha.sum(0, keepdims=True)
 469 |         ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context
 470 | 
 471 |         preact2 = tensor.dot(h1, U_nl)+b_nl
 472 |         preact2 += tensor.dot(ctx_, Wc)
 473 |         preact2 = tensor.nnet.sigmoid(preact2)
 474 | 
 475 |         r2 = _slice(preact2, 0, dim)
 476 |         u2 = _slice(preact2, 1, dim)
 477 | 
 478 |         preactx2 = tensor.dot(h1, Ux_nl)+bx_nl
 479 |         preactx2 *= r2
 480 |         preactx2 += tensor.dot(ctx_, Wcx)
 481 | 
 482 |         h2 = tensor.tanh(preactx2)
 483 | 
 484 |         h2 = u2 * h1 + (1. - u2) * h2
 485 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
 486 | 
 487 |         return h2, ctx_, alpha.T  # pstate_, preact, preactx, r, u
 488 | 
 489 |     seqs = [mask, state_below_, state_belowx]
 490 |     #seqs = [mask, state_below_, state_belowx, state_belowc]
 491 |     _step = _step_slice
 492 | 
 493 |     shared_vars = [tparams[_p(prefix, 'U')],
 494 |                    tparams[_p(prefix, 'Wc')],
 495 |                    tparams[_p(prefix, 'W_comb_att')],
 496 |                    tparams[_p(prefix, 'U_att')],
 497 |                    tparams[_p(prefix, 'c_tt')],
 498 |                    tparams[_p(prefix, 'Ux')],
 499 |                    tparams[_p(prefix, 'Wcx')],
 500 |                    tparams[_p(prefix, 'U_nl')],
 501 |                    tparams[_p(prefix, 'Ux_nl')],
 502 |                    tparams[_p(prefix, 'b_nl')],
 503 |                    tparams[_p(prefix, 'bx_nl')]]
 504 | 
 505 |     if one_step:
 506 |         rval = _step(*(seqs + [init_state, None, None, pctx_, context] +
 507 |                        shared_vars))
 508 |     else:
 509 |         rval, updates = theano.scan(_step,
 510 |                                     sequences=seqs,
 511 |                                     outputs_info=[init_state,
 512 |                                                   tensor.alloc(0., n_samples,
 513 |                                                                context.shape[2]),
 514 |                                                   tensor.alloc(0., n_samples,
 515 |                                                                context.shape[0])],
 516 |                                     non_sequences=[pctx_, context]+shared_vars,
 517 |                                     name=_p(prefix, '_layers'),
 518 |                                     n_steps=nsteps,
 519 |                                     profile=profile,
 520 |                                     strict=True)
 521 |     return rval
 522 | 
 523 | 
 524 | # initialize all parameters
 525 | def init_params(options):
 526 |     params = OrderedDict()
 527 | 
 528 |     # embedding
 529 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
 530 |     params['Wemb_pos'] = norm_weight(options['n_pos'], options['dim_pos'])
 531 |     # params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
 532 | 
 533 |     # encoder: bidirectional RNN
 534 |     # params = get_layer(options['encoder'])[0](options, params,
 535 |     #                                           prefix='encoder',
 536 |     #                                           nin=options['dim_word'],
 537 |     #                                           dim=options['dim'])
 538 |     # params = get_layer(options['encoder'])[0](options, params,
 539 |     #                                           prefix='encoder_r',
 540 |     #                                           nin=options['dim_word'],
 541 |     #                                           dim=options['dim'])
 542 |     params = get_layer(options['encoder'])[0](options, params,
 543 |                                               prefix='encoder',
 544 |                                               nin=options['dim_word']+options['dim_pos'],
 545 |                                               dim=options['dim'])
 546 |     params = get_layer(options['encoder'])[0](options, params,
 547 |                                               prefix='encoder_r',
 548 |                                               nin=options['dim_word']+options['dim_pos'],
 549 |                                               dim=options['dim'])
 550 |     ctxdim = 2 * options['dim']
 551 | 
 552 |     # init_state, init_cell
 553 |     params = get_layer('ff')[0](options, params, prefix='ff_state',
 554 |                                 nin=ctxdim, nout=options['dim'])
 555 |     # decoder
 556 |     params = get_layer(options['decoder'])[0](options, params,
 557 |                                               prefix='decoder',
 558 |                                               nin=options['dim_word']+options['dim_pos'],
 559 |                                               dim=options['dim'],
 560 |                                               dimctx=ctxdim)
 561 |     # readout
 562 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
 563 |     #                             nin=options['dim'], nout=options['dim_word'],
 564 |     #                             ortho=False)
 565 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 566 |     #                             nin=options['dim_word'],
 567 |     #                             nout=options['dim_word'], ortho=False)
 568 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 569 |     #                             nin=ctxdim, nout=options['dim_word'],
 570 |     #                             ortho=False)
 571 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit',
 572 |     #                             nin=options['dim_word'],
 573 |     #                             nout=options['n_words'])
 574 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
 575 |                                 nin=options['dim'], nout=options['dim_word']+options['dim_pos'],
 576 |                                 ortho=False)
 577 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 578 |                                 nin=options['dim_word']+options['dim_pos'],
 579 |                                 nout=options['dim_word'], ortho=False)
 580 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 581 |                                 nin=ctxdim, nout=options['dim_word']+options['dim_pos'],
 582 |                                 ortho=False)
 583 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
 584 |                                 nin=options['dim_word'],
 585 |                                 nout=options['n_words'])
 586 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_pos',
 587 |                                 nin=options['dim_pos'],
 588 |                                 nout=options['n_pos'])
 589 |     # params['att_lambda'] = norm_weight(nin=1, nout=options['n_words'])
 590 |     params['att_lambda'] = 0.01 * numpy.random.randn(options['n_words']).astype('float32')
 591 |     params['pos_to_word'] = 0.01 * numpy.random.randn(options['n_pos'],options['n_words']).astype('float32')
 592 | 
 593 |     return params
 594 | 
 595 | 
 596 | # build a training model
 597 | def build_model(tparams, options):
 598 |     opt_ret = dict()
 599 | 
 600 |     trng = RandomStreams(1234)
 601 |     use_noise = theano.shared(numpy.float32(0.))
 602 | 
 603 |     # description string: #words x #samples
 604 |     x = tensor.matrix('x', dtype='int64')
 605 |     xp = tensor.matrix('xp', dtype='int64')
 606 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 607 |     y = tensor.matrix('y', dtype='int64')
 608 |     yp = tensor.matrix('yp', dtype='int64')
 609 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 610 | 
 611 |     # for the backward rnn, we just need to invert x and x_mask
 612 |     xr = x[::-1]
 613 |     xpr = xp[::-1]
 614 |     xr_mask = x_mask[::-1]
 615 | 
 616 |     n_timesteps = x.shape[0]
 617 |     n_timesteps_trg = y.shape[0]
 618 |     n_samples = x.shape[1]
 619 | 
 620 |     # word embedding for forward rnn (source)
 621 |     emb_w = tparams['Wemb'][x.flatten()]
 622 |     emb_pos = tparams['Wemb_pos'][xp.flatten()]
 623 |     emb = concatenate([emb_w, emb_pos], axis=1)
 624 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']])
 625 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 626 |                                             prefix='encoder',
 627 |                                             mask=x_mask)
 628 |     # word embedding for backward rnn (source)
 629 |     emb_wr = tparams['Wemb'][xr.flatten()]
 630 |     emb_posr = tparams['Wemb_pos'][xpr.flatten()]
 631 |     embr = concatenate([emb_wr, emb_posr], axis=1)
 632 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']])
 633 |     projr = get_layer(options['encoder'])[1](tparams, embr, options,
 634 |                                              prefix='encoder_r',
 635 |                                              mask=xr_mask)
 636 | 
 637 |     # context will be the concatenation of forward and backward rnns
 638 |     ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 639 | 
 640 |     # mean of the context (across time) will be used to initialize decoder rnn
 641 |     ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
 642 | 
 643 |     # or you can use the last state of forward + backward encoder rnns
 644 |     # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)
 645 | 
 646 |     # initial decoder state
 647 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 648 |                                     prefix='ff_state', activ='tanh')
 649 | 
 650 |     # word embedding (target), we will shift the target sequence one time step
 651 |     # to the right. This is done because of the bi-gram connections in the
 652 |     # readout and decoder rnn. The first target will be all zeros and we will
 653 |     # not condition on the last output.
 654 |     # emb = tparams['Wemb_dec'][y.flatten()]
 655 |     emb_w = tparams['Wemb'][y.flatten()]
 656 |     emb_pos = tparams['Wemb_pos'][yp.flatten()]
 657 |     emb = concatenate([emb_w, emb_pos], axis=1)
 658 |     emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']+options['dim_pos']])
 659 |     emb_shifted = tensor.zeros_like(emb)
 660 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
 661 |     emb = emb_shifted
 662 | 
 663 |     # decoder - pass through the decoder conditional gru with attention
 664 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 665 |                                             prefix='decoder',
 666 |                                             mask=y_mask, context=ctx,
 667 |                                             context_mask=x_mask,
 668 |                                             one_step=False,
 669 |                                             init_state=init_state)
 670 |     # hidden states of the decoder gru
 671 |     proj_h = proj[0]
 672 | 
 673 |     # weighted averages of context, generated by attention module
 674 |     ctxs = proj[1]
 675 | 
 676 |     # weights (alignment matrix)
 677 |     opt_ret['dec_alphas'] = proj[2]
 678 |     # print opt_ret['dec_alphas'].shape
 679 | 
 680 |     # compute word probabilities
 681 |     logit_lstm = get_layer('ff')[1](tparams, proj_h, options,
 682 |                                     prefix='ff_logit_lstm', activ='linear')
 683 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 684 |                                     prefix='ff_logit_prev', activ='linear')
 685 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
 686 |                                    prefix='ff_logit_ctx', activ='linear')
 687 |     logit_lstm_w, logit_lstm_p = logit_lstm[:,:,:options['dim_word']], logit_lstm[:,:,-options['dim_pos']:]
 688 |     logit_prev_w, logit_prev_p = logit_prev[:,:,:options['dim_word']], logit_prev[:,:,-options['dim_pos']:]
 689 |     logit_ctx_w, logit_ctx_p = logit_ctx[:,:,:options['dim_word']], logit_ctx[:,:,-options['dim_pos']:]
 690 | 
 691 |     logit = tensor.tanh(logit_lstm_w+logit_prev_w+logit_ctx_w)
 692 |     logit_p = tensor.tanh(logit_lstm_p+logit_prev_p+logit_ctx_p)
 693 | 
 694 |     if options['use_dropout']:
 695 |         logit = dropout_layer(logit, use_noise, trng)
 696 |         logit_p = dropout_layer(logit_p, use_noise, trng)
 697 | 
 698 |     logit = get_layer('ff')[1](tparams, logit, options,
 699 |                                prefix='ff_logit', activ='linear')
 700 |     logit_p = get_layer('ff')[1](tparams, logit_p, options,
 701 |                                prefix='ff_logit_pos', activ='linear')
 702 | 
 703 |     #copy attention
 704 |     logit_shp = logit.shape
 705 |     sflogit = logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])
 706 |     logit_new = (1-tparams['att_lambda']) * sflogit
 707 |     alpha_shape = opt_ret['dec_alphas'].shape
 708 |     attw = opt_ret['dec_alphas'].reshape([alpha_shape[0]*alpha_shape[1],alpha_shape[2]])
 709 | 
 710 |     def _step_for_copy(label, lg, _x, atw):
 711 |         lg = tensor.set_subtensor(lg[label, _x[label%_x.shape[0]]], lg[label, _x[label%_x.shape[0]]] + tparams['att_lambda'][_x[label%_x.shape[0]]]*atw[label])
 712 |         return lg
 713 |     # ls = T.vector('ls', dtype='int64')
 714 |     result, _ = theano.scan(_step_for_copy, sequences=tensor.arange(logit_new.shape[0]), outputs_info=[logit_new], non_sequences = [x.T, attw])
 715 | 
 716 |     logit_shp_pos = logit_p.shape
 717 |     sflogit_p = logit_p.reshape([logit_shp_pos[0]*logit_shp_pos[1], logit_shp_pos[2]])
 718 |     probs_p = tensor.nnet.softmax(sflogit_p)
 719 | 
 720 |     probs = tensor.nnet.softmax(result[-1]+tensor.dot(sflogit_p, tparams['pos_to_word']))
 721 | 
 722 |     # cost
 723 |     y_flat = y.flatten()
 724 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
 725 |     cost = -tensor.log(probs.flatten()[y_flat_idx])
 726 |     cost = cost.reshape([y.shape[0], y.shape[1]])
 727 |     cost = (cost * y_mask).sum(0)
 728 | 
 729 |     # pos cost
 730 |     yp_flat = yp.flatten()
 731 |     yp_flat_idx = tensor.arange(yp_flat.shape[0]) * options['n_pos'] + yp_flat
 732 |     costp = -tensor.log(probs_p.flatten()[yp_flat_idx])
 733 |     costp = costp.reshape([yp.shape[0], yp.shape[1]])
 734 |     costp = (costp * y_mask).sum(0)
 735 | 
 736 |     final_cost = cost+costp
 737 | 
 738 |     return trng, use_noise, x, xp, x_mask, y, yp, y_mask, opt_ret, final_cost
 739 | 
 740 | 
 741 | # build a sampler
 742 | def build_sampler(tparams, options, trng, use_noise):
 743 |     x = tensor.matrix('x', dtype='int64')
 744 |     xp = tensor.matrix('xp', dtype='int64')
 745 |     xr = x[::-1]
 746 |     xpr = xp[::-1]
 747 |     n_timesteps = x.shape[0]
 748 |     n_samples = x.shape[1]
 749 | 
 750 |     # word embedding (source), forward and backward
 751 |     # emb = tparams['Wemb'][x.flatten()]
 752 |     # emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
 753 |     # embr = tparams['Wemb'][xr.flatten()]
 754 |     # embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
 755 |     emb_w = tparams['Wemb'][x.flatten()]
 756 |     emb_pos = tparams['Wemb_pos'][xp.flatten()]
 757 |     emb = concatenate([emb_w, emb_pos], axis=1)
 758 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']])
 759 | 
 760 |     emb_wr = tparams['Wemb'][xr.flatten()]
 761 |     emb_posr = tparams['Wemb_pos'][xpr.flatten()]
 762 |     embr = concatenate([emb_wr, emb_posr], axis=1)
 763 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']])
 764 | 
 765 |     # encoder
 766 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
 767 |                                             prefix='encoder')
 768 |     projr = get_layer(options['encoder'])[1](tparams, embr, options,
 769 |                                              prefix='encoder_r')
 770 | 
 771 |     # concatenate forward and backward rnn hidden states
 772 |     ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
 773 | 
 774 |     # get the input for decoder rnn initializer mlp
 775 |     ctx_mean = ctx.mean(0)
 776 |     # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
 777 |     init_state = get_layer('ff')[1](tparams, ctx_mean, options,
 778 |                                     prefix='ff_state', activ='tanh')
 779 | 
 780 |     print 'Building f_init...',
 781 |     outs = [init_state, ctx]
 782 |     f_init = theano.function([x,xp], outs, name='f_init', profile=profile)
 783 |     print 'Done'
 784 | 
 785 |     # x: 1 x 1
 786 |     y = tensor.vector('y_sampler', dtype='int64')
 787 |     yp = tensor.vector('yp_sampler', dtype='int64')
 788 |     word_map = tensor.vector('wm', dtype='int64')
 789 |     init_state = tensor.matrix('init_state', dtype='float32')
 790 | 
 791 |     # if it's the first word, emb should be all zero and it is indicated by -1
 792 |     # emb = tensor.switch(y[:, None] < 0,
 793 |     #                     tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
 794 |     #                     tparams['Wemb_dec'][y])
 795 |     emb = tensor.switch(y[:, None] < 0,
 796 |                         tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
 797 |                         tparams['Wemb'][y])
 798 |     emb_pos = tensor.switch(yp[:, None] < 0,
 799 |                         tensor.alloc(0., 1, tparams['Wemb_pos'].shape[1]),
 800 |                         tparams['Wemb_pos'][yp])
 801 |     emb = concatenate([emb, emb_pos], axis=1)
 802 | 
 803 |     # apply one step of conditional gru with attention
 804 |     proj = get_layer(options['decoder'])[1](tparams, emb, options,
 805 |                                             prefix='decoder',
 806 |                                             mask=None, context=ctx,
 807 |                                             one_step=True,
 808 |                                             init_state=init_state)
 809 |     # get the next hidden state
 810 |     next_state = proj[0]
 811 | 
 812 |     # get the weighted averages of context for this target word y
 813 |     ctxs = proj[1]
 814 | 
 815 |     logit_lstm = get_layer('ff')[1](tparams, next_state, options,
 816 |                                     prefix='ff_logit_lstm', activ='linear')
 817 |     logit_prev = get_layer('ff')[1](tparams, emb, options,
 818 |                                     prefix='ff_logit_prev', activ='linear')
 819 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
 820 |                                    prefix='ff_logit_ctx', activ='linear')
 821 |     logit_lstm_w, logit_lstm_p = logit_lstm[:,:options['dim_word']], logit_lstm[:,-options['dim_pos']:]
 822 |     logit_prev_w, logit_prev_p = logit_prev[:,:options['dim_word']], logit_prev[:,-options['dim_pos']:]
 823 |     logit_ctx_w, logit_ctx_p = logit_ctx[:,:options['dim_word']], logit_ctx[:,-options['dim_pos']:]
 824 | 
 825 | 
 826 |     logit = tensor.tanh(logit_lstm_w+logit_prev_w+logit_ctx_w)
 827 |     logit_p = tensor.tanh(logit_lstm_p+logit_prev_p+logit_ctx_p)
 828 | 
 829 |     if options['use_dropout']:
 830 |         logit = dropout_layer(logit, use_noise, trng)
 831 |         logit_p = dropout_layer(logit_p, use_noise, trng)
 832 | 
 833 |     logit = get_layer('ff')[1](tparams, logit, options,
 834 |                                prefix='ff_logit', activ='linear')
 835 |     # logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map])
 836 |     logit_p = get_layer('ff')[1](tparams, logit_p, options,
 837 |                                prefix='ff_logit_pos', activ='linear')
 838 | 
 839 |     logit_new = (1-tparams['att_lambda']) * logit
 840 | 
 841 |     attw = proj[2]
 842 | 
 843 |     def _step_for_copy(label, lg, _x, atw):
 844 |         lg = tensor.set_subtensor(lg[label, _x[label%_x.shape[0]]], lg[label, _x[label%_x.shape[0]]] + tparams['att_lambda'][_x[label%_x.shape[0]]]*atw[label])
 845 |         return lg
 846 |     # ls = T.vector('ls', dtype='int64')
 847 |     result, _ = theano.scan(_step_for_copy, sequences=tensor.arange(logit_new.shape[0]), outputs_info=[logit_new], non_sequences = [x.T, attw])
 848 | 
 849 |     next_probs_p = tensor.nnet.softmax(logit_p)
 850 | 
 851 |     # compute the softmax probability
 852 |     next_probs = tensor.nnet.softmax(result[-1]+tensor.dot(logit_p, tparams['pos_to_word']))
 853 |     next_probs = next_probs[:,word_map]
 854 | 
 855 |     # sample from softmax distribution to get the sample
 856 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
 857 |     next_sample_p = trng.multinomial(pvals=next_probs_p).argmax(1)
 858 | 
 859 |     # compile a function to do the whole thing above, next word probability,
 860 |     # sampled word for the next target, next hidden state to be used
 861 |     print 'Building f_next..',
 862 |     inps = [x, y, yp, ctx, init_state, word_map]
 863 |     outs = [next_probs, next_probs_p, next_sample, next_sample_p, next_state]
 864 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
 865 |     print 'Done'
 866 | 
 867 |     return f_init, f_next
 868 | 
 869 | 
 870 | # generate sample, either with stochastic sampling or beam search. Note that,
 871 | # this function iteratively calls f_init and f_next functions.
 872 | def gen_sample(tparams, f_init, f_next, x, xp, word_map, options, trng=None, k=1, maxlen=30,
 873 |                stochastic=True, argmax=False):
 874 | 
 875 |     # k is the beam size we have
 876 |     if stochastic is not True:
 877 |         stochastic = True
 878 |     if k > 1:
 879 |         assert not stochastic, \
 880 |             'Beam search does not support stochastic sampling'
 881 | 
 882 |     sample = []
 883 |     samplep = []
 884 |     sample_score = []
 885 |     sample_scorep = []
 886 |     if stochastic:
 887 |         sample_score = 0
 888 | 
 889 |     live_k = 1
 890 |     dead_k = 0
 891 | 
 892 |     hyp_samples = [[]] * live_k
 893 |     hyp_samplesp = [[]] * live_k
 894 |     hyp_scores = numpy.zeros(live_k).astype('float32')
 895 |     hyp_scoresp = numpy.zeros(live_k).astype('float32')
 896 |     hyp_states = []
 897 | 
 898 |     # get initial state of decoder rnn and encoder context
 899 |     ret = f_init(x,xp)
 900 |     next_state, ctx0 = ret[0], ret[1]
 901 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
 902 |     next_wp = -1 * numpy.ones((1,)).astype('int64')
 903 | 
 904 |     for ii in xrange(maxlen):
 905 |         ctx = numpy.tile(ctx0, [live_k, 1])
 906 |         inps = [x, next_w, next_wp, ctx, next_state, word_map]
 907 |         ret = f_next(*inps)
 908 |         next_p, next_pp, next_w, next_wp, next_state = ret[0], ret[1], ret[2], ret[3], ret[4]
 909 |         true_next_w = numpy.array([word_map[next_w[0]]])
 910 | 
 911 |         if stochastic:
 912 |             if argmax:
 913 |                 nw = next_p[0].argmax()
 914 |                 nw1 = word_map[next_p[0].argmax()]
 915 |                 nwp = next_pp[0].argmax()
 916 |             else:
 917 |                 nw = next_w[0]
 918 |                 nw1 = true_next_w[0]
 919 |                 nwp = next_wp[0]
 920 |             sample.append(nw1)
 921 |             samplep.append(nwp)
 922 |             sample_score -= numpy.log(next_p[0, nw])
 923 |             sample_scorep -= numpy.log(next_pp[0, nwp])
 924 |             if nw == 0:
 925 |                 break
 926 |         else:
 927 |             cand_scores = hyp_scores[:, None] - numpy.log(next_p)
 928 |             cand_scoresp = hyp_scoresp[:, None] - numpy.log(next_pp)
 929 |             cand_flat = cand_scores.flatten()
 930 |             cand_flatp = cand_scoresp.flatten()
 931 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
 932 |             ranks_flatp = cand_flatp.argsort()[:(k-dead_k)]
 933 | 
 934 |             voc_size = next_p.shape[1]
 935 |             voc_sizep = next_pp.shape[1]
 936 |             trans_indices = ranks_flat / voc_size
 937 |             trans_indicesp = ranks_flatp/ voc_sizep
 938 |             word_indices = ranks_flat % voc_size
 939 |             word_indicesp = ranks_flatp % voc_sizep
 940 |             costs = cand_flat[ranks_flat]
 941 |             costsp = cand_flatp[ranks_flatp]
 942 | 
 943 |             new_hyp_samples = []
 944 |             new_hyp_samplesp = []
 945 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
 946 |             new_hyp_scoresp = numpy.zeros(k-dead_k).astype('float32')
 947 |             new_hyp_states = []
 948 | 
 949 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
 950 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
 951 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
 952 |                 new_hyp_states.append(copy.copy(next_state[ti]))
 953 | 
 954 |             # check the finished samples
 955 |             new_live_k = 0
 956 |             hyp_samples = []
 957 |             hyp_scores = []
 958 |             hyp_states = []
 959 | 
 960 |             for idx in xrange(len(new_hyp_samples)):
 961 |                 if new_hyp_samples[idx][-1] == 0:
 962 |                     sample.append(new_hyp_samples[idx])
 963 |                     sample_score.append(new_hyp_scores[idx])
 964 |                     dead_k += 1
 965 |                 else:
 966 |                     new_live_k += 1
 967 |                     hyp_samples.append(new_hyp_samples[idx])
 968 |                     hyp_scores.append(new_hyp_scores[idx])
 969 |                     hyp_states.append(new_hyp_states[idx])
 970 |             hyp_scores = numpy.array(hyp_scores)
 971 |             live_k = new_live_k
 972 | 
 973 |             if new_live_k < 1:
 974 |                 break
 975 |             if dead_k >= k:
 976 |                 break
 977 | 
 978 |             next_w = numpy.array([w[-1] for w in hyp_samples])
 979 |             next_state = numpy.array(hyp_states)
 980 | 
 981 |     if not stochastic:
 982 |         # dump every remaining one
 983 |         if live_k > 0:
 984 |             for idx in xrange(live_k):
 985 |                 sample.append(hyp_samples[idx])
 986 |                 sample_score.append(hyp_scores[idx])
 987 | 
 988 |     return sample, samplep, sample_score, sample_scorep
 989 | 
 990 | 
 991 | # calculate the log probablities on a given corpus using translation model
 992 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True):
 993 |     probs = []
 994 | 
 995 |     n_done = 0
 996 | 
 997 |     for x, y, xp, yp in iterator:
 998 |         n_done += len(x)
 999 | 
1000 |         x, x_mask, y, y_mask = prepare_data(x, y,
1001 |                                             n_words_src=options['n_words_src'],
1002 |                                             n_words=options['n_words'])
1003 |         xp, _, yp, _ = prepare_data(xp, yp,
1004 |                                             n_words_src=options['n_words_src'],
1005 |                                             n_words=options['n_words'])
1006 | 
1007 |         pprobs = f_log_probs(x, xp, x_mask, y, yp, y_mask)
1008 |         for pp in pprobs:
1009 |             probs.append(pp)
1010 | 
1011 |         if numpy.isnan(numpy.mean(probs)):
1012 |             # ipdb.set_trace()
1013 |             print 1
1014 | 
1015 |         if verbose:
1016 |             print >>sys.stderr, '%d samples computed' % (n_done)
1017 | 
1018 |     return numpy.array(probs)
1019 | 
1020 | 
1021 | # optimizers
1022 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
1023 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8):
1024 | 
1025 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
1026 |                for k, p in tparams.iteritems()]
1027 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
1028 | 
1029 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)
1030 | 
1031 |     updates = []
1032 | 
1033 |     t_prev = theano.shared(numpy.float32(0.))
1034 |     t = t_prev + 1.
1035 |     lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t)
1036 | 
1037 |     for p, g in zip(tparams.values(), gshared):
1038 |         m = theano.shared(p.get_value() * 0., p.name + '_mean')
1039 |         v = theano.shared(p.get_value() * 0., p.name + '_variance')
1040 |         m_t = beta1 * m + (1. - beta1) * g
1041 |         v_t = beta2 * v + (1. - beta2) * g**2
1042 |         step = lr_t * m_t / (tensor.sqrt(v_t) + e)
1043 |         p_t = p - step
1044 |         updates.append((m, m_t))
1045 |         updates.append((v, v_t))
1046 |         updates.append((p, p_t))
1047 |     updates.append((t_prev, t))
1048 | 
1049 |     f_update = theano.function([lr], [], updates=updates,
1050 |                                on_unused_input='ignore', profile=profile)
1051 | 
1052 |     return f_grad_shared, f_update
1053 | 
1054 | 
1055 | def adadelta(lr, tparams, grads, inp, cost):
1056 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1057 |                                   name='%s_grad' % k)
1058 |                     for k, p in tparams.iteritems()]
1059 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
1060 |                                  name='%s_rup2' % k)
1061 |                    for k, p in tparams.iteritems()]
1062 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
1063 |                                     name='%s_rgrad2' % k)
1064 |                       for k, p in tparams.iteritems()]
1065 | 
1066 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
1067 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
1068 |              for rg2, g in zip(running_grads2, grads)]
1069 | 
1070 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
1071 |                                     profile=profile)
1072 | 
1073 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
1074 |              for zg, ru2, rg2 in zip(zipped_grads, running_up2,
1075 |                                      running_grads2)]
1076 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
1077 |              for ru2, ud in zip(running_up2, updir)]
1078 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
1079 | 
1080 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
1081 |                                on_unused_input='ignore', profile=profile)
1082 | 
1083 |     return f_grad_shared, f_update
1084 | 
1085 | 
1086 | def rmsprop(lr, tparams, grads, inp, cost):
1087 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1088 |                                   name='%s_grad' % k)
1089 |                     for k, p in tparams.iteritems()]
1090 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1091 |                                    name='%s_rgrad' % k)
1092 |                      for k, p in tparams.iteritems()]
1093 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
1094 |                                     name='%s_rgrad2' % k)
1095 |                       for k, p in tparams.iteritems()]
1096 | 
1097 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
1098 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
1099 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
1100 |              for rg2, g in zip(running_grads2, grads)]
1101 | 
1102 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
1103 |                                     profile=profile)
1104 | 
1105 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
1106 |                            name='%s_updir' % k)
1107 |              for k, p in tparams.iteritems()]
1108 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
1109 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
1110 |                                             running_grads2)]
1111 |     param_up = [(p, p + udn[1])
1112 |                 for p, udn in zip(itemlist(tparams), updir_new)]
1113 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
1114 |                                on_unused_input='ignore', profile=profile)
1115 | 
1116 |     return f_grad_shared, f_update
1117 | 
1118 | 
1119 | def sgd(lr, tparams, grads, x, mask, y, cost):
1120 |     gshared = [theano.shared(p.get_value() * 0.,
1121 |                              name='%s_grad' % k)
1122 |                for k, p in tparams.iteritems()]
1123 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
1124 | 
1125 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
1126 |                                     profile=profile)
1127 | 
1128 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
1129 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
1130 | 
1131 |     return f_grad_shared, f_update
1132 | 
1133 | 
1134 | def train(dim_word=100,  # word vector dimensionality
1135 |           dim_pos = 100,
1136 |           dim=1000,  # the number of LSTM units
1137 |           encoder='gru',
1138 |           decoder='gru_cond',
1139 |           patience=10,  # early stopping patience
1140 |           max_epochs=5000,
1141 |           finish_after=10000000,  # finish after this many updates
1142 |           dispFreq=100,
1143 |           decay_c=0.,  # L2 regularization penalty
1144 |           alpha_c=0.,  # alignment regularization
1145 |           clip_c=-1.,  # gradient clipping threshold
1146 |           lrate=0.01,  # learning rate
1147 |           n_words_src=100000,  # source vocabulary size
1148 |           n_words=100000,  # target vocabulary size
1149 |           n_pos = 36, # pos vocabulary size
1150 |           maxlen=100,  # maximum length of the description
1151 |           optimizer='rmsprop',
1152 |           batch_size=16,
1153 |           valid_batch_size=16,
1154 |           saveto='model.npz',
1155 |           validFreq=1000,
1156 |           saveFreq=1000,   # save the parameters after every saveFreq updates
1157 |           sampleFreq=100,   # generate some samples after every sampleFreq
1158 |           datasets=[
1159 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
1160 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok',
1161 |               'data/post_pos.txt',
1162 |               'data/response_pos.txt'],
1163 |           valid_datasets=['../data/dev/newstest2011.en.tok',
1164 |                           '../data/dev/newstest2011.fr.tok',
1165 |                           'data/v_post_pos.txt',
1166 |                           'data/v_response_pos.txt'],
1167 |           dictionaries=[
1168 |               'data/worddicts.pkl',
1169 |               'data/posdicts.pkl',
1170 |               'data/dict2.txt'],
1171 |           use_dropout=False,
1172 |           reload_=False,
1173 |           overwrite=False):
1174 | 
1175 |     # Model options
1176 |     model_options = locals().copy()
1177 | 
1178 |     # load dictionaries and invert them
1179 |     worddicts = [None] * 2
1180 |     worddicts_r = [None] * 2
1181 |     for ii, dd in enumerate(dictionaries[:2]):
1182 |         with open(dd, 'rb') as f:
1183 |             worddicts[ii] = pkl.load(f)
1184 |         worddicts_r[ii] = dict()
1185 |         for kk, vv in worddicts[ii].iteritems():
1186 |             worddicts_r[ii][vv] = kk
1187 |     pos_dicts_r = worddicts_r[1]
1188 |     worddicts_r = [worddicts_r[0],worddicts_r[0]]
1189 | 
1190 |     word_map0 = []
1191 |     with open(dictionaries[-1]) as ff:
1192 |         for line in ff:
1193 |             line = line.strip()
1194 |             if line in worddicts[0]:
1195 |                 if line not in word_map0 and worddicts[0][line]<n_words:
1196 |                     word_map0.append(worddicts[0][line])
1197 | 
1198 | 
1199 |     # reload options
1200 |     if reload_ and os.path.exists(saveto):
1201 |         print 'Reloading model options'
1202 |         with open('%s.pkl' % saveto, 'rb') as f:
1203 |             model_options = pkl.load(f)
1204 | 
1205 |     print 'Loading data'
1206 |     train = TextIterator(datasets[0], datasets[1], datasets[2], datasets[3],
1207 |                          dictionaries[0], dictionaries[1],
1208 |                          n_words=n_words, n_pos=n_pos,
1209 |                          batch_size=batch_size,
1210 |                          maxlen=maxlen)
1211 | 
1212 |     valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], valid_datasets[3],
1213 |                          dictionaries[0], dictionaries[1],
1214 |                          n_words=n_words, n_pos=n_pos,
1215 |                          batch_size=valid_batch_size,
1216 |                          maxlen=maxlen)
1217 | 
1218 |     # ipdb.set_trace()
1219 | 
1220 |     print 'Building model'
1221 |     params = init_params(model_options)
1222 |     # reload parameters
1223 |     if reload_ and os.path.exists(saveto):
1224 |         print 'Reloading model parameters'
1225 |         params = load_params(saveto, params)
1226 | 
1227 |     tparams = init_tparams(params)
1228 | 
1229 |     trng, use_noise, \
1230 |         x, xp, x_mask, y, yp, y_mask, \
1231 |         opt_ret, \
1232 |         cost = \
1233 |         build_model(tparams, model_options)
1234 |     inps = [x, xp, x_mask, y, yp, y_mask]
1235 | 
1236 |     print 'Building sampler'
1237 |     f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)
1238 | 
1239 |     # before any regularizer
1240 |     print 'Building f_log_probs...',
1241 |     f_log_probs = theano.function(inps, cost, profile=profile)
1242 |     print 'Done'
1243 | 
1244 |     cost = cost.mean()
1245 | 
1246 |     # apply L2 regularization on weights
1247 |     if decay_c > 0.:
1248 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
1249 |         weight_decay = 0.
1250 |         for kk, vv in tparams.iteritems():
1251 |             weight_decay += (vv ** 2).sum()
1252 |         weight_decay *= decay_c
1253 |         cost += weight_decay
1254 | 
1255 |     # regularize the alpha weights
1256 |     if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
1257 |         alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
1258 |         alpha_reg = alpha_c * (
1259 |             (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
1260 |              opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
1261 |         cost += alpha_reg
1262 | 
1263 |     # after all regularizers - compile the computational graph for cost
1264 |     print 'Building f_cost...',
1265 |     f_cost = theano.function(inps, cost, profile=profile)
1266 |     print 'Done'
1267 | 
1268 |     print 'Computing gradient...',
1269 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
1270 |     print 'Done'
1271 | 
1272 |     # apply gradient clipping here
1273 |     if clip_c > 0.:
1274 |         g2 = 0.
1275 |         for g in grads:
1276 |             g2 += (g**2).sum()
1277 |         new_grads = []
1278 |         for g in grads:
1279 |             new_grads.append(tensor.switch(g2 > (clip_c**2),
1280 |                                            g / tensor.sqrt(g2) * clip_c,
1281 |                                            g))
1282 |         grads = new_grads
1283 | 
1284 |     # compile the optimizer, the actual computational graph is compiled here
1285 |     lr = tensor.scalar(name='lr')
1286 |     print 'Building optimizers...',
1287 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
1288 |     print 'Done'
1289 | 
1290 |     print 'Optimization'
1291 | 
1292 |     best_p = None
1293 |     bad_counter = 0
1294 |     uidx = 0
1295 |     estop = False
1296 |     history_errs = []
1297 |     # reload history
1298 |     if reload_ and os.path.exists(saveto):
1299 |         rmodel = numpy.load(saveto)
1300 |         history_errs = list(rmodel['history_errs'])
1301 |         if 'uidx' in rmodel:
1302 |             uidx = rmodel['uidx']
1303 | 
1304 |     if validFreq == -1:
1305 |         validFreq = len(train[0])/batch_size
1306 |     if saveFreq == -1:
1307 |         saveFreq = len(train[0])/batch_size
1308 |     if sampleFreq == -1:
1309 |         sampleFreq = len(train[0])/batch_size
1310 | 
1311 |     for eidx in xrange(max_epochs):
1312 |         n_samples = 0
1313 | 
1314 |         for x, y, xp, yp in train:
1315 |             n_samples += len(x)
1316 |             if len(x) == 0:
1317 |                 continue
1318 |             uidx += 1
1319 |             # use_noise.set_value(1.)
1320 | 
1321 |             x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
1322 |                                                 n_words_src=n_words_src,
1323 |                                                 n_words=n_words)
1324 |             xp, _, yp, _ = prepare_data(xp, yp, maxlen=maxlen,
1325 |                                                 n_words_src=n_pos,
1326 |                                                 n_words=n_pos)
1327 | 
1328 |             if x is None:
1329 |                 print 'Minibatch with zero sample under length ', maxlen
1330 |                 uidx -= 1
1331 |                 continue
1332 |             # word_map = list(set(list(x.reshape(x.shape[0]*x.shape[1]))))
1333 |             # word_map3 = list(set(word_map+word_map0))
1334 |             # ipdb.set_trace()
1335 | 
1336 |             ud_start = time.time()
1337 | 
1338 |             # compute cost, grads and copy grads to shared variables
1339 |             print 'fuck cost'
1340 |             cost = f_grad_shared(x, xp, x_mask, y, yp, y_mask)
1341 | 
1342 |             # do the update on parameters
1343 |             f_update(lrate)
1344 | 
1345 |             ud = time.time() - ud_start
1346 | 
1347 |             # check for bad numbers, usually we remove non-finite elements
1348 |             # and continue training - but not done here
1349 |             if numpy.isnan(cost) or numpy.isinf(cost):
1350 |                 print 'NaN detected'
1351 |                 return 1., 1., 1.
1352 | 
1353 |             # verbose
1354 |             if numpy.mod(uidx, dispFreq) == 0:
1355 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
1356 | 
1357 |             # save the best model so far, in addition, save the latest model
1358 |             # into a separate file with the iteration number for external eval
1359 |             if numpy.mod(uidx, saveFreq) == 0:
1360 |                 print 'Saving the best model...',
1361 |                 if best_p is not None:
1362 |                     params = best_p
1363 |                 else:
1364 |                     params = unzip(tparams)
1365 |                 numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
1366 |                 pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
1367 |                 print 'Done'
1368 | 
1369 |                 # save with uidx
1370 |                 if not overwrite:
1371 |                     print 'Saving the model at iteration {}...'.format(uidx),
1372 |                     saveto_uidx = '{}.iter{}.npz'.format(
1373 |                         os.path.splitext(saveto)[0], uidx)
1374 |                     numpy.savez(saveto_uidx, history_errs=history_errs,
1375 |                                 uidx=uidx, **unzip(tparams))
1376 |                     print 'Done'
1377 | 
1378 | 
1379 |             # generate some samples with the model and display them
1380 |             if numpy.mod(uidx, sampleFreq) == 0:
1381 |                 # FIXME: random selection?
1382 |                 for jj in xrange(numpy.minimum(5, x.shape[1])):
1383 |                     stochastic = True
1384 |                     word_map = list(set(list(x[:, jj][:, None].reshape(x[:, jj][:, None].shape[0]*x[:, jj][:, None].shape[1]))+word_map0))
1385 |                     sample, samplep, score, scorep = gen_sample(tparams, f_init, f_next,
1386 |                                                x[:, jj][:, None], xp[:, jj][:, None], word_map,
1387 |                                                model_options, trng=trng, k=1,
1388 |                                                maxlen=30,
1389 |                                                stochastic=stochastic,
1390 |                                                argmax=False)
1391 |                     print 'Source ', jj, ': ',
1392 |                     assert x.shape == xp.shape
1393 |                     for vv, vvp in zip(x[:, jj], xp[:,jj]):
1394 |                         if vv == 0:
1395 |                             break
1396 |                         if vv in worddicts_r[0]:
1397 |                             print worddicts_r[0][vv],
1398 |                             if vvp in pos_dicts_r:
1399 |                                 print '#'+str(pos_dicts_r[vvp]),
1400 |                         else:
1401 |                             print 'UNK',
1402 |                     print
1403 |                     print 'Truth ', jj, ' : ',
1404 |                     for vv, vvp in zip(y[:, jj], yp[:, jj]):
1405 |                         if vv == 0:
1406 |                             break
1407 |                         if vv in worddicts_r[1]:
1408 |                             print worddicts_r[1][vv],
1409 |                             if vvp in pos_dicts_r:
1410 |                                 print '#'+str(pos_dicts_r[vvp]),
1411 |                         else:
1412 |                             print 'UNK',
1413 |                     print
1414 |                     print 'Sample ', jj, ': ',
1415 |                     if stochastic:
1416 |                         ss = sample
1417 |                         ssp = samplep
1418 |                     else:
1419 |                         score = score / numpy.array([len(s) for s in sample])
1420 |                         scorep = scorep / numpy.array([len(s) for s in samplep])
1421 |                         ss = sample[score.argmin()]
1422 |                         ssp = samplep[score.argmin()]
1423 |                     for vv,vvp in zip(ss,ssp):
1424 |                         if vv == 0:
1425 |                             break
1426 |                         if vv in worddicts_r[1]:
1427 |                             print worddicts_r[1][vv],
1428 |                             if vvp in pos_dicts_r:
1429 |                                 print '#'+str(pos_dicts_r[vvp]),
1430 |                         else:
1431 |                             print 'UNK',
1432 |                     print
1433 | 
1434 |             # validate model on validation set and early stop if necessary
1435 |             if numpy.mod(uidx, validFreq) == 0:
1436 |                 use_noise.set_value(0.)
1437 |                 valid_errs = pred_probs(f_log_probs, prepare_data,
1438 |                                         model_options, valid)
1439 |                 valid_err = valid_errs.mean()
1440 |                 history_errs.append(valid_err)
1441 | 
1442 |                 if uidx == 0 or valid_err <= numpy.array(history_errs).min():
1443 |                     best_p = unzip(tparams)
1444 |                     bad_counter = 0
1445 |                 if len(history_errs) > patience and valid_err >= \
1446 |                         numpy.array(history_errs)[:-patience].min():
1447 |                     bad_counter += 1
1448 |                     if bad_counter > patience:
1449 |                         print 'Early Stop!'
1450 |                         estop = True
1451 |                         break
1452 | 
1453 |                 if numpy.isnan(valid_err):
1454 |                     # ipdb.set_trace()
1455 |                     print 1
1456 | 
1457 |                 print 'Valid ', valid_err
1458 | 
1459 |             # finish after this many updates
1460 |             if uidx >= finish_after:
1461 |                 print 'Finishing after %d iterations!' % uidx
1462 |                 estop = True
1463 |                 break
1464 |         # ipdb.set_trace()
1465 |         print 'Seen %d samples' % n_samples
1466 | 
1467 |         if estop:
1468 |             break
1469 | 
1470 |     if best_p is not None:
1471 |         zipp(best_p, tparams)
1472 | 
1473 |     use_noise.set_value(0.)
1474 |     valid_err = pred_probs(f_log_probs, prepare_data,
1475 |                            model_options, valid).mean()
1476 | 
1477 |     print 'Valid ', valid_err
1478 | 
1479 |     params = copy.copy(best_p)
1480 |     numpy.savez(saveto, zipped_params=best_p,
1481 |                 history_errs=history_errs,
1482 |                 uidx=uidx,
1483 |                 **params)
1484 |     
1485 | 
1486 |     return valid_err
1487 | 
1488 | 
1489 | if __name__ == '__main__':
1490 |     pass
1491 | 


--------------------------------------------------------------------------------
/Att_POS_CopyNet/train.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | import cPickle
 4 | 
 5 | from nmt_new_pos_word import train
 6 | 
 7 | def main(job_id, params):
 8 |     print params
 9 |     basedir = 'data_2'
10 |     validerr = train(saveto=params['model'][0],
11 |                                         reload_=params['reload'][0],
12 |                                         dim_word=params['dim_word'][0],
13 |                                         dim_pos=params['dim_pos'][0],
14 |                                         dim=params['dim'][0],
15 |                                         n_words=params['n-words'][0],
16 |                                         n_pos=params['n-pos'][0]+1,
17 |                                         n_words_src=params['n-words'][0],
18 |                                         decay_c=params['decay-c'][0],
19 |                                         clip_c=params['clip-c'][0],
20 |                                         lrate=params['learning-rate'][0],
21 |                                         optimizer=params['optimizer'][0],
22 |                                         maxlen=15,
23 |                                         batch_size=4,
24 |                                         valid_batch_size=1,
25 |                     datasets=['%s/p.txt'%basedir,
26 |                     '%s/p.txt'%basedir,
27 |                     '%s/p_pos.txt'%basedir,
28 |                     '%s/p_pos.txt'%basedir],
29 |                     valid_datasets=['%s/p.txt'%basedir,
30 |                     '%s/p.txt'%basedir,
31 |                     '%s/p_pos.txt'%basedir,
32 |                     '%s/p_pos.txt'%basedir],
33 |                     # dictionaries=['%s/p.txt.pkl'%basedir,
34 |                     # '%s/r.txt.pkl'%basedir],
35 |                     dictionaries=['%s/word_dict.pkl'%basedir,'%s/pos_dict.pkl'%basedir,'%s/dict2.txt'%basedir],
36 |                                         validFreq=50000,
37 |                                         dispFreq=1,
38 |                                         saveFreq=100,
39 |                                         sampleFreq=1,
40 |                                         use_dropout=params['use-dropout'][0],
41 |                                         overwrite=False)
42 |     return validerr
43 | 
44 | if __name__ == '__main__':
45 |     # f = cPickle.load(open(r'data//p.txt.pkl'))
46 |     # print f
47 | 
48 |     """
49 |     datasets:
50 | 
51 |     dictionaries:
52 |     OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)])
53 |     OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)])
54 | 
55 |     """
56 |     basedir = 'data_2'
57 |     main(0, {
58 |         'model': ['%s/model/m.npz'%basedir],
59 |         'dim_word': [100],#word embedding dim
60 |         'dim_pos': [100], #pos embedding dim
61 |         'dim': [100],     #hidden dim
62 |         'n-words': [6],   #vocabulary size
63 |         'n-pos':[6],      #pos tag set size
64 |         'optimizer': ['rmsprop'],
65 |         'decay-c': [0.],
66 |         'clip-c': [1.],
67 |         'use-dropout': [False],
68 |         'learning-rate': [0.01],
69 |         'reload': [False]})
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/Att_Seq2Seq/Pdt.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | ##################################################################
  3 | #
  4 | #    Author: Chuwei Luo
  5 | #    Email: luochuwei@gmail.com
  6 | #    Date: 08/08/2016(Bug fixed)
  7 | #    Usage: For testing.
  8 | #
  9 | #
 10 | ##################################################################
 11 | import argparse
 12 | import theano
 13 | import numpy
 14 | import cPickle as pkl
 15 | 
 16 | from nmt_word_without_copy import (build_sampler, gen_sample, load_params,
 17 |                  init_params, init_tparams)
 18 | 
 19 | from multiprocessing import Process, Queue
 20 | 
 21 | 
 22 | 
 23 | 
 24 | def translate_model(word_map0, queue, rqueue, pid, model, options, k, normalize, n_best):
 25 | 
 26 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 27 |     trng = RandomStreams(1234)
 28 |     use_noise = theano.shared(numpy.float32(0.))
 29 | 
 30 |     # allocate model parameters
 31 |     params = init_params(options)
 32 | 
 33 |     # load model parameters and set theano shared variables
 34 |     params = load_params(model, params)
 35 |     tparams = init_tparams(params)
 36 | 
 37 |     # word index
 38 |     f_init, f_next = build_sampler(tparams, options, trng, use_noise)
 39 | 
 40 |     def _translate(seq):
 41 |         xx = numpy.array(seq).reshape([len(seq), 1])
 42 |         word_map = list(set(list(xx.reshape(xx.shape[0]*xx.shape[1]))+word_map0))
 43 |         # sample given an input sequence and obtain scores
 44 |         sample, score = gen_sample(tparams, f_init, f_next,
 45 |                                    xx, word_map,
 46 |                                    options, trng=trng, k=k, maxlen=200,
 47 |                                    stochastic=False, argmax=False)
 48 | 
 49 |         # normalize scores according to sequence lengths
 50 |         if normalize:
 51 |             lengths = numpy.array([len(s) for s in sample])
 52 |             score = score / lengths
 53 |         if n_best > 1:
 54 |             sidx = numpy.argsort(score)[:n_best]
 55 | 
 56 |         else:
 57 |             sidx = numpy.argmin(score)
 58 |         # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx]
 59 | 
 60 |         return numpy.array(sample)[sidx], numpy.array(score)[sidx]
 61 |         # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx]
 62 | 
 63 |     while True:
 64 |         req = queue.get()
 65 |         if req is None:
 66 |             break
 67 | 
 68 |         idx, x = req[0], req[1]
 69 |         print pid, '-', idx
 70 |         seq, scores = _translate(x)
 71 | 
 72 |         rqueue.put((idx, seq, scores))
 73 | 
 74 |     return
 75 | 
 76 | 
 77 | def predict(model, dictionary, common_dictionary, source_file, saveto, k=5,
 78 |          normalize=False, n_process=5, chr_level=False, n_best=1):
 79 | 
 80 |     # load model model_options
 81 |     with open('%s.pkl' % model, 'rb') as f:
 82 |         options = pkl.load(f)
 83 | 
 84 |     # load source dictionary and invert
 85 |     with open(dictionary, 'rb') as f:
 86 |         word_dict = pkl.load(f)
 87 |     word_idict = dict()
 88 |     for kk, vv in word_dict.iteritems():
 89 |         word_idict[vv] = kk
 90 |     word_idict[0] = '<eos>'
 91 |     word_idict[1] = 'UNK'
 92 | 
 93 |     word_idict_trg = word_idict
 94 |     # load target dictionary and invert
 95 |     # with open(dictionary_target, 'rb') as f:
 96 |     #     word_dict_trg = pkl.load(f)
 97 |     # word_idict_trg = dict()
 98 |     # for kk, vv in word_dict_trg.iteritems():
 99 |     #     word_idict_trg[vv] = kk
100 |     # word_idict_trg[0] = '<eos>'
101 |     # word_idict_trg[1] = 'UNK'
102 | 
103 |     word_map0 = []
104 |     with open(common_dictionary) as ff:
105 |         for line in ff:
106 |             line = line.strip()
107 |             if line in word_dict:
108 |                 if line not in word_map0:
109 |                     word_map0.append(word_dict[line])
110 | 
111 |     # create input and output queues for processes
112 |     queue = Queue()
113 |     rqueue = Queue()
114 |     processes = [None] * n_process
115 |     for midx in xrange(n_process):
116 |         processes[midx] = Process(
117 |             target=translate_model,
118 |             args=(word_map0, queue, rqueue, midx, model, options, k, normalize, n_best))
119 |         processes[midx].start()
120 | 
121 |     # utility function
122 |     def _seqs2words(caps):
123 |         capsw = []
124 |         for cc in caps:
125 |             ww = []
126 |             for w in cc:
127 |                 if w == 0:
128 |                     break
129 |                 ww.append(word_idict_trg[w])
130 |             capsw.append(' '.join(ww))
131 |         return capsw
132 | 
133 |     def _send_jobs(fname):
134 |         with open(fname, 'r') as f:
135 |             for idx, line in enumerate(f):
136 |                 if chr_level:
137 |                     words = list(line.decode('utf-8').strip())
138 |                 else:
139 |                     words = line.strip().split()
140 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
141 |                 x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
142 |                 x += [0]
143 |                 queue.put((idx, x))
144 |         return idx+1
145 | 
146 |     def _finish_processes():
147 |         for midx in xrange(n_process):
148 |             queue.put(None)
149 | 
150 |     def _retrieve_jobs(n_samples):
151 |         trans = [None] * n_samples
152 |         scores = [None] * n_samples
153 |         for idx in xrange(n_samples):
154 |             resp = rqueue.get()
155 |             trans[resp[0]] = resp[1]
156 |             scores[resp[0]] = resp[2]
157 |             if numpy.mod(idx, 10) == 0:
158 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
159 |         return trans, scores
160 | 
161 |     print 'Translating ', source_file, '...'
162 |     n_samples = _send_jobs(source_file)
163 |     trans, scores = _retrieve_jobs(n_samples)
164 |     _finish_processes()
165 | 
166 |     if n_best == 1:
167 |         trans = _seqs2words(trans)
168 |     else:
169 |         n_best_trans = []
170 |         for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)):
171 |             sentences = _seqs2words(n_best_tr)
172 |             for ids, trans_ in enumerate(sentences):
173 |                 n_best_trans.append(
174 |                     '|||'.join(
175 |                         ['{}'.format(idx), trans_,
176 |                          '{}'.format(score_[ids])]))
177 |         trans = n_best_trans
178 | 
179 |     with open(saveto, 'w') as f:
180 |         print >>f, '\n'.join(trans)
181 |     print 'Done'
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     parser = argparse.ArgumentParser()
186 |     parser.add_argument('-k', type=int, default=5, help="Beam size")
187 |     parser.add_argument('-p', type=int, default=5, help="Number of processes")
188 |     parser.add_argument('-n', action="store_true", default=False,
189 |                         help="Normalize wrt sequence length")
190 |     parser.add_argument('-c', action="store_true", default=False,
191 |                         help="Character level")
192 |     parser.add_argument('-b', type=int, default=1, help="Output n-best list")
193 |     parser.add_argument('model', type=str)
194 |     parser.add_argument('dictionary', type=str)
195 |     parser.add_argument('common_dictionary', type=str)
196 |     parser.add_argument('source', type=str)
197 |     parser.add_argument('saveto', type=str)
198 | 
199 |     args = parser.parse_args()
200 | 
201 |     main(args.model, args.dictionary, args.common_dictionary, args.source,
202 |          args.saveto, k=args.k, normalize=args.n, n_process=args.p,
203 |          chr_level=args.c, n_best=args.b)
204 | 


--------------------------------------------------------------------------------
/Att_Seq2Seq/Pdt_windows.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | ##################################################################
 3 | #
 4 | #    Author: Chuwei Luo
 5 | #    Email: luochuwei@gmail.com
 6 | #    Date: 08/08/2016(Bug fixed)
 7 | #    Usage: For windows predict
 8 | #
 9 | ##################################################################
10 | import Pdt as TTT
11 | 
12 | if __name__ == '__main__':
13 |     TTT.predict(r'data_2/model/m.npz', r'data_2/word_dict.pkl', r'data_2/dict2.txt', r'data_2/p.txt', r'data_2/ttt.txt', k=5, n_process=1)
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/Att_Seq2Seq/README.md:
--------------------------------------------------------------------------------
1 | # Standard Model
2 | 
3 | Standard Seq2Seq Attention Model
4 | 
5 | (For training faster, the output vocabulary is built according to the input and a extra dictionary)
6 | 


--------------------------------------------------------------------------------
/Att_Seq2Seq/Seq2SeqAtt.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | ##################################################################
 3 | #
 4 | #    Author: Chuwei Luo
 5 | #    Email: luochuwei@gmail.com
 6 | #    Date: 08/08/2016(Bug fixed)
 7 | #    Usage: Seq2Seq Att, the output vocabulary is built
 8 | #           according to the input and a extra dictionary
 9 | #
10 | ##################################################################
11 | 
12 | import numpy
13 | import os
14 | import cPickle
15 | 
16 | from nmt_word_without_copy import train
17 | 
18 | def main(job_id, params):
19 |     print params
20 |     basedir = 'data_2'
21 |     validerr = train(saveto=params['model'][0],
22 |                                         reload_=params['reload'][0],
23 |                                         dim_word=params['dim_word'][0],
24 |                                         dim=params['dim'][0],
25 |                                         n_words=params['n-words'][0],
26 |                                         n_words_src=params['n-words'][0],
27 |                                         decay_c=params['decay-c'][0],
28 |                                         clip_c=params['clip-c'][0],
29 |                                         lrate=params['learning-rate'][0],
30 |                                         optimizer=params['optimizer'][0],
31 |                                         maxlen=100,
32 |                                         batch_size=32,
33 |                                         valid_batch_size=32,
34 |                     datasets=['%s/p.txt'%basedir,
35 |                     '%s/r.txt'%basedir],
36 |                     valid_datasets=['%s/p.txt'%basedir,
37 |                     '%s/r.txt'%basedir,],
38 |                     # dictionaries=['%s/p.txt.pkl'%basedir,
39 |                     # '%s/r.txt.pkl'%basedir],
40 |                     dictionaries=['%s/word_dict.pkl'%basedir,'%s/dict2.txt'%basedir],
41 |                                         validFreq=50000,
42 |                                         dispFreq=1,
43 |                                         saveFreq=100,
44 |                                         sampleFreq=1,
45 |                                         use_dropout=params['use-dropout'][0],
46 |                                         overwrite=False)
47 |     return validerr
48 | 
49 | if __name__ == '__main__':
50 |     # f = cPickle.load(open(r'data//p.txt.pkl'))
51 |     # print f
52 | 
53 |     """
54 |     datasets:
55 | 
56 |     dictionaries:
57 |     OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)])
58 |     OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)])
59 | 
60 |     """
61 |     basedir = 'data_2'
62 |     main(0, {
63 |         'model': ['%s/model/m.npz'%basedir],
64 |         'dim_word': [512],#word embedding dim
65 |         'dim': [512],     #hidden dim
66 |         'n-words': [50000],   #vocabulary size
67 |         'optimizer': ['rmsprop'],
68 |         'decay-c': [0.],
69 |         'clip-c': [1.],
70 |         'use-dropout': [False],
71 |         'learning-rate': [0.01],
72 |         'reload': [False]})
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/Att_Seq2Seq/data/pp.txt:
--------------------------------------------------------------------------------
1 | let us play dota2
2 | i like playing basketball
3 | go go go pokemon go


--------------------------------------------------------------------------------
/Att_Seq2Seq/data/pp.txt.pkl:
--------------------------------------------------------------------------------
 1 | ccollections
 2 | OrderedDict
 3 | p1
 4 | ((lp2
 5 | (lp3
 6 | S'eos'
 7 | p4
 8 | aI0
 9 | aa(lp5
10 | S'UNK'
11 | p6
12 | aI1
13 | aa(lp7
14 | S'go'
15 | p8
16 | aI2
17 | aa(lp9
18 | S'pokemon'
19 | p10
20 | aI3
21 | aa(lp11
22 | S'basketball'
23 | p12
24 | aI4
25 | aa(lp13
26 | S'playing'
27 | p14
28 | aI5
29 | aa(lp15
30 | S'like'
31 | p16
32 | aI6
33 | aa(lp17
34 | S'i'
35 | aI7
36 | aa(lp18
37 | S'dota2'
38 | p19
39 | aI8
40 | aa(lp20
41 | S'play'
42 | p21
43 | aI9
44 | aa(lp22
45 | S'us'
46 | p23
47 | aI10
48 | aa(lp24
49 | S'let'
50 | p25
51 | aI11
52 | aatRp26
53 | .


--------------------------------------------------------------------------------
/Att_Seq2Seq/data/ppv.txt:
--------------------------------------------------------------------------------
1 | dota2


--------------------------------------------------------------------------------
/Att_Seq2Seq/data/rr.txt:
--------------------------------------------------------------------------------
1 | play dota2
2 | basketball
3 | pokemon go


--------------------------------------------------------------------------------
/Att_Seq2Seq/data/rr.txt.pkl:
--------------------------------------------------------------------------------
 1 | ccollections
 2 | OrderedDict
 3 | p1
 4 | ((lp2
 5 | (lp3
 6 | S'eos'
 7 | p4
 8 | aI0
 9 | aa(lp5
10 | S'UNK'
11 | p6
12 | aI1
13 | aa(lp7
14 | S'go'
15 | p8
16 | aI2
17 | aa(lp9
18 | S'pokemon'
19 | p10
20 | aI3
21 | aa(lp11
22 | S'basketball'
23 | p12
24 | aI4
25 | aa(lp13
26 | S'dota2'
27 | p14
28 | aI5
29 | aa(lp15
30 | S'play'
31 | p16
32 | aI6
33 | aatRp17
34 | .


--------------------------------------------------------------------------------
/Att_Seq2Seq/data/rrv.txt:
--------------------------------------------------------------------------------
1 | dota2


--------------------------------------------------------------------------------
/Att_Seq2Seq/data_iterator.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | ##################################################################
  3 | #
  4 | #    Author: Chuwei Luo
  5 | #    Email: luochuwei@gmail.com
  6 | #    Date: 26/07/2016
  7 | #    Usage: data iterator
  8 | #
  9 | ##################################################################
 10 | import cPickle as pkl
 11 | import gzip
 12 | 
 13 | 
 14 | def fopen(filename, mode='r'):
 15 |     if filename.endswith('.gz'):
 16 |         return gzip.open(filename, mode)
 17 |     return open(filename, mode)
 18 | 
 19 | 
 20 | class TextIterator:
 21 |     """Simple Bitext iterator."""
 22 |     def __init__(self, source, target,
 23 |                  source_dict, target_dict,
 24 |                  batch_size=128,
 25 |                  maxlen=100,
 26 |                  n_words_source=-1,
 27 |                  n_words_target=-1):
 28 |         self.source = fopen(source, 'r')
 29 |         self.target = fopen(target, 'r')
 30 |         with open(source_dict, 'rb') as f:
 31 |             self.source_dict = pkl.load(f)
 32 |         with open(target_dict, 'rb') as f:
 33 |             self.target_dict = pkl.load(f)
 34 | 
 35 |         self.batch_size = batch_size
 36 |         self.maxlen = maxlen
 37 | 
 38 |         self.n_words_source = n_words_source
 39 |         self.n_words_target = n_words_target
 40 | 
 41 |         self.end_of_data = False
 42 | 
 43 |     def __iter__(self):
 44 |         return self
 45 | 
 46 |     def reset(self):
 47 |         self.source.seek(0)
 48 |         self.target.seek(0)
 49 | 
 50 |     def next(self):
 51 |         if self.end_of_data:
 52 |             self.end_of_data = False
 53 |             self.reset()
 54 |             raise StopIteration
 55 | 
 56 |         source = []
 57 |         target = []
 58 | 
 59 |         try:
 60 | 
 61 |             # actual work here
 62 |             while True:
 63 | 
 64 |                 # read from source file and map to word index
 65 |                 ss = self.source.readline()
 66 |                 if ss == "":
 67 |                     raise IOError
 68 |                 ss = ss.strip().split()
 69 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
 70 |                       for w in ss]
 71 |                 if self.n_words_source > 0:
 72 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
 73 | 
 74 |                 # read from source file and map to word index
 75 |                 tt = self.target.readline()
 76 |                 if tt == "":
 77 |                     raise IOError
 78 |                 tt = tt.strip().split()
 79 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
 80 |                       for w in tt]
 81 |                 if self.n_words_target > 0:
 82 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
 83 | 
 84 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
 85 |                     continue
 86 | 
 87 |                 source.append(ss)
 88 |                 target.append(tt)
 89 | 
 90 |                 if len(source) >= self.batch_size or \
 91 |                         len(target) >= self.batch_size:
 92 |                     break
 93 |         except IOError:
 94 |             self.end_of_data = True
 95 | 
 96 |         if len(source) <= 0 or len(target) <= 0:
 97 |             self.end_of_data = False
 98 |             self.reset()
 99 |             raise StopIteration
100 | 
101 |         return source, target
102 | 


--------------------------------------------------------------------------------
/Att_Seq2Seq/train.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | ##################################################################
 3 | #
 4 | #    Author: Chuwei Luo
 5 | #    Email: luochuwei@gmail.com
 6 | #    Date: 26/07/2016
 7 | #    Usage: Training
 8 | #
 9 | ##################################################################
10 | 
11 | import numpy
12 | import os
13 | import cPickle
14 | 
15 | from Seq2SeqAtt import train
16 | 
17 | def main(job_id, params):
18 |     print params
19 |     basedir = 'data'
20 |     validerr = train(saveto=params['model'][0], reload_=params['reload'][0], dim_word=params['dim_word'][0], dim=params['dim'][0], n_words=params['n-words'][0], n_words_src=params['n-words'][0], decay_c=params['decay-c'][0], clip_c=params['clip-c'][0], lrate=params['learning-rate'][0], optimizer=params['optimizer'][0], maxlen=15, batch_size=1, valid_batch_size=1, datasets=['%s/ppp.txt'%basedir, '%s/ppp.txt'%basedir], valid_datasets=['%s/pv.txt'%basedir,
21 |                     '%s/pv.txt'%basedir],  dictionaries=['%s/p.txt.pkl'%basedir], validFreq=500000, dispFreq=1, saveFreq=100, sampleFreq=1, use_dropout=params['use-dropout'][0], overwrite=False)
22 |     return validerr
23 | 
24 | if __name__ == '__main__':
25 |     # f = cPickle.load(open(r'data//p.txt.pkl'))
26 |     # print f
27 |     """
28 |     datasets:
29 | 
30 |     dictionaries:
31 |     OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)])
32 |     OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)])
33 | 
34 |     """
35 |     basedir = 'data'
36 |     main(0, {
37 |         'model': ['%s/model/m.model'%basedir],
38 |         'dim_word': [15],
39 |         'dim': [24],
40 |         'n-words': [6],
41 |         'optimizer': ['rmsprop'],
42 |         'decay-c': [0.],
43 |         'clip-c': [1.],
44 |         'use-dropout': [False],
45 |         'learning-rate': [0.01],
46 |         'reload': [False]})
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Attention_CopyNet
 2 | 
 3 | Attention_CopyNet for summarization and response generation
 4 | 
 5 | Att_Seq2Seq(Finished)
 6 | 
 7 | Att_CopyNet(Finished)
 8 | 
 9 | Att_POS_CopyNet(Finished)
10 | 


--------------------------------------------------------------------------------