├── .gitattributes ├── .gitignore ├── Att_CopyNet ├── AttCopy.py ├── Att_CopyNet_copy_supervision │ ├── Att_copy_s.py │ ├── build_dictionary.py │ ├── data_iterator.py │ ├── train.py │ ├── translate.py │ └── translate_Windows.py ├── README.md ├── build_dictionary.py ├── data_2 │ ├── dict2.txt │ ├── p.txt │ ├── r.txt │ ├── ttt.txt │ └── word_dict.pkl ├── data_iterator.py ├── predict.py ├── predict_windows.py └── train.py ├── Att_POS_CopyNet ├── README.md ├── build_dictionary.py ├── data_2 │ ├── dict2.txt │ ├── p.txt │ ├── p_pos.txt │ ├── pos_dict.pkl │ ├── r.txt │ ├── r_pos.txt │ └── word_dict.pkl ├── data_iterator_for_pos.py ├── nmt_new_pos_word.py └── train.py ├── Att_Seq2Seq ├── Pdt.py ├── Pdt_windows.py ├── README.md ├── Seq2SeqAtt.py ├── data │ ├── pp.txt │ ├── pp.txt.pkl │ ├── ppv.txt │ ├── rr.txt │ ├── rr.txt.pkl │ └── rrv.txt ├── data_iterator.py └── train.py └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Att_CopyNet/AttCopy.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ############################################ 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 18/08/2016 7 | # Usage: copy net 8 | # 9 | ############################################ 10 | import theano 11 | import theano.tensor as tensor 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 13 | 14 | import cPickle as pkl 15 | # import ipdb 16 | import numpy 17 | import copy 18 | 19 | import os 20 | import warnings 21 | import sys 22 | import time 23 | 24 | from collections import OrderedDict 25 | 26 | from data_iterator import TextIterator 27 | 28 | profile = False 29 | 30 | 31 | # push parameters to Theano shared variables 32 | def zipp(params, tparams): 33 | for kk, vv in params.iteritems(): 34 | tparams[kk].set_value(vv) 35 | 36 | 37 | # pull parameters from Theano shared variables 38 | def unzip(zipped): 39 | new_params = OrderedDict() 40 | for kk, vv in zipped.iteritems(): 41 | new_params[kk] = vv.get_value() 42 | return new_params 43 | 44 | 45 | # get the list of parameters: Note that tparams must be OrderedDict 46 | def itemlist(tparams): 47 | return [vv for kk, vv in tparams.iteritems()] 48 | 49 | 50 | # dropout 51 | def dropout_layer(state_before, use_noise, trng): 52 | proj = tensor.switch( 53 | use_noise, 54 | state_before * trng.binomial(state_before.shape, p=0.5, n=1, 55 | dtype=state_before.dtype), 56 | state_before * 0.5) 57 | return proj 58 | 59 | 60 | # make prefix-appended name 61 | def _p(pp, name): 62 | return '%s_%s' % (pp, name) 63 | 64 | 65 | # initialize Theano shared variables according to the initial parameters 66 | def init_tparams(params): 67 | tparams = OrderedDict() 68 | for kk, pp in params.iteritems(): 69 | tparams[kk] = theano.shared(params[kk], name=kk) 70 | return tparams 71 | 72 | 73 | # load parameters 74 | def load_params(path, params): 75 | pp = numpy.load(path) 76 | for kk, vv in params.iteritems(): 77 | if kk not in pp: 78 | warnings.warn('%s is not in the archive' % kk) 79 | continue 80 | params[kk] = pp[kk] 81 | 82 | return params 83 | 84 | # layers: 'name': ('parameter initializer', 'feedforward') 85 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 86 | 'gru': ('param_init_gru', 'gru_layer'), 87 | 'gru_cond': ('param_init_gru_cond', 'gru_cond_layer'), 88 | } 89 | 90 | 91 | def get_layer(name): 92 | fns = layers[name] 93 | return (eval(fns[0]), eval(fns[1])) 94 | 95 | 96 | # some utilities 97 | def ortho_weight(ndim): 98 | W = numpy.random.randn(ndim, ndim) 99 | u, s, v = numpy.linalg.svd(W) 100 | return u.astype('float32') 101 | 102 | 103 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 104 | if nout is None: 105 | nout = nin 106 | if nout == nin and ortho: 107 | W = ortho_weight(nin) 108 | else: 109 | W = scale * numpy.random.randn(nin, nout) 110 | return W.astype('float32') 111 | 112 | 113 | def tanh(x): 114 | return tensor.tanh(x) 115 | 116 | 117 | def linear(x): 118 | return x 119 | 120 | 121 | def concatenate(tensor_list, axis=0): 122 | """ 123 | Alternative implementation of `theano.tensor.concatenate`. 124 | This function does exactly the same thing, but contrary to Theano's own 125 | implementation, the gradient is implemented on the GPU. 126 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 127 | because the inverse operation (splitting) needs to be done on the CPU. 128 | This implementation does not have that problem. 129 | :usage: 130 | >>> x, y = theano.tensor.matrices('x', 'y') 131 | >>> c = concatenate([x, y], axis=1) 132 | :parameters: 133 | - tensor_list : list 134 | list of Theano tensor expressions that should be concatenated. 135 | - axis : int 136 | the tensors will be joined along this axis. 137 | :returns: 138 | - out : tensor 139 | the concatenated tensor expression. 140 | """ 141 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 142 | 143 | output_shape = () 144 | for k in range(axis): 145 | output_shape += (tensor_list[0].shape[k],) 146 | output_shape += (concat_size,) 147 | for k in range(axis + 1, tensor_list[0].ndim): 148 | output_shape += (tensor_list[0].shape[k],) 149 | 150 | out = tensor.zeros(output_shape) 151 | offset = 0 152 | for tt in tensor_list: 153 | indices = () 154 | for k in range(axis): 155 | indices += (slice(None),) 156 | indices += (slice(offset, offset + tt.shape[axis]),) 157 | for k in range(axis + 1, tensor_list[0].ndim): 158 | indices += (slice(None),) 159 | 160 | out = tensor.set_subtensor(out[indices], tt) 161 | offset += tt.shape[axis] 162 | 163 | return out 164 | 165 | 166 | # batch preparation 167 | def prepare_data(seqs_x, seqs_y, maxlen=None, n_words_src=30000, 168 | n_words=30000): 169 | # x: a list of sentences 170 | lengths_x = [len(s) for s in seqs_x] 171 | lengths_y = [len(s) for s in seqs_y] 172 | 173 | if maxlen is not None: 174 | new_seqs_x = [] 175 | new_seqs_y = [] 176 | new_lengths_x = [] 177 | new_lengths_y = [] 178 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y): 179 | if l_x < maxlen and l_y < maxlen: 180 | new_seqs_x.append(s_x) 181 | new_lengths_x.append(l_x) 182 | new_seqs_y.append(s_y) 183 | new_lengths_y.append(l_y) 184 | lengths_x = new_lengths_x 185 | seqs_x = new_seqs_x 186 | lengths_y = new_lengths_y 187 | seqs_y = new_seqs_y 188 | 189 | if len(lengths_x) < 1 or len(lengths_y) < 1: 190 | return None, None, None, None 191 | 192 | n_samples = len(seqs_x) 193 | maxlen_x = numpy.max(lengths_x) + 1 194 | maxlen_y = numpy.max(lengths_y) + 1 195 | 196 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 197 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 198 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 199 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 200 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)): 201 | x[:lengths_x[idx], idx] = s_x 202 | x_mask[:lengths_x[idx]+1, idx] = 1. 203 | y[:lengths_y[idx], idx] = s_y 204 | y_mask[:lengths_y[idx]+1, idx] = 1. 205 | 206 | return x, x_mask, y, y_mask 207 | 208 | 209 | # feedforward layer: affine transformation + point-wise nonlinearity 210 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, 211 | ortho=True): 212 | if nin is None: 213 | nin = options['dim_proj'] 214 | if nout is None: 215 | nout = options['dim_proj'] 216 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) 217 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') 218 | 219 | return params 220 | 221 | 222 | def fflayer(tparams, state_below, options, prefix='rconv', 223 | activ='lambda x: tensor.tanh(x)', **kwargs): 224 | return eval(activ)( 225 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 226 | tparams[_p(prefix, 'b')]) 227 | 228 | 229 | # GRU layer 230 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 231 | if nin is None: 232 | nin = options['dim_proj'] 233 | if dim is None: 234 | dim = options['dim_proj'] 235 | 236 | # embedding to gates transformation weights, biases 237 | W = numpy.concatenate([norm_weight(nin, dim), 238 | norm_weight(nin, dim)], axis=1) 239 | params[_p(prefix, 'W')] = W 240 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 241 | 242 | # recurrent transformation weights for gates 243 | U = numpy.concatenate([ortho_weight(dim), 244 | ortho_weight(dim)], axis=1) 245 | params[_p(prefix, 'U')] = U 246 | 247 | # embedding to hidden state proposal weights, biases 248 | Wx = norm_weight(nin, dim) 249 | params[_p(prefix, 'Wx')] = Wx 250 | params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 251 | 252 | # recurrent transformation weights for hidden state proposal 253 | Ux = ortho_weight(dim) 254 | params[_p(prefix, 'Ux')] = Ux 255 | 256 | return params 257 | 258 | 259 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, 260 | **kwargs): 261 | nsteps = state_below.shape[0] 262 | if state_below.ndim == 3: 263 | n_samples = state_below.shape[1] 264 | else: 265 | n_samples = 1 266 | 267 | dim = tparams[_p(prefix, 'Ux')].shape[1] 268 | 269 | if mask is None: 270 | mask = tensor.alloc(1., state_below.shape[0], 1) 271 | 272 | # utility function to slice a tensor 273 | def _slice(_x, n, dim): 274 | if _x.ndim == 3: 275 | return _x[:, :, n*dim:(n+1)*dim] 276 | return _x[:, n*dim:(n+1)*dim] 277 | 278 | # state_below is the input word embeddings 279 | # input to the gates, concatenated 280 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 281 | tparams[_p(prefix, 'b')] 282 | # input to compute the hidden state proposal 283 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 284 | tparams[_p(prefix, 'bx')] 285 | 286 | # step function to be used by scan 287 | # arguments | sequences |outputs-info| non-seqs 288 | def _step_slice(m_, x_, xx_, h_, U, Ux): 289 | preact = tensor.dot(h_, U) 290 | preact += x_ 291 | 292 | # reset and update gates 293 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 294 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 295 | 296 | # compute the hidden state proposal 297 | preactx = tensor.dot(h_, Ux) 298 | preactx = preactx * r 299 | preactx = preactx + xx_ 300 | 301 | # hidden state proposal 302 | h = tensor.tanh(preactx) 303 | 304 | # leaky integrate and obtain next hidden state 305 | h = u * h_ + (1. - u) * h 306 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 307 | 308 | return h 309 | 310 | # prepare scan arguments 311 | seqs = [mask, state_below_, state_belowx] 312 | init_states = [tensor.alloc(0., n_samples, dim)] 313 | _step = _step_slice 314 | shared_vars = [tparams[_p(prefix, 'U')], 315 | tparams[_p(prefix, 'Ux')]] 316 | 317 | rval, updates = theano.scan(_step, 318 | sequences=seqs, 319 | outputs_info=init_states, 320 | non_sequences=shared_vars, 321 | name=_p(prefix, '_layers'), 322 | n_steps=nsteps, 323 | profile=profile, 324 | strict=True) 325 | rval = [rval] 326 | return rval 327 | 328 | 329 | # Conditional GRU layer with Attention 330 | def param_init_gru_cond(options, params, prefix='gru_cond', 331 | nin=None, dim=None, dimctx=None, 332 | nin_nonlin=None, dim_nonlin=None): 333 | if nin is None: 334 | nin = options['dim'] 335 | if dim is None: 336 | dim = options['dim'] 337 | if dimctx is None: 338 | dimctx = options['dim'] 339 | if nin_nonlin is None: 340 | nin_nonlin = nin 341 | if dim_nonlin is None: 342 | dim_nonlin = dim 343 | 344 | W = numpy.concatenate([norm_weight(nin, dim), 345 | norm_weight(nin, dim)], axis=1) 346 | params[_p(prefix, 'W')] = W 347 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 348 | U = numpy.concatenate([ortho_weight(dim_nonlin), 349 | ortho_weight(dim_nonlin)], axis=1) 350 | params[_p(prefix, 'U')] = U 351 | 352 | Wx = norm_weight(nin_nonlin, dim_nonlin) 353 | params[_p(prefix, 'Wx')] = Wx 354 | Ux = ortho_weight(dim_nonlin) 355 | params[_p(prefix, 'Ux')] = Ux 356 | params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32') 357 | 358 | U_nl = numpy.concatenate([ortho_weight(dim_nonlin), 359 | ortho_weight(dim_nonlin)], axis=1) 360 | params[_p(prefix, 'U_nl')] = U_nl 361 | params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32') 362 | 363 | Ux_nl = ortho_weight(dim_nonlin) 364 | params[_p(prefix, 'Ux_nl')] = Ux_nl 365 | params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32') 366 | 367 | # context to LSTM 368 | Wc = norm_weight(dimctx, dim*2) 369 | params[_p(prefix, 'Wc')] = Wc 370 | 371 | Wcx = norm_weight(dimctx, dim) 372 | params[_p(prefix, 'Wcx')] = Wcx 373 | 374 | # attention: combined -> hidden 375 | W_comb_att = norm_weight(dim, dimctx) 376 | params[_p(prefix, 'W_comb_att')] = W_comb_att 377 | 378 | # attention: context -> hidden 379 | Wc_att = norm_weight(dimctx) 380 | params[_p(prefix, 'Wc_att')] = Wc_att 381 | 382 | # attention: hidden bias 383 | b_att = numpy.zeros((dimctx,)).astype('float32') 384 | params[_p(prefix, 'b_att')] = b_att 385 | 386 | # attention: 387 | U_att = norm_weight(dimctx, 1) 388 | params[_p(prefix, 'U_att')] = U_att 389 | c_att = numpy.zeros((1,)).astype('float32') 390 | params[_p(prefix, 'c_tt')] = c_att 391 | 392 | return params 393 | 394 | 395 | def gru_cond_layer(tparams, state_below, options, prefix='gru', 396 | mask=None, context=None, one_step=False, 397 | init_memory=None, init_state=None, 398 | context_mask=None, 399 | **kwargs): 400 | 401 | assert context, 'Context must be provided' 402 | 403 | if one_step: 404 | assert init_state, 'previous state must be provided' 405 | 406 | nsteps = state_below.shape[0] 407 | if state_below.ndim == 3: 408 | n_samples = state_below.shape[1] 409 | else: 410 | n_samples = 1 411 | 412 | # mask 413 | if mask is None: 414 | mask = tensor.alloc(1., state_below.shape[0], 1) 415 | 416 | dim = tparams[_p(prefix, 'Wcx')].shape[1] 417 | 418 | # initial/previous state 419 | if init_state is None: 420 | init_state = tensor.alloc(0., n_samples, dim) 421 | 422 | # projected context 423 | assert context.ndim == 3, \ 424 | 'Context must be 3-d: #annotation x #sample x dim' 425 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\ 426 | tparams[_p(prefix, 'b_att')] 427 | 428 | def _slice(_x, n, dim): 429 | if _x.ndim == 3: 430 | return _x[:, :, n*dim:(n+1)*dim] 431 | return _x[:, n*dim:(n+1)*dim] 432 | 433 | # projected x 434 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\ 435 | tparams[_p(prefix, 'bx')] 436 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\ 437 | tparams[_p(prefix, 'b')] 438 | 439 | def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, 440 | U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, 441 | U_nl, Ux_nl, b_nl, bx_nl): 442 | preact1 = tensor.dot(h_, U) 443 | preact1 += x_ 444 | preact1 = tensor.nnet.sigmoid(preact1) 445 | 446 | r1 = _slice(preact1, 0, dim) 447 | u1 = _slice(preact1, 1, dim) 448 | 449 | preactx1 = tensor.dot(h_, Ux) 450 | preactx1 *= r1 451 | preactx1 += xx_ 452 | 453 | h1 = tensor.tanh(preactx1) 454 | 455 | h1 = u1 * h_ + (1. - u1) * h1 456 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ 457 | 458 | # attention 459 | pstate_ = tensor.dot(h1, W_comb_att) 460 | pctx__ = pctx_ + pstate_[None, :, :] 461 | #pctx__ += xc_ 462 | pctx__ = tensor.tanh(pctx__) 463 | alpha = tensor.dot(pctx__, U_att)+c_tt 464 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) 465 | alpha = tensor.exp(alpha) 466 | if context_mask: 467 | alpha = alpha * context_mask 468 | alpha = alpha / alpha.sum(0, keepdims=True) 469 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context 470 | 471 | preact2 = tensor.dot(h1, U_nl)+b_nl 472 | preact2 += tensor.dot(ctx_, Wc) 473 | preact2 = tensor.nnet.sigmoid(preact2) 474 | 475 | r2 = _slice(preact2, 0, dim) 476 | u2 = _slice(preact2, 1, dim) 477 | 478 | preactx2 = tensor.dot(h1, Ux_nl)+bx_nl 479 | preactx2 *= r2 480 | preactx2 += tensor.dot(ctx_, Wcx) 481 | 482 | h2 = tensor.tanh(preactx2) 483 | 484 | h2 = u2 * h1 + (1. - u2) * h2 485 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 486 | 487 | return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u 488 | 489 | seqs = [mask, state_below_, state_belowx] 490 | #seqs = [mask, state_below_, state_belowx, state_belowc] 491 | _step = _step_slice 492 | 493 | shared_vars = [tparams[_p(prefix, 'U')], 494 | tparams[_p(prefix, 'Wc')], 495 | tparams[_p(prefix, 'W_comb_att')], 496 | tparams[_p(prefix, 'U_att')], 497 | tparams[_p(prefix, 'c_tt')], 498 | tparams[_p(prefix, 'Ux')], 499 | tparams[_p(prefix, 'Wcx')], 500 | tparams[_p(prefix, 'U_nl')], 501 | tparams[_p(prefix, 'Ux_nl')], 502 | tparams[_p(prefix, 'b_nl')], 503 | tparams[_p(prefix, 'bx_nl')]] 504 | 505 | if one_step: 506 | rval = _step(*(seqs + [init_state, None, None, pctx_, context] + 507 | shared_vars)) 508 | else: 509 | rval, updates = theano.scan(_step, 510 | sequences=seqs, 511 | outputs_info=[init_state, 512 | tensor.alloc(0., n_samples, 513 | context.shape[2]), 514 | tensor.alloc(0., n_samples, 515 | context.shape[0])], 516 | non_sequences=[pctx_, context]+shared_vars, 517 | name=_p(prefix, '_layers'), 518 | n_steps=nsteps, 519 | profile=profile, 520 | strict=True) 521 | return rval 522 | 523 | 524 | # initialize all parameters 525 | def init_params(options): 526 | params = OrderedDict() 527 | 528 | # embedding 529 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 530 | # params['Wemb_pos'] = norm_weight(options['n_pos'], options['dim_pos']) 531 | 532 | params = get_layer(options['encoder'])[0](options, params, 533 | prefix='encoder', 534 | nin=options['dim_word'], 535 | dim=options['dim']) 536 | params = get_layer(options['encoder'])[0](options, params, 537 | prefix='encoder_r', 538 | nin=options['dim_word'], 539 | dim=options['dim']) 540 | ctxdim = 2 * options['dim'] 541 | 542 | # init_state, init_cell 543 | params = get_layer('ff')[0](options, params, prefix='ff_state', 544 | nin=ctxdim, nout=options['dim']) 545 | # decoder 546 | params = get_layer(options['decoder'])[0](options, params, 547 | prefix='decoder', 548 | nin=options['dim_word'], 549 | dim=options['dim'], 550 | dimctx=ctxdim) 551 | # readout 552 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 553 | # nin=options['dim'], nout=options['dim_word'], 554 | # ortho=False) 555 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 556 | # nin=options['dim_word'], 557 | # nout=options['dim_word'], ortho=False) 558 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 559 | # nin=ctxdim, nout=options['dim_word'], 560 | # ortho=False) 561 | # params = get_layer('ff')[0](options, params, prefix='ff_logit', 562 | # nin=options['dim_word'], 563 | # nout=options['n_words']) 564 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 565 | nin=options['dim'], nout=options['dim_word'], 566 | ortho=False) 567 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 568 | nin=options['dim_word'], 569 | nout=options['dim_word'], ortho=False) 570 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 571 | nin=ctxdim, nout=options['dim_word'], 572 | ortho=False) 573 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 574 | nin=options['dim_word'], 575 | nout=options['n_words']) 576 | 577 | params['W_out_lambda'] = 0.01 * numpy.random.randn(options['dim'],1).astype('float32') 578 | 579 | 580 | return params 581 | 582 | 583 | # build a training model 584 | def build_model(tparams, options): 585 | opt_ret = dict() 586 | 587 | trng = RandomStreams(1234) 588 | use_noise = theano.shared(numpy.float32(0.)) 589 | 590 | # description string: #words x #samples 591 | x = tensor.matrix('x', dtype='int64') 592 | x_map1 = tensor.vector('x', dtype='int64') 593 | # x_map2 = tensor.vector('x', dtype='int64') 594 | x_mask = tensor.matrix('x_mask', dtype='float32') 595 | x_mask_for_attw = tensor.matrix('x_mask_for_attw', dtype='float32') 596 | y = tensor.matrix('y', dtype='int64') 597 | new_y = tensor.matrix('new_y', dtype='int64') 598 | y_mask = tensor.matrix('y_mask', dtype='float32') 599 | 600 | word_map = tensor.vector('wm', dtype='int64') 601 | # label_for_dim_expand = tensor.vector('lde', dtype='int64') 602 | # lambda_a = tensor.matrix('lambda_a', dype='int64') 603 | 604 | # for the backward rnn, we just need to invert x and x_mask 605 | xr = x[::-1] 606 | xr_mask = x_mask[::-1] 607 | 608 | n_timesteps = x.shape[0] 609 | n_timesteps_trg = y.shape[0] 610 | n_samples = x.shape[1] 611 | 612 | # word embedding for forward rnn (source) 613 | emb = tparams['Wemb'][x.flatten()] 614 | 615 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 616 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 617 | prefix='encoder', 618 | mask=x_mask) 619 | # word embedding for backward rnn (source) 620 | embr = tparams['Wemb'][xr.flatten()] 621 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 622 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 623 | prefix='encoder_r', 624 | mask=xr_mask) 625 | 626 | # context will be the concatenation of forward and backward rnns 627 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 628 | 629 | # mean of the context (across time) will be used to initialize decoder rnn 630 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 631 | 632 | # or you can use the last state of forward + backward encoder rnns 633 | # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) 634 | 635 | # initial decoder state 636 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 637 | prefix='ff_state', activ='tanh') 638 | 639 | # word embedding (target), we will shift the target sequence one time step 640 | # to the right. This is done because of the bi-gram connections in the 641 | # readout and decoder rnn. The first target will be all zeros and we will 642 | # not condition on the last output. 643 | # emb = tparams['Wemb_dec'][y.flatten()] 644 | emb = tparams['Wemb'][y.flatten()] 645 | emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) 646 | emb_shifted = tensor.zeros_like(emb) 647 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 648 | emb = emb_shifted 649 | 650 | # decoder - pass through the decoder conditional gru with attention 651 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 652 | prefix='decoder', 653 | mask=y_mask, context=ctx, 654 | context_mask=x_mask, 655 | one_step=False, 656 | init_state=init_state) 657 | # hidden states of the decoder gru 658 | proj_h = proj[0] 659 | 660 | # weighted averages of context, generated by attention module 661 | ctxs = proj[1] 662 | 663 | # weights (alignment matrix) 664 | opt_ret['dec_alphas'] = proj[2] 665 | # print opt_ret['dec_alphas'].shape 666 | 667 | # compute word probabilities 668 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options, 669 | prefix='ff_logit_lstm', activ='linear') 670 | logit_prev = get_layer('ff')[1](tparams, emb, options, 671 | prefix='ff_logit_prev', activ='linear') 672 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 673 | prefix='ff_logit_ctx', activ='linear') 674 | 675 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx) 676 | 677 | 678 | proj_h_shp = proj_h.shape 679 | 680 | attw_lambda = tensor.nnet.sigmoid(tensor.dot(proj_h.reshape([proj_h_shp[0] * proj_h_shp[1], proj_h_shp[2]]), tparams['W_out_lambda'])) 681 | 682 | if options['use_dropout']: 683 | logit = dropout_layer(logit, use_noise, trng) 684 | 685 | logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map]) 686 | 687 | #copy attention 688 | logit_shp = logit.shape 689 | r1,_ = theano.scan(lambda :tensor.constant(0), n_steps = logit_shp[2]) 690 | logit_new = logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]) 691 | 692 | alpha_shape = opt_ret['dec_alphas'].shape 693 | attw = opt_ret['dec_alphas'].reshape([alpha_shape[0], alpha_shape[1] * alpha_shape[2]]) 694 | r2,_ = theano.scan(lambda :tensor.constant(0), n_steps = alpha_shape[2]) 695 | attw = x_mask_for_attw.T.flatten() * attw 696 | lambda_plus_attw1 = attw_lambda[:,r2] * attw.reshape([alpha_shape[0] * alpha_shape[1], alpha_shape[2]]) 697 | lambda_plus_attw = lambda_plus_attw1.reshape([alpha_shape[0], alpha_shape[1]*alpha_shape[2]]) 698 | # logit_new2 = logit_new.reshape([logit_shp[0], logit_shp[1]*logit_shp[2]]) 699 | 700 | # logit_new2 = tensor.set_subtensor(logit_new2[:,x_map1], logit_new2[:,x_map1] + lambda_plus_attw) 701 | 702 | probs_0 = (1-attw_lambda[:,r1]) * tensor.nnet.softmax(logit_new) 703 | 704 | probs_1 = probs_0.reshape([logit_shp[0], logit_shp[1]*logit_shp[2]]) 705 | probs_1 = tensor.set_subtensor(probs_1[:,x_map1], probs_1[:,x_map1] + lambda_plus_attw) 706 | 707 | probs = probs_1.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]) 708 | 709 | # cost 710 | # y_flat = y.flatten() 711 | y_flat = new_y.flatten() 712 | y_flat_idx = tensor.arange(y_flat.shape[0]) * word_map.shape[0] + y_flat 713 | cost = -tensor.log(probs.flatten()[y_flat_idx]) 714 | cost = cost.reshape([y.shape[0], y.shape[1]]) 715 | cost = (cost * y_mask).sum(0) 716 | 717 | fucktest = [attw, probs] 718 | 719 | 720 | return trng, use_noise, x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, opt_ret, cost, word_map, fucktest 721 | 722 | 723 | # build a sampler 724 | def build_sampler(tparams, options, trng, use_noise): 725 | x = tensor.matrix('x', dtype='int64') 726 | xr = x[::-1] 727 | n_timesteps = x.shape[0] 728 | n_samples = x.shape[1] 729 | 730 | # word embedding (source), forward and backward 731 | # emb = tparams['Wemb'][x.flatten()] 732 | # emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 733 | # embr = tparams['Wemb'][xr.flatten()] 734 | # embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 735 | emb = tparams['Wemb'][x.flatten()] 736 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 737 | 738 | embr = tparams['Wemb'][xr.flatten()] 739 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 740 | 741 | # encoder 742 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 743 | prefix='encoder') 744 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 745 | prefix='encoder_r') 746 | 747 | # concatenate forward and backward rnn hidden states 748 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 749 | 750 | # get the input for decoder rnn initializer mlp 751 | ctx_mean = ctx.mean(0) 752 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) 753 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 754 | prefix='ff_state', activ='tanh') 755 | 756 | print 'Building f_init...', 757 | outs = [init_state, ctx] 758 | f_init = theano.function([x], outs, name='f_init', profile=profile) 759 | print 'Done' 760 | 761 | # x: 1 x 1 762 | y = tensor.vector('y_sampler', dtype='int64') 763 | x_map1 = tensor.vector('x_map1', dtype='int64') 764 | x_mask = tensor.vector('x_mask', dtype='int64') 765 | # x_map2 = tensor.vector('x', dtype='int64') 766 | word_map = tensor.vector('wm', dtype='int64') 767 | init_state = tensor.matrix('init_state', dtype='float32') 768 | 769 | # if it's the first word, emb should be all zero and it is indicated by -1 770 | # emb = tensor.switch(y[:, None] < 0, 771 | # tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 772 | # tparams['Wemb_dec'][y]) 773 | emb = tensor.switch(y[:, None] < 0, 774 | tensor.alloc(0., 1, tparams['Wemb'].shape[1]), 775 | tparams['Wemb'][y]) 776 | 777 | # apply one step of conditional gru with attention 778 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 779 | prefix='decoder', 780 | mask=None, context=ctx, 781 | one_step=True, 782 | init_state=init_state) 783 | # get the next hidden state 784 | next_state = proj[0] 785 | 786 | # get the weighted averages of context for this target word y 787 | ctxs = proj[1] 788 | 789 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 790 | prefix='ff_logit_lstm', activ='linear') 791 | logit_prev = get_layer('ff')[1](tparams, emb, options, 792 | prefix='ff_logit_prev', activ='linear') 793 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 794 | prefix='ff_logit_ctx', activ='linear') 795 | 796 | logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx) 797 | 798 | 799 | if options['use_dropout']: 800 | logit = dropout_layer(logit, use_noise, trng) 801 | 802 | # logit = get_layer('ff')[1](tparams, logit, options, 803 | # prefix='ff_logit', activ='linear') 804 | # logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map]) 805 | 806 | logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map]) 807 | 808 | #do not copy 'eos' 809 | # tparams['att_lambda'] = tensor.set_subtensor(tparams['att_lambda'][0], 0.0) 810 | 811 | attw_lambda = tensor.nnet.sigmoid(tensor.dot(next_state, tparams['W_out_lambda'])) 812 | 813 | f_lambda = theano.function([x_map1, y, ctx, init_state, word_map], attw_lambda, on_unused_input='ignore') 814 | 815 | # logit_new = (1-tparams['att_lambda'][word_map]) * logit 816 | 817 | r1,_ = theano.scan(lambda :tensor.constant(0), n_steps=logit.shape[-1]) 818 | 819 | # logit_new = logit 820 | 821 | # logit_new = tensor.set_subtensor(logit_new[:,x_map1], logit_new[:,x_map1] + tparams['att_lambda'][word_map][x_map2] * attw) 822 | # logit_new = tensor.set_subtensor(logit_new[:,x_map1], logit_new[:,x_map1] + attw_lambda[:, r2] * attw) 823 | 824 | prob_1 = (1-attw_lambda[:, r1]) * tensor.nnet.softmax(logit) 825 | 826 | attw = proj[2] 827 | attw = x_mask * attw 828 | 829 | r2,_ = theano.scan(lambda :tensor.constant(0), n_steps=attw.shape[-1]) 830 | 831 | prob_1 = tensor.set_subtensor(prob_1[:,x_map1], prob_1[:,x_map1] + attw_lambda[:, r2] * attw) 832 | 833 | # compute the softmax probability 834 | # next_probs = tensor.nnet.softmax(logit_new) 835 | # next_probs = tensor.nnet.softmax(prob_1) 836 | # next_probs = next_probs[:,word_map] 837 | prob_sum = prob_1.sum(1).mean() 838 | next_probs = prob_1/prob_sum 839 | 840 | # sample from softmax distribution to get the sample 841 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 842 | 843 | # compile a function to do the whole thing above, next word probability, 844 | # sampled word for the next target, next hidden state to be used 845 | print 'Building f_next..', 846 | inps = [x_mask, x_map1, y, ctx, init_state, word_map] 847 | outs = [next_probs, next_sample, next_state] 848 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 849 | print 'Done' 850 | 851 | return f_init, f_next, f_lambda 852 | 853 | 854 | # generate sample, either with stochastic sampling or beam search. Note that, 855 | # this function iteratively calls f_init and f_next functions. 856 | def gen_sample(tparams, f_init, f_next, f_lambda, x, x_mask, x_map1, word_map, options, trng=None, k=1, maxlen=30, 857 | stochastic=True, argmax=False): 858 | 859 | # k is the beam size we have 860 | 861 | if k > 1: 862 | assert not stochastic, \ 863 | 'Beam search does not support stochastic sampling' 864 | 865 | sample = [] 866 | sample_score = [] 867 | sample_lambda = [] 868 | if stochastic: 869 | sample_score = 0 870 | 871 | live_k = 1 872 | dead_k = 0 873 | 874 | hyp_samples = [[]] * live_k 875 | 876 | hyp_scores = numpy.zeros(live_k).astype('float32') 877 | hyp_scoresp = numpy.zeros(live_k).astype('float32') 878 | hyp_states = [] 879 | 880 | # get initial state of decoder rnn and encoder context 881 | ret = f_init(x) 882 | next_state, ctx0 = ret[0], ret[1] 883 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 884 | 885 | for ii in xrange(maxlen): 886 | ctx = numpy.tile(ctx0, [live_k, 1]) 887 | inps0 = [x_map1, next_w, ctx, next_state, word_map] 888 | inps = [x_mask, x_map1, next_w, ctx, next_state, word_map] 889 | # ttt = fftest(*inps) 890 | # ipdb.set_trace() 891 | lam = f_lambda(*inps0) 892 | ret = f_next(*inps) 893 | next_p, next_w0, next_state = ret[0], ret[1], ret[2] 894 | # ipdb.set_trace() 895 | next_w = numpy.array([word_map[next_w0[0]]]) 896 | sample_lambda.append(lam) 897 | 898 | if stochastic: 899 | if argmax: 900 | nw0 = next_p[0].argmax() 901 | nw = word_map[next_p[0].argmax()] 902 | 903 | else: 904 | nw0 = next_w0[0] 905 | nw = next_w[0] 906 | 907 | sample.append(nw) 908 | 909 | sample_score -= numpy.log(next_p[0, nw0]) 910 | 911 | if nw == 0: 912 | break 913 | else: 914 | cand_scores = hyp_scores[:, None] - numpy.log(next_p) 915 | 916 | cand_flat = cand_scores.flatten() 917 | 918 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 919 | 920 | 921 | voc_size = next_p.shape[1] 922 | 923 | trans_indices = ranks_flat / voc_size 924 | 925 | word_indices = ranks_flat % voc_size 926 | 927 | costs = cand_flat[ranks_flat] 928 | 929 | 930 | new_hyp_samples = [] 931 | 932 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 933 | 934 | new_hyp_states = [] 935 | 936 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 937 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 938 | new_hyp_scores[idx] = copy.copy(costs[idx]) 939 | new_hyp_states.append(copy.copy(next_state[ti])) 940 | 941 | # check the finished samples 942 | new_live_k = 0 943 | hyp_samples = [] 944 | hyp_scores = [] 945 | hyp_states = [] 946 | 947 | for idx in xrange(len(new_hyp_samples)): 948 | if new_hyp_samples[idx][-1] == 0: 949 | w_m = numpy.array(word_map)[new_hyp_samples[idx]] 950 | sample.append(w_m) 951 | # sample.append(new_hyp_samples[idx]) 952 | sample_score.append(new_hyp_scores[idx]) 953 | dead_k += 1 954 | else: 955 | new_live_k += 1 956 | hyp_samples.append(new_hyp_samples[idx]) 957 | hyp_scores.append(new_hyp_scores[idx]) 958 | hyp_states.append(new_hyp_states[idx]) 959 | hyp_scores = numpy.array(hyp_scores) 960 | live_k = new_live_k 961 | 962 | if new_live_k < 1: 963 | break 964 | if dead_k >= k: 965 | break 966 | 967 | # next_w = numpy.array([w[-1] for w in hyp_samples]) 968 | next_w = numpy.array([word_map[w[-1]] for w in hyp_samples]) 969 | next_state = numpy.array(hyp_states) 970 | 971 | if not stochastic: 972 | # dump every remaining one 973 | if live_k > 0: 974 | for idx in xrange(live_k): 975 | w_m = numpy.array(word_map)[new_hyp_samples[idx]] 976 | sample.append(w_m) 977 | # sample.append(word_map[hyp_samples[idx]]) 978 | sample_score.append(hyp_scores[idx]) 979 | 980 | return sample, sample_score, sample_lambda 981 | 982 | 983 | # calculate the log probablities on a given corpus using translation model 984 | def pred_probs(f_log_probs, prepare_data, options, iterator, word_map0, verbose=True): 985 | probs = [] 986 | 987 | n_done = 0 988 | 989 | for x, y in iterator: 990 | n_done += len(x) 991 | 992 | x, x_mask, y, y_mask = prepare_data(x, y, 993 | n_words_src=options['n_words_src'], 994 | n_words=options['n_words']) 995 | 996 | word_map = list(set(list(x.reshape(x.shape[0]*x.shape[1]))+list(y.reshape(y.shape[0]*y.shape[1]))+word_map0)) 997 | new_x = numpy.array([word_map.index(ii) for ii in x.reshape(x.shape[0]*x.shape[1])]).reshape(x.shape[0], x.shape[1]) 998 | 999 | x_mask_for_attw = numpy.array([1 if jj !=0 else 0 for jj in x.flatten()], dtype='float32') 1000 | x_mask_for_attw = x_mask_for_attw.reshape(x.shape) 1001 | 1002 | # x_map2 = new_x.T.flatten() 1003 | x_map1 = new_x.T 1004 | for iii in xrange(x_map1.shape[0]): 1005 | x_map1[iii] += len(word_map)*iii 1006 | x_map1 = x_map1.flatten() 1007 | new_y = numpy.array([word_map.index(ii) for ii in y.reshape(y.shape[0]*y.shape[1])]).reshape(y.shape[0], y.shape[1]) 1008 | 1009 | pprobs = f_log_probs(x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, numpy.array(word_map, dtype='int64')) 1010 | for pp in pprobs: 1011 | probs.append(pp) 1012 | 1013 | if numpy.isnan(numpy.mean(probs)): 1014 | # ipdb.set_trace() 1015 | print 1 1016 | 1017 | if verbose: 1018 | print >>sys.stderr, '%d samples computed' % (n_done) 1019 | 1020 | return numpy.array(probs) 1021 | 1022 | 1023 | # optimizers 1024 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 1025 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8): 1026 | 1027 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 1028 | for k, p in tparams.iteritems()] 1029 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 1030 | 1031 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) 1032 | 1033 | updates = [] 1034 | 1035 | t_prev = theano.shared(numpy.float32(0.)) 1036 | t = t_prev + 1. 1037 | lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) 1038 | 1039 | for p, g in zip(tparams.values(), gshared): 1040 | m = theano.shared(p.get_value() * 0., p.name + '_mean') 1041 | v = theano.shared(p.get_value() * 0., p.name + '_variance') 1042 | m_t = beta1 * m + (1. - beta1) * g 1043 | v_t = beta2 * v + (1. - beta2) * g**2 1044 | step = lr_t * m_t / (tensor.sqrt(v_t) + e) 1045 | p_t = p - step 1046 | updates.append((m, m_t)) 1047 | updates.append((v, v_t)) 1048 | updates.append((p, p_t)) 1049 | updates.append((t_prev, t)) 1050 | 1051 | f_update = theano.function([lr], [], updates=updates, 1052 | on_unused_input='ignore', profile=profile) 1053 | 1054 | return f_grad_shared, f_update 1055 | 1056 | 1057 | def adadelta(lr, tparams, grads, inp, cost): 1058 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1059 | name='%s_grad' % k) 1060 | for k, p in tparams.iteritems()] 1061 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 1062 | name='%s_rup2' % k) 1063 | for k, p in tparams.iteritems()] 1064 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 1065 | name='%s_rgrad2' % k) 1066 | for k, p in tparams.iteritems()] 1067 | 1068 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 1069 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 1070 | for rg2, g in zip(running_grads2, grads)] 1071 | 1072 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 1073 | profile=profile) 1074 | 1075 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 1076 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 1077 | running_grads2)] 1078 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 1079 | for ru2, ud in zip(running_up2, updir)] 1080 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 1081 | 1082 | f_update = theano.function([lr], [], updates=ru2up+param_up, 1083 | on_unused_input='ignore', profile=profile) 1084 | 1085 | return f_grad_shared, f_update 1086 | 1087 | 1088 | def rmsprop(lr, tparams, grads, inp, cost): 1089 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1090 | name='%s_grad' % k) 1091 | for k, p in tparams.iteritems()] 1092 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1093 | name='%s_rgrad' % k) 1094 | for k, p in tparams.iteritems()] 1095 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 1096 | name='%s_rgrad2' % k) 1097 | for k, p in tparams.iteritems()] 1098 | 1099 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 1100 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 1101 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 1102 | for rg2, g in zip(running_grads2, grads)] 1103 | 1104 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 1105 | profile=profile) 1106 | 1107 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 1108 | name='%s_updir' % k) 1109 | for k, p in tparams.iteritems()] 1110 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 1111 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 1112 | running_grads2)] 1113 | param_up = [(p, p + udn[1]) 1114 | for p, udn in zip(itemlist(tparams), updir_new)] 1115 | f_update = theano.function([lr], [], updates=updir_new+param_up, 1116 | on_unused_input='ignore', profile=profile) 1117 | 1118 | return f_grad_shared, f_update 1119 | 1120 | 1121 | def sgd(lr, tparams, grads, x, mask, y, cost): 1122 | gshared = [theano.shared(p.get_value() * 0., 1123 | name='%s_grad' % k) 1124 | for k, p in tparams.iteritems()] 1125 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 1126 | 1127 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 1128 | profile=profile) 1129 | 1130 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 1131 | f_update = theano.function([lr], [], updates=pup, profile=profile) 1132 | 1133 | return f_grad_shared, f_update 1134 | 1135 | 1136 | def train(dim_word=100, # word vector dimensionality 1137 | dim=1000, # the number of LSTM units 1138 | encoder='gru', 1139 | decoder='gru_cond', 1140 | patience=10, # early stopping patience 1141 | max_epochs=5000, 1142 | finish_after=10000000, # finish after this many updates 1143 | dispFreq=100, 1144 | decay_c=0., # L2 regularization penalty 1145 | alpha_c=0., # alignment regularization 1146 | clip_c=-1., # gradient clipping threshold 1147 | lrate=0.01, # learning rate 1148 | n_words_src=100000, # source vocabulary size 1149 | n_words=100000, # target vocabulary size 1150 | maxlen=100, # maximum length of the description 1151 | optimizer='rmsprop', 1152 | batch_size=16, 1153 | valid_batch_size=16, 1154 | saveto='model.npz', 1155 | validFreq=1000, 1156 | saveFreq=1000, # save the parameters after every saveFreq updates 1157 | sampleFreq=100, # generate some samples after every sampleFreq 1158 | datasets=[ 1159 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', 1160 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], 1161 | valid_datasets=['../data/dev/newstest2011.en.tok', 1162 | '../data/dev/newstest2011.fr.tok'], 1163 | dictionaries=[ 1164 | 'data/worddicts.pkl', 1165 | 'data/dict2.txt'], 1166 | use_dropout=False, 1167 | reload_=False, 1168 | overwrite=False, 1169 | show_lambda = False): 1170 | 1171 | # Model options 1172 | model_options = locals().copy() 1173 | 1174 | # load dictionaries and invert them 1175 | 1176 | with open(dictionaries[0], 'rb') as f: 1177 | worddicts = pkl.load(f) 1178 | worddicts_r = dict() 1179 | for kk, vv in worddicts.iteritems(): 1180 | worddicts_r[vv] = kk 1181 | 1182 | word_map0 = [] 1183 | with open(dictionaries[-1]) as ff: 1184 | for line in ff: 1185 | line = line.strip() 1186 | if line in worddicts: 1187 | if line not in word_map0 and worddicts[line] 0.: 1242 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 1243 | weight_decay = 0. 1244 | for kk, vv in tparams.iteritems(): 1245 | weight_decay += (vv ** 2).sum() 1246 | weight_decay *= decay_c 1247 | cost += weight_decay 1248 | 1249 | # regularize the alpha weights 1250 | if alpha_c > 0. and not model_options['decoder'].endswith('simple'): 1251 | alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') 1252 | alpha_reg = alpha_c * ( 1253 | (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - 1254 | opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() 1255 | cost += alpha_reg 1256 | 1257 | # after all regularizers - compile the computational graph for cost 1258 | print 'Building f_cost...', 1259 | f_cost = theano.function(inps, cost, profile=profile) 1260 | print 'Done' 1261 | 1262 | print 'Computing gradient...', 1263 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 1264 | print 'Done' 1265 | 1266 | # apply gradient clipping here 1267 | if clip_c > 0.: 1268 | g2 = 0. 1269 | for g in grads: 1270 | g2 += (g**2).sum() 1271 | new_grads = [] 1272 | for g in grads: 1273 | new_grads.append(tensor.switch(g2 > (clip_c**2), 1274 | g / tensor.sqrt(g2) * clip_c, 1275 | g)) 1276 | grads = new_grads 1277 | 1278 | # compile the optimizer, the actual computational graph is compiled here 1279 | lr = tensor.scalar(name='lr') 1280 | print 'Building optimizers...', 1281 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 1282 | print 'Done' 1283 | 1284 | print 'Optimization' 1285 | 1286 | best_p = None 1287 | bad_counter = 0 1288 | uidx = 0 1289 | estop = False 1290 | history_errs = [] 1291 | # reload history 1292 | if reload_ and os.path.exists(saveto): 1293 | rmodel = numpy.load(saveto) 1294 | history_errs = list(rmodel['history_errs']) 1295 | if 'uidx' in rmodel: 1296 | uidx = rmodel['uidx'] 1297 | 1298 | if validFreq == -1: 1299 | validFreq = len(train[0])/batch_size 1300 | if saveFreq == -1: 1301 | saveFreq = len(train[0])/batch_size 1302 | if sampleFreq == -1: 1303 | sampleFreq = len(train[0])/batch_size 1304 | 1305 | for eidx in xrange(max_epochs): 1306 | n_samples = 0 1307 | 1308 | for x, y in train: 1309 | n_samples += len(x) 1310 | if len(x) == 0: 1311 | continue 1312 | uidx += 1 1313 | use_noise.set_value(1.) 1314 | 1315 | x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, 1316 | n_words_src=n_words_src, 1317 | n_words=n_words) 1318 | 1319 | if x is None: 1320 | print 'Minibatch with zero sample under length ', maxlen 1321 | uidx -= 1 1322 | continue 1323 | word_map = list(set(list(x.reshape(x.shape[0]*x.shape[1]))+list(y.reshape(y.shape[0]*y.shape[1]))+word_map0)) 1324 | new_x = numpy.array([word_map.index(ii) for ii in x.reshape(x.shape[0]*x.shape[1])]).reshape(x.shape[0], x.shape[1]) 1325 | # x_map2 = new_x.T.flatten() 1326 | x_map1 = new_x.T 1327 | for iii in xrange(x_map1.shape[0]): 1328 | x_map1[iii] += len(word_map)*iii 1329 | x_map1 = x_map1.flatten() 1330 | 1331 | new_y = numpy.array([word_map.index(ii) for ii in y.reshape(y.shape[0]*y.shape[1])]).reshape(y.shape[0], y.shape[1]) 1332 | # word_map3 = list(set(word_map+word_map0)) 1333 | 1334 | x_mask_for_attw = numpy.array([1 if jij !=0 else 0 for jij in x.flatten()], dtype='float32') 1335 | x_mask_for_attw = x_mask_for_attw.reshape(x.shape) 1336 | 1337 | ud_start = time.time() 1338 | 1339 | # ft1,ft2 = ftest(x, x_map1, x_mask, y, new_y, y_mask, numpy.array(word_map, dtype='int64')) 1340 | # print ft1, ft2 1341 | # ipdb.set_trace() 1342 | 1343 | # compute cost, grads and copy grads to shared variables 1344 | # print 'fuck cost' 1345 | cost = f_grad_shared(x, x_map1, x_mask, x_mask_for_attw, y, new_y, y_mask, numpy.array(word_map, dtype='int64')) 1346 | 1347 | # do the update on parameters 1348 | f_update(lrate) 1349 | 1350 | ud = time.time() - ud_start 1351 | 1352 | # check for bad numbers, usually we remove non-finite elements 1353 | # and continue training - but not done here 1354 | if numpy.isnan(cost) or numpy.isinf(cost): 1355 | print 'NaN detected' 1356 | return 1., 1., 1. 1357 | 1358 | # verbose 1359 | if numpy.mod(uidx, dispFreq) == 0: 1360 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 1361 | 1362 | # save the best model so far, in addition, save the latest model 1363 | # into a separate file with the iteration number for external eval 1364 | if numpy.mod(uidx, saveFreq) == 0: 1365 | print 'Saving the best model...', 1366 | if best_p is not None: 1367 | params = best_p 1368 | else: 1369 | params = unzip(tparams) 1370 | numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) 1371 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) 1372 | print 'Done' 1373 | 1374 | # save with uidx 1375 | if not overwrite: 1376 | print 'Saving the model at iteration {}...'.format(uidx), 1377 | saveto_uidx = '{}.iter{}.npz'.format( 1378 | os.path.splitext(saveto)[0], uidx) 1379 | numpy.savez(saveto_uidx, history_errs=history_errs, 1380 | uidx=uidx, **unzip(tparams)) 1381 | print 'Done' 1382 | 1383 | 1384 | # generate some samples with the model and display them 1385 | if numpy.mod(uidx, sampleFreq) == 0: 1386 | # FIXME: random selection? 1387 | for jj in xrange(numpy.minimum(5, x.shape[1])): 1388 | stochastic = True 1389 | input_x = x[:, jj][:, None] 1390 | word_map = list(set(list(x[:, jj][:, None].reshape(x[:, jj][:, None].shape[0]*x[:, jj][:, None].shape[1]))+word_map0+list(y[:, jj][:, None].reshape(y[:, jj][:, None].shape[0]*y[:, jj][:, None].shape[1])))) 1391 | 1392 | new_x_input = numpy.array([word_map.index(ii) for ii in input_x.reshape(input_x.shape[0]*input_x.shape[1])]).reshape(input_x.shape[0], input_x.shape[1]) 1393 | assert new_x_input.T.shape[0] == 1 1394 | sx_map= new_x_input.T.flatten() 1395 | 1396 | gen_x_mask = numpy.array([1 if jjj[0] !=0 else 0 for jjj in input_x]) 1397 | 1398 | 1399 | sample, score, lam = gen_sample(tparams, f_init, f_next, f_lambda, 1400 | input_x, gen_x_mask, sx_map, word_map, 1401 | model_options, trng=trng, k=1, 1402 | maxlen=30, 1403 | stochastic=stochastic, 1404 | argmax=False) 1405 | print 'Source ', jj, ': ', 1406 | for vv in x[:, jj]: 1407 | if vv == 0: 1408 | break 1409 | if vv in worddicts_r: 1410 | print worddicts_r[vv], 1411 | else: 1412 | print 'UNK', 1413 | print 1414 | print 'Truth ', jj, ' : ', 1415 | for vv in y[:, jj]: 1416 | if vv == 0: 1417 | break 1418 | if vv in worddicts_r: 1419 | print worddicts_r[vv], 1420 | else: 1421 | print 'UNK', 1422 | print 1423 | print 'Sample ', jj, ': ', 1424 | if stochastic: 1425 | ss = sample 1426 | else: 1427 | score = score / numpy.array([len(s) for s in sample]) 1428 | ss = sample[score.argmin()] 1429 | label = 0 1430 | for vv in ss: 1431 | if vv == 0: 1432 | print str(worddicts_r[vv])+ "#" + str("%.4f" %float(lam[label])), 1433 | break 1434 | if vv in worddicts_r: 1435 | if show_lambda: 1436 | print str(worddicts_r[vv])+ "#" + str("%.4f" %float(lam[label])), 1437 | label += 1 1438 | else: 1439 | print worddicts_r[vv], 1440 | else: 1441 | print 'UNK', 1442 | print 1443 | 1444 | # validate model on validation set and early stop if necessary 1445 | if numpy.mod(uidx, validFreq) == 0: 1446 | use_noise.set_value(0.) 1447 | valid_errs = pred_probs(f_log_probs, prepare_data, 1448 | model_options, valid, word_map0) 1449 | valid_err = valid_errs.mean() 1450 | history_errs.append(valid_err) 1451 | 1452 | if uidx == 0 or valid_err <= numpy.array(history_errs).min(): 1453 | best_p = unzip(tparams) 1454 | bad_counter = 0 1455 | if len(history_errs) > patience and valid_err >= \ 1456 | numpy.array(history_errs)[:-patience].min(): 1457 | bad_counter += 1 1458 | if bad_counter > patience: 1459 | print 'Early Stop!' 1460 | estop = True 1461 | break 1462 | 1463 | if numpy.isnan(valid_err): 1464 | # ipdb.set_trace() 1465 | print 1 1466 | 1467 | print 'Valid ', valid_err 1468 | 1469 | # finish after this many updates 1470 | if uidx >= finish_after: 1471 | print 'Finishing after %d iterations!' % uidx 1472 | estop = True 1473 | break 1474 | # ipdb.set_trace() 1475 | print 'Seen %d samples' % n_samples 1476 | 1477 | if estop: 1478 | break 1479 | 1480 | if best_p is not None: 1481 | zipp(best_p, tparams) 1482 | 1483 | use_noise.set_value(0.) 1484 | valid_err = pred_probs(f_log_probs, prepare_data, 1485 | model_options, valid, word_map0).mean() 1486 | 1487 | print 'Valid ', valid_err 1488 | 1489 | params = copy.copy(best_p) 1490 | numpy.savez(saveto, zipped_params=best_p, 1491 | history_errs=history_errs, 1492 | uidx=uidx, 1493 | **params) 1494 | 1495 | 1496 | return valid_err 1497 | 1498 | 1499 | if __name__ == '__main__': 1500 | pass 1501 | -------------------------------------------------------------------------------- /Att_CopyNet/Att_CopyNet_copy_supervision/build_dictionary.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import cPickle as pkl 3 | 4 | import sys 5 | import fileinput 6 | 7 | from collections import OrderedDict 8 | 9 | def main(f_list, dictname, is_pos_dict=False): 10 | word_freqs = OrderedDict() 11 | for filename in f_list: 12 | print 'Processing', filename 13 | with open(filename, 'r') as f: 14 | for line in f: 15 | words_in = line.strip().split(' ') 16 | for w in words_in: 17 | if w not in word_freqs: 18 | word_freqs[w] = 0 19 | else: 20 | word_freqs[w] += 1 21 | words = word_freqs.keys() 22 | freqs = word_freqs.values() 23 | 24 | sorted_idx = numpy.argsort(freqs) 25 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 26 | 27 | worddict = OrderedDict() 28 | worddict['eos'] = 0 29 | worddict['UNK'] = 1 30 | kk = 2 31 | if is_pos_dict: 32 | worddict = OrderedDict() 33 | worddict['eos'] = 0 34 | kk=1 35 | 36 | for ii, ww in enumerate(sorted_words): 37 | worddict[ww] = ii+kk 38 | 39 | pkl.dump(worddict, open('data_2/%s.pkl'%dictname, 'wb'), True) 40 | print worddict 41 | 42 | print 'Done' 43 | 44 | if __name__ == '__main__': 45 | f_list2 = ['data_2/p.txt', 'data_2/r.txt'] 46 | main(f_list2, 'word_dict') 47 | 48 | -------------------------------------------------------------------------------- /Att_CopyNet/Att_CopyNet_copy_supervision/data_iterator.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import gzip 3 | 4 | 5 | def fopen(filename, mode='r'): 6 | if filename.endswith('.gz'): 7 | return gzip.open(filename, mode) 8 | return open(filename, mode) 9 | 10 | 11 | class TextIterator: 12 | """Simple Bitext iterator.""" 13 | def __init__(self, source, target, 14 | source_dict, target_dict, 15 | batch_size=128, 16 | maxlen=100, 17 | n_words_source=-1, 18 | n_words_target=-1): 19 | self.source = fopen(source, 'r') 20 | self.target = fopen(target, 'r') 21 | with open(source_dict, 'rb') as f: 22 | self.source_dict = pkl.load(f) 23 | with open(target_dict, 'rb') as f: 24 | self.target_dict = pkl.load(f) 25 | 26 | self.batch_size = batch_size 27 | self.maxlen = maxlen 28 | 29 | self.n_words_source = n_words_source 30 | self.n_words_target = n_words_target 31 | 32 | self.end_of_data = False 33 | 34 | def __iter__(self): 35 | return self 36 | 37 | def reset(self): 38 | self.source.seek(0) 39 | self.target.seek(0) 40 | 41 | def next(self): 42 | if self.end_of_data: 43 | self.end_of_data = False 44 | self.reset() 45 | raise StopIteration 46 | 47 | source = [] 48 | target = [] 49 | 50 | try: 51 | 52 | # actual work here 53 | while True: 54 | 55 | # read from source file and map to word index 56 | ss = self.source.readline() 57 | if ss == "": 58 | raise IOError 59 | ss = ss.strip().split() 60 | ss = [self.source_dict[w] if w in self.source_dict else 1 61 | for w in ss] 62 | if self.n_words_source > 0: 63 | ss = [w if w < self.n_words_source else 1 for w in ss] 64 | 65 | # read from source file and map to word index 66 | tt = self.target.readline() 67 | if tt == "": 68 | raise IOError 69 | tt = tt.strip().split() 70 | tt = [self.target_dict[w] if w in self.target_dict else 1 71 | for w in tt] 72 | if self.n_words_target > 0: 73 | tt = [w if w < self.n_words_target else 1 for w in tt] 74 | 75 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 76 | continue 77 | 78 | source.append(ss) 79 | target.append(tt) 80 | 81 | if len(source) >= self.batch_size or \ 82 | len(target) >= self.batch_size: 83 | break 84 | except IOError: 85 | self.end_of_data = True 86 | 87 | if len(source) <= 0 or len(target) <= 0: 88 | self.end_of_data = False 89 | self.reset() 90 | raise StopIteration 91 | 92 | return source, target 93 | -------------------------------------------------------------------------------- /Att_CopyNet/Att_CopyNet_copy_supervision/train.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | import cPickle 4 | 5 | from Att_copy_s import train 6 | 7 | def main(job_id, params): 8 | print params 9 | basedir = 'data_2' 10 | validerr = train(saveto=params['model'][0], 11 | reload_=params['reload'][0], 12 | dim_word=params['dim_word'][0], 13 | dim=params['dim'][0], 14 | n_words=params['n-words'][0], 15 | n_words_src=params['n-words'][0], 16 | decay_c=params['decay-c'][0], 17 | clip_c=params['clip-c'][0], 18 | lrate=params['learning-rate'][0], 19 | optimizer=params['optimizer'][0], 20 | maxlen=100, 21 | batch_size=32, 22 | valid_batch_size=32, 23 | datasets=['%s/validation.s'%basedir, 24 | '%s/validation.t'%basedir], 25 | valid_datasets=['%s/validation.s'%basedir, 26 | '%s/validation.t'%basedir,], 27 | # dictionaries=['%s/p.txt.pkl'%basedir, 28 | # '%s/r.txt.pkl'%basedir], 29 | dictionaries=['%s/training.s.pkl'%basedir,'%s/commonwords.txt'%basedir], 30 | validFreq=1000, 31 | dispFreq=100, 32 | saveFreq=1000, 33 | sampleFreq=100, 34 | use_dropout=params['use-dropout'][0], 35 | overwrite=False, 36 | show_lambda=True) 37 | return validerr 38 | 39 | if __name__ == '__main__': 40 | # f = cPickle.load(open(r'data//p.txt.pkl')) 41 | # print f 42 | 43 | """ 44 | datasets: 45 | 46 | dictionaries: 47 | OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)]) 48 | OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)]) 49 | 50 | """ 51 | basedir = 'data_2' 52 | main(0, { 53 | 'model': ['%s/model/m.npz'%basedir], 54 | 'dim_word': [512],#word embedding dim 55 | 'dim': [512], #hidden dim 56 | 'n-words': [10000], #vocabulary size 57 | 'optimizer': ['rmsprop'], 58 | 'decay-c': [0.], 59 | 'clip-c': [1.], 60 | 'use-dropout': [False], 61 | 'learning-rate': [0.05], 62 | 'reload': [False]}) 63 | 64 | 65 | -------------------------------------------------------------------------------- /Att_CopyNet/Att_CopyNet_copy_supervision/translate.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import theano 4 | import numpy 5 | import cPickle as pkl 6 | 7 | from nmt_word import (build_sampler, gen_sample, load_params, init_params, init_tparams) 8 | 9 | from multiprocessing import Process, Queue 10 | 11 | 12 | 13 | 14 | def translate_model(word_map0, queue, rqueue, pid, model, options, k, normalize, n_best): 15 | 16 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 17 | trng = RandomStreams(1234) 18 | use_noise = theano.shared(numpy.float32(0.)) 19 | 20 | # allocate model parameters 21 | params = init_params(options) 22 | 23 | # load model parameters and set theano shared variables 24 | params = load_params(model, params) 25 | tparams = init_tparams(params) 26 | 27 | # word index 28 | f_init, f_next, f_lambda = build_sampler(tparams, options, trng, use_noise) 29 | 30 | def _translate(seq): 31 | xx = numpy.array(seq).reshape([len(seq), 1]) 32 | word_map = list(set(list(xx.reshape(xx.shape[0]*xx.shape[1]))+word_map0)) 33 | 34 | new_x_input = numpy.array([word_map.index(ii) for ii in xx.reshape(xx.shape[0]*xx.shape[1])]).reshape(xx.shape[0], xx.shape[1]) 35 | sx_map = new_x_input.T.flatten() 36 | 37 | gen_x_mask = numpy.array([1 if jjj[0] !=0 else 0 for jjj in xx]) 38 | # sample given an input sequence and obtain scores 39 | sample, score, _ = gen_sample(tparams, f_init, f_next, f_lambda, 40 | xx, gen_x_mask, sx_map, word_map, 41 | options, trng=trng, k=k, maxlen=200, 42 | stochastic=False, argmax=False) 43 | 44 | # normalize scores according to sequence lengths 45 | if normalize: 46 | lengths = numpy.array([len(s) for s in sample]) 47 | score = score / lengths 48 | if n_best > 1: 49 | sidx = numpy.argsort(score)[:n_best] 50 | 51 | else: 52 | sidx = numpy.argmin(score) 53 | # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx] 54 | 55 | return numpy.array(sample)[sidx], numpy.array(score)[sidx] 56 | # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx] 57 | 58 | while True: 59 | req = queue.get() 60 | if req is None: 61 | break 62 | 63 | idx, x = req[0], req[1] 64 | print pid, '-', idx 65 | seq, scores = _translate(x) 66 | # print seq, scores 67 | 68 | rqueue.put((idx, seq, scores)) 69 | 70 | # print tparams['att_lambda'].get_value()[0] 71 | 72 | return 73 | 74 | 75 | def predict(model, dictionary, common_dictionary, source_file, saveto, k=5, 76 | normalize=False, n_process=5, chr_level=False, n_best=1): 77 | 78 | # load model model_options 79 | with open('%s.pkl' % model, 'rb') as f: 80 | options = pkl.load(f) 81 | 82 | # load source dictionary and invert 83 | with open(dictionary, 'rb') as f: 84 | word_dict = pkl.load(f) 85 | word_idict = dict() 86 | for kk, vv in word_dict.iteritems(): 87 | word_idict[vv] = kk 88 | word_idict[0] = '' 89 | word_idict[1] = 'UNK' 90 | 91 | word_idict_trg = word_idict 92 | # load target dictionary and invert 93 | # with open(dictionary_target, 'rb') as f: 94 | # word_dict_trg = pkl.load(f) 95 | # word_idict_trg = dict() 96 | # for kk, vv in word_dict_trg.iteritems(): 97 | # word_idict_trg[vv] = kk 98 | # word_idict_trg[0] = '' 99 | # word_idict_trg[1] = 'UNK' 100 | 101 | word_map0 = [] 102 | with open(common_dictionary) as ff: 103 | for line in ff: 104 | line = line.strip() 105 | if line in word_dict: 106 | if line not in word_map0: 107 | word_map0.append(word_dict[line]) 108 | 109 | 110 | 111 | # create input and output queues for processes 112 | queue = Queue() 113 | rqueue = Queue() 114 | processes = [None] * n_process 115 | for midx in xrange(n_process): 116 | processes[midx] = Process( 117 | target=translate_model, 118 | args=(word_map0, queue, rqueue, midx, model, options, k, normalize, n_best)) 119 | processes[midx].start() 120 | 121 | # utility function 122 | def _seqs2words(caps): 123 | capsw = [] 124 | for cc in caps: 125 | ww = [] 126 | for w in cc: 127 | if w == 0: 128 | break 129 | ww.append(word_idict_trg[w]) 130 | capsw.append(' '.join(ww)) 131 | return capsw 132 | # def _seqs2words(caps): 133 | # capsw = [] 134 | # attw = [] 135 | # for cc in caps: 136 | # ww = [] 137 | # www = [] 138 | # label = 0 139 | # for w in cc: 140 | # if w == 0 and label != 0: 141 | # break 142 | # elif w == 0: 143 | # continue 144 | # label += 1 145 | # ww.append(word_idict_trg[w]) 146 | # www.append(str(tparams['att_lambda'].get_value()[w])) 147 | # wwww = [] 148 | # for aa, bb in zip(ww, www): 149 | # wwww.append(aa+'_'+bb) 150 | # # capsw.append(' '.join(ww)) 151 | # capsw.append(' '.join(wwww)) 152 | # return capsw 153 | 154 | def _send_jobs(fname): 155 | with open(fname, 'r') as f: 156 | for idx, line in enumerate(f): 157 | if chr_level: 158 | words = list(line.decode('utf-8').strip()) 159 | else: 160 | words = line.strip().split() 161 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 162 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 163 | x += [0] 164 | queue.put((idx, x)) 165 | return idx+1 166 | 167 | def _finish_processes(): 168 | for midx in xrange(n_process): 169 | queue.put(None) 170 | 171 | def _retrieve_jobs(n_samples): 172 | trans = [None] * n_samples 173 | scores = [None] * n_samples 174 | for idx in xrange(n_samples): 175 | resp = rqueue.get() 176 | trans[resp[0]] = resp[1] 177 | scores[resp[0]] = resp[2] 178 | if numpy.mod(idx, 10) == 0: 179 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 180 | return trans, scores 181 | 182 | print 'Translating ', source_file, '...' 183 | n_samples = _send_jobs(source_file) 184 | trans, scores = _retrieve_jobs(n_samples) 185 | _finish_processes() 186 | 187 | if n_best == 1: 188 | trans = _seqs2words(trans) 189 | else: 190 | n_best_trans = [] 191 | for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)): 192 | sentences = _seqs2words(n_best_tr) 193 | for ids, trans_ in enumerate(sentences): 194 | n_best_trans.append( 195 | '|||'.join( 196 | ['{}'.format(idx), trans_, 197 | '{}'.format(score_[ids])])) 198 | trans = n_best_trans 199 | 200 | with open(saveto, 'w') as f: 201 | print >>f, '\n'.join(trans) 202 | print 'Done' 203 | 204 | 205 | if __name__ == "__main__": 206 | parser = argparse.ArgumentParser() 207 | parser.add_argument('-k', type=int, default=5, help="Beam size") 208 | parser.add_argument('-p', type=int, default=5, help="Number of processes") 209 | parser.add_argument('-n', action="store_true", default=False, 210 | help="Normalize wrt sequence length") 211 | parser.add_argument('-c', action="store_true", default=False, 212 | help="Character level") 213 | parser.add_argument('-b', type=int, default=1, help="Output n-best list") 214 | parser.add_argument('model', type=str) 215 | parser.add_argument('dictionary', type=str) 216 | parser.add_argument('common_dictionary', type=str) 217 | parser.add_argument('source', type=str) 218 | parser.add_argument('saveto', type=str) 219 | 220 | args = parser.parse_args() 221 | 222 | main(args.model, args.dictionary, args.common_dictionary, args.source, 223 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 224 | chr_level=args.c, n_best=args.b) 225 | -------------------------------------------------------------------------------- /Att_CopyNet/Att_CopyNet_copy_supervision/translate_Windows.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import translate as TTT 5 | 6 | if __name__ == '__main__': 7 | TTT.predict(r'data_2/model/m.npz', r'data_2/word_dict.pkl', r'data_2/dict2.txt', r'data_2/p.txt', r'data_2/ttt.txt', k=5, n_process=1) 8 | 9 | 10 | -------------------------------------------------------------------------------- /Att_CopyNet/README.md: -------------------------------------------------------------------------------- 1 | # Attention_CopyNet 2 | 3 | -------------------------------------------------------------------------------- /Att_CopyNet/build_dictionary.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ############################################ 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 15/08/2016 7 | # Usage: build dict 8 | # 9 | ############################################ 10 | import numpy 11 | import cPickle as pkl 12 | 13 | import sys 14 | import fileinput 15 | 16 | from collections import OrderedDict 17 | 18 | def main(f_list, dictname, is_pos_dict=False): 19 | word_freqs = OrderedDict() 20 | for filename in f_list: 21 | print 'Processing', filename 22 | with open(filename, 'r') as f: 23 | for line in f: 24 | words_in = line.strip().split(' ') 25 | for w in words_in: 26 | if w not in word_freqs: 27 | word_freqs[w] = 0 28 | else: 29 | word_freqs[w] += 1 30 | words = word_freqs.keys() 31 | freqs = word_freqs.values() 32 | 33 | sorted_idx = numpy.argsort(freqs) 34 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 35 | 36 | worddict = OrderedDict() 37 | worddict['eos'] = 0 38 | worddict['UNK'] = 1 39 | kk = 2 40 | if is_pos_dict: 41 | worddict = OrderedDict() 42 | worddict['eos'] = 0 43 | kk=1 44 | 45 | for ii, ww in enumerate(sorted_words): 46 | worddict[ww] = ii+kk 47 | 48 | pkl.dump(worddict, open('data_2/%s.pkl'%dictname, 'wb'), True) 49 | print worddict 50 | 51 | print 'Done' 52 | 53 | if __name__ == '__main__': 54 | f_list2 = ['data_2/p.txt', 'data_2/r.txt'] 55 | main(f_list2, 'word_dict') 56 | 57 | -------------------------------------------------------------------------------- /Att_CopyNet/data_2/dict2.txt: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /Att_CopyNet/data_2/p.txt: -------------------------------------------------------------------------------- 1 | a b 2 | c a d 3 | a 4 | b 5 | c 6 | d 7 | -------------------------------------------------------------------------------- /Att_CopyNet/data_2/r.txt: -------------------------------------------------------------------------------- 1 | c 2 | d c 3 | d b 4 | c a d 5 | b 6 | a 7 | -------------------------------------------------------------------------------- /Att_CopyNet/data_2/ttt.txt: -------------------------------------------------------------------------------- 1 | a b 2 | c a d 3 | a 4 | b 5 | c 6 | d 7 | -------------------------------------------------------------------------------- /Att_CopyNet/data_2/word_dict.pkl: -------------------------------------------------------------------------------- 1 | ccollections 2 | OrderedDict 3 | q(]q(]q(UeosqKe]q(UUNKqKe]q(UdKe]q(UcKe]q (UaKe]q 4 | (UbKeetRq . -------------------------------------------------------------------------------- /Att_CopyNet/data_iterator.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import gzip 3 | 4 | 5 | def fopen(filename, mode='r'): 6 | if filename.endswith('.gz'): 7 | return gzip.open(filename, mode) 8 | return open(filename, mode) 9 | 10 | 11 | class TextIterator: 12 | """Simple Bitext iterator.""" 13 | def __init__(self, source, target, 14 | source_dict, target_dict, 15 | batch_size=128, 16 | maxlen=100, 17 | n_words_source=-1, 18 | n_words_target=-1): 19 | self.source = fopen(source, 'r') 20 | self.target = fopen(target, 'r') 21 | with open(source_dict, 'rb') as f: 22 | self.source_dict = pkl.load(f) 23 | with open(target_dict, 'rb') as f: 24 | self.target_dict = pkl.load(f) 25 | 26 | self.batch_size = batch_size 27 | self.maxlen = maxlen 28 | 29 | self.n_words_source = n_words_source 30 | self.n_words_target = n_words_target 31 | 32 | self.end_of_data = False 33 | 34 | def __iter__(self): 35 | return self 36 | 37 | def reset(self): 38 | self.source.seek(0) 39 | self.target.seek(0) 40 | 41 | def next(self): 42 | if self.end_of_data: 43 | self.end_of_data = False 44 | self.reset() 45 | raise StopIteration 46 | 47 | source = [] 48 | target = [] 49 | 50 | try: 51 | 52 | # actual work here 53 | while True: 54 | 55 | # read from source file and map to word index 56 | ss = self.source.readline() 57 | if ss == "": 58 | raise IOError 59 | ss = ss.strip().split() 60 | ss = [self.source_dict[w] if w in self.source_dict else 1 61 | for w in ss] 62 | if self.n_words_source > 0: 63 | ss = [w if w < self.n_words_source else 1 for w in ss] 64 | 65 | # read from source file and map to word index 66 | tt = self.target.readline() 67 | if tt == "": 68 | raise IOError 69 | tt = tt.strip().split() 70 | tt = [self.target_dict[w] if w in self.target_dict else 1 71 | for w in tt] 72 | if self.n_words_target > 0: 73 | tt = [w if w < self.n_words_target else 1 for w in tt] 74 | 75 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 76 | continue 77 | 78 | source.append(ss) 79 | target.append(tt) 80 | 81 | if len(source) >= self.batch_size or \ 82 | len(target) >= self.batch_size: 83 | break 84 | except IOError: 85 | self.end_of_data = True 86 | 87 | if len(source) <= 0 or len(target) <= 0: 88 | self.end_of_data = False 89 | self.reset() 90 | raise StopIteration 91 | 92 | return source, target 93 | -------------------------------------------------------------------------------- /Att_CopyNet/predict.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ############################################ 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 15/08/2016 7 | # Usage: for testing 8 | # 9 | ############################################ 10 | import argparse 11 | import theano 12 | import numpy 13 | import cPickle as pkl 14 | 15 | from AttCopy import (build_sampler, gen_sample, load_params, init_params, init_tparams) 16 | 17 | from multiprocessing import Process, Queue 18 | 19 | 20 | 21 | 22 | def translate_model(word_map0, queue, rqueue, pid, model, options, k, normalize, n_best): 23 | 24 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 25 | trng = RandomStreams(1234) 26 | use_noise = theano.shared(numpy.float32(0.)) 27 | 28 | # allocate model parameters 29 | params = init_params(options) 30 | 31 | # load model parameters and set theano shared variables 32 | params = load_params(model, params) 33 | tparams = init_tparams(params) 34 | 35 | # word index 36 | f_init, f_next, f_lambda = build_sampler(tparams, options, trng, use_noise) 37 | 38 | def _translate(seq): 39 | xx = numpy.array(seq).reshape([len(seq), 1]) 40 | word_map = list(set(list(xx.reshape(xx.shape[0]*xx.shape[1]))+word_map0)) 41 | 42 | new_x_input = numpy.array([word_map.index(ii) for ii in xx.reshape(xx.shape[0]*xx.shape[1])]).reshape(xx.shape[0], xx.shape[1]) 43 | sx_map = new_x_input.T.flatten() 44 | # sample given an input sequence and obtain scores 45 | sample, score, _ = gen_sample(tparams, f_init, f_next, f_lambda, 46 | xx, sx_map, word_map, 47 | options, trng=trng, k=k, maxlen=200, 48 | stochastic=False, argmax=False) 49 | 50 | # normalize scores according to sequence lengths 51 | if normalize: 52 | lengths = numpy.array([len(s) for s in sample]) 53 | score = score / lengths 54 | if n_best > 1: 55 | sidx = numpy.argsort(score)[:n_best] 56 | 57 | else: 58 | sidx = numpy.argmin(score) 59 | # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx] 60 | 61 | return numpy.array(sample)[sidx], numpy.array(score)[sidx] 62 | # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx] 63 | 64 | while True: 65 | req = queue.get() 66 | if req is None: 67 | break 68 | 69 | idx, x = req[0], req[1] 70 | print pid, '-', idx 71 | seq, scores = _translate(x) 72 | # print seq, scores 73 | 74 | rqueue.put((idx, seq, scores)) 75 | 76 | # print tparams['att_lambda'].get_value()[0] 77 | 78 | return 79 | 80 | 81 | def predict(model, dictionary, common_dictionary, source_file, saveto, k=5, 82 | normalize=False, n_process=5, chr_level=False, n_best=1): 83 | 84 | # load model model_options 85 | with open('%s.pkl' % model, 'rb') as f: 86 | options = pkl.load(f) 87 | 88 | # load source dictionary and invert 89 | with open(dictionary, 'rb') as f: 90 | word_dict = pkl.load(f) 91 | word_idict = dict() 92 | for kk, vv in word_dict.iteritems(): 93 | word_idict[vv] = kk 94 | word_idict[0] = '' 95 | word_idict[1] = 'UNK' 96 | 97 | word_idict_trg = word_idict 98 | # load target dictionary and invert 99 | # with open(dictionary_target, 'rb') as f: 100 | # word_dict_trg = pkl.load(f) 101 | # word_idict_trg = dict() 102 | # for kk, vv in word_dict_trg.iteritems(): 103 | # word_idict_trg[vv] = kk 104 | # word_idict_trg[0] = '' 105 | # word_idict_trg[1] = 'UNK' 106 | 107 | word_map0 = [] 108 | with open(common_dictionary) as ff: 109 | for line in ff: 110 | line = line.strip() 111 | if line in word_dict: 112 | if line not in word_map0: 113 | word_map0.append(word_dict[line]) 114 | 115 | 116 | 117 | # create input and output queues for processes 118 | queue = Queue() 119 | rqueue = Queue() 120 | processes = [None] * n_process 121 | for midx in xrange(n_process): 122 | processes[midx] = Process( 123 | target=translate_model, 124 | args=(word_map0, queue, rqueue, midx, model, options, k, normalize, n_best)) 125 | processes[midx].start() 126 | 127 | # utility function 128 | def _seqs2words(caps): 129 | capsw = [] 130 | for cc in caps: 131 | ww = [] 132 | for w in cc: 133 | if w == 0: 134 | break 135 | ww.append(word_idict_trg[w]) 136 | capsw.append(' '.join(ww)) 137 | return capsw 138 | # def _seqs2words(caps): 139 | # capsw = [] 140 | # attw = [] 141 | # for cc in caps: 142 | # ww = [] 143 | # www = [] 144 | # label = 0 145 | # for w in cc: 146 | # if w == 0 and label != 0: 147 | # break 148 | # elif w == 0: 149 | # continue 150 | # label += 1 151 | # ww.append(word_idict_trg[w]) 152 | # www.append(str(tparams['att_lambda'].get_value()[w])) 153 | # wwww = [] 154 | # for aa, bb in zip(ww, www): 155 | # wwww.append(aa+'_'+bb) 156 | # # capsw.append(' '.join(ww)) 157 | # capsw.append(' '.join(wwww)) 158 | # return capsw 159 | 160 | def _send_jobs(fname): 161 | with open(fname, 'r') as f: 162 | for idx, line in enumerate(f): 163 | if chr_level: 164 | words = list(line.decode('utf-8').strip()) 165 | else: 166 | words = line.strip().split() 167 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 168 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 169 | x += [0] 170 | queue.put((idx, x)) 171 | return idx+1 172 | 173 | def _finish_processes(): 174 | for midx in xrange(n_process): 175 | queue.put(None) 176 | 177 | def _retrieve_jobs(n_samples): 178 | trans = [None] * n_samples 179 | scores = [None] * n_samples 180 | for idx in xrange(n_samples): 181 | resp = rqueue.get() 182 | trans[resp[0]] = resp[1] 183 | scores[resp[0]] = resp[2] 184 | if numpy.mod(idx, 10) == 0: 185 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 186 | return trans, scores 187 | 188 | print 'Translating ', source_file, '...' 189 | n_samples = _send_jobs(source_file) 190 | trans, scores = _retrieve_jobs(n_samples) 191 | _finish_processes() 192 | 193 | if n_best == 1: 194 | trans = _seqs2words(trans) 195 | else: 196 | n_best_trans = [] 197 | for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)): 198 | sentences = _seqs2words(n_best_tr) 199 | for ids, trans_ in enumerate(sentences): 200 | n_best_trans.append( 201 | '|||'.join( 202 | ['{}'.format(idx), trans_, 203 | '{}'.format(score_[ids])])) 204 | trans = n_best_trans 205 | 206 | with open(saveto, 'w') as f: 207 | print >>f, '\n'.join(trans) 208 | print 'Done' 209 | 210 | 211 | if __name__ == "__main__": 212 | parser = argparse.ArgumentParser() 213 | parser.add_argument('-k', type=int, default=5, help="Beam size") 214 | parser.add_argument('-p', type=int, default=5, help="Number of processes") 215 | parser.add_argument('-n', action="store_true", default=False, 216 | help="Normalize wrt sequence length") 217 | parser.add_argument('-c', action="store_true", default=False, 218 | help="Character level") 219 | parser.add_argument('-b', type=int, default=1, help="Output n-best list") 220 | parser.add_argument('model', type=str) 221 | parser.add_argument('dictionary', type=str) 222 | parser.add_argument('common_dictionary', type=str) 223 | parser.add_argument('source', type=str) 224 | parser.add_argument('saveto', type=str) 225 | 226 | args = parser.parse_args() 227 | 228 | main(args.model, args.dictionary, args.common_dictionary, args.source, 229 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 230 | chr_level=args.c, n_best=args.b) 231 | -------------------------------------------------------------------------------- /Att_CopyNet/predict_windows.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ############################################ 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 15/08/2016 7 | # Usage: for testing in Windows 8 | # 9 | ############################################ 10 | import translate as TTT 11 | 12 | if __name__ == '__main__': 13 | TTT.predict(r'data_2/model/m.npz', r'data_2/word_dict.pkl', r'data_2/dict2.txt', r'data_2/p.txt', r'data_2/ttt.txt', k=5, n_process=1) 14 | 15 | 16 | -------------------------------------------------------------------------------- /Att_CopyNet/train.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ############################################ 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 15/08/2016 7 | # Usage: for training 8 | # 9 | ############################################ 10 | 11 | import numpy 12 | import os 13 | import cPickle 14 | 15 | from AttCopy import train 16 | 17 | def main(job_id, params): 18 | print params 19 | basedir = 'data_2' 20 | validerr = train(saveto=params['model'][0], 21 | reload_=params['reload'][0], 22 | dim_word=params['dim_word'][0], 23 | dim=params['dim'][0], 24 | n_words=params['n-words'][0], 25 | n_words_src=params['n-words'][0], 26 | decay_c=params['decay-c'][0], 27 | clip_c=params['clip-c'][0], 28 | lrate=params['learning-rate'][0], 29 | optimizer=params['optimizer'][0], 30 | maxlen=100, 31 | batch_size=32, 32 | valid_batch_size=32, 33 | datasets=['%s/p.txt'%basedir, 34 | '%s/p.txt'%basedir], 35 | valid_datasets=['%s/p.txt'%basedir, 36 | '%s/p.txt'%basedir,], 37 | # dictionaries=['%s/p.txt.pkl'%basedir, 38 | # '%s/r.txt.pkl'%basedir], 39 | dictionaries=['%s/word_dict.pkl'%basedir,'%s/dict2.txt'%basedir], 40 | validFreq=100, 41 | dispFreq=1, 42 | saveFreq=100, 43 | sampleFreq=1, 44 | use_dropout=params['use-dropout'][0], 45 | overwrite=False, 46 | show_lambda=False) 47 | return validerr 48 | 49 | if __name__ == '__main__': 50 | # f = cPickle.load(open(r'data//p.txt.pkl')) 51 | # print f 52 | 53 | """ 54 | datasets: 55 | 56 | dictionaries: 57 | OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)]) 58 | OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)]) 59 | 60 | """ 61 | basedir = 'data_2' 62 | main(0, { 63 | 'model': ['%s/model/m.npz'%basedir], 64 | 'dim_word': [100],#word embedding dim 65 | 'dim': [100], #hidden dim 66 | 'n-words': [6], #vocabulary size 67 | 'optimizer': ['rmsprop'], 68 | 'decay-c': [0.], 69 | 'clip-c': [1.], 70 | 'use-dropout': [False], 71 | 'learning-rate': [0.1], 72 | 'reload': [False]}) 73 | 74 | 75 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/README.md: -------------------------------------------------------------------------------- 1 | # Attention_POS_CopyNet 2 | 3 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/build_dictionary.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import cPickle as pkl 3 | 4 | import sys 5 | import fileinput 6 | 7 | from collections import OrderedDict 8 | 9 | def main(f_list, dictname, is_pos_dict=False): 10 | word_freqs = OrderedDict() 11 | for filename in f_list: 12 | print 'Processing', filename 13 | with open(filename, 'r') as f: 14 | for line in f: 15 | words_in = line.strip().split(' ') 16 | for w in words_in: 17 | if w not in word_freqs: 18 | word_freqs[w] = 0 19 | else: 20 | word_freqs[w] += 1 21 | words = word_freqs.keys() 22 | freqs = word_freqs.values() 23 | 24 | sorted_idx = numpy.argsort(freqs) 25 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 26 | 27 | worddict = OrderedDict() 28 | worddict['eos'] = 0 29 | worddict['UNK'] = 1 30 | kk = 2 31 | if is_pos_dict: 32 | worddict = OrderedDict() 33 | worddict['eos'] = 0 34 | kk=1 35 | 36 | for ii, ww in enumerate(sorted_words): 37 | worddict[ww] = ii+kk 38 | 39 | pkl.dump(worddict, open('data_2/%s.pkl'%dictname, 'wb'), True) 40 | print worddict 41 | 42 | print 'Done' 43 | 44 | if __name__ == '__main__': 45 | f_list1 = ['data_2/p_pos.txt', 'data_2/r_pos.txt'] 46 | main(f_list1, 'pos_dict', is_pos_dict=True) 47 | 48 | f_list2 = ['data_2/p.txt', 'data_2/r.txt'] 49 | main(f_list2, 'word_dict') 50 | 51 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/dict2.txt: -------------------------------------------------------------------------------- 1 | a 2 | b 3 | d -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/p.txt: -------------------------------------------------------------------------------- 1 | a b 2 | c a d 3 | a 4 | b 5 | c 6 | d 7 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/p_pos.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 3 1 4 3 | 1 4 | 2 5 | 3 6 | 4 7 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/pos_dict.pkl: -------------------------------------------------------------------------------- 1 | ccollections 2 | OrderedDict 3 | q(]q(]q(UeosqKe]q(U4Ke]q(U3Ke]q(U1Ke]q(U2KeetRq . -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/r.txt: -------------------------------------------------------------------------------- 1 | c d 2 | d c a -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/r_pos.txt: -------------------------------------------------------------------------------- 1 | 3 4 2 | 4 3 1 -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_2/word_dict.pkl: -------------------------------------------------------------------------------- 1 | ccollections 2 | OrderedDict 3 | q(]q(]q(UeosqKe]q(UUNKqKe]q(UdKe]q(UcKe]q (UaKe]q 4 | (UbKeetRq . -------------------------------------------------------------------------------- /Att_POS_CopyNet/data_iterator_for_pos.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ####################################################### 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 03/08/2016 7 | # Usage: text iterator for pos 8 | # 9 | ####################################################### 10 | 11 | import cPickle as pkl 12 | import gzip 13 | 14 | 15 | def fopen(filename, mode='r'): 16 | if filename.endswith('.gz'): 17 | return gzip.open(filename, mode) 18 | return open(filename, mode) 19 | 20 | 21 | class TextIterator: 22 | """Simple Bitext iterator.""" 23 | def __init__(self, source, target, source_pos, target_pos, 24 | word_dic, pos_dic, 25 | batch_size=128, 26 | maxlen=100, 27 | n_words=-1, 28 | n_pos=-1): 29 | self.source = fopen(source, 'r') 30 | self.source_pos = fopen(source_pos, 'r') 31 | self.target = fopen(target, 'r') 32 | self.target_pos = fopen(target_pos, 'r') 33 | with open(word_dic, 'rb') as f: 34 | self.word_dic = pkl.load(f) 35 | with open(pos_dic, 'rb') as f: 36 | self.pos_dic = pkl.load(f) 37 | 38 | self.batch_size = batch_size 39 | self.maxlen = maxlen 40 | 41 | self.n_words = n_words 42 | self.n_pos = n_pos 43 | 44 | self.end_of_data = False 45 | 46 | def __iter__(self): 47 | return self 48 | 49 | def reset(self): 50 | self.source.seek(0) 51 | self.source_pos.seek(0) 52 | self.target.seek(0) 53 | self.target_pos.seek(0) 54 | 55 | def next(self): 56 | if self.end_of_data: 57 | self.end_of_data = False 58 | self.reset() 59 | raise StopIteration 60 | 61 | source = [] 62 | source_pos = [] 63 | target = [] 64 | target_pos = [] 65 | 66 | try: 67 | 68 | # actual work here 69 | while True: 70 | 71 | # read from source file and map to word index 72 | ss = self.source.readline() 73 | if ss == "": 74 | raise IOError 75 | ss = ss.strip().split() 76 | ss = [self.word_dic[w] if w in self.word_dic else 1 77 | for w in ss] 78 | if self.n_words > 0: 79 | ss = [w if w < self.n_words else 1 for w in ss] 80 | 81 | ssp = self.source_pos.readline() 82 | if ssp == "": 83 | raise IOError 84 | ssp = ssp.strip().split() 85 | ssp = [self.pos_dic[w] if w in self.pos_dic else 1 86 | for w in ssp] 87 | if self.n_pos > 0: 88 | ssp = [w if w < self.n_pos else 1 for w in ssp] 89 | 90 | # read from source file and map to word index 91 | tt = self.target.readline() 92 | if tt == "": 93 | raise IOError 94 | tt = tt.strip().split() 95 | tt = [self.word_dic[w] if w in self.word_dic else 1 96 | for w in tt] 97 | if self.n_words > 0: 98 | tt = [w if w < self.n_words else 1 for w in tt] 99 | 100 | ttp = self.target_pos.readline() 101 | if ttp == "": 102 | raise IOError 103 | ttp = ttp.strip().split() 104 | ttp = [self.pos_dic[w] if w in self.pos_dic else 1 105 | for w in ttp] 106 | if self.n_pos > 0: 107 | ttp = [w if w < self.n_pos else 1 for w in ttp] 108 | 109 | if len(ss) > self.maxlen and len(tt) > self.maxlen and len(ssp) > self.maxlen and len(ttp) > self.maxlen: 110 | continue 111 | 112 | source.append(ss) 113 | source_pos.append(ssp) 114 | target.append(tt) 115 | target_pos.append(ttp) 116 | 117 | if len(source) >= self.batch_size or \ 118 | len(target) >= self.batch_size or len(source_pos) >= self.batch_size or len(target_pos) >= self.batch_size: 119 | break 120 | except IOError: 121 | self.end_of_data = True 122 | 123 | if len(source) <= 0 or len(target) <= 0 or len(source_pos) <= 0 or len(target_pos) <= 0: 124 | self.end_of_data = False 125 | self.reset() 126 | raise StopIteration 127 | 128 | return source, target, source_pos, target_pos 129 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/nmt_new_pos_word.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ####################################################### 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 03/08/2016 7 | # Usage: Seq2Seq Attention POS CopyNet(based on dl4mt) 8 | # 9 | ####################################################### 10 | import theano 11 | import theano.tensor as tensor 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 13 | 14 | import cPickle as pkl 15 | # import ipdb 16 | import numpy 17 | import copy 18 | 19 | import os 20 | import warnings 21 | import sys 22 | import time 23 | 24 | from collections import OrderedDict 25 | 26 | from data_iterator_for_pos import TextIterator 27 | 28 | profile = False 29 | 30 | 31 | # push parameters to Theano shared variables 32 | def zipp(params, tparams): 33 | for kk, vv in params.iteritems(): 34 | tparams[kk].set_value(vv) 35 | 36 | 37 | # pull parameters from Theano shared variables 38 | def unzip(zipped): 39 | new_params = OrderedDict() 40 | for kk, vv in zipped.iteritems(): 41 | new_params[kk] = vv.get_value() 42 | return new_params 43 | 44 | 45 | # get the list of parameters: Note that tparams must be OrderedDict 46 | def itemlist(tparams): 47 | return [vv for kk, vv in tparams.iteritems()] 48 | 49 | 50 | # dropout 51 | def dropout_layer(state_before, use_noise, trng): 52 | proj = tensor.switch( 53 | use_noise, 54 | state_before * trng.binomial(state_before.shape, p=0.5, n=1, 55 | dtype=state_before.dtype), 56 | state_before * 0.5) 57 | return proj 58 | 59 | 60 | # make prefix-appended name 61 | def _p(pp, name): 62 | return '%s_%s' % (pp, name) 63 | 64 | 65 | # initialize Theano shared variables according to the initial parameters 66 | def init_tparams(params): 67 | tparams = OrderedDict() 68 | for kk, pp in params.iteritems(): 69 | tparams[kk] = theano.shared(params[kk], name=kk) 70 | return tparams 71 | 72 | 73 | # load parameters 74 | def load_params(path, params): 75 | pp = numpy.load(path) 76 | for kk, vv in params.iteritems(): 77 | if kk not in pp: 78 | warnings.warn('%s is not in the archive' % kk) 79 | continue 80 | params[kk] = pp[kk] 81 | 82 | return params 83 | 84 | # layers: 'name': ('parameter initializer', 'feedforward') 85 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 86 | 'gru': ('param_init_gru', 'gru_layer'), 87 | 'gru_cond': ('param_init_gru_cond', 'gru_cond_layer'), 88 | } 89 | 90 | 91 | def get_layer(name): 92 | fns = layers[name] 93 | return (eval(fns[0]), eval(fns[1])) 94 | 95 | 96 | # some utilities 97 | def ortho_weight(ndim): 98 | W = numpy.random.randn(ndim, ndim) 99 | u, s, v = numpy.linalg.svd(W) 100 | return u.astype('float32') 101 | 102 | 103 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 104 | if nout is None: 105 | nout = nin 106 | if nout == nin and ortho: 107 | W = ortho_weight(nin) 108 | else: 109 | W = scale * numpy.random.randn(nin, nout) 110 | return W.astype('float32') 111 | 112 | 113 | def tanh(x): 114 | return tensor.tanh(x) 115 | 116 | 117 | def linear(x): 118 | return x 119 | 120 | 121 | def concatenate(tensor_list, axis=0): 122 | """ 123 | Alternative implementation of `theano.tensor.concatenate`. 124 | This function does exactly the same thing, but contrary to Theano's own 125 | implementation, the gradient is implemented on the GPU. 126 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 127 | because the inverse operation (splitting) needs to be done on the CPU. 128 | This implementation does not have that problem. 129 | :usage: 130 | >>> x, y = theano.tensor.matrices('x', 'y') 131 | >>> c = concatenate([x, y], axis=1) 132 | :parameters: 133 | - tensor_list : list 134 | list of Theano tensor expressions that should be concatenated. 135 | - axis : int 136 | the tensors will be joined along this axis. 137 | :returns: 138 | - out : tensor 139 | the concatenated tensor expression. 140 | """ 141 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 142 | 143 | output_shape = () 144 | for k in range(axis): 145 | output_shape += (tensor_list[0].shape[k],) 146 | output_shape += (concat_size,) 147 | for k in range(axis + 1, tensor_list[0].ndim): 148 | output_shape += (tensor_list[0].shape[k],) 149 | 150 | out = tensor.zeros(output_shape) 151 | offset = 0 152 | for tt in tensor_list: 153 | indices = () 154 | for k in range(axis): 155 | indices += (slice(None),) 156 | indices += (slice(offset, offset + tt.shape[axis]),) 157 | for k in range(axis + 1, tensor_list[0].ndim): 158 | indices += (slice(None),) 159 | 160 | out = tensor.set_subtensor(out[indices], tt) 161 | offset += tt.shape[axis] 162 | 163 | return out 164 | 165 | 166 | # batch preparation 167 | def prepare_data(seqs_x, seqs_y, maxlen=None, n_words_src=30000, 168 | n_words=30000): 169 | # x: a list of sentences 170 | lengths_x = [len(s) for s in seqs_x] 171 | lengths_y = [len(s) for s in seqs_y] 172 | 173 | if maxlen is not None: 174 | new_seqs_x = [] 175 | new_seqs_y = [] 176 | new_lengths_x = [] 177 | new_lengths_y = [] 178 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y): 179 | if l_x < maxlen and l_y < maxlen: 180 | new_seqs_x.append(s_x) 181 | new_lengths_x.append(l_x) 182 | new_seqs_y.append(s_y) 183 | new_lengths_y.append(l_y) 184 | lengths_x = new_lengths_x 185 | seqs_x = new_seqs_x 186 | lengths_y = new_lengths_y 187 | seqs_y = new_seqs_y 188 | 189 | if len(lengths_x) < 1 or len(lengths_y) < 1: 190 | return None, None, None, None 191 | 192 | n_samples = len(seqs_x) 193 | maxlen_x = numpy.max(lengths_x) + 1 194 | maxlen_y = numpy.max(lengths_y) + 1 195 | 196 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 197 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 198 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 199 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 200 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)): 201 | x[:lengths_x[idx], idx] = s_x 202 | x_mask[:lengths_x[idx]+1, idx] = 1. 203 | y[:lengths_y[idx], idx] = s_y 204 | y_mask[:lengths_y[idx]+1, idx] = 1. 205 | 206 | return x, x_mask, y, y_mask 207 | 208 | 209 | # feedforward layer: affine transformation + point-wise nonlinearity 210 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, 211 | ortho=True): 212 | if nin is None: 213 | nin = options['dim_proj'] 214 | if nout is None: 215 | nout = options['dim_proj'] 216 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) 217 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') 218 | 219 | return params 220 | 221 | 222 | def fflayer(tparams, state_below, options, prefix='rconv', 223 | activ='lambda x: tensor.tanh(x)', **kwargs): 224 | return eval(activ)( 225 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 226 | tparams[_p(prefix, 'b')]) 227 | 228 | 229 | # GRU layer 230 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 231 | if nin is None: 232 | nin = options['dim_proj'] 233 | if dim is None: 234 | dim = options['dim_proj'] 235 | 236 | # embedding to gates transformation weights, biases 237 | W = numpy.concatenate([norm_weight(nin, dim), 238 | norm_weight(nin, dim)], axis=1) 239 | params[_p(prefix, 'W')] = W 240 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 241 | 242 | # recurrent transformation weights for gates 243 | U = numpy.concatenate([ortho_weight(dim), 244 | ortho_weight(dim)], axis=1) 245 | params[_p(prefix, 'U')] = U 246 | 247 | # embedding to hidden state proposal weights, biases 248 | Wx = norm_weight(nin, dim) 249 | params[_p(prefix, 'Wx')] = Wx 250 | params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 251 | 252 | # recurrent transformation weights for hidden state proposal 253 | Ux = ortho_weight(dim) 254 | params[_p(prefix, 'Ux')] = Ux 255 | 256 | return params 257 | 258 | 259 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, 260 | **kwargs): 261 | nsteps = state_below.shape[0] 262 | if state_below.ndim == 3: 263 | n_samples = state_below.shape[1] 264 | else: 265 | n_samples = 1 266 | 267 | dim = tparams[_p(prefix, 'Ux')].shape[1] 268 | 269 | if mask is None: 270 | mask = tensor.alloc(1., state_below.shape[0], 1) 271 | 272 | # utility function to slice a tensor 273 | def _slice(_x, n, dim): 274 | if _x.ndim == 3: 275 | return _x[:, :, n*dim:(n+1)*dim] 276 | return _x[:, n*dim:(n+1)*dim] 277 | 278 | # state_below is the input word embeddings 279 | # input to the gates, concatenated 280 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 281 | tparams[_p(prefix, 'b')] 282 | # input to compute the hidden state proposal 283 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 284 | tparams[_p(prefix, 'bx')] 285 | 286 | # step function to be used by scan 287 | # arguments | sequences |outputs-info| non-seqs 288 | def _step_slice(m_, x_, xx_, h_, U, Ux): 289 | preact = tensor.dot(h_, U) 290 | preact += x_ 291 | 292 | # reset and update gates 293 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 294 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 295 | 296 | # compute the hidden state proposal 297 | preactx = tensor.dot(h_, Ux) 298 | preactx = preactx * r 299 | preactx = preactx + xx_ 300 | 301 | # hidden state proposal 302 | h = tensor.tanh(preactx) 303 | 304 | # leaky integrate and obtain next hidden state 305 | h = u * h_ + (1. - u) * h 306 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 307 | 308 | return h 309 | 310 | # prepare scan arguments 311 | seqs = [mask, state_below_, state_belowx] 312 | init_states = [tensor.alloc(0., n_samples, dim)] 313 | _step = _step_slice 314 | shared_vars = [tparams[_p(prefix, 'U')], 315 | tparams[_p(prefix, 'Ux')]] 316 | 317 | rval, updates = theano.scan(_step, 318 | sequences=seqs, 319 | outputs_info=init_states, 320 | non_sequences=shared_vars, 321 | name=_p(prefix, '_layers'), 322 | n_steps=nsteps, 323 | profile=profile, 324 | strict=True) 325 | rval = [rval] 326 | return rval 327 | 328 | 329 | # Conditional GRU layer with Attention 330 | def param_init_gru_cond(options, params, prefix='gru_cond', 331 | nin=None, dim=None, dimctx=None, 332 | nin_nonlin=None, dim_nonlin=None): 333 | if nin is None: 334 | nin = options['dim'] 335 | if dim is None: 336 | dim = options['dim'] 337 | if dimctx is None: 338 | dimctx = options['dim'] 339 | if nin_nonlin is None: 340 | nin_nonlin = nin 341 | if dim_nonlin is None: 342 | dim_nonlin = dim 343 | 344 | W = numpy.concatenate([norm_weight(nin, dim), 345 | norm_weight(nin, dim)], axis=1) 346 | params[_p(prefix, 'W')] = W 347 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 348 | U = numpy.concatenate([ortho_weight(dim_nonlin), 349 | ortho_weight(dim_nonlin)], axis=1) 350 | params[_p(prefix, 'U')] = U 351 | 352 | Wx = norm_weight(nin_nonlin, dim_nonlin) 353 | params[_p(prefix, 'Wx')] = Wx 354 | Ux = ortho_weight(dim_nonlin) 355 | params[_p(prefix, 'Ux')] = Ux 356 | params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32') 357 | 358 | U_nl = numpy.concatenate([ortho_weight(dim_nonlin), 359 | ortho_weight(dim_nonlin)], axis=1) 360 | params[_p(prefix, 'U_nl')] = U_nl 361 | params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32') 362 | 363 | Ux_nl = ortho_weight(dim_nonlin) 364 | params[_p(prefix, 'Ux_nl')] = Ux_nl 365 | params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32') 366 | 367 | # context to LSTM 368 | Wc = norm_weight(dimctx, dim*2) 369 | params[_p(prefix, 'Wc')] = Wc 370 | 371 | Wcx = norm_weight(dimctx, dim) 372 | params[_p(prefix, 'Wcx')] = Wcx 373 | 374 | # attention: combined -> hidden 375 | W_comb_att = norm_weight(dim, dimctx) 376 | params[_p(prefix, 'W_comb_att')] = W_comb_att 377 | 378 | # attention: context -> hidden 379 | Wc_att = norm_weight(dimctx) 380 | params[_p(prefix, 'Wc_att')] = Wc_att 381 | 382 | # attention: hidden bias 383 | b_att = numpy.zeros((dimctx,)).astype('float32') 384 | params[_p(prefix, 'b_att')] = b_att 385 | 386 | # attention: 387 | U_att = norm_weight(dimctx, 1) 388 | params[_p(prefix, 'U_att')] = U_att 389 | c_att = numpy.zeros((1,)).astype('float32') 390 | params[_p(prefix, 'c_tt')] = c_att 391 | 392 | return params 393 | 394 | 395 | def gru_cond_layer(tparams, state_below, options, prefix='gru', 396 | mask=None, context=None, one_step=False, 397 | init_memory=None, init_state=None, 398 | context_mask=None, 399 | **kwargs): 400 | 401 | assert context, 'Context must be provided' 402 | 403 | if one_step: 404 | assert init_state, 'previous state must be provided' 405 | 406 | nsteps = state_below.shape[0] 407 | if state_below.ndim == 3: 408 | n_samples = state_below.shape[1] 409 | else: 410 | n_samples = 1 411 | 412 | # mask 413 | if mask is None: 414 | mask = tensor.alloc(1., state_below.shape[0], 1) 415 | 416 | dim = tparams[_p(prefix, 'Wcx')].shape[1] 417 | 418 | # initial/previous state 419 | if init_state is None: 420 | init_state = tensor.alloc(0., n_samples, dim) 421 | 422 | # projected context 423 | assert context.ndim == 3, \ 424 | 'Context must be 3-d: #annotation x #sample x dim' 425 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) +\ 426 | tparams[_p(prefix, 'b_att')] 427 | 428 | def _slice(_x, n, dim): 429 | if _x.ndim == 3: 430 | return _x[:, :, n*dim:(n+1)*dim] 431 | return _x[:, n*dim:(n+1)*dim] 432 | 433 | # projected x 434 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) +\ 435 | tparams[_p(prefix, 'bx')] 436 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) +\ 437 | tparams[_p(prefix, 'b')] 438 | 439 | def _step_slice(m_, x_, xx_, h_, ctx_, alpha_, pctx_, cc_, 440 | U, Wc, W_comb_att, U_att, c_tt, Ux, Wcx, 441 | U_nl, Ux_nl, b_nl, bx_nl): 442 | preact1 = tensor.dot(h_, U) 443 | preact1 += x_ 444 | preact1 = tensor.nnet.sigmoid(preact1) 445 | 446 | r1 = _slice(preact1, 0, dim) 447 | u1 = _slice(preact1, 1, dim) 448 | 449 | preactx1 = tensor.dot(h_, Ux) 450 | preactx1 *= r1 451 | preactx1 += xx_ 452 | 453 | h1 = tensor.tanh(preactx1) 454 | 455 | h1 = u1 * h_ + (1. - u1) * h1 456 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_ 457 | 458 | # attention 459 | pstate_ = tensor.dot(h1, W_comb_att) 460 | pctx__ = pctx_ + pstate_[None, :, :] 461 | #pctx__ += xc_ 462 | pctx__ = tensor.tanh(pctx__) 463 | alpha = tensor.dot(pctx__, U_att)+c_tt 464 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) 465 | alpha = tensor.exp(alpha) 466 | if context_mask: 467 | alpha = alpha * context_mask 468 | alpha = alpha / alpha.sum(0, keepdims=True) 469 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context 470 | 471 | preact2 = tensor.dot(h1, U_nl)+b_nl 472 | preact2 += tensor.dot(ctx_, Wc) 473 | preact2 = tensor.nnet.sigmoid(preact2) 474 | 475 | r2 = _slice(preact2, 0, dim) 476 | u2 = _slice(preact2, 1, dim) 477 | 478 | preactx2 = tensor.dot(h1, Ux_nl)+bx_nl 479 | preactx2 *= r2 480 | preactx2 += tensor.dot(ctx_, Wcx) 481 | 482 | h2 = tensor.tanh(preactx2) 483 | 484 | h2 = u2 * h1 + (1. - u2) * h2 485 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 486 | 487 | return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u 488 | 489 | seqs = [mask, state_below_, state_belowx] 490 | #seqs = [mask, state_below_, state_belowx, state_belowc] 491 | _step = _step_slice 492 | 493 | shared_vars = [tparams[_p(prefix, 'U')], 494 | tparams[_p(prefix, 'Wc')], 495 | tparams[_p(prefix, 'W_comb_att')], 496 | tparams[_p(prefix, 'U_att')], 497 | tparams[_p(prefix, 'c_tt')], 498 | tparams[_p(prefix, 'Ux')], 499 | tparams[_p(prefix, 'Wcx')], 500 | tparams[_p(prefix, 'U_nl')], 501 | tparams[_p(prefix, 'Ux_nl')], 502 | tparams[_p(prefix, 'b_nl')], 503 | tparams[_p(prefix, 'bx_nl')]] 504 | 505 | if one_step: 506 | rval = _step(*(seqs + [init_state, None, None, pctx_, context] + 507 | shared_vars)) 508 | else: 509 | rval, updates = theano.scan(_step, 510 | sequences=seqs, 511 | outputs_info=[init_state, 512 | tensor.alloc(0., n_samples, 513 | context.shape[2]), 514 | tensor.alloc(0., n_samples, 515 | context.shape[0])], 516 | non_sequences=[pctx_, context]+shared_vars, 517 | name=_p(prefix, '_layers'), 518 | n_steps=nsteps, 519 | profile=profile, 520 | strict=True) 521 | return rval 522 | 523 | 524 | # initialize all parameters 525 | def init_params(options): 526 | params = OrderedDict() 527 | 528 | # embedding 529 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 530 | params['Wemb_pos'] = norm_weight(options['n_pos'], options['dim_pos']) 531 | # params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) 532 | 533 | # encoder: bidirectional RNN 534 | # params = get_layer(options['encoder'])[0](options, params, 535 | # prefix='encoder', 536 | # nin=options['dim_word'], 537 | # dim=options['dim']) 538 | # params = get_layer(options['encoder'])[0](options, params, 539 | # prefix='encoder_r', 540 | # nin=options['dim_word'], 541 | # dim=options['dim']) 542 | params = get_layer(options['encoder'])[0](options, params, 543 | prefix='encoder', 544 | nin=options['dim_word']+options['dim_pos'], 545 | dim=options['dim']) 546 | params = get_layer(options['encoder'])[0](options, params, 547 | prefix='encoder_r', 548 | nin=options['dim_word']+options['dim_pos'], 549 | dim=options['dim']) 550 | ctxdim = 2 * options['dim'] 551 | 552 | # init_state, init_cell 553 | params = get_layer('ff')[0](options, params, prefix='ff_state', 554 | nin=ctxdim, nout=options['dim']) 555 | # decoder 556 | params = get_layer(options['decoder'])[0](options, params, 557 | prefix='decoder', 558 | nin=options['dim_word']+options['dim_pos'], 559 | dim=options['dim'], 560 | dimctx=ctxdim) 561 | # readout 562 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 563 | # nin=options['dim'], nout=options['dim_word'], 564 | # ortho=False) 565 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 566 | # nin=options['dim_word'], 567 | # nout=options['dim_word'], ortho=False) 568 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 569 | # nin=ctxdim, nout=options['dim_word'], 570 | # ortho=False) 571 | # params = get_layer('ff')[0](options, params, prefix='ff_logit', 572 | # nin=options['dim_word'], 573 | # nout=options['n_words']) 574 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 575 | nin=options['dim'], nout=options['dim_word']+options['dim_pos'], 576 | ortho=False) 577 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 578 | nin=options['dim_word']+options['dim_pos'], 579 | nout=options['dim_word'], ortho=False) 580 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 581 | nin=ctxdim, nout=options['dim_word']+options['dim_pos'], 582 | ortho=False) 583 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 584 | nin=options['dim_word'], 585 | nout=options['n_words']) 586 | params = get_layer('ff')[0](options, params, prefix='ff_logit_pos', 587 | nin=options['dim_pos'], 588 | nout=options['n_pos']) 589 | # params['att_lambda'] = norm_weight(nin=1, nout=options['n_words']) 590 | params['att_lambda'] = 0.01 * numpy.random.randn(options['n_words']).astype('float32') 591 | params['pos_to_word'] = 0.01 * numpy.random.randn(options['n_pos'],options['n_words']).astype('float32') 592 | 593 | return params 594 | 595 | 596 | # build a training model 597 | def build_model(tparams, options): 598 | opt_ret = dict() 599 | 600 | trng = RandomStreams(1234) 601 | use_noise = theano.shared(numpy.float32(0.)) 602 | 603 | # description string: #words x #samples 604 | x = tensor.matrix('x', dtype='int64') 605 | xp = tensor.matrix('xp', dtype='int64') 606 | x_mask = tensor.matrix('x_mask', dtype='float32') 607 | y = tensor.matrix('y', dtype='int64') 608 | yp = tensor.matrix('yp', dtype='int64') 609 | y_mask = tensor.matrix('y_mask', dtype='float32') 610 | 611 | # for the backward rnn, we just need to invert x and x_mask 612 | xr = x[::-1] 613 | xpr = xp[::-1] 614 | xr_mask = x_mask[::-1] 615 | 616 | n_timesteps = x.shape[0] 617 | n_timesteps_trg = y.shape[0] 618 | n_samples = x.shape[1] 619 | 620 | # word embedding for forward rnn (source) 621 | emb_w = tparams['Wemb'][x.flatten()] 622 | emb_pos = tparams['Wemb_pos'][xp.flatten()] 623 | emb = concatenate([emb_w, emb_pos], axis=1) 624 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']]) 625 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 626 | prefix='encoder', 627 | mask=x_mask) 628 | # word embedding for backward rnn (source) 629 | emb_wr = tparams['Wemb'][xr.flatten()] 630 | emb_posr = tparams['Wemb_pos'][xpr.flatten()] 631 | embr = concatenate([emb_wr, emb_posr], axis=1) 632 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']]) 633 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 634 | prefix='encoder_r', 635 | mask=xr_mask) 636 | 637 | # context will be the concatenation of forward and backward rnns 638 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 639 | 640 | # mean of the context (across time) will be used to initialize decoder rnn 641 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 642 | 643 | # or you can use the last state of forward + backward encoder rnns 644 | # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) 645 | 646 | # initial decoder state 647 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 648 | prefix='ff_state', activ='tanh') 649 | 650 | # word embedding (target), we will shift the target sequence one time step 651 | # to the right. This is done because of the bi-gram connections in the 652 | # readout and decoder rnn. The first target will be all zeros and we will 653 | # not condition on the last output. 654 | # emb = tparams['Wemb_dec'][y.flatten()] 655 | emb_w = tparams['Wemb'][y.flatten()] 656 | emb_pos = tparams['Wemb_pos'][yp.flatten()] 657 | emb = concatenate([emb_w, emb_pos], axis=1) 658 | emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']+options['dim_pos']]) 659 | emb_shifted = tensor.zeros_like(emb) 660 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 661 | emb = emb_shifted 662 | 663 | # decoder - pass through the decoder conditional gru with attention 664 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 665 | prefix='decoder', 666 | mask=y_mask, context=ctx, 667 | context_mask=x_mask, 668 | one_step=False, 669 | init_state=init_state) 670 | # hidden states of the decoder gru 671 | proj_h = proj[0] 672 | 673 | # weighted averages of context, generated by attention module 674 | ctxs = proj[1] 675 | 676 | # weights (alignment matrix) 677 | opt_ret['dec_alphas'] = proj[2] 678 | # print opt_ret['dec_alphas'].shape 679 | 680 | # compute word probabilities 681 | logit_lstm = get_layer('ff')[1](tparams, proj_h, options, 682 | prefix='ff_logit_lstm', activ='linear') 683 | logit_prev = get_layer('ff')[1](tparams, emb, options, 684 | prefix='ff_logit_prev', activ='linear') 685 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 686 | prefix='ff_logit_ctx', activ='linear') 687 | logit_lstm_w, logit_lstm_p = logit_lstm[:,:,:options['dim_word']], logit_lstm[:,:,-options['dim_pos']:] 688 | logit_prev_w, logit_prev_p = logit_prev[:,:,:options['dim_word']], logit_prev[:,:,-options['dim_pos']:] 689 | logit_ctx_w, logit_ctx_p = logit_ctx[:,:,:options['dim_word']], logit_ctx[:,:,-options['dim_pos']:] 690 | 691 | logit = tensor.tanh(logit_lstm_w+logit_prev_w+logit_ctx_w) 692 | logit_p = tensor.tanh(logit_lstm_p+logit_prev_p+logit_ctx_p) 693 | 694 | if options['use_dropout']: 695 | logit = dropout_layer(logit, use_noise, trng) 696 | logit_p = dropout_layer(logit_p, use_noise, trng) 697 | 698 | logit = get_layer('ff')[1](tparams, logit, options, 699 | prefix='ff_logit', activ='linear') 700 | logit_p = get_layer('ff')[1](tparams, logit_p, options, 701 | prefix='ff_logit_pos', activ='linear') 702 | 703 | #copy attention 704 | logit_shp = logit.shape 705 | sflogit = logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]) 706 | logit_new = (1-tparams['att_lambda']) * sflogit 707 | alpha_shape = opt_ret['dec_alphas'].shape 708 | attw = opt_ret['dec_alphas'].reshape([alpha_shape[0]*alpha_shape[1],alpha_shape[2]]) 709 | 710 | def _step_for_copy(label, lg, _x, atw): 711 | lg = tensor.set_subtensor(lg[label, _x[label%_x.shape[0]]], lg[label, _x[label%_x.shape[0]]] + tparams['att_lambda'][_x[label%_x.shape[0]]]*atw[label]) 712 | return lg 713 | # ls = T.vector('ls', dtype='int64') 714 | result, _ = theano.scan(_step_for_copy, sequences=tensor.arange(logit_new.shape[0]), outputs_info=[logit_new], non_sequences = [x.T, attw]) 715 | 716 | logit_shp_pos = logit_p.shape 717 | sflogit_p = logit_p.reshape([logit_shp_pos[0]*logit_shp_pos[1], logit_shp_pos[2]]) 718 | probs_p = tensor.nnet.softmax(sflogit_p) 719 | 720 | probs = tensor.nnet.softmax(result[-1]+tensor.dot(sflogit_p, tparams['pos_to_word'])) 721 | 722 | # cost 723 | y_flat = y.flatten() 724 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 725 | cost = -tensor.log(probs.flatten()[y_flat_idx]) 726 | cost = cost.reshape([y.shape[0], y.shape[1]]) 727 | cost = (cost * y_mask).sum(0) 728 | 729 | # pos cost 730 | yp_flat = yp.flatten() 731 | yp_flat_idx = tensor.arange(yp_flat.shape[0]) * options['n_pos'] + yp_flat 732 | costp = -tensor.log(probs_p.flatten()[yp_flat_idx]) 733 | costp = costp.reshape([yp.shape[0], yp.shape[1]]) 734 | costp = (costp * y_mask).sum(0) 735 | 736 | final_cost = cost+costp 737 | 738 | return trng, use_noise, x, xp, x_mask, y, yp, y_mask, opt_ret, final_cost 739 | 740 | 741 | # build a sampler 742 | def build_sampler(tparams, options, trng, use_noise): 743 | x = tensor.matrix('x', dtype='int64') 744 | xp = tensor.matrix('xp', dtype='int64') 745 | xr = x[::-1] 746 | xpr = xp[::-1] 747 | n_timesteps = x.shape[0] 748 | n_samples = x.shape[1] 749 | 750 | # word embedding (source), forward and backward 751 | # emb = tparams['Wemb'][x.flatten()] 752 | # emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 753 | # embr = tparams['Wemb'][xr.flatten()] 754 | # embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 755 | emb_w = tparams['Wemb'][x.flatten()] 756 | emb_pos = tparams['Wemb_pos'][xp.flatten()] 757 | emb = concatenate([emb_w, emb_pos], axis=1) 758 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']]) 759 | 760 | emb_wr = tparams['Wemb'][xr.flatten()] 761 | emb_posr = tparams['Wemb_pos'][xpr.flatten()] 762 | embr = concatenate([emb_wr, emb_posr], axis=1) 763 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']+options['dim_pos']]) 764 | 765 | # encoder 766 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 767 | prefix='encoder') 768 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 769 | prefix='encoder_r') 770 | 771 | # concatenate forward and backward rnn hidden states 772 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 773 | 774 | # get the input for decoder rnn initializer mlp 775 | ctx_mean = ctx.mean(0) 776 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) 777 | init_state = get_layer('ff')[1](tparams, ctx_mean, options, 778 | prefix='ff_state', activ='tanh') 779 | 780 | print 'Building f_init...', 781 | outs = [init_state, ctx] 782 | f_init = theano.function([x,xp], outs, name='f_init', profile=profile) 783 | print 'Done' 784 | 785 | # x: 1 x 1 786 | y = tensor.vector('y_sampler', dtype='int64') 787 | yp = tensor.vector('yp_sampler', dtype='int64') 788 | word_map = tensor.vector('wm', dtype='int64') 789 | init_state = tensor.matrix('init_state', dtype='float32') 790 | 791 | # if it's the first word, emb should be all zero and it is indicated by -1 792 | # emb = tensor.switch(y[:, None] < 0, 793 | # tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 794 | # tparams['Wemb_dec'][y]) 795 | emb = tensor.switch(y[:, None] < 0, 796 | tensor.alloc(0., 1, tparams['Wemb'].shape[1]), 797 | tparams['Wemb'][y]) 798 | emb_pos = tensor.switch(yp[:, None] < 0, 799 | tensor.alloc(0., 1, tparams['Wemb_pos'].shape[1]), 800 | tparams['Wemb_pos'][yp]) 801 | emb = concatenate([emb, emb_pos], axis=1) 802 | 803 | # apply one step of conditional gru with attention 804 | proj = get_layer(options['decoder'])[1](tparams, emb, options, 805 | prefix='decoder', 806 | mask=None, context=ctx, 807 | one_step=True, 808 | init_state=init_state) 809 | # get the next hidden state 810 | next_state = proj[0] 811 | 812 | # get the weighted averages of context for this target word y 813 | ctxs = proj[1] 814 | 815 | logit_lstm = get_layer('ff')[1](tparams, next_state, options, 816 | prefix='ff_logit_lstm', activ='linear') 817 | logit_prev = get_layer('ff')[1](tparams, emb, options, 818 | prefix='ff_logit_prev', activ='linear') 819 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 820 | prefix='ff_logit_ctx', activ='linear') 821 | logit_lstm_w, logit_lstm_p = logit_lstm[:,:options['dim_word']], logit_lstm[:,-options['dim_pos']:] 822 | logit_prev_w, logit_prev_p = logit_prev[:,:options['dim_word']], logit_prev[:,-options['dim_pos']:] 823 | logit_ctx_w, logit_ctx_p = logit_ctx[:,:options['dim_word']], logit_ctx[:,-options['dim_pos']:] 824 | 825 | 826 | logit = tensor.tanh(logit_lstm_w+logit_prev_w+logit_ctx_w) 827 | logit_p = tensor.tanh(logit_lstm_p+logit_prev_p+logit_ctx_p) 828 | 829 | if options['use_dropout']: 830 | logit = dropout_layer(logit, use_noise, trng) 831 | logit_p = dropout_layer(logit_p, use_noise, trng) 832 | 833 | logit = get_layer('ff')[1](tparams, logit, options, 834 | prefix='ff_logit', activ='linear') 835 | # logit = eval('linear')(tensor.dot(logit, tparams[_p('ff_logit', 'W')][:,word_map]) + tparams[_p('ff_logit', 'b')][word_map]) 836 | logit_p = get_layer('ff')[1](tparams, logit_p, options, 837 | prefix='ff_logit_pos', activ='linear') 838 | 839 | logit_new = (1-tparams['att_lambda']) * logit 840 | 841 | attw = proj[2] 842 | 843 | def _step_for_copy(label, lg, _x, atw): 844 | lg = tensor.set_subtensor(lg[label, _x[label%_x.shape[0]]], lg[label, _x[label%_x.shape[0]]] + tparams['att_lambda'][_x[label%_x.shape[0]]]*atw[label]) 845 | return lg 846 | # ls = T.vector('ls', dtype='int64') 847 | result, _ = theano.scan(_step_for_copy, sequences=tensor.arange(logit_new.shape[0]), outputs_info=[logit_new], non_sequences = [x.T, attw]) 848 | 849 | next_probs_p = tensor.nnet.softmax(logit_p) 850 | 851 | # compute the softmax probability 852 | next_probs = tensor.nnet.softmax(result[-1]+tensor.dot(logit_p, tparams['pos_to_word'])) 853 | next_probs = next_probs[:,word_map] 854 | 855 | # sample from softmax distribution to get the sample 856 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 857 | next_sample_p = trng.multinomial(pvals=next_probs_p).argmax(1) 858 | 859 | # compile a function to do the whole thing above, next word probability, 860 | # sampled word for the next target, next hidden state to be used 861 | print 'Building f_next..', 862 | inps = [x, y, yp, ctx, init_state, word_map] 863 | outs = [next_probs, next_probs_p, next_sample, next_sample_p, next_state] 864 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 865 | print 'Done' 866 | 867 | return f_init, f_next 868 | 869 | 870 | # generate sample, either with stochastic sampling or beam search. Note that, 871 | # this function iteratively calls f_init and f_next functions. 872 | def gen_sample(tparams, f_init, f_next, x, xp, word_map, options, trng=None, k=1, maxlen=30, 873 | stochastic=True, argmax=False): 874 | 875 | # k is the beam size we have 876 | if stochastic is not True: 877 | stochastic = True 878 | if k > 1: 879 | assert not stochastic, \ 880 | 'Beam search does not support stochastic sampling' 881 | 882 | sample = [] 883 | samplep = [] 884 | sample_score = [] 885 | sample_scorep = [] 886 | if stochastic: 887 | sample_score = 0 888 | 889 | live_k = 1 890 | dead_k = 0 891 | 892 | hyp_samples = [[]] * live_k 893 | hyp_samplesp = [[]] * live_k 894 | hyp_scores = numpy.zeros(live_k).astype('float32') 895 | hyp_scoresp = numpy.zeros(live_k).astype('float32') 896 | hyp_states = [] 897 | 898 | # get initial state of decoder rnn and encoder context 899 | ret = f_init(x,xp) 900 | next_state, ctx0 = ret[0], ret[1] 901 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 902 | next_wp = -1 * numpy.ones((1,)).astype('int64') 903 | 904 | for ii in xrange(maxlen): 905 | ctx = numpy.tile(ctx0, [live_k, 1]) 906 | inps = [x, next_w, next_wp, ctx, next_state, word_map] 907 | ret = f_next(*inps) 908 | next_p, next_pp, next_w, next_wp, next_state = ret[0], ret[1], ret[2], ret[3], ret[4] 909 | true_next_w = numpy.array([word_map[next_w[0]]]) 910 | 911 | if stochastic: 912 | if argmax: 913 | nw = next_p[0].argmax() 914 | nw1 = word_map[next_p[0].argmax()] 915 | nwp = next_pp[0].argmax() 916 | else: 917 | nw = next_w[0] 918 | nw1 = true_next_w[0] 919 | nwp = next_wp[0] 920 | sample.append(nw1) 921 | samplep.append(nwp) 922 | sample_score -= numpy.log(next_p[0, nw]) 923 | sample_scorep -= numpy.log(next_pp[0, nwp]) 924 | if nw == 0: 925 | break 926 | else: 927 | cand_scores = hyp_scores[:, None] - numpy.log(next_p) 928 | cand_scoresp = hyp_scoresp[:, None] - numpy.log(next_pp) 929 | cand_flat = cand_scores.flatten() 930 | cand_flatp = cand_scoresp.flatten() 931 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 932 | ranks_flatp = cand_flatp.argsort()[:(k-dead_k)] 933 | 934 | voc_size = next_p.shape[1] 935 | voc_sizep = next_pp.shape[1] 936 | trans_indices = ranks_flat / voc_size 937 | trans_indicesp = ranks_flatp/ voc_sizep 938 | word_indices = ranks_flat % voc_size 939 | word_indicesp = ranks_flatp % voc_sizep 940 | costs = cand_flat[ranks_flat] 941 | costsp = cand_flatp[ranks_flatp] 942 | 943 | new_hyp_samples = [] 944 | new_hyp_samplesp = [] 945 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 946 | new_hyp_scoresp = numpy.zeros(k-dead_k).astype('float32') 947 | new_hyp_states = [] 948 | 949 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 950 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 951 | new_hyp_scores[idx] = copy.copy(costs[idx]) 952 | new_hyp_states.append(copy.copy(next_state[ti])) 953 | 954 | # check the finished samples 955 | new_live_k = 0 956 | hyp_samples = [] 957 | hyp_scores = [] 958 | hyp_states = [] 959 | 960 | for idx in xrange(len(new_hyp_samples)): 961 | if new_hyp_samples[idx][-1] == 0: 962 | sample.append(new_hyp_samples[idx]) 963 | sample_score.append(new_hyp_scores[idx]) 964 | dead_k += 1 965 | else: 966 | new_live_k += 1 967 | hyp_samples.append(new_hyp_samples[idx]) 968 | hyp_scores.append(new_hyp_scores[idx]) 969 | hyp_states.append(new_hyp_states[idx]) 970 | hyp_scores = numpy.array(hyp_scores) 971 | live_k = new_live_k 972 | 973 | if new_live_k < 1: 974 | break 975 | if dead_k >= k: 976 | break 977 | 978 | next_w = numpy.array([w[-1] for w in hyp_samples]) 979 | next_state = numpy.array(hyp_states) 980 | 981 | if not stochastic: 982 | # dump every remaining one 983 | if live_k > 0: 984 | for idx in xrange(live_k): 985 | sample.append(hyp_samples[idx]) 986 | sample_score.append(hyp_scores[idx]) 987 | 988 | return sample, samplep, sample_score, sample_scorep 989 | 990 | 991 | # calculate the log probablities on a given corpus using translation model 992 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True): 993 | probs = [] 994 | 995 | n_done = 0 996 | 997 | for x, y, xp, yp in iterator: 998 | n_done += len(x) 999 | 1000 | x, x_mask, y, y_mask = prepare_data(x, y, 1001 | n_words_src=options['n_words_src'], 1002 | n_words=options['n_words']) 1003 | xp, _, yp, _ = prepare_data(xp, yp, 1004 | n_words_src=options['n_words_src'], 1005 | n_words=options['n_words']) 1006 | 1007 | pprobs = f_log_probs(x, xp, x_mask, y, yp, y_mask) 1008 | for pp in pprobs: 1009 | probs.append(pp) 1010 | 1011 | if numpy.isnan(numpy.mean(probs)): 1012 | # ipdb.set_trace() 1013 | print 1 1014 | 1015 | if verbose: 1016 | print >>sys.stderr, '%d samples computed' % (n_done) 1017 | 1018 | return numpy.array(probs) 1019 | 1020 | 1021 | # optimizers 1022 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 1023 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8): 1024 | 1025 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 1026 | for k, p in tparams.iteritems()] 1027 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 1028 | 1029 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) 1030 | 1031 | updates = [] 1032 | 1033 | t_prev = theano.shared(numpy.float32(0.)) 1034 | t = t_prev + 1. 1035 | lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) 1036 | 1037 | for p, g in zip(tparams.values(), gshared): 1038 | m = theano.shared(p.get_value() * 0., p.name + '_mean') 1039 | v = theano.shared(p.get_value() * 0., p.name + '_variance') 1040 | m_t = beta1 * m + (1. - beta1) * g 1041 | v_t = beta2 * v + (1. - beta2) * g**2 1042 | step = lr_t * m_t / (tensor.sqrt(v_t) + e) 1043 | p_t = p - step 1044 | updates.append((m, m_t)) 1045 | updates.append((v, v_t)) 1046 | updates.append((p, p_t)) 1047 | updates.append((t_prev, t)) 1048 | 1049 | f_update = theano.function([lr], [], updates=updates, 1050 | on_unused_input='ignore', profile=profile) 1051 | 1052 | return f_grad_shared, f_update 1053 | 1054 | 1055 | def adadelta(lr, tparams, grads, inp, cost): 1056 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1057 | name='%s_grad' % k) 1058 | for k, p in tparams.iteritems()] 1059 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 1060 | name='%s_rup2' % k) 1061 | for k, p in tparams.iteritems()] 1062 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 1063 | name='%s_rgrad2' % k) 1064 | for k, p in tparams.iteritems()] 1065 | 1066 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 1067 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 1068 | for rg2, g in zip(running_grads2, grads)] 1069 | 1070 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 1071 | profile=profile) 1072 | 1073 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 1074 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 1075 | running_grads2)] 1076 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 1077 | for ru2, ud in zip(running_up2, updir)] 1078 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 1079 | 1080 | f_update = theano.function([lr], [], updates=ru2up+param_up, 1081 | on_unused_input='ignore', profile=profile) 1082 | 1083 | return f_grad_shared, f_update 1084 | 1085 | 1086 | def rmsprop(lr, tparams, grads, inp, cost): 1087 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1088 | name='%s_grad' % k) 1089 | for k, p in tparams.iteritems()] 1090 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1091 | name='%s_rgrad' % k) 1092 | for k, p in tparams.iteritems()] 1093 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 1094 | name='%s_rgrad2' % k) 1095 | for k, p in tparams.iteritems()] 1096 | 1097 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 1098 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 1099 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 1100 | for rg2, g in zip(running_grads2, grads)] 1101 | 1102 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 1103 | profile=profile) 1104 | 1105 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 1106 | name='%s_updir' % k) 1107 | for k, p in tparams.iteritems()] 1108 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 1109 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 1110 | running_grads2)] 1111 | param_up = [(p, p + udn[1]) 1112 | for p, udn in zip(itemlist(tparams), updir_new)] 1113 | f_update = theano.function([lr], [], updates=updir_new+param_up, 1114 | on_unused_input='ignore', profile=profile) 1115 | 1116 | return f_grad_shared, f_update 1117 | 1118 | 1119 | def sgd(lr, tparams, grads, x, mask, y, cost): 1120 | gshared = [theano.shared(p.get_value() * 0., 1121 | name='%s_grad' % k) 1122 | for k, p in tparams.iteritems()] 1123 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 1124 | 1125 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 1126 | profile=profile) 1127 | 1128 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 1129 | f_update = theano.function([lr], [], updates=pup, profile=profile) 1130 | 1131 | return f_grad_shared, f_update 1132 | 1133 | 1134 | def train(dim_word=100, # word vector dimensionality 1135 | dim_pos = 100, 1136 | dim=1000, # the number of LSTM units 1137 | encoder='gru', 1138 | decoder='gru_cond', 1139 | patience=10, # early stopping patience 1140 | max_epochs=5000, 1141 | finish_after=10000000, # finish after this many updates 1142 | dispFreq=100, 1143 | decay_c=0., # L2 regularization penalty 1144 | alpha_c=0., # alignment regularization 1145 | clip_c=-1., # gradient clipping threshold 1146 | lrate=0.01, # learning rate 1147 | n_words_src=100000, # source vocabulary size 1148 | n_words=100000, # target vocabulary size 1149 | n_pos = 36, # pos vocabulary size 1150 | maxlen=100, # maximum length of the description 1151 | optimizer='rmsprop', 1152 | batch_size=16, 1153 | valid_batch_size=16, 1154 | saveto='model.npz', 1155 | validFreq=1000, 1156 | saveFreq=1000, # save the parameters after every saveFreq updates 1157 | sampleFreq=100, # generate some samples after every sampleFreq 1158 | datasets=[ 1159 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', 1160 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok', 1161 | 'data/post_pos.txt', 1162 | 'data/response_pos.txt'], 1163 | valid_datasets=['../data/dev/newstest2011.en.tok', 1164 | '../data/dev/newstest2011.fr.tok', 1165 | 'data/v_post_pos.txt', 1166 | 'data/v_response_pos.txt'], 1167 | dictionaries=[ 1168 | 'data/worddicts.pkl', 1169 | 'data/posdicts.pkl', 1170 | 'data/dict2.txt'], 1171 | use_dropout=False, 1172 | reload_=False, 1173 | overwrite=False): 1174 | 1175 | # Model options 1176 | model_options = locals().copy() 1177 | 1178 | # load dictionaries and invert them 1179 | worddicts = [None] * 2 1180 | worddicts_r = [None] * 2 1181 | for ii, dd in enumerate(dictionaries[:2]): 1182 | with open(dd, 'rb') as f: 1183 | worddicts[ii] = pkl.load(f) 1184 | worddicts_r[ii] = dict() 1185 | for kk, vv in worddicts[ii].iteritems(): 1186 | worddicts_r[ii][vv] = kk 1187 | pos_dicts_r = worddicts_r[1] 1188 | worddicts_r = [worddicts_r[0],worddicts_r[0]] 1189 | 1190 | word_map0 = [] 1191 | with open(dictionaries[-1]) as ff: 1192 | for line in ff: 1193 | line = line.strip() 1194 | if line in worddicts[0]: 1195 | if line not in word_map0 and worddicts[0][line] 0.: 1248 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 1249 | weight_decay = 0. 1250 | for kk, vv in tparams.iteritems(): 1251 | weight_decay += (vv ** 2).sum() 1252 | weight_decay *= decay_c 1253 | cost += weight_decay 1254 | 1255 | # regularize the alpha weights 1256 | if alpha_c > 0. and not model_options['decoder'].endswith('simple'): 1257 | alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') 1258 | alpha_reg = alpha_c * ( 1259 | (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - 1260 | opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() 1261 | cost += alpha_reg 1262 | 1263 | # after all regularizers - compile the computational graph for cost 1264 | print 'Building f_cost...', 1265 | f_cost = theano.function(inps, cost, profile=profile) 1266 | print 'Done' 1267 | 1268 | print 'Computing gradient...', 1269 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 1270 | print 'Done' 1271 | 1272 | # apply gradient clipping here 1273 | if clip_c > 0.: 1274 | g2 = 0. 1275 | for g in grads: 1276 | g2 += (g**2).sum() 1277 | new_grads = [] 1278 | for g in grads: 1279 | new_grads.append(tensor.switch(g2 > (clip_c**2), 1280 | g / tensor.sqrt(g2) * clip_c, 1281 | g)) 1282 | grads = new_grads 1283 | 1284 | # compile the optimizer, the actual computational graph is compiled here 1285 | lr = tensor.scalar(name='lr') 1286 | print 'Building optimizers...', 1287 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 1288 | print 'Done' 1289 | 1290 | print 'Optimization' 1291 | 1292 | best_p = None 1293 | bad_counter = 0 1294 | uidx = 0 1295 | estop = False 1296 | history_errs = [] 1297 | # reload history 1298 | if reload_ and os.path.exists(saveto): 1299 | rmodel = numpy.load(saveto) 1300 | history_errs = list(rmodel['history_errs']) 1301 | if 'uidx' in rmodel: 1302 | uidx = rmodel['uidx'] 1303 | 1304 | if validFreq == -1: 1305 | validFreq = len(train[0])/batch_size 1306 | if saveFreq == -1: 1307 | saveFreq = len(train[0])/batch_size 1308 | if sampleFreq == -1: 1309 | sampleFreq = len(train[0])/batch_size 1310 | 1311 | for eidx in xrange(max_epochs): 1312 | n_samples = 0 1313 | 1314 | for x, y, xp, yp in train: 1315 | n_samples += len(x) 1316 | if len(x) == 0: 1317 | continue 1318 | uidx += 1 1319 | # use_noise.set_value(1.) 1320 | 1321 | x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, 1322 | n_words_src=n_words_src, 1323 | n_words=n_words) 1324 | xp, _, yp, _ = prepare_data(xp, yp, maxlen=maxlen, 1325 | n_words_src=n_pos, 1326 | n_words=n_pos) 1327 | 1328 | if x is None: 1329 | print 'Minibatch with zero sample under length ', maxlen 1330 | uidx -= 1 1331 | continue 1332 | # word_map = list(set(list(x.reshape(x.shape[0]*x.shape[1])))) 1333 | # word_map3 = list(set(word_map+word_map0)) 1334 | # ipdb.set_trace() 1335 | 1336 | ud_start = time.time() 1337 | 1338 | # compute cost, grads and copy grads to shared variables 1339 | print 'fuck cost' 1340 | cost = f_grad_shared(x, xp, x_mask, y, yp, y_mask) 1341 | 1342 | # do the update on parameters 1343 | f_update(lrate) 1344 | 1345 | ud = time.time() - ud_start 1346 | 1347 | # check for bad numbers, usually we remove non-finite elements 1348 | # and continue training - but not done here 1349 | if numpy.isnan(cost) or numpy.isinf(cost): 1350 | print 'NaN detected' 1351 | return 1., 1., 1. 1352 | 1353 | # verbose 1354 | if numpy.mod(uidx, dispFreq) == 0: 1355 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 1356 | 1357 | # save the best model so far, in addition, save the latest model 1358 | # into a separate file with the iteration number for external eval 1359 | if numpy.mod(uidx, saveFreq) == 0: 1360 | print 'Saving the best model...', 1361 | if best_p is not None: 1362 | params = best_p 1363 | else: 1364 | params = unzip(tparams) 1365 | numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) 1366 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) 1367 | print 'Done' 1368 | 1369 | # save with uidx 1370 | if not overwrite: 1371 | print 'Saving the model at iteration {}...'.format(uidx), 1372 | saveto_uidx = '{}.iter{}.npz'.format( 1373 | os.path.splitext(saveto)[0], uidx) 1374 | numpy.savez(saveto_uidx, history_errs=history_errs, 1375 | uidx=uidx, **unzip(tparams)) 1376 | print 'Done' 1377 | 1378 | 1379 | # generate some samples with the model and display them 1380 | if numpy.mod(uidx, sampleFreq) == 0: 1381 | # FIXME: random selection? 1382 | for jj in xrange(numpy.minimum(5, x.shape[1])): 1383 | stochastic = True 1384 | word_map = list(set(list(x[:, jj][:, None].reshape(x[:, jj][:, None].shape[0]*x[:, jj][:, None].shape[1]))+word_map0)) 1385 | sample, samplep, score, scorep = gen_sample(tparams, f_init, f_next, 1386 | x[:, jj][:, None], xp[:, jj][:, None], word_map, 1387 | model_options, trng=trng, k=1, 1388 | maxlen=30, 1389 | stochastic=stochastic, 1390 | argmax=False) 1391 | print 'Source ', jj, ': ', 1392 | assert x.shape == xp.shape 1393 | for vv, vvp in zip(x[:, jj], xp[:,jj]): 1394 | if vv == 0: 1395 | break 1396 | if vv in worddicts_r[0]: 1397 | print worddicts_r[0][vv], 1398 | if vvp in pos_dicts_r: 1399 | print '#'+str(pos_dicts_r[vvp]), 1400 | else: 1401 | print 'UNK', 1402 | print 1403 | print 'Truth ', jj, ' : ', 1404 | for vv, vvp in zip(y[:, jj], yp[:, jj]): 1405 | if vv == 0: 1406 | break 1407 | if vv in worddicts_r[1]: 1408 | print worddicts_r[1][vv], 1409 | if vvp in pos_dicts_r: 1410 | print '#'+str(pos_dicts_r[vvp]), 1411 | else: 1412 | print 'UNK', 1413 | print 1414 | print 'Sample ', jj, ': ', 1415 | if stochastic: 1416 | ss = sample 1417 | ssp = samplep 1418 | else: 1419 | score = score / numpy.array([len(s) for s in sample]) 1420 | scorep = scorep / numpy.array([len(s) for s in samplep]) 1421 | ss = sample[score.argmin()] 1422 | ssp = samplep[score.argmin()] 1423 | for vv,vvp in zip(ss,ssp): 1424 | if vv == 0: 1425 | break 1426 | if vv in worddicts_r[1]: 1427 | print worddicts_r[1][vv], 1428 | if vvp in pos_dicts_r: 1429 | print '#'+str(pos_dicts_r[vvp]), 1430 | else: 1431 | print 'UNK', 1432 | print 1433 | 1434 | # validate model on validation set and early stop if necessary 1435 | if numpy.mod(uidx, validFreq) == 0: 1436 | use_noise.set_value(0.) 1437 | valid_errs = pred_probs(f_log_probs, prepare_data, 1438 | model_options, valid) 1439 | valid_err = valid_errs.mean() 1440 | history_errs.append(valid_err) 1441 | 1442 | if uidx == 0 or valid_err <= numpy.array(history_errs).min(): 1443 | best_p = unzip(tparams) 1444 | bad_counter = 0 1445 | if len(history_errs) > patience and valid_err >= \ 1446 | numpy.array(history_errs)[:-patience].min(): 1447 | bad_counter += 1 1448 | if bad_counter > patience: 1449 | print 'Early Stop!' 1450 | estop = True 1451 | break 1452 | 1453 | if numpy.isnan(valid_err): 1454 | # ipdb.set_trace() 1455 | print 1 1456 | 1457 | print 'Valid ', valid_err 1458 | 1459 | # finish after this many updates 1460 | if uidx >= finish_after: 1461 | print 'Finishing after %d iterations!' % uidx 1462 | estop = True 1463 | break 1464 | # ipdb.set_trace() 1465 | print 'Seen %d samples' % n_samples 1466 | 1467 | if estop: 1468 | break 1469 | 1470 | if best_p is not None: 1471 | zipp(best_p, tparams) 1472 | 1473 | use_noise.set_value(0.) 1474 | valid_err = pred_probs(f_log_probs, prepare_data, 1475 | model_options, valid).mean() 1476 | 1477 | print 'Valid ', valid_err 1478 | 1479 | params = copy.copy(best_p) 1480 | numpy.savez(saveto, zipped_params=best_p, 1481 | history_errs=history_errs, 1482 | uidx=uidx, 1483 | **params) 1484 | 1485 | 1486 | return valid_err 1487 | 1488 | 1489 | if __name__ == '__main__': 1490 | pass 1491 | -------------------------------------------------------------------------------- /Att_POS_CopyNet/train.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | import cPickle 4 | 5 | from nmt_new_pos_word import train 6 | 7 | def main(job_id, params): 8 | print params 9 | basedir = 'data_2' 10 | validerr = train(saveto=params['model'][0], 11 | reload_=params['reload'][0], 12 | dim_word=params['dim_word'][0], 13 | dim_pos=params['dim_pos'][0], 14 | dim=params['dim'][0], 15 | n_words=params['n-words'][0], 16 | n_pos=params['n-pos'][0]+1, 17 | n_words_src=params['n-words'][0], 18 | decay_c=params['decay-c'][0], 19 | clip_c=params['clip-c'][0], 20 | lrate=params['learning-rate'][0], 21 | optimizer=params['optimizer'][0], 22 | maxlen=15, 23 | batch_size=4, 24 | valid_batch_size=1, 25 | datasets=['%s/p.txt'%basedir, 26 | '%s/p.txt'%basedir, 27 | '%s/p_pos.txt'%basedir, 28 | '%s/p_pos.txt'%basedir], 29 | valid_datasets=['%s/p.txt'%basedir, 30 | '%s/p.txt'%basedir, 31 | '%s/p_pos.txt'%basedir, 32 | '%s/p_pos.txt'%basedir], 33 | # dictionaries=['%s/p.txt.pkl'%basedir, 34 | # '%s/r.txt.pkl'%basedir], 35 | dictionaries=['%s/word_dict.pkl'%basedir,'%s/pos_dict.pkl'%basedir,'%s/dict2.txt'%basedir], 36 | validFreq=50000, 37 | dispFreq=1, 38 | saveFreq=100, 39 | sampleFreq=1, 40 | use_dropout=params['use-dropout'][0], 41 | overwrite=False) 42 | return validerr 43 | 44 | if __name__ == '__main__': 45 | # f = cPickle.load(open(r'data//p.txt.pkl')) 46 | # print f 47 | 48 | """ 49 | datasets: 50 | 51 | dictionaries: 52 | OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)]) 53 | OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)]) 54 | 55 | """ 56 | basedir = 'data_2' 57 | main(0, { 58 | 'model': ['%s/model/m.npz'%basedir], 59 | 'dim_word': [100],#word embedding dim 60 | 'dim_pos': [100], #pos embedding dim 61 | 'dim': [100], #hidden dim 62 | 'n-words': [6], #vocabulary size 63 | 'n-pos':[6], #pos tag set size 64 | 'optimizer': ['rmsprop'], 65 | 'decay-c': [0.], 66 | 'clip-c': [1.], 67 | 'use-dropout': [False], 68 | 'learning-rate': [0.01], 69 | 'reload': [False]}) 70 | 71 | 72 | -------------------------------------------------------------------------------- /Att_Seq2Seq/Pdt.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ################################################################## 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 08/08/2016(Bug fixed) 7 | # Usage: For testing. 8 | # 9 | # 10 | ################################################################## 11 | import argparse 12 | import theano 13 | import numpy 14 | import cPickle as pkl 15 | 16 | from nmt_word_without_copy import (build_sampler, gen_sample, load_params, 17 | init_params, init_tparams) 18 | 19 | from multiprocessing import Process, Queue 20 | 21 | 22 | 23 | 24 | def translate_model(word_map0, queue, rqueue, pid, model, options, k, normalize, n_best): 25 | 26 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 27 | trng = RandomStreams(1234) 28 | use_noise = theano.shared(numpy.float32(0.)) 29 | 30 | # allocate model parameters 31 | params = init_params(options) 32 | 33 | # load model parameters and set theano shared variables 34 | params = load_params(model, params) 35 | tparams = init_tparams(params) 36 | 37 | # word index 38 | f_init, f_next = build_sampler(tparams, options, trng, use_noise) 39 | 40 | def _translate(seq): 41 | xx = numpy.array(seq).reshape([len(seq), 1]) 42 | word_map = list(set(list(xx.reshape(xx.shape[0]*xx.shape[1]))+word_map0)) 43 | # sample given an input sequence and obtain scores 44 | sample, score = gen_sample(tparams, f_init, f_next, 45 | xx, word_map, 46 | options, trng=trng, k=k, maxlen=200, 47 | stochastic=False, argmax=False) 48 | 49 | # normalize scores according to sequence lengths 50 | if normalize: 51 | lengths = numpy.array([len(s) for s in sample]) 52 | score = score / lengths 53 | if n_best > 1: 54 | sidx = numpy.argsort(score)[:n_best] 55 | 56 | else: 57 | sidx = numpy.argmin(score) 58 | # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx] 59 | 60 | return numpy.array(sample)[sidx], numpy.array(score)[sidx] 61 | # return numpy.array(word_map)[sample[sidx]], numpy.array(score)[sidx] 62 | 63 | while True: 64 | req = queue.get() 65 | if req is None: 66 | break 67 | 68 | idx, x = req[0], req[1] 69 | print pid, '-', idx 70 | seq, scores = _translate(x) 71 | 72 | rqueue.put((idx, seq, scores)) 73 | 74 | return 75 | 76 | 77 | def predict(model, dictionary, common_dictionary, source_file, saveto, k=5, 78 | normalize=False, n_process=5, chr_level=False, n_best=1): 79 | 80 | # load model model_options 81 | with open('%s.pkl' % model, 'rb') as f: 82 | options = pkl.load(f) 83 | 84 | # load source dictionary and invert 85 | with open(dictionary, 'rb') as f: 86 | word_dict = pkl.load(f) 87 | word_idict = dict() 88 | for kk, vv in word_dict.iteritems(): 89 | word_idict[vv] = kk 90 | word_idict[0] = '' 91 | word_idict[1] = 'UNK' 92 | 93 | word_idict_trg = word_idict 94 | # load target dictionary and invert 95 | # with open(dictionary_target, 'rb') as f: 96 | # word_dict_trg = pkl.load(f) 97 | # word_idict_trg = dict() 98 | # for kk, vv in word_dict_trg.iteritems(): 99 | # word_idict_trg[vv] = kk 100 | # word_idict_trg[0] = '' 101 | # word_idict_trg[1] = 'UNK' 102 | 103 | word_map0 = [] 104 | with open(common_dictionary) as ff: 105 | for line in ff: 106 | line = line.strip() 107 | if line in word_dict: 108 | if line not in word_map0: 109 | word_map0.append(word_dict[line]) 110 | 111 | # create input and output queues for processes 112 | queue = Queue() 113 | rqueue = Queue() 114 | processes = [None] * n_process 115 | for midx in xrange(n_process): 116 | processes[midx] = Process( 117 | target=translate_model, 118 | args=(word_map0, queue, rqueue, midx, model, options, k, normalize, n_best)) 119 | processes[midx].start() 120 | 121 | # utility function 122 | def _seqs2words(caps): 123 | capsw = [] 124 | for cc in caps: 125 | ww = [] 126 | for w in cc: 127 | if w == 0: 128 | break 129 | ww.append(word_idict_trg[w]) 130 | capsw.append(' '.join(ww)) 131 | return capsw 132 | 133 | def _send_jobs(fname): 134 | with open(fname, 'r') as f: 135 | for idx, line in enumerate(f): 136 | if chr_level: 137 | words = list(line.decode('utf-8').strip()) 138 | else: 139 | words = line.strip().split() 140 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 141 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 142 | x += [0] 143 | queue.put((idx, x)) 144 | return idx+1 145 | 146 | def _finish_processes(): 147 | for midx in xrange(n_process): 148 | queue.put(None) 149 | 150 | def _retrieve_jobs(n_samples): 151 | trans = [None] * n_samples 152 | scores = [None] * n_samples 153 | for idx in xrange(n_samples): 154 | resp = rqueue.get() 155 | trans[resp[0]] = resp[1] 156 | scores[resp[0]] = resp[2] 157 | if numpy.mod(idx, 10) == 0: 158 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 159 | return trans, scores 160 | 161 | print 'Translating ', source_file, '...' 162 | n_samples = _send_jobs(source_file) 163 | trans, scores = _retrieve_jobs(n_samples) 164 | _finish_processes() 165 | 166 | if n_best == 1: 167 | trans = _seqs2words(trans) 168 | else: 169 | n_best_trans = [] 170 | for idx, (n_best_tr, score_) in enumerate(zip(trans, scores)): 171 | sentences = _seqs2words(n_best_tr) 172 | for ids, trans_ in enumerate(sentences): 173 | n_best_trans.append( 174 | '|||'.join( 175 | ['{}'.format(idx), trans_, 176 | '{}'.format(score_[ids])])) 177 | trans = n_best_trans 178 | 179 | with open(saveto, 'w') as f: 180 | print >>f, '\n'.join(trans) 181 | print 'Done' 182 | 183 | 184 | if __name__ == "__main__": 185 | parser = argparse.ArgumentParser() 186 | parser.add_argument('-k', type=int, default=5, help="Beam size") 187 | parser.add_argument('-p', type=int, default=5, help="Number of processes") 188 | parser.add_argument('-n', action="store_true", default=False, 189 | help="Normalize wrt sequence length") 190 | parser.add_argument('-c', action="store_true", default=False, 191 | help="Character level") 192 | parser.add_argument('-b', type=int, default=1, help="Output n-best list") 193 | parser.add_argument('model', type=str) 194 | parser.add_argument('dictionary', type=str) 195 | parser.add_argument('common_dictionary', type=str) 196 | parser.add_argument('source', type=str) 197 | parser.add_argument('saveto', type=str) 198 | 199 | args = parser.parse_args() 200 | 201 | main(args.model, args.dictionary, args.common_dictionary, args.source, 202 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 203 | chr_level=args.c, n_best=args.b) 204 | -------------------------------------------------------------------------------- /Att_Seq2Seq/Pdt_windows.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ################################################################## 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 08/08/2016(Bug fixed) 7 | # Usage: For windows predict 8 | # 9 | ################################################################## 10 | import Pdt as TTT 11 | 12 | if __name__ == '__main__': 13 | TTT.predict(r'data_2/model/m.npz', r'data_2/word_dict.pkl', r'data_2/dict2.txt', r'data_2/p.txt', r'data_2/ttt.txt', k=5, n_process=1) 14 | 15 | 16 | -------------------------------------------------------------------------------- /Att_Seq2Seq/README.md: -------------------------------------------------------------------------------- 1 | # Standard Model 2 | 3 | Standard Seq2Seq Attention Model 4 | 5 | (For training faster, the output vocabulary is built according to the input and a extra dictionary) 6 | -------------------------------------------------------------------------------- /Att_Seq2Seq/Seq2SeqAtt.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ################################################################## 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 08/08/2016(Bug fixed) 7 | # Usage: Seq2Seq Att, the output vocabulary is built 8 | # according to the input and a extra dictionary 9 | # 10 | ################################################################## 11 | 12 | import numpy 13 | import os 14 | import cPickle 15 | 16 | from nmt_word_without_copy import train 17 | 18 | def main(job_id, params): 19 | print params 20 | basedir = 'data_2' 21 | validerr = train(saveto=params['model'][0], 22 | reload_=params['reload'][0], 23 | dim_word=params['dim_word'][0], 24 | dim=params['dim'][0], 25 | n_words=params['n-words'][0], 26 | n_words_src=params['n-words'][0], 27 | decay_c=params['decay-c'][0], 28 | clip_c=params['clip-c'][0], 29 | lrate=params['learning-rate'][0], 30 | optimizer=params['optimizer'][0], 31 | maxlen=100, 32 | batch_size=32, 33 | valid_batch_size=32, 34 | datasets=['%s/p.txt'%basedir, 35 | '%s/r.txt'%basedir], 36 | valid_datasets=['%s/p.txt'%basedir, 37 | '%s/r.txt'%basedir,], 38 | # dictionaries=['%s/p.txt.pkl'%basedir, 39 | # '%s/r.txt.pkl'%basedir], 40 | dictionaries=['%s/word_dict.pkl'%basedir,'%s/dict2.txt'%basedir], 41 | validFreq=50000, 42 | dispFreq=1, 43 | saveFreq=100, 44 | sampleFreq=1, 45 | use_dropout=params['use-dropout'][0], 46 | overwrite=False) 47 | return validerr 48 | 49 | if __name__ == '__main__': 50 | # f = cPickle.load(open(r'data//p.txt.pkl')) 51 | # print f 52 | 53 | """ 54 | datasets: 55 | 56 | dictionaries: 57 | OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)]) 58 | OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)]) 59 | 60 | """ 61 | basedir = 'data_2' 62 | main(0, { 63 | 'model': ['%s/model/m.npz'%basedir], 64 | 'dim_word': [512],#word embedding dim 65 | 'dim': [512], #hidden dim 66 | 'n-words': [50000], #vocabulary size 67 | 'optimizer': ['rmsprop'], 68 | 'decay-c': [0.], 69 | 'clip-c': [1.], 70 | 'use-dropout': [False], 71 | 'learning-rate': [0.01], 72 | 'reload': [False]}) 73 | 74 | 75 | -------------------------------------------------------------------------------- /Att_Seq2Seq/data/pp.txt: -------------------------------------------------------------------------------- 1 | let us play dota2 2 | i like playing basketball 3 | go go go pokemon go -------------------------------------------------------------------------------- /Att_Seq2Seq/data/pp.txt.pkl: -------------------------------------------------------------------------------- 1 | ccollections 2 | OrderedDict 3 | p1 4 | ((lp2 5 | (lp3 6 | S'eos' 7 | p4 8 | aI0 9 | aa(lp5 10 | S'UNK' 11 | p6 12 | aI1 13 | aa(lp7 14 | S'go' 15 | p8 16 | aI2 17 | aa(lp9 18 | S'pokemon' 19 | p10 20 | aI3 21 | aa(lp11 22 | S'basketball' 23 | p12 24 | aI4 25 | aa(lp13 26 | S'playing' 27 | p14 28 | aI5 29 | aa(lp15 30 | S'like' 31 | p16 32 | aI6 33 | aa(lp17 34 | S'i' 35 | aI7 36 | aa(lp18 37 | S'dota2' 38 | p19 39 | aI8 40 | aa(lp20 41 | S'play' 42 | p21 43 | aI9 44 | aa(lp22 45 | S'us' 46 | p23 47 | aI10 48 | aa(lp24 49 | S'let' 50 | p25 51 | aI11 52 | aatRp26 53 | . -------------------------------------------------------------------------------- /Att_Seq2Seq/data/ppv.txt: -------------------------------------------------------------------------------- 1 | dota2 -------------------------------------------------------------------------------- /Att_Seq2Seq/data/rr.txt: -------------------------------------------------------------------------------- 1 | play dota2 2 | basketball 3 | pokemon go -------------------------------------------------------------------------------- /Att_Seq2Seq/data/rr.txt.pkl: -------------------------------------------------------------------------------- 1 | ccollections 2 | OrderedDict 3 | p1 4 | ((lp2 5 | (lp3 6 | S'eos' 7 | p4 8 | aI0 9 | aa(lp5 10 | S'UNK' 11 | p6 12 | aI1 13 | aa(lp7 14 | S'go' 15 | p8 16 | aI2 17 | aa(lp9 18 | S'pokemon' 19 | p10 20 | aI3 21 | aa(lp11 22 | S'basketball' 23 | p12 24 | aI4 25 | aa(lp13 26 | S'dota2' 27 | p14 28 | aI5 29 | aa(lp15 30 | S'play' 31 | p16 32 | aI6 33 | aatRp17 34 | . -------------------------------------------------------------------------------- /Att_Seq2Seq/data/rrv.txt: -------------------------------------------------------------------------------- 1 | dota2 -------------------------------------------------------------------------------- /Att_Seq2Seq/data_iterator.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ################################################################## 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 26/07/2016 7 | # Usage: data iterator 8 | # 9 | ################################################################## 10 | import cPickle as pkl 11 | import gzip 12 | 13 | 14 | def fopen(filename, mode='r'): 15 | if filename.endswith('.gz'): 16 | return gzip.open(filename, mode) 17 | return open(filename, mode) 18 | 19 | 20 | class TextIterator: 21 | """Simple Bitext iterator.""" 22 | def __init__(self, source, target, 23 | source_dict, target_dict, 24 | batch_size=128, 25 | maxlen=100, 26 | n_words_source=-1, 27 | n_words_target=-1): 28 | self.source = fopen(source, 'r') 29 | self.target = fopen(target, 'r') 30 | with open(source_dict, 'rb') as f: 31 | self.source_dict = pkl.load(f) 32 | with open(target_dict, 'rb') as f: 33 | self.target_dict = pkl.load(f) 34 | 35 | self.batch_size = batch_size 36 | self.maxlen = maxlen 37 | 38 | self.n_words_source = n_words_source 39 | self.n_words_target = n_words_target 40 | 41 | self.end_of_data = False 42 | 43 | def __iter__(self): 44 | return self 45 | 46 | def reset(self): 47 | self.source.seek(0) 48 | self.target.seek(0) 49 | 50 | def next(self): 51 | if self.end_of_data: 52 | self.end_of_data = False 53 | self.reset() 54 | raise StopIteration 55 | 56 | source = [] 57 | target = [] 58 | 59 | try: 60 | 61 | # actual work here 62 | while True: 63 | 64 | # read from source file and map to word index 65 | ss = self.source.readline() 66 | if ss == "": 67 | raise IOError 68 | ss = ss.strip().split() 69 | ss = [self.source_dict[w] if w in self.source_dict else 1 70 | for w in ss] 71 | if self.n_words_source > 0: 72 | ss = [w if w < self.n_words_source else 1 for w in ss] 73 | 74 | # read from source file and map to word index 75 | tt = self.target.readline() 76 | if tt == "": 77 | raise IOError 78 | tt = tt.strip().split() 79 | tt = [self.target_dict[w] if w in self.target_dict else 1 80 | for w in tt] 81 | if self.n_words_target > 0: 82 | tt = [w if w < self.n_words_target else 1 for w in tt] 83 | 84 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 85 | continue 86 | 87 | source.append(ss) 88 | target.append(tt) 89 | 90 | if len(source) >= self.batch_size or \ 91 | len(target) >= self.batch_size: 92 | break 93 | except IOError: 94 | self.end_of_data = True 95 | 96 | if len(source) <= 0 or len(target) <= 0: 97 | self.end_of_data = False 98 | self.reset() 99 | raise StopIteration 100 | 101 | return source, target 102 | -------------------------------------------------------------------------------- /Att_Seq2Seq/train.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | ################################################################## 3 | # 4 | # Author: Chuwei Luo 5 | # Email: luochuwei@gmail.com 6 | # Date: 26/07/2016 7 | # Usage: Training 8 | # 9 | ################################################################## 10 | 11 | import numpy 12 | import os 13 | import cPickle 14 | 15 | from Seq2SeqAtt import train 16 | 17 | def main(job_id, params): 18 | print params 19 | basedir = 'data' 20 | validerr = train(saveto=params['model'][0], reload_=params['reload'][0], dim_word=params['dim_word'][0], dim=params['dim'][0], n_words=params['n-words'][0], n_words_src=params['n-words'][0], decay_c=params['decay-c'][0], clip_c=params['clip-c'][0], lrate=params['learning-rate'][0], optimizer=params['optimizer'][0], maxlen=15, batch_size=1, valid_batch_size=1, datasets=['%s/ppp.txt'%basedir, '%s/ppp.txt'%basedir], valid_datasets=['%s/pv.txt'%basedir, 21 | '%s/pv.txt'%basedir], dictionaries=['%s/p.txt.pkl'%basedir], validFreq=500000, dispFreq=1, saveFreq=100, sampleFreq=1, use_dropout=params['use-dropout'][0], overwrite=False) 22 | return validerr 23 | 24 | if __name__ == '__main__': 25 | # f = cPickle.load(open(r'data//p.txt.pkl')) 26 | # print f 27 | """ 28 | datasets: 29 | 30 | dictionaries: 31 | OrderedDict([('eos', 0), ('UNK', 1), ('b', 2), ('c', 3), ('a', 4)]) 32 | OrderedDict([('eos', 0), ('UNK', 1), ('B', 2), ('C', 3), ('A', 4)]) 33 | 34 | """ 35 | basedir = 'data' 36 | main(0, { 37 | 'model': ['%s/model/m.model'%basedir], 38 | 'dim_word': [15], 39 | 'dim': [24], 40 | 'n-words': [6], 41 | 'optimizer': ['rmsprop'], 42 | 'decay-c': [0.], 43 | 'clip-c': [1.], 44 | 'use-dropout': [False], 45 | 'learning-rate': [0.01], 46 | 'reload': [False]}) 47 | 48 | 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Attention_CopyNet 2 | 3 | Attention_CopyNet for summarization and response generation 4 | 5 | Att_Seq2Seq(Finished) 6 | 7 | Att_CopyNet(Finished) 8 | 9 | Att_POS_CopyNet(Finished) 10 | --------------------------------------------------------------------------------