├── BLEUbyLength.py
├── README.md
├── baseline.py
├── beam_decode.sh
├── chunk_nmt_train.sh
├── clean.sh
├── codetest.py
├── computeCost.py
├── cpu_train_chunk_nmt.sh
├── data_iterator.py
├── haos.bib
├── lookup.py
├── multi-len-bleu.sh
├── nmt.py
├── output_align.py
├── rmmodel.sh
├── scp.sh
├── scp240.sh
├── scp5.sh
├── test.398000.sh
├── test.align.sh
├── test.batch.sh
├── test.scratch.sh
├── test.sh
├── test_zh2en.pc.sh
├── test_zh2en.sh
├── train.sh
├── train_all.sh
├── train_nmt.py
├── train_nmt_all.py
├── train_nmt_zh2en.py
├── train_nmt_zh2en_pc.py
├── training_data_iterator.py
├── translate.py
├── translate_gpu.py
├── valid.sh
└── validate.py


/BLEUbyLength.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import subprocess
  7 | import re
  8 | import os
  9 | 
 10 | 
 11 | def getBLEU():
 12 | 
 13 |     return 0
 14 | 
 15 | 
 16 | 
 17 | def main(bleu_scrip,
 18 |          valid_datasets=['../data/dev/newstest2011.en.tok',
 19 |                           '../data/dev/newstest2011.fr.tok',
 20 |                          '../data/dev/newstest2011.fr.tok'],
 21 |          length=10):
 22 | 
 23 |                 len_chose = [10, 20, 30, 40, 50]
 24 | 
 25 | 
 26 |                 source_file = open(valid_datasets[0], 'r')
 27 | 
 28 |                 target_file = open(valid_datasets[1], 'r')
 29 | 
 30 |                 reference0_file = open(valid_datasets[2]+'0', 'r')
 31 |                 reference1_file = open(valid_datasets[2]+'1', 'r')
 32 |                 reference2_file = open(valid_datasets[2]+'2', 'r')
 33 |                 reference3_file = open(valid_datasets[2]+'3', 'r')
 34 | 
 35 |                 source_sents = source_file.readlines()
 36 |                 target_sents = target_file.readlines()
 37 |                 reference0_sents = reference0_file.readlines()
 38 |                 reference1_sents = reference1_file.readlines()
 39 |                 reference2_sents = reference2_file.readlines()
 40 |                 reference3_sents = reference3_file.readlines()
 41 | 
 42 |                 idx_set=[ [], [], [], [], [], [] ]
 43 | 
 44 |                 target_set = [ [] ] * len(idx_set)
 45 |                 r0_set = [ [] ] * len(idx_set)
 46 |                 r1_set = [ [] ] * len(idx_set)
 47 |                 r2_set = [ [] ] * len(idx_set)
 48 |                 r3_set = [ [] ] * len(idx_set)
 49 | 
 50 | 
 51 | 
 52 |                 for idx, sent in enumerate(source_sents):
 53 |                     tokens = sent.strip().split()
 54 |                     l = len(tokens)
 55 |                     # print l
 56 | 
 57 |                     if l <= len_chose[0]:
 58 |                         idx_set[0].append(idx)
 59 |                         # print '< 10', l, idx
 60 |                     elif l <= len_chose[1]:
 61 |                         idx_set[1].append(idx)
 62 |                     elif l <= len_chose[2]:
 63 |                         idx_set[2].append(idx)
 64 |                     elif l <= len_chose[3]:
 65 |                         idx_set[3].append(idx)
 66 |                     elif l <= len_chose[4]:
 67 |                         idx_set[4].append(idx)
 68 |                     else:
 69 |                          idx_set[5].append(idx)
 70 | 
 71 | 
 72 | 
 73 | 
 74 |                 # get the filter of the sentences
 75 |                 for i in range(6):
 76 | 
 77 |                     # print idx_set[i]
 78 |                     target_set[i] = [target_sents[k].strip() for k in idx_set[i]]
 79 |                     r0_set[i] = [reference0_sents[k].strip() for k in idx_set[i]]
 80 |                     r1_set[i] = [reference1_sents[k].strip() for k in idx_set[i]]
 81 |                     r2_set[i] = [reference2_sents[k].strip() for k in idx_set[i]]
 82 |                     r3_set[i] = [reference3_sents[k].strip() for k in idx_set[i]]
 83 | 
 84 |                     with open('./translate.tmp', 'w') as f:
 85 |                         print >> f , '\n'.join(target_set[i])
 86 | 
 87 |                     with open('./tmp.reference0', 'w') as f:
 88 |                         print >> f, '\n'.join(r0_set[i])
 89 | 
 90 |                     with open('./tmp.reference1', 'w') as f:
 91 |                         print >> f, '\n'.join(r1_set[i])
 92 | 
 93 |                     with open('./tmp.reference2', 'w') as f:
 94 |                         print >> f, '\n'.join(r2_set[i])
 95 | 
 96 |                     with open('./tmp.reference3', 'w') as f:
 97 |                         print >> f, '\n'.join(r3_set[i])
 98 | 
 99 | 
100 |                     cmd_bleu_cmd = ['perl', bleu_scrip, \
101 |                                      './tmp.reference', \
102 |                                      '<', \
103 |                                      './translate.tmp', \
104 |                                      '>'
105 |                                      './output.eva']
106 | 
107 |                     subprocess.check_call(" ".join(cmd_bleu_cmd), shell=True)
108 | 
109 |                     fin = open('./output.eva', 'rU')
110 |                     out = re.search('BLEU = [-.0-9]+', fin.readlines()[0])
111 |                     fin.close()
112 | 
113 |                     bleu_score = float(out.group()[7:])
114 | 
115 |                     if i < len(len_chose):
116 |                         print '### Len <= ', len_chose[i]
117 |                     else:
118 |                         print '### Len > ', len_chose[i-1]
119 |                     print '### BLEU:', bleu_score, 'total: ', len(target_set[i])
120 | 
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     parser = argparse.ArgumentParser()
125 |     parser.add_argument('-length', type=int, default=10)
126 |     parser.add_argument('bleu_scrip', type=str)
127 |     parser.add_argument('valid_source', type=str)
128 |     parser.add_argument('valid_target', type=str)
129 |     parser.add_argument('valid_reference', type=str)
130 | 
131 | 
132 |     args = parser.parse_args()
133 | 
134 |     main(args.bleu_scrip,
135 |          valid_datasets=[args.valid_source, args.valid_target, args.valid_reference],
136 |          length=args.length)
137 | 
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Chunk-Based Bi-Scaled Decoder for Neural Machine Translation
 2 | 
 3 | ------
 4 | 
 5 | This is the code for paper "Chunk-Based Bi-Scaled Decoder for Neural Machine Translation".
 6 | 
 7 | The chunk-based neural machine translation system is on basis of session 2 of [dl4mt-tutorial](https://github.com/nyu-dl/dl4mt-tutorial), which is a attention based encoder-decoder machine translation model. 
 8 | 
 9 | The main difference between our proposed model and dl4mt is that we use a bi-scaled decoder to leverage the target-side phrase information for better translation, and propose the phrase attention for phrase level soft alignments. 
10 | 
11 | ## Reguired Software
12 |  * Python 2.7
13 |  * [Theano](http://deeplearning.net/software/theano/)
14 | 
15 | ## Training
16 | 
17 |     export THEANO_FLAGS=device=gpu2,floatX=float32
18 |     python ./train_nmt_zh2en.py
19 | 
20 | ## Evaluation
21 | 
22 |     export THEANO_FLAGS=device=gpu2,floatX=float32
23 |     datadir=/home/zhouh/Data/nmt
24 |     modeldir=./
25 |     
26 |     python ./translate_gpu.py -n -jointProb \
27 |     	$modeldir/model_hal.iter.npz  \
28 |     	$modeldir/model_hal.npz.pkl  \
29 |         $datadir/hms.ch.filter.pkl \
30 |     	$datadir/hms.en.filter.chunked.pkl \
31 |         $datadir/devntest/MT0${i}/MT0${i}.src \
32 |     	./test.result.chunk.${i} 
33 | 
34 | 
35 | 
36 | ------
37 | 
38 | 
39 | [1]: Hao Zhou, Zhaopeng Tu, Shujian Huang, Xiaohua Liu, Hang Li and Jiajun Chen. Chunk-based Bi-Scale Decoder for Neural Machine Translation. In Proceeding of ACL 2017, short paper.


--------------------------------------------------------------------------------
/baseline.py:
--------------------------------------------------------------------------------
   1 | '''
   2 | Build a neural machine translation model with soft attention
   3 | '''
   4 | import theano
   5 | import theano.tensor as tensor
   6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
   7 | 
   8 | import cPickle as pkl
   9 | import ipdb
  10 | import numpy
  11 | import copy
  12 | 
  13 | import os
  14 | import warnings
  15 | import sys
  16 | import time
  17 | 
  18 | from collections import OrderedDict
  19 | 
  20 | from training_data_iterator import TrainingTextIterator
  21 | from data_iterator import TextIterator
  22 | 
  23 | 
  24 | profile = False
  25 | 
  26 | 
  27 | # push parameters to Theano shared variables
  28 | def zipp(params, tparams):
  29 |     for kk, vv in params.iteritems():
  30 |         tparams[kk].set_value(vv)
  31 | 
  32 | 
  33 | # pull parameters from Theano shared variables
  34 | def unzip(zipped):
  35 |     new_params = OrderedDict()
  36 |     for kk, vv in zipped.iteritems():
  37 |         new_params[kk] = vv.get_value()
  38 |     return new_params
  39 | 
  40 | 
  41 | # get the list of parameters: Note that tparams must be OrderedDict
  42 | def itemlist(tparams):
  43 |     return [vv for kk, vv in tparams.iteritems()]
  44 | 
  45 | 
  46 | # dropout
  47 | def dropout_layer(state_before, use_noise, trng):
  48 |     proj = tensor.switch(
  49 |         use_noise,
  50 |         state_before * trng.binomial(state_before.shape, p=0.5, n=1,
  51 |                                      dtype=state_before.dtype),
  52 |         state_before * 0.5)
  53 |     return proj
  54 | 
  55 | 
  56 | # make prefix-appended name
  57 | def _p(pp, name):
  58 |     return '%s_%s' % (pp, name)
  59 | 
  60 | 
  61 | # initialize Theano shared variables according to the initial parameters
  62 | def init_tparams(params):
  63 |     tparams = OrderedDict()
  64 |     for kk, pp in params.iteritems():
  65 |         tparams[kk] = theano.shared(params[kk], name=kk)
  66 |     return tparams
  67 | 
  68 | 
  69 | # load parameters
  70 | def load_params(path, params):
  71 |     pp = numpy.load(path)
  72 |     for kk, vv in params.iteritems():
  73 |         if kk not in pp:
  74 |             warnings.warn('%s is not in the archive' % kk)
  75 |             continue
  76 |         params[kk] = pp[kk]
  77 | 
  78 |     return params
  79 | 
  80 | # layers: 'name': ('parameter initializer', 'feedforward')
  81 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
  82 |           'gru': ('param_init_gru', 'gru_layer'),
  83 |           'gru_cond': ('param_init_gru_cond', 'gru_cond_layer'),
  84 |           }
  85 | 
  86 | 
  87 | def get_layer(name):
  88 |     fns = layers[name]
  89 |     return (eval(fns[0]), eval(fns[1]))
  90 | 
  91 | 
  92 | # some utilities
  93 | def ortho_weight(ndim):
  94 |     W = numpy.random.randn(ndim, ndim)
  95 |     u, s, v = numpy.linalg.svd(W)
  96 |     return u.astype('float32')
  97 | 
  98 | 
  99 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
 100 |     if nout is None:
 101 |         nout = nin
 102 |     if nout == nin and ortho:
 103 |         W = ortho_weight(nin)
 104 |     else:
 105 |         W = scale * numpy.random.randn(nin, nout)
 106 |     return W.astype('float32')
 107 | 
 108 | 
 109 | def get_tensor_weight(n, nin, nout, scale=0.01):
 110 | 
 111 |     W = scale * numpy.random.randn(n, nin, nout)
 112 |     return W.astype('float32')
 113 | 
 114 | 
 115 | def tanh(x):
 116 |     return tensor.tanh(x)
 117 | 
 118 | 
 119 | def linear(x):
 120 |     return x
 121 | 
 122 | 
 123 | def concatenate(tensor_list, axis=0):
 124 |     """
 125 |     Alternative implementation of `theano.tensor.concatenate`.
 126 |     This function does exactly the same thing, but contrary to Theano's own
 127 |     implementation, the gradient is implemented on the GPU.
 128 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
 129 |     because the inverse operation (splitting) needs to be done on the CPU.
 130 |     This implementation does not have that problem.
 131 |     :usage:
 132 |         >>> x, y = theano.tensor.matrices('x', 'y')
 133 |         >>> c = concatenate([x, y], axis=1)
 134 |     :parameters:
 135 |         - tensor_list : list
 136 |             list of Theano tensor expressions that should be concatenated.
 137 |         - axis : int
 138 |             the tensors will be joined along this axis.
 139 |     :returns:
 140 |         - out : tensor
 141 |             the concatenated tensor expression.
 142 |     """
 143 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
 144 | 
 145 |     output_shape = ()
 146 |     for k in range(axis):
 147 |         output_shape += (tensor_list[0].shape[k],)
 148 |     output_shape += (concat_size,)
 149 |     for k in range(axis + 1, tensor_list[0].ndim):
 150 |         output_shape += (tensor_list[0].shape[k],)
 151 | 
 152 |     out = tensor.zeros(output_shape)
 153 |     offset = 0
 154 |     for tt in tensor_list:
 155 |         indices = ()
 156 |         for k in range(axis):
 157 |             indices += (slice(None),)
 158 |         indices += (slice(offset, offset + tt.shape[axis]),)
 159 |         for k in range(axis + 1, tensor_list[0].ndim):
 160 |             indices += (slice(None),)
 161 | 
 162 |         out = tensor.set_subtensor(out[indices], tt)
 163 |         offset += tt.shape[axis]
 164 | 
 165 |     return out
 166 | 
 167 | 
 168 | 
 169 | # batch preparation
 170 | def prepare_training_data(seqs_x, seqs_y_c, seqs_y_cw, maxlen_chunk=None, maxlen_cw=None, n_words_src=30000,
 171 |                  n_words=30000):
 172 |     # x: a list of sentences
 173 |     lengths_x = [len(s) for s in seqs_x]
 174 |     lengths_y = [ len(s) for s in seqs_y_cw]
 175 | 
 176 |     n_samples = len(seqs_x)
 177 |     maxlen_x = numpy.max(lengths_x) + 1
 178 |     maxlen_y = numpy.max(lengths_y) + 1
 179 | 
 180 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
 181 |     y_c = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 182 |     y_cw = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 183 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 184 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 185 |     chunk_indicator = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 186 | 
 187 |     for idx, [s_x, s_y_c, s_y_cw] in enumerate(zip(seqs_x, seqs_y_c, seqs_y_cw)):
 188 |         x[:lengths_x[idx], idx] = s_x
 189 |         x_mask[:lengths_x[idx]+1, idx] = 1.
 190 |         # print 'yc', y_c
 191 |         # print 'shape yc', y_c.shape
 192 |         # print 'idx', idx
 193 |         # print 'max', maxlen_y[idx]
 194 |         # print 'syc', s_y_c
 195 |         # print 'shape syc', s_y_c.shape
 196 |         y_c[:lengths_y[idx], idx] = s_y_c
 197 |         y_cw[:lengths_y[idx], idx] = s_y_cw
 198 |         y_mask[:lengths_y[idx]+1, idx] = 1.
 199 | 
 200 | 
 201 |         indicator_mask = [1 if cc != 1 else 0 for cc in s_y_c]
 202 | 
 203 |         # indicator here is a chunk begin or not (1 if True )
 204 |         chunk_indicator[:lengths_y[idx], idx] = indicator_mask
 205 | 
 206 | 
 207 | 
 208 |     # print y_cw
 209 | 
 210 |     return x, x_mask, y_c, y_cw, chunk_indicator, y_mask
 211 | 
 212 | # batch preparation
 213 | def prepare_data(seqs_x, seqs_y, maxlen=None, n_words_src=30000,
 214 |                  n_words=30000):
 215 |     # x: a list of sentences
 216 |     lengths_x = [len(s) for s in seqs_x]
 217 |     lengths_y = [len(s) for s in seqs_y]
 218 | 
 219 |     if maxlen is not None:
 220 |         new_seqs_x = []
 221 |         new_seqs_y = []
 222 |         new_lengths_x = []
 223 |         new_lengths_y = []
 224 |         for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
 225 |             if l_x < maxlen and l_y < maxlen:
 226 |                 new_seqs_x.append(s_x)
 227 |                 new_lengths_x.append(l_x)
 228 |                 new_seqs_y.append(s_y)
 229 |                 new_lengths_y.append(l_y)
 230 |         lengths_x = new_lengths_x
 231 |         seqs_x = new_seqs_x
 232 |         lengths_y = new_lengths_y
 233 |         seqs_y = new_seqs_y
 234 | 
 235 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
 236 |             return None, None, None, None
 237 | 
 238 |     n_samples = len(seqs_x)
 239 |     maxlen_x = numpy.max(lengths_x) + 1
 240 |     maxlen_y = numpy.max(lengths_y) + 1
 241 | 
 242 |     x = numpy.zeros((maxlen_x, n_samples)).astype('int64')
 243 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 244 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 245 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 246 |     for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
 247 |         x[:lengths_x[idx], idx] = s_x
 248 |         x_mask[:lengths_x[idx]+1, idx] = 1.
 249 |         y[:lengths_y[idx], idx] = s_y
 250 |         y_mask[:lengths_y[idx]+1, idx] = 1.
 251 | 
 252 |     return x, x_mask, y, y_mask
 253 | 
 254 | 
 255 | # feedforward layer: affine transformation + point-wise nonlinearity
 256 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None,
 257 |                        ortho=True):
 258 |     if nin is None:
 259 |         nin = options['dim_proj']
 260 |     if nout is None:
 261 |         nout = options['dim_proj']
 262 |     params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
 263 |     params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')
 264 | 
 265 |     return params
 266 | 
 267 | 
 268 | def fflayer(tparams, state_below, options, prefix='rconv',
 269 |             activ='lambda x: tensor.tanh(x)', **kwargs):
 270 |     return eval(activ)(
 271 |         tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
 272 |         tparams[_p(prefix, 'b')])
 273 | 
 274 | 
 275 | # GRU layer
 276 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
 277 |     if nin is None:
 278 |         nin = options['dim_proj']
 279 |     if dim is None:
 280 |         dim = options['dim_proj']
 281 | 
 282 |     # embedding to gates transformation weights, biases
 283 |     W = numpy.concatenate([norm_weight(nin, dim),
 284 |                            norm_weight(nin, dim)], axis=1)
 285 |     params[_p(prefix, 'W')] = W
 286 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
 287 | 
 288 |     # recurrent transformation weights for gates
 289 |     U = numpy.concatenate([ortho_weight(dim),
 290 |                            ortho_weight(dim)], axis=1)
 291 |     params[_p(prefix, 'U')] = U
 292 | 
 293 |     # embedding to hidden state proposal weights, biases
 294 |     Wx = norm_weight(nin, dim)
 295 |     params[_p(prefix, 'Wx')] = Wx
 296 |     params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32')
 297 | 
 298 |     # recurrent transformation weights for hidden state proposal
 299 |     Ux = ortho_weight(dim)
 300 |     params[_p(prefix, 'Ux')] = Ux
 301 | 
 302 |     return params
 303 | 
 304 | 
 305 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None,
 306 |               **kwargs):
 307 |     nsteps = state_below.shape[0]
 308 |     if state_below.ndim == 3:
 309 |         n_samples = state_below.shape[1]
 310 |     else:
 311 |         n_samples = 1
 312 | 
 313 |     dim = tparams[_p(prefix, 'Ux')].shape[1]
 314 | 
 315 |     if mask is None:
 316 |         mask = tensor.alloc(1., state_below.shape[0], 1)
 317 | 
 318 |     # utility function to slice a tensor
 319 |     def _slice(_x, n, dim):
 320 |         if _x.ndim == 3:
 321 |             return _x[:, :, n*dim:(n+1)*dim]
 322 |         return _x[:, n*dim:(n+1)*dim]
 323 | 
 324 |     # state_below is the input word embeddings
 325 |     # input to the gates, concatenated
 326 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
 327 |         tparams[_p(prefix, 'b')]
 328 |     # input to compute the hidden state proposal
 329 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
 330 |         tparams[_p(prefix, 'bx')]
 331 | 
 332 |     # step function to be used by scan
 333 |     # arguments    | sequences |outputs-info| non-seqs
 334 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
 335 |         preact = tensor.dot(h_, U)
 336 |         preact += x_
 337 | 
 338 |         # reset and update gates
 339 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
 340 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
 341 | 
 342 |         # compute the hidden state proposal
 343 |         preactx = tensor.dot(h_, Ux)
 344 |         preactx = preactx * r
 345 |         preactx = preactx + xx_
 346 | 
 347 |         # hidden state proposal
 348 |         h = tensor.tanh(preactx)
 349 | 
 350 |         # leaky integrate and obtain next hidden state
 351 |         h = u * h_ + (1. - u) * h
 352 |         h = m_[:, None] * h + (1. - m_)[:, None] * h_
 353 | 
 354 |         return h
 355 | 
 356 |     # prepare scan arguments
 357 |     seqs = [mask, state_below_, state_belowx]
 358 |     init_states = [tensor.alloc(0., n_samples, dim)]
 359 |     _step = _step_slice
 360 |     shared_vars = [tparams[_p(prefix, 'U')],
 361 |                    tparams[_p(prefix, 'Ux')]]
 362 | 
 363 |     rval, updates = theano.scan(_step,
 364 |                                 sequences=seqs,
 365 |                                 outputs_info=init_states,
 366 |                                 non_sequences=shared_vars,
 367 |                                 name=_p(prefix, '_layers'),
 368 |                                 n_steps=nsteps,
 369 |                                 profile=profile,
 370 |                                 strict=True)
 371 |     rval = [rval]
 372 |     return rval
 373 | 
 374 | 
 375 | # Conditional GRU layer with Attention
 376 | def param_init_gru_cond(options, params, prefix='gru_cond',
 377 |                         nin=None, dim=None, dimctx=None,
 378 |                         nin_nonlin=None, dim_nonlin=None, nin_chunk=None, dim_chunk_hidden=None, nin_nonlin_chunk=None):
 379 | 
 380 | 
 381 |     if nin is None:
 382 |         nin = options['dim']
 383 |     if dim is None:
 384 |         dim = options['dim']
 385 |     if dimctx is None:
 386 |         dimctx = options['dim']
 387 |     if nin_nonlin is None:
 388 |         nin_nonlin = nin
 389 |     if dim_nonlin is None:
 390 |         dim_nonlin = dim
 391 |     if nin_chunk is None:
 392 |         nin_chunk = nin
 393 |     if nin_nonlin_chunk is None:
 394 |         nin_nonlin_chunk = nin_chunk
 395 |     if dim_chunk_hidden is None:
 396 |         dim_chunk_hidden = dim
 397 | 
 398 | 
 399 |     chunk_label_num = options['n_chunks']
 400 | 
 401 |     W = numpy.concatenate([norm_weight(nin, dim),
 402 |                            norm_weight(nin, dim)], axis=1)
 403 |     params[_p(prefix, 'W')] = W
 404 |     params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32')
 405 |     U = numpy.concatenate([ortho_weight(dim_nonlin),
 406 |                            ortho_weight(dim_nonlin)], axis=1)
 407 |     params[_p(prefix, 'U')] = U
 408 | 
 409 |     Wx = norm_weight(nin_nonlin, dim_nonlin)
 410 |     params[_p(prefix, 'Wx')] = Wx
 411 | 
 412 |     W_use_current_chunk = norm_weight(dim_chunk_hidden, dim)  # TODO the dimention here need to be careful
 413 |     params[_p(prefix, 'W_use_current_chunk')] = W_use_current_chunk
 414 | 
 415 | 
 416 |     W_current_chunk_c = norm_weight(dim_chunk_hidden, dim * 2)
 417 |     params[_p(prefix, 'W_current_chunk_c')] = W_current_chunk_c
 418 | 
 419 | 
 420 |     Ux = ortho_weight(dim_nonlin)
 421 |     params[_p(prefix, 'Ux')] = Ux
 422 |     params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32')
 423 | 
 424 |     U_nl = numpy.concatenate([ortho_weight(dim_nonlin),
 425 |                               ortho_weight(dim_nonlin)], axis=1)
 426 |     params[_p(prefix, 'U_nl')] = U_nl
 427 |     params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32')
 428 | 
 429 |     Ux_nl = ortho_weight(dim_nonlin)
 430 |     params[_p(prefix, 'Ux_nl')] = Ux_nl
 431 |     params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32')
 432 | 
 433 |     # context to LSTM
 434 |     Wc = norm_weight(dimctx, dim*2)
 435 |     params[_p(prefix, 'Wc')] = Wc
 436 | 
 437 |     Wcx = norm_weight(dimctx, dim)
 438 |     params[_p(prefix, 'Wcx')] = Wcx
 439 | 
 440 |     # attention: combined -> hidden
 441 |     W_comb_att = norm_weight(dim, dimctx)
 442 |     params[_p(prefix, 'W_comb_att')] = W_comb_att
 443 | 
 444 |     # attention: context -> hidden
 445 |     Wc_att = norm_weight(dimctx)
 446 |     params[_p(prefix, 'Wc_att')] = Wc_att
 447 | 
 448 | 
 449 |     # attention: combined -> hidden
 450 |     W_cu_chunk_att = norm_weight(dim_chunk_hidden, dimctx)
 451 |     params[_p(prefix, 'W_cu_chunk_att')] = W_cu_chunk_att
 452 | 
 453 | 
 454 |     # attention: hidden bias
 455 |     b_att = numpy.zeros((dimctx,)).astype('float32')
 456 |     params[_p(prefix, 'b_att')] = b_att
 457 | 
 458 |     # attention:
 459 |     U_att = norm_weight(dimctx, 1)
 460 |     params[_p(prefix, 'U_att')] = U_att
 461 |     c_att = numpy.zeros((1,)).astype('float32')
 462 |     params[_p(prefix, 'c_tt')] = c_att
 463 | 
 464 | 
 465 |     # new the chunking parameters
 466 | 
 467 | 
 468 |     params[_p(prefix, 'chunk_transform_matrix')] = get_tensor_weight(chunk_label_num, dim_nonlin, nin_chunk)
 469 | 
 470 | 
 471 |     W_chunk = numpy.concatenate([norm_weight(nin_chunk, dim_chunk_hidden),
 472 |                            norm_weight(nin_chunk, dim_chunk_hidden)], axis=1) # nin * 2 dim
 473 |     params[_p(prefix, 'W_chunk')] = W_chunk
 474 |     params[_p(prefix, 'b_chunk')] = numpy.zeros((2 * dim_chunk_hidden,)).astype('float32')
 475 | 
 476 |     U_chunk = numpy.concatenate([ortho_weight(dim_chunk_hidden),
 477 |                            ortho_weight(dim_chunk_hidden)], axis=1)
 478 |     params[_p(prefix, 'U_chunk')] = U_chunk
 479 | 
 480 |     Wx_chunk = norm_weight(nin_nonlin_chunk, dim_chunk_hidden)
 481 |     params[_p(prefix, 'Wx_chunk')] = Wx_chunk
 482 |     Ux_chunk = ortho_weight(dim_chunk_hidden)
 483 |     params[_p(prefix, 'Ux_chunk')] = Ux_chunk
 484 |     params[_p(prefix, 'bx_chunk')] = numpy.zeros((dim_chunk_hidden,)).astype('float32')
 485 | 
 486 |     U_nl_chunk = numpy.concatenate([ortho_weight(dim_chunk_hidden),
 487 |                               ortho_weight(dim_chunk_hidden)], axis=1)
 488 |     params[_p(prefix, 'U_nl_chunk')] = U_nl_chunk
 489 |     params[_p(prefix, 'b_nl_chunk')] = numpy.zeros((2 * dim_chunk_hidden,)).astype('float32')
 490 | 
 491 |     Ux_nl_chunk = ortho_weight(dim_chunk_hidden)
 492 |     params[_p(prefix, 'Ux_nl_chunk')] = Ux_nl_chunk
 493 |     params[_p(prefix, 'bx_nl_chunk')] = numpy.zeros((dim_chunk_hidden,)).astype('float32')
 494 | 
 495 |     # context to LSTM
 496 |     Wc_chunk = norm_weight(dimctx, dim_chunk_hidden*2)
 497 |     params[_p(prefix, 'Wc_chunk')] = Wc_chunk
 498 | 
 499 |     Wcx_chunk = norm_weight(dimctx, dim_chunk_hidden)
 500 |     params[_p(prefix, 'Wcx_chunk')] = Wcx_chunk
 501 | 
 502 |     # attention: combined -> hidden
 503 |     W_comb_att_chunk = norm_weight(dim_chunk_hidden, dimctx)
 504 |     params[_p(prefix, 'W_comb_att_chunk')] = W_comb_att_chunk
 505 | 
 506 |     # attention: context -> hidden
 507 |     Wc_att_chunk = norm_weight(dimctx)
 508 |     params[_p(prefix, 'Wc_att_chunk')] = Wc_att_chunk
 509 | 
 510 |     # attention: hidden bias
 511 |     b_att_chunk = numpy.zeros((dimctx,)).astype('float32')
 512 |     params[_p(prefix, 'b_att_chunk')] = b_att_chunk
 513 | 
 514 |     # attention:
 515 |     U_att_chunk = norm_weight(dimctx, 1)
 516 |     params[_p(prefix, 'U_att_chunk')] = U_att_chunk
 517 |     c_att_chunk = numpy.zeros((1,)).astype('float32')
 518 |     params[_p(prefix, 'c_tt_chunk')] = c_att_chunk
 519 | 
 520 | 
 521 | 
 522 |     return params
 523 | 
 524 | 
 525 | def gru_cond_layer(tparams, emb, chunk_index, options, prefix='gru',
 526 |                    mask=None, chunk_boundary_indicator=None, context=None,
 527 |                    one_step=False, one_step_chunk=False, one_step_word=False,
 528 |                    init_state_chunk=None,init_state_chunk_words=None,
 529 |                    current_chunk_hidden=None,last_chunk_end_word_hidden1=None, current_word_hidden1=None,
 530 |                    context_mask=None, **kwargs):
 531 | 
 532 | 
 533 |     assert context, 'Context must be provided'
 534 | 
 535 | 
 536 |     # nsteps = chunk_index.shape[0]
 537 | 
 538 |     if chunk_index is not None:
 539 |         nsteps = chunk_index.shape[0]
 540 | 
 541 |     if one_step_chunk:
 542 |         assert init_state_chunk, 'previous state must be provided'
 543 |         assert init_state_chunk_words, 'previous state must be provided'
 544 | 
 545 |     # if this is a sample or decode process, we may use a sample = 1 predict
 546 |     if emb is not None:
 547 |         if emb.ndim == 3:
 548 |             n_samples = emb.shape[1]
 549 |         else:
 550 |             n_samples = emb.shape[0]
 551 |     else:
 552 |         n_samples = current_word_hidden1.shape[0]
 553 | 
 554 | 
 555 |     # the hidden dim
 556 |     dim = tparams[_p(prefix, 'Wcx')].shape[1]
 557 | 
 558 |     # chunk hidden dim
 559 |     chunk_hidden_dim = tparams[_p(prefix, 'Wcx_chunk')].shape[1]
 560 | 
 561 |     # if mask is None, it is the sample process
 562 |     if mask is None:
 563 |         mask = tensor.alloc(1., n_samples, 1)
 564 | 
 565 |     # initial/previous state
 566 |     if init_state_chunk is None:
 567 |         init_state_chunk = tensor.alloc(0., n_samples, chunk_hidden_dim)
 568 |     if init_state_chunk_words is None:
 569 |         init_state_chunk_words = tensor.alloc(0., n_samples, dim)
 570 | 
 571 |     # projected context
 572 |     assert context.ndim == 3, \
 573 |         'Context must be 3-d: #annotation x #sample x dim'
 574 | 
 575 |     def _slice(_x, n, dim):
 576 |         if _x.ndim == 3:
 577 |             return _x[:, :, n*dim:(n+1)*dim]
 578 |         return _x[:, n*dim:(n+1)*dim]
 579 | 
 580 |     #
 581 |     # chunking prediction
 582 |     #
 583 |     # We have to firstly compute the word hidden 1 and then compute the
 584 |     # chunk hidden given the word hidden 1
 585 |     def predict_chunk(  m_, cw_x_, cw_xx_, h_chunk,
 586 |                         h_cw, h1_last_chunk_end_word,
 587 |                         # non sequences
 588 |                         pctx_chunk, cc,
 589 |                         chunk_transform_matrix, U_chunk, Wc_chunk, W_comb_att_chunk, U_att_chunk, c_tt_chunk,
 590 |                         Ux_chunk, Wcx_chunk, U_nl_chunk, Ux_nl_chunk, b_nl_chunk, bx_nl_chunk, Wx_chunk, bx_chunk,
 591 |                         W_chunk, b_chunk,
 592 |                         W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, W_cu_chunk_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl,
 593 |                         b_nl, bx_nl):
 594 | 
 595 | 
 596 |         #
 597 |         # incorporate the last words into word hidden 1 : h1
 598 |         #
 599 |         preact1 = tensor.dot(h_cw, U)
 600 |         preact1 += cw_x_
 601 |         preact1 = tensor.nnet.sigmoid(preact1)
 602 | 
 603 |         r1 = _slice(preact1, 0, dim)
 604 |         u1 = _slice(preact1, 1, dim)
 605 | 
 606 |         preactx1 = tensor.dot(h_cw, Ux)
 607 |         preactx1 *= r1
 608 |         preactx1 += cw_xx_
 609 | 
 610 |         h1 = tensor.tanh(preactx1)
 611 | 
 612 |         h1 = u1 * h_cw + (1. - u1) * h1
 613 |         h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_cw
 614 | 
 615 |         ret_word_hidden1 = h1
 616 | 
 617 |         ########### end compute word h1
 618 | 
 619 |         #
 620 |         # compute the chunk embedding ######################
 621 |         #
 622 |         last_chunk_emb = h1 - h1_last_chunk_end_word
 623 | 
 624 | 
 625 |         transform = chunk_transform_matrix[0]
 626 |         last_chunk_emb =  tensor.dot(last_chunk_emb, transform) # TODO, make sure that here the chunkindex is last chunk index
 627 | 
 628 | 
 629 |         #
 630 |         # compute the current chunk hidden
 631 |         #
 632 |         chunk_xx_ = tensor.dot(last_chunk_emb, Wx_chunk) + \
 633 |                     bx_chunk
 634 |         chunk_x_ = tensor.dot(last_chunk_emb, W_chunk) + \
 635 |                    b_chunk
 636 | 
 637 | 
 638 |         preact1 = tensor.dot(h_chunk, U_chunk)
 639 |         preact1 += chunk_x_
 640 |         preact1 = tensor.nnet.sigmoid(preact1)
 641 | 
 642 |         r1 = _slice(preact1, 0, chunk_hidden_dim)
 643 |         u1 = _slice(preact1, 1, chunk_hidden_dim)
 644 | 
 645 |         preactx1 = tensor.dot(h_chunk, Ux_chunk)
 646 |         preactx1 *= r1
 647 |         preactx1 += chunk_xx_
 648 | 
 649 |         h1 = tensor.tanh(preactx1)
 650 | 
 651 | 
 652 |         h1 = u1 * h_chunk + (1. - u1) * h1
 653 | 
 654 | 
 655 |         h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_chunk
 656 | 
 657 |         #
 658 |         # attention
 659 |         #
 660 |         pstate_ = tensor.dot(h1, W_comb_att_chunk)
 661 |         pctx__ = pctx_chunk + pstate_[None, :, :]
 662 |         #pctx__ += xc_
 663 |         pctx__ = tensor.tanh(pctx__)
 664 |         alpha = tensor.dot(pctx__, U_att_chunk)+c_tt_chunk
 665 |         alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
 666 |         alpha = tensor.exp(alpha)
 667 |         if context_mask:
 668 |             alpha = alpha * context_mask
 669 |         alpha = alpha / alpha.sum(0, keepdims=True)
 670 |         ctx_ = (cc * alpha[:, :, None]).sum(0)  # current context
 671 | 
 672 | 
 673 |         preact2 = tensor.dot(h1, U_nl_chunk)+b_nl_chunk
 674 |         preact2 += tensor.dot(ctx_, Wc_chunk)
 675 |         preact2 = tensor.nnet.sigmoid(preact2)
 676 | 
 677 |         r2 = _slice(preact2, 0, chunk_hidden_dim)
 678 |         u2 = _slice(preact2, 1, chunk_hidden_dim)
 679 | 
 680 |         preactx2 = tensor.dot(h1, Ux_nl_chunk)+bx_nl_chunk
 681 |         preactx2 *= r2
 682 |         preactx2 += tensor.dot(ctx_, Wcx_chunk)
 683 | 
 684 |         h2 = tensor.tanh(preactx2)
 685 | 
 686 |         h2 = u2 * h1 + (1. - u2) * h2
 687 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
 688 | 
 689 |         chunk_hidden2 = h2
 690 |         chunk_ctx = ctx_
 691 |         chunk_alpha = alpha.T
 692 | 
 693 | 
 694 | 
 695 |         return ret_word_hidden1, last_chunk_emb, chunk_hidden2, chunk_ctx, chunk_alpha
 696 | 
 697 | 
 698 |     #
 699 |     # given word hidden1, chunk hidden, compute the
 700 |     # word hidden2
 701 |     #
 702 |     def predict_word_hidden2(m_, word_hidden1, chunk_hidden,
 703 |                              pctx_, cc_,
 704 |                              W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att,
 705 |                              W_cu_chunk_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl):
 706 | 
 707 | 
 708 |         m = tensor.alloc(0., chunk_hidden.shape[0], chunk_hidden.shape[1])
 709 | 
 710 |         chunk_hidden = m * chunk_hidden
 711 |         #
 712 |         # given the word hidden1 and chunk hidden , compute
 713 |         # word attention
 714 |         pstate_ = tensor.dot(word_hidden1, W_comb_att)
 715 |         pstate_chunk = tensor.dot(chunk_hidden, W_cu_chunk_att)
 716 | 
 717 | 
 718 | 
 719 |         ################
 720 |         # revise
 721 |         ################
 722 |         pctx__ = pctx_ + pstate_[None, :, :] + pstate_chunk[None, :, :]
 723 |         # pctx__ = pctx_ + pstate_[None, :, :]
 724 |         pctx__ = tensor.tanh(pctx__)
 725 |         alpha = tensor.dot(pctx__, U_att)+c_tt
 726 |         alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]])
 727 |         alpha = tensor.exp(alpha)
 728 |         if context_mask:
 729 |             alpha = alpha * context_mask
 730 |         alpha = alpha / alpha.sum(0, keepdims=True)
 731 |         ctx_ = (cc_ * alpha[:, :, None]).sum(0)  # current context
 732 | 
 733 | 
 734 | 
 735 |         preact2 = tensor.dot(word_hidden1, U_nl)+b_nl
 736 | 
 737 | 
 738 |         ################
 739 |         # revise
 740 |         ################
 741 |         preact2 += tensor.dot(ctx_, Wc) + tensor.dot(chunk_hidden, W_current_chunk_c)
 742 |         # preact2 += tensor.dot(ctx_, Wc)
 743 |         preact2 = tensor.nnet.sigmoid(preact2)
 744 | 
 745 |         r2 = _slice(preact2, 0, dim)
 746 |         u2 = _slice(preact2, 1, dim)
 747 | 
 748 |         preactx2 = tensor.dot(word_hidden1, Ux_nl)+bx_nl
 749 |         preactx2 *= r2
 750 | 
 751 | 
 752 |         ################
 753 |         # revise
 754 |         ################
 755 |         preactx2 += tensor.dot(ctx_, Wcx) + tensor.dot(chunk_hidden, W_current_chunk_hidden) # here we add current chunk representation
 756 |         # preactx2 += tensor.dot(ctx_, Wcx) # here we add current chunk representation
 757 | 
 758 | 
 759 |         h2 = tensor.tanh(preactx2)
 760 | 
 761 |         h2 = u2 * word_hidden1 + (1. - u2) * h2
 762 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * word_hidden1
 763 | 
 764 |         return h2, ctx_, alpha.T  # pstate_, preact, preactx, r, u
 765 | 
 766 | 
 767 | 
 768 |     def scan_step(  # seq
 769 |                     m_, chunk_boundary, cw_x_, cw_xx_,
 770 |                     # outputs info
 771 |                     h_chunk, position_chunk_hidden2, ctx_chunk, alpha_chunk, chunk_true,
 772 |                     h_cw, position_h1, h1_last_chunk_end_word, ctx_cw, alpha_cw,
 773 |                     # non sequences
 774 |                     pctx_chunk, pctx_cw, cc,
 775 |                     chunk_transform_matrix, U_chunk, Wc_chunk, W_comb_att_chunk, U_att_chunk, c_tt_chunk,
 776 |                     Ux_chunk, Wcx_chunk, U_nl_chunk, Ux_nl_chunk, b_nl_chunk, bx_nl_chunk, Wx_chunk, bx_chunk,
 777 |                     W_chunk, b_chunk,
 778 |                     W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, W_cu_chunk_att, U_att, c_tt, Ux, Wcx,
 779 |                     U_nl, Ux_nl, b_nl, bx_nl):
 780 | 
 781 | 
 782 |         word_hidden1, \
 783 |         last_chunk_emb, \
 784 |         current_position_hypo_chunk_hidden, \
 785 |         chunk_ctx, chunk_alpha = \
 786 |             predict_chunk(m_, cw_x_, cw_xx_, h_chunk,
 787 |                           h_cw, h1_last_chunk_end_word,
 788 |                           pctx_chunk, cc,
 789 |                           chunk_transform_matrix, U_chunk, Wc_chunk, W_comb_att_chunk, U_att_chunk, c_tt_chunk,
 790 |                           Ux_chunk, Wcx_chunk, U_nl_chunk, Ux_nl_chunk, b_nl_chunk, bx_nl_chunk, Wx_chunk, bx_chunk,
 791 |                           W_chunk, b_chunk,
 792 |                           W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, W_cu_chunk_att, U_att, c_tt, Ux,
 793 |                           Wcx, U_nl, Ux_nl, b_nl, bx_nl)
 794 | 
 795 | 
 796 |         #
 797 |         # if current chunk indocator is 1, then this is a begin of a new chunk,
 798 |         # the chunk hidden is cuttent chunk hidde, otherwise, this word is still in the old chunk
 799 |         # the last chunk hidden will still be used.
 800 |         #
 801 |         chunk_hidden = chunk_boundary[:, None] * current_position_hypo_chunk_hidden \
 802 |                        + (1. - chunk_boundary)[:, None] * h_chunk
 803 | 
 804 |         h1_last_chunk_end_word = chunk_boundary[:, None] * word_hidden1 \
 805 |                                  + (1. - chunk_boundary)[:, None] * h1_last_chunk_end_word
 806 | 
 807 | 
 808 |         word_hidden2, \
 809 |         word_ctx_, \
 810 |         word_alpha = predict_word_hidden2(m_, word_hidden1, chunk_hidden,
 811 |                                           pctx_cw, cc,
 812 |                                           W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att,
 813 |                                           W_cu_chunk_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl)
 814 | 
 815 | 
 816 | 
 817 |         return chunk_hidden, current_position_hypo_chunk_hidden, chunk_ctx, chunk_alpha, last_chunk_emb, \
 818 |                word_hidden2, word_hidden1, h1_last_chunk_end_word, word_ctx_, word_alpha
 819 | 
 820 | 
 821 | 
 822 |     _step = scan_step
 823 | 
 824 |     word_shared_vars = [tparams[_p(prefix, 'W_use_current_chunk')],
 825 |                         tparams[_p(prefix, 'W_current_chunk_c')],
 826 |                         tparams[_p(prefix, 'U')],
 827 |                         tparams[_p(prefix, 'Wc')],
 828 |                         tparams[_p(prefix, 'W_comb_att')],
 829 |                         tparams[_p(prefix, 'W_cu_chunk_att')],
 830 |                         tparams[_p(prefix, 'U_att')],
 831 |                         tparams[_p(prefix, 'c_tt')],
 832 |                         tparams[_p(prefix, 'Ux')],
 833 |                         tparams[_p(prefix, 'Wcx')],
 834 |                         tparams[_p(prefix, 'U_nl')],
 835 |                         tparams[_p(prefix, 'Ux_nl')],
 836 |                         tparams[_p(prefix, 'b_nl')],
 837 |                         tparams[_p(prefix, 'bx_nl')]]
 838 | 
 839 |     chunk_shared_vars = [tparams[_p(prefix, 'chunk_transform_matrix')],
 840 |                          tparams[_p(prefix, 'U_chunk')],
 841 |                          tparams[_p(prefix, 'Wc_chunk')],
 842 |                          tparams[_p(prefix, 'W_comb_att_chunk')],
 843 |                          tparams[_p(prefix, 'U_att_chunk')],
 844 |                          tparams[_p(prefix, 'c_tt_chunk')],
 845 |                          tparams[_p(prefix, 'Ux_chunk')],
 846 |                          tparams[_p(prefix, 'Wcx_chunk')],
 847 |                          tparams[_p(prefix, 'U_nl_chunk')],
 848 |                          tparams[_p(prefix, 'Ux_nl_chunk')],
 849 |                          tparams[_p(prefix, 'b_nl_chunk')],
 850 |                          tparams[_p(prefix, 'bx_nl_chunk')],
 851 |                          tparams[_p(prefix, 'Wx_chunk')],
 852 |                          tparams[_p(prefix, 'bx_chunk')],
 853 |                          tparams[_p(prefix, 'W_chunk')],
 854 |                          tparams[_p(prefix, 'b_chunk')]]
 855 | 
 856 |     # compute the word hidden1 and chunk hidden during sample
 857 |     if one_step_chunk:
 858 | 
 859 |         chunk_pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att_chunk')]) + \
 860 |                       tparams[_p(prefix, 'b_att_chunk')]
 861 | 
 862 | 
 863 |         # projected x
 864 |         state_belowx = tensor.dot(emb, tparams[_p(prefix, 'Wx')]) + \
 865 |                        tparams[_p(prefix, 'bx')]
 866 |         state_below_ = tensor.dot(emb, tparams[_p(prefix, 'W')]) + \
 867 |                        tparams[_p(prefix, 'b')]
 868 | 
 869 | 
 870 | 
 871 |         seqs = [mask, state_below_, state_belowx, init_state_chunk,
 872 |                 init_state_chunk_words, last_chunk_end_word_hidden1]
 873 |         rval = predict_chunk(*(seqs + [chunk_pctx_, context] +
 874 |                                chunk_shared_vars + word_shared_vars))
 875 |         return rval[0], rval[1], rval[2], rval[3], rval[4], None, None, None, None, None
 876 |         # ret_word_hidden1, last_chunk_emb, chunk_hidden2, chunk_ctx, chunk_alpha
 877 | 
 878 |     # given the word hidden1 and chunk hidden, compute the word hidden 2
 879 |     elif one_step_word:
 880 | 
 881 |         # word pctx
 882 |         pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + \
 883 |                 tparams[_p(prefix, 'b_att')]
 884 | 
 885 |         seqs = [mask, current_word_hidden1, current_chunk_hidden,
 886 |                 pctx_, context]
 887 | 
 888 |         rval = predict_word_hidden2(*(seqs + word_shared_vars ))
 889 |         return rval[0], rval[1], rval[2], None, None, None, None, None, None, None
 890 |         # word hidden2, word ctx, word attention alpha
 891 | 
 892 | 
 893 |     # word pctx
 894 |     pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + \
 895 |             tparams[_p(prefix, 'b_att')]
 896 | 
 897 |     # chunk pctx
 898 |     chunk_pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att_chunk')]) + \
 899 |                   tparams[_p(prefix, 'b_att_chunk')]
 900 | 
 901 | 
 902 | 
 903 |     # projected x
 904 |     state_belowx = tensor.dot(emb, tparams[_p(prefix, 'Wx')]) + \
 905 |                    tparams[_p(prefix, 'bx')]
 906 |     state_below_ = tensor.dot(emb, tparams[_p(prefix, 'W')]) + \
 907 |                    tparams[_p(prefix, 'b')]
 908 | 
 909 |     # the sequence is
 910 |     # @mask the word mask for batch training
 911 |     # @chunk_boundary_indicator 1: this is a begin of a chunk, 0: this is a inter part of a chunk
 912 |     # @state_below_ W*y_emb
 913 |     # @state_belowx W*y_emb, for different usage
 914 |     seqs = [mask, chunk_boundary_indicator, state_below_, state_belowx]
 915 | 
 916 | 
 917 | 
 918 |     # outputs_info of the training scan process
 919 |     init_chunk_ctx = tensor.alloc(0., n_samples, context.shape[2])
 920 |     init_chunk_alpha = tensor.alloc(0., n_samples, context.shape[0])
 921 |     h1_last_chunk_end_word =  tensor.alloc(0., n_samples, dim) # set last chunk hidden 0
 922 |     position_h1 =  tensor.alloc(0., n_samples, dim)
 923 |     last_chunk_emb =  tensor.alloc(0., n_samples, options['dim_chunk'])
 924 | 
 925 |     init_word_ctx = tensor.alloc(0., n_samples, context.shape[2])
 926 |     init_word_alpha = tensor.alloc(0., n_samples, context.shape[0])
 927 | 
 928 | 
 929 |     # chunk_hidden, current_position_hypo_chunk_hidden, chunk_ctx, chunk_alpha, last_chunk_emb, \
 930 |     #           word_hidden1, word_hidden2, h1_last_chunk_end_word, word_ctx_, word_alpha
 931 | 
 932 |     outputs = [init_state_chunk,
 933 |                init_state_chunk, # only for output
 934 |                init_chunk_ctx,
 935 |                init_chunk_alpha,
 936 |                last_chunk_emb,
 937 |                init_state_chunk_words,
 938 |                position_h1,   # current position computed word hidden1
 939 |                h1_last_chunk_end_word,
 940 |                init_word_ctx,
 941 |                init_word_alpha]
 942 | 
 943 | 
 944 |     rval, updates = theano.scan(_step,
 945 |                                 sequences=seqs,
 946 |                                 outputs_info=outputs,
 947 |                                 # here pctx is the tranformation of the source context
 948 |                                 non_sequences=[chunk_pctx_, pctx_, context]+chunk_shared_vars+word_shared_vars,
 949 |                                 name=_p(prefix, '_layers'),
 950 |                                 #n_steps=n_chunk_step,
 951 |                                 n_steps=nsteps,
 952 |                                 profile=profile,
 953 |                                 strict=True)
 954 | 
 955 |     return rval
 956 |     # chunk_hidden, chunk_ctx, chunk_alpha, word_hidden2, h1_last_chunk_end_word, word_ctx_, word_alpha
 957 | 
 958 | 
 959 | 
 960 | 
 961 | # initialize all parameters
 962 | def init_params(options):
 963 |     params = OrderedDict()
 964 | 
 965 |     # embedding
 966 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
 967 | 
 968 |     params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
 969 | 
 970 |     # encoder: bidirectional RNN
 971 |     params = get_layer(options['encoder'])[0](options, params,
 972 |                                               prefix='encoder',
 973 |                                               nin=options['dim_word'],
 974 |                                               dim=options['dim'])
 975 |     params = get_layer(options['encoder'])[0](options, params,
 976 |                                               prefix='encoder_r',
 977 |                                               nin=options['dim_word'],
 978 |                                               dim=options['dim'])
 979 | 
 980 | 
 981 |     ctxdim = 2 * options['dim']
 982 | 
 983 | 
 984 |     #
 985 |     # generate the initial hidden representation for word and chunk
 986 |     #
 987 | 
 988 |     # init_state, init_cell
 989 |     params = get_layer('ff')[0](options, params, prefix='ff_state_chunk',
 990 |                                 nin=ctxdim, nout=options['dim_chunk_hidden'])
 991 | 
 992 | 
 993 |     # init_state, init_cell
 994 |     params = get_layer('ff')[0](options, params, prefix='ff_state_chunk_words',
 995 |                                 nin=ctxdim, nout=options['dim'])
 996 | 
 997 | 
 998 | 
 999 |     # decoder
1000 |     params = get_layer(options['decoder'])[0](options, params,
1001 |                                               prefix='decoder',
1002 |                                               nin=options['dim_word'],
1003 |                                               dim=options['dim'],
1004 |                                               dimctx=ctxdim,
1005 |                                               nin_chunk=options['dim_chunk'],
1006 |                                               dim_chunk_hidden=options['dim_chunk_hidden'])
1007 | 
1008 | 
1009 |     # readout
1010 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm',
1011 |                                 nin=options['dim'], nout=options['dim_word'],
1012 |                                 ortho=False)
1013 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
1014 |                                 nin=options['dim_word'],
1015 |                                 nout=options['dim_word'], ortho=False)
1016 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
1017 |                                 nin=ctxdim, nout=options['dim_word'],
1018 |                                 ortho=False)
1019 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_using_chunk_hidden',
1020 |                                 nin=options['dim_chunk_hidden'], nout=options['dim_word'],
1021 |                                 ortho=False)
1022 |     # params = get_layer('ff')[0](options, params, prefix='ff_logit_chunk_hidden',
1023 |     #                             nin=ctxdim, nout=options['dim_word'],
1024 |     #                             ortho=False)
1025 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
1026 |                                 nin=options['dim_word'],
1027 |                                 nout=options['n_words'])
1028 | 
1029 |     # readout
1030 | 
1031 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm_chunk',
1032 |                                 nin=options['dim_chunk_hidden'], nout=options['dim_chunk'],
1033 |                                 ortho=False)
1034 | 
1035 |     # we should note here, we use word dim
1036 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev_chunk',
1037 |                                 nin=options['dim_chunk'],
1038 |                                 nout=options['dim_chunk'], ortho=False)
1039 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx_chunk',
1040 |                                 nin=ctxdim, nout=options['dim_chunk'],
1041 |                                 ortho=False)
1042 | 
1043 |     params = get_layer('ff')[0](options, params, prefix='logit_ctx_last_word',
1044 |                                 nin=options['dim_word'],
1045 |                                 nout=options['dim_chunk'],
1046 |                                 ortho=False)
1047 | 
1048 |     params = get_layer('ff')[0](options, params, prefix='logit_ctx_current_word_hidden1',
1049 |                                 nin=options['dim'],
1050 |                                 nout=options['dim_chunk'],
1051 |                                 ortho=False)
1052 | 
1053 | 
1054 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_chunk',
1055 |                                 nin=options['dim_chunk'],
1056 |                                 nout=options['dim_chunk'],
1057 |                                 ortho=False)
1058 | 
1059 | 
1060 |     return params
1061 | 
1062 | 
1063 | # build a training model
1064 | def build_model(tparams, options):
1065 |     opt_ret = dict()
1066 | 
1067 |     trng = RandomStreams(1234)
1068 |     use_noise = theano.shared(numpy.float32(0.))
1069 | 
1070 |     # description string: #words x #samples
1071 |     x = tensor.matrix('x', dtype='int64')
1072 |     x_mask = tensor.matrix('x_mask', dtype='float32')
1073 | 
1074 |     y_chunk = tensor.matrix('y_chunk', dtype='int64')
1075 |     y_chunk_words = tensor.matrix('y_chunk_words', dtype='int64')
1076 |     y_mask = tensor.matrix('y_mask', dtype='float32')
1077 |     chunk_indicator = tensor.matrix('chunk_indicator', dtype='float32')
1078 | 
1079 |     # for the backward rnn, we just need to invert x and x_mask
1080 |     xr = x[::-1]
1081 |     xr_mask = x_mask[::-1]
1082 | 
1083 |     n_timesteps = x.shape[0]
1084 |     n_timesteps_y = y_chunk.shape[0]
1085 |     n_samples = x.shape[1]
1086 | 
1087 |     # word embedding for forward rnn (source)
1088 |     emb = tparams['Wemb'][x.flatten()]
1089 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
1090 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
1091 |                                             prefix='encoder',
1092 |                                             mask=x_mask)
1093 |     # word embedding for backward rnn (source)
1094 |     embr = tparams['Wemb'][xr.flatten()]
1095 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
1096 |     projr = get_layer(options['encoder'])[1](tparams, embr, options,
1097 |                                              prefix='encoder_r',
1098 |                                              mask=xr_mask)
1099 | 
1100 |     # context will be the concatenation of forward and backward rnns
1101 |     ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
1102 | 
1103 |     # mean of the context (across time) will be used to initialize decoder rnn
1104 |     ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
1105 | 
1106 |     # or you can use the last state of forward + backward encoder rnns
1107 |     # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)
1108 | 
1109 |     # initial decoder state for both
1110 |     init_state_chunk = get_layer('ff')[1](tparams, ctx_mean, options,
1111 |                                     prefix='ff_state_chunk', activ='tanh')
1112 |     init_state_chunk_words = get_layer('ff')[1](tparams, ctx_mean, options,
1113 |                                     prefix='ff_state_chunk_words', activ='tanh')
1114 | 
1115 |     # word embedding (target), we will shift the target sequence one time step
1116 |     # to the right. This is done because of the bi-gram connections in the
1117 |     # readout and decoder rnn. The first target will be all zeros and we will
1118 |     # not condition on the last output.
1119 | 
1120 | 
1121 |     # shift the word embeddings in the chunk
1122 |     emb = tparams['Wemb_dec'][y_chunk_words.flatten()]
1123 |     emb = emb.reshape([n_timesteps_y, n_samples, options['dim_word']])
1124 | 
1125 |     emb_shifted = tensor.zeros_like(emb)
1126 |     emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
1127 |     emb = emb_shifted
1128 | 
1129 |     y_chunk_shift = tensor.zeros_like(y_chunk)
1130 |     y_chunk_shift = tensor.set_subtensor(y_chunk_shift[1:], y_chunk[:-1])
1131 | 
1132 |     #
1133 |     # decoder
1134 |     chunk_hidden, \
1135 |     current_position_hypo_chunk_hidden, \
1136 |     chunk_ctx, \
1137 |     chunk_alpha, \
1138 |     last_chunk_emb, \
1139 |     word_hidden2, \
1140 |     word_hidden1, \
1141 |     h1_last_chunk_end_word, \
1142 |     word_ctx_, \
1143 |     word_alpha = get_layer(options['decoder'])[1](tparams, emb, y_chunk_shift,
1144 |                                                   options,
1145 |                                                   prefix='decoder',
1146 |                                                   mask=y_mask,
1147 |                                                   chunk_boundary_indicator=chunk_indicator,
1148 |                                                   context=ctx,
1149 |                                                   context_mask=x_mask,
1150 |                                                   init_state_chunk=init_state_chunk,
1151 |                                                   init_state_chunk_words=init_state_chunk_words)
1152 |     #
1153 |     opt_ret['dec_alphas_chunk'] = chunk_alpha
1154 | 
1155 | 
1156 |     logit_lstm_chunk = get_layer('ff')[1](tparams, current_position_hypo_chunk_hidden, options,
1157 |                                     prefix='ff_logit_lstm_chunk', activ='linear')
1158 |     logit_prev_chunk = get_layer('ff')[1](tparams, last_chunk_emb, options,
1159 |                                     prefix='ff_logit_prev_chunk', activ='linear')
1160 |     logit_ctx_chunk = get_layer('ff')[1](tparams, chunk_ctx, options,
1161 |                                    prefix='ff_logit_ctx_chunk', activ='linear')
1162 | 
1163 |     logit_ctx_last_word = get_layer('ff')[1](tparams, emb, options,
1164 |                                    prefix='logit_ctx_last_word', activ='linear')
1165 |     logit_ctx_current_word_hidden1 = get_layer('ff')[1](tparams, word_hidden1, options,
1166 |                                    prefix='logit_ctx_current_word_hidden1', activ='linear')
1167 | 
1168 | 
1169 | 
1170 |     logit_chunk = tensor.tanh(logit_lstm_chunk+logit_prev_chunk+logit_ctx_chunk+logit_ctx_last_word+logit_ctx_current_word_hidden1)
1171 | 
1172 |     if options['use_dropout']:
1173 |         logit_chunk = dropout_layer(logit_chunk, use_noise, trng)
1174 |     logit_chunk = get_layer('ff')[1](tparams, logit_chunk, options,
1175 |                                prefix='ff_logit_chunk', activ='linear')
1176 |     logit_shp_chunk = logit_chunk.shape
1177 |     probs_chunk = tensor.nnet.softmax(logit_chunk.reshape([logit_shp_chunk[0]*logit_shp_chunk[1],
1178 |                                                logit_shp_chunk[2]]))
1179 | 
1180 |     # cost
1181 |     y_flat_chunk = y_chunk.flatten()
1182 |     y_flat_idx_chunk = tensor.arange(y_flat_chunk.shape[0]) * options['n_chunks'] + y_flat_chunk
1183 |     cost = -tensor.log(probs_chunk.flatten()[y_flat_idx_chunk])
1184 |     cost = cost.reshape([y_chunk.shape[0], y_chunk.shape[1]])
1185 | 
1186 | 
1187 |     m = tensor.alloc(0., y_mask.shape[0], y_mask.shape[1])
1188 |     cost = m * cost
1189 | 
1190 | 
1191 |     # weights (alignment matrix)
1192 |     opt_ret['dec_alphas_cw'] = word_ctx_
1193 | 
1194 |     # compute word probabilities
1195 |     logit_lstm_cw = get_layer('ff')[1](tparams, word_hidden2, options,
1196 |                                     prefix='ff_logit_lstm', activ='linear')
1197 |     logit_prev_cw = get_layer('ff')[1](tparams, emb, options,
1198 |                                     prefix='ff_logit_prev', activ='linear')
1199 |     logit_ctx_cw = get_layer('ff')[1](tparams, word_ctx_, options,
1200 |                                    prefix='ff_logit_ctx', activ='linear')
1201 | 
1202 | 
1203 |     logit_ctx_using_current_chunk_hidden = get_layer('ff')[1](tparams, chunk_hidden, options,
1204 |                                   prefix='ff_logit_using_chunk_hidden', activ='linear')
1205 | 
1206 |     m = tensor.alloc(0., logit_ctx_using_current_chunk_hidden.shape[0], logit_ctx_using_current_chunk_hidden.shape[1], logit_ctx_using_current_chunk_hidden.shape[2])
1207 | 
1208 |     logit_ctx_using_current_chunk_hidden = m * logit_ctx_using_current_chunk_hidden
1209 | 
1210 | 
1211 |     logit_cw = tensor.tanh(logit_lstm_cw+logit_prev_cw+logit_ctx_cw+logit_ctx_using_current_chunk_hidden)
1212 |     # logit_cw = tensor.tanh(logit_lstm_cw+logit_prev_cw+logit_ctx_cw)
1213 | 
1214 |     if options['use_dropout']:
1215 |         logit_cw = dropout_layer(logit_cw, use_noise, trng)
1216 |     logit_cw = get_layer('ff')[1](tparams, logit_cw, options,
1217 |                                prefix='ff_logit', activ='linear')
1218 |     logit_shp_cw = logit_cw.shape
1219 |     probs_cw = tensor.nnet.softmax(logit_cw.reshape([logit_shp_cw[0]*logit_shp_cw[1],
1220 |                                                logit_shp_cw[2]]))
1221 | 
1222 |     # cost
1223 |     y_flat_cw = y_chunk_words.flatten()
1224 |     y_flat_idx_cw = tensor.arange(y_flat_cw.shape[0]) * options['n_words'] + y_flat_cw
1225 | 
1226 |     cost_cw = -tensor.log(probs_cw.flatten()[y_flat_idx_cw])
1227 |     cost_cw = cost_cw.reshape([y_chunk_words.shape[0], y_chunk_words.shape[1]])
1228 | 
1229 | 
1230 |     cost = cost + cost_cw
1231 |     # cost = cost_cw
1232 |     cost = (cost * y_mask).sum(0)
1233 | 
1234 |     return trng, use_noise, x, x_mask, y_chunk, y_mask, y_chunk_words, chunk_indicator,\
1235 |            opt_ret, cost, cost_cw
1236 | 
1237 | # build a sampler
1238 | def build_sampler(tparams, options, trng, use_noise):
1239 | 
1240 | 
1241 |     x = tensor.matrix('x', dtype='int64')
1242 | 
1243 |     xr = x[::-1]
1244 | 
1245 |     n_timesteps = x.shape[0]
1246 |     n_samples = x.shape[1]
1247 | 
1248 |     # word embedding (source), forward and backward
1249 |     emb = tparams['Wemb'][x.flatten()]
1250 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
1251 |     embr = tparams['Wemb'][xr.flatten()]
1252 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
1253 | 
1254 |     # encoder
1255 |     proj = get_layer(options['encoder'])[1](tparams, emb, options,
1256 |                                             prefix='encoder')
1257 |     projr = get_layer(options['encoder'])[1](tparams, embr, options,
1258 |                                              prefix='encoder_r')
1259 | 
1260 |     # concatenate forward and backward rnn hidden states
1261 |     ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
1262 | 
1263 |     # get the input for decoder rnn initializer mlp
1264 |     ctx_mean = ctx.mean(0)
1265 |     # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
1266 | 
1267 |     # initial decoder state for both
1268 |     init_state_chunk = get_layer('ff')[1](tparams, ctx_mean, options,
1269 |                                     prefix='ff_state_chunk', activ='tanh')
1270 |     init_state_chunk_words = get_layer('ff')[1](tparams, ctx_mean, options,
1271 |                                     prefix='ff_state_chunk_words', activ='tanh')
1272 | 
1273 | 
1274 |     print 'Building f_init...',
1275 |     outs = [init_state_chunk, init_state_chunk_words, ctx]
1276 |     f_init = theano.function([x], outs, name='f_init', profile=profile)
1277 |     print 'Done'
1278 | 
1279 | 
1280 | 
1281 | 
1282 |     #
1283 |     # build predict word hidden 1 and chunk hidden2
1284 |     #
1285 | 
1286 |     # TODO note that here the y_chunk and y_chunk_words are both vector, because it only conduct one steps!
1287 |     # y_chunk = tensor.vector('y_sample_chunk', dtype='int64')
1288 |     y_chunk_words = tensor.vector('y_sample_chunk_words', dtype='int64')
1289 | 
1290 |     chunk_boundary = tensor.vector('chunk_boundary', dtype='float32')
1291 | 
1292 |     init_state_chunk = tensor.matrix('init_state_chunk', dtype='float32')
1293 |     init_state_chunk_words = tensor.matrix('init_state_chunk_words', dtype='float32')
1294 | 
1295 |     last_chunk_end_word_hidden1 = tensor.matrix('last_chunk_end_word_hidden1', dtype='float32')
1296 | 
1297 | 
1298 |     current_chunk_hidden = tensor.matrix('current_chunk_hidden', dtype='float32')
1299 | 
1300 |     # if it's the first word, emb should be all zero and it is indicated by -1
1301 |     emb_chunk_word = tensor.switch(y_chunk_words[:, None] < 0,
1302 |                         tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
1303 |                         tparams['Wemb_dec'][y_chunk_words])
1304 | 
1305 | 
1306 | 
1307 |     #
1308 |     # decoder
1309 |     #
1310 |     retval_predict_chunk = get_layer(options['decoder'])[1](tparams,
1311 |                                                             emb_chunk_word,
1312 |                                                             None,
1313 |                                                             options,
1314 |                                                             prefix='decoder',
1315 |                                                             context=ctx,
1316 |                                                             one_step=True,
1317 |                                                             one_step_word=False,
1318 |                                                             one_step_chunk=True,
1319 |                                                             init_state_chunk=init_state_chunk,
1320 |                                                             init_state_chunk_words=init_state_chunk_words,
1321 |                                                             last_chunk_end_word_hidden1=last_chunk_end_word_hidden1)
1322 |     word_hidden1 = retval_predict_chunk[0]
1323 |     last_chunk_emb = retval_predict_chunk[1]
1324 |     current_position_hypo_chunk_hidden = retval_predict_chunk[2]
1325 |     chunk_ctx = retval_predict_chunk[3]
1326 |     chunk_alpha = retval_predict_chunk[4]
1327 | 
1328 | 
1329 |     #
1330 |     # get the chunk prediction
1331 |     #
1332 |     logit_lstm_chunk = get_layer('ff')[1](tparams, current_position_hypo_chunk_hidden, options,
1333 |                                     prefix='ff_logit_lstm_chunk', activ='linear')
1334 |     logit_prev_chunk = get_layer('ff')[1](tparams, last_chunk_emb, options,
1335 |                                     prefix='ff_logit_prev_chunk', activ='linear')
1336 |     logit_ctx_chunk = get_layer('ff')[1](tparams, chunk_ctx, options,
1337 |                                    prefix='ff_logit_ctx_chunk', activ='linear')
1338 | 
1339 |     logit_ctx_last_word = get_layer('ff')[1](tparams, emb_chunk_word, options,
1340 |                                    prefix='logit_ctx_last_word', activ='linear')
1341 |     logit_ctx_current_word_hidden1 = get_layer('ff')[1](tparams, word_hidden1, options,
1342 |                                    prefix='logit_ctx_current_word_hidden1', activ='linear')
1343 | 
1344 | 
1345 | 
1346 |     logit_chunk = tensor.tanh(logit_lstm_chunk+logit_prev_chunk+logit_ctx_chunk+logit_ctx_last_word+logit_ctx_current_word_hidden1)
1347 | 
1348 |     if options['use_dropout']:
1349 |         logit_chunk = dropout_layer(logit_chunk, use_noise, trng)
1350 |     logit_chunk = get_layer('ff')[1](tparams, logit_chunk, options,
1351 |                                prefix='ff_logit_chunk', activ='linear')
1352 |     probs_chunk = tensor.nnet.softmax(logit_chunk)
1353 | 
1354 |     next_sample_chunk = trng.multinomial(pvals=probs_chunk).argmax(1)
1355 | 
1356 | 
1357 |     print 'Building f_next_chunk..'
1358 |     inps = [y_chunk_words, ctx, init_state_chunk, init_state_chunk_words, last_chunk_end_word_hidden1]
1359 |     outs = [probs_chunk, next_sample_chunk, word_hidden1, current_position_hypo_chunk_hidden]
1360 |     f_next_chunk = theano.function(inps, outs, name='f_next_chunk', profile=profile)
1361 |     print 'End Building f_next_chunk..'
1362 | 
1363 | 
1364 | 
1365 | 
1366 | 
1367 | 
1368 | 
1369 | 
1370 |     #
1371 |     # begin to predict the word hidden2
1372 |     #
1373 | 
1374 | 
1375 |     chunk_boundary = tensor.vector('chunk_boundary', dtype='float32')
1376 |     current_chunk_hidden = tensor.matrix('current_chunk_hidden', dtype='float32')
1377 |     current_position_hypo_chunk_hidden = tensor.matrix('current_position_hypo_chunk_hidden', dtype='float32')
1378 |     word_hidden1 = tensor.matrix('word_hidden1', dtype='float32')
1379 |     last_chunk_end_word_hidden1 = tensor.matrix('last_chunk_end_word_hidden1', dtype='float32')
1380 | 
1381 | 
1382 | 
1383 |     # given the chunk indicator, compute the word hidden2
1384 |     chunk_hidden = chunk_boundary[:, None] * current_position_hypo_chunk_hidden \
1385 |                    + (1. - chunk_boundary)[:, None] * current_chunk_hidden
1386 | 
1387 |     h1_last_chunk_end_word = chunk_boundary[:, None] * word_hidden1 \
1388 |                              + (1. - chunk_boundary)[:, None] * last_chunk_end_word_hidden1
1389 | 
1390 | 
1391 |     #
1392 |     # decoder for word hidden2
1393 |     #
1394 |     retval_predict_chunk = get_layer(options['decoder'])[1](tparams,
1395 |                                                             None,
1396 |                                                             None,
1397 |                                                             options,
1398 |                                                             prefix='decoder',
1399 |                                                             context=ctx,
1400 |                                                             one_step=True,
1401 |                                                             one_step_word=True,
1402 |                                                             one_step_chunk=False,
1403 |                                                             current_chunk_hidden=chunk_hidden,
1404 |                                                             current_word_hidden1=word_hidden1)
1405 | 
1406 | 
1407 |     word_hidden2 = retval_predict_chunk[0]
1408 |     word_ctx = retval_predict_chunk[1]
1409 |     word_alpha = retval_predict_chunk[2]
1410 | 
1411 | 
1412 |     # compute word probabilities
1413 |     logit_lstm_cw = get_layer('ff')[1](tparams, word_hidden2, options,
1414 |                                     prefix='ff_logit_lstm', activ='linear')
1415 |     logit_prev_cw = get_layer('ff')[1](tparams, emb_chunk_word, options,
1416 |                                     prefix='ff_logit_prev', activ='linear')
1417 |     logit_ctx_cw = get_layer('ff')[1](tparams, word_ctx, options,
1418 |                                    prefix='ff_logit_ctx', activ='linear')
1419 | 
1420 | 
1421 |     logit_ctx_using_current_chunk_hidden = get_layer('ff')[1](tparams, chunk_hidden, options,
1422 |                                   prefix='ff_logit_using_chunk_hidden', activ='linear')
1423 | 
1424 | 
1425 |     m = tensor.alloc(0., logit_ctx_using_current_chunk_hidden.shape[0], logit_ctx_using_current_chunk_hidden.shape[1])
1426 | 
1427 |     logit_ctx_using_current_chunk_hidden = m * logit_ctx_using_current_chunk_hidden
1428 | 
1429 | 
1430 | 
1431 |     logit_cw = tensor.tanh(logit_lstm_cw+logit_prev_cw+logit_ctx_cw+logit_ctx_using_current_chunk_hidden)
1432 | 
1433 |     if options['use_dropout']:
1434 |         logit_cw = dropout_layer(logit_cw, use_noise, trng)
1435 |     logit_cw = get_layer('ff')[1](tparams, logit_cw, options,
1436 |                                prefix='ff_logit', activ='linear')
1437 |     probs_cw = tensor.nnet.softmax(logit_cw)
1438 |     next_sample_cw = trng.multinomial(pvals=probs_cw).argmax(1)
1439 | 
1440 | 
1441 | 
1442 | 
1443 |     # sample from softmax distribution to get the sample
1444 |     # compile a function to do the whole thing above, next word probability,
1445 |     # sampled word for the next target, next hidden state to be used
1446 |     print 'Building f_next_word..'
1447 |     inps = [y_chunk_words,
1448 |             ctx,
1449 |             chunk_boundary,
1450 |             current_chunk_hidden,
1451 |             current_position_hypo_chunk_hidden,
1452 |             word_hidden1,
1453 |             last_chunk_end_word_hidden1]
1454 |     outs = [probs_cw, next_sample_cw, word_hidden2, h1_last_chunk_end_word, chunk_hidden]
1455 |     f_next_chunk_word = theano.function(inps, outs, name='f_next_chunk_word', profile=profile)
1456 |     print 'Done'
1457 | 
1458 |     return f_init, f_next_chunk, f_next_chunk_word
1459 | 
1460 | 
1461 | 
1462 | 
1463 | # generate sample, either with stochastic sampling or beam search. Note that,
1464 | # this function iteratively calls f_init and f_next functions.
1465 | def gen_sample(tparams, f_init, f_next_chunk, f_next_word, x,
1466 |                options, trng=None, k_chunk=1, k_word=1, k=5, maxlen=50,
1467 |                stochastic=True, argmax=False, jointProb=True):
1468 | 
1469 |     # k is the beam size we have
1470 |     if k > 1:
1471 |         assert not stochastic, \
1472 |             'Beam search does not support stochastic sampling'
1473 | 
1474 |     sample = []
1475 |     sample_score = []
1476 |     if stochastic:
1477 |         sample_score = 0
1478 | 
1479 |     live_k = 1
1480 |     dead_k = 0
1481 | 
1482 |     hyp_samples = [[]] * live_k
1483 |     hyp_scores = numpy.zeros(live_k).astype('float32')
1484 |     hyp_states = []
1485 |     hyp_chunk_states = []
1486 |     hyp_last_chunk_last_word_hidden1 = []
1487 | 
1488 |     # get initial state of decoder rnn and encoder context
1489 |     ret = f_init(x)
1490 |     next_state_chunk, next_state_word, ctx0 = ret[0], ret[1], ret[2]
1491 |     last_chunk_last_word_hidden1 = numpy.zeros((1, options['dim'])).astype('float32')
1492 | 
1493 | 
1494 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
1495 | 
1496 |     # next_chunk = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
1497 |     #
1498 |     # word_hidden1 = None
1499 | 
1500 | 
1501 |     for ii in xrange(maxlen):
1502 |         ctx = numpy.tile(ctx0, [live_k, 1])
1503 |         inps = [next_w,
1504 |                 ctx,
1505 |                 next_state_chunk,
1506 |                 next_state_word,
1507 |                 last_chunk_last_word_hidden1]
1508 |         ret = f_next_chunk(*inps)
1509 |         next_chunk_p, next_chunk, word_hidden1, hypo_chunk_hidden = ret[0], ret[1], ret[2], ret[3]
1510 | 
1511 | 
1512 |         # get the chunk boundrary indocator
1513 |         next_chunk = next_chunk_p.argmax(1)
1514 |         chunk_boundary = numpy.zeros((next_chunk.shape[0],)).astype('float32')
1515 | 
1516 |         for i in xrange(next_chunk.shape[0]):
1517 |             if next_chunk[i] != 1:
1518 |                 chunk_boundary[i] = 1.0
1519 | 
1520 |         inps = [next_w,
1521 |                 ctx,
1522 |                 chunk_boundary,
1523 |                 next_state_chunk,
1524 |                 hypo_chunk_hidden,
1525 |                 word_hidden1,
1526 |                 last_chunk_last_word_hidden1]
1527 | 
1528 |         ret = f_next_word(*inps)
1529 | 
1530 |         next_word_p, \
1531 |         next_w, \
1532 |         next_state_word, \
1533 |         last_chunk_last_word_hidden1, \
1534 |         next_state_chunk \
1535 |             = ret[0], ret[1], ret[2], ret[3], ret[4]
1536 | 
1537 | 
1538 |         if jointProb:
1539 |             indicator_score = next_chunk_p.max(1)
1540 |             indicator_score = indicator_score.reshape(indicator_score.shape[0], 1)
1541 |             next_word_p = indicator_score * next_word_p
1542 | 
1543 | 
1544 |         if stochastic:
1545 |             if argmax:
1546 |                 nw = next_word_p[0].argmax()
1547 |             else:
1548 |                 nw = next_w[0]
1549 |             sample.append(nw)
1550 |             sample_score -= numpy.log(next_word_p[0, nw])
1551 |             if nw == 0:
1552 |                 break
1553 |         else:
1554 |             cand_scores = hyp_scores[:, None] - numpy.log(next_word_p)
1555 |             cand_flat = cand_scores.flatten()
1556 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
1557 | 
1558 |             voc_size = next_word_p.shape[1]
1559 |             trans_indices = ranks_flat / voc_size
1560 |             word_indices = ranks_flat % voc_size
1561 |             costs = cand_flat[ranks_flat]
1562 | 
1563 |             new_hyp_samples = []
1564 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
1565 |             new_hyp_states = []
1566 |             new_hyp_chunk_states = []
1567 |             new_hyp_last_chunk_last_word_hidden1 = []
1568 | 
1569 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
1570 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
1571 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
1572 |                 new_hyp_states.append(copy.copy(next_state_word[ti]))
1573 |                 new_hyp_chunk_states.append(copy.copy(next_state_chunk[ti]))
1574 |                 new_hyp_last_chunk_last_word_hidden1.append(copy.copy(last_chunk_last_word_hidden1[ti]))
1575 | 
1576 |             # check the finished samples
1577 |             new_live_k = 0
1578 |             hyp_samples = []
1579 |             hyp_scores = []
1580 |             hyp_states = []
1581 |             hyp_chunk_states = []
1582 |             hyp_last_chunk_last_word_hidden1 = []
1583 | 
1584 |             for idx in xrange(len(new_hyp_samples)):
1585 |                 if new_hyp_samples[idx][-1] == 0:
1586 |                     sample.append(new_hyp_samples[idx])
1587 |                     sample_score.append(new_hyp_scores[idx])
1588 |                     dead_k += 1
1589 |                 else:
1590 |                     new_live_k += 1
1591 |                     hyp_samples.append(new_hyp_samples[idx])
1592 |                     hyp_scores.append(new_hyp_scores[idx])
1593 |                     hyp_states.append(new_hyp_states[idx])
1594 |                     hyp_chunk_states.append(new_hyp_chunk_states[idx])
1595 |                     hyp_last_chunk_last_word_hidden1.append(new_hyp_last_chunk_last_word_hidden1[idx])
1596 | 
1597 |             hyp_scores = numpy.array(hyp_scores)
1598 |             live_k = new_live_k
1599 | 
1600 |             if new_live_k < 1:
1601 |                 break
1602 |             if dead_k >= k:
1603 |                 break
1604 | 
1605 |             next_w = numpy.array([w[-1] for w in hyp_samples])
1606 |             next_state_word = numpy.array(hyp_states)
1607 |             next_state_chunk = numpy.array(hyp_chunk_states)
1608 |             last_chunk_last_word_hidden1 = numpy.array(hyp_last_chunk_last_word_hidden1)
1609 | 
1610 |     if not stochastic:
1611 |         # dump every remaining one
1612 |         if live_k > 0:
1613 |             for idx in xrange(live_k):
1614 |                 sample.append(hyp_samples[idx])
1615 |                 sample_score.append(hyp_scores[idx])
1616 | 
1617 |     return sample, sample_score
1618 | 
1619 | 
1620 | # calculate the log probablities on a given corpus using translation model
1621 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True):
1622 |     probs = []
1623 | 
1624 |     n_done = 0
1625 | 
1626 |     for x, y_chunk, y_cw in iterator:
1627 |         n_done += len(x)
1628 | 
1629 |         x, x_mask, y_c, y_cw, chunk_indicator, y_mask = prepare_data(x, y_chunk, y_cw,
1630 |                                             n_words_src=options['n_words_src'],
1631 |                                             n_words=options['n_words'])
1632 | 
1633 |         pprobs = f_log_probs(x, x_mask, y_c, y_mask, y_cw, chunk_indicator)
1634 |         for pp in pprobs:
1635 |             probs.append(pp)
1636 | 
1637 |         if numpy.isnan(numpy.mean(probs)):
1638 |             ipdb.set_trace()
1639 | 
1640 |         if verbose:
1641 |             print >>sys.stderr, '%d samples computed' % (n_done)
1642 | 
1643 |     return numpy.array(probs)
1644 | 
1645 | 
1646 | # optimizers
1647 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
1648 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8):
1649 | 
1650 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
1651 |                for k, p in tparams.iteritems()]
1652 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
1653 | 
1654 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)
1655 | 
1656 |     updates = []
1657 | 
1658 |     t_prev = theano.shared(numpy.float32(0.))
1659 |     t = t_prev + 1.
1660 |     lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t)
1661 | 
1662 |     for p, g in zip(tparams.values(), gshared):
1663 |         m = theano.shared(p.get_value() * 0., p.name + '_mean')
1664 |         v = theano.shared(p.get_value() * 0., p.name + '_variance')
1665 |         m_t = beta1 * m + (1. - beta1) * g
1666 |         v_t = beta2 * v + (1. - beta2) * g**2
1667 |         step = lr_t * m_t / (tensor.sqrt(v_t) + e)
1668 |         p_t = p - step
1669 |         updates.append((m, m_t))
1670 |         updates.append((v, v_t))
1671 |         updates.append((p, p_t))
1672 |     updates.append((t_prev, t))
1673 | 
1674 |     f_update = theano.function([lr], [], updates=updates,
1675 |                                on_unused_input='ignore', profile=profile)
1676 | 
1677 |     return f_grad_shared, f_update
1678 | 
1679 | 
1680 | def adadelta(lr, tparams, grads, inp, cost):
1681 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1682 |                                   name='%s_grad' % k)
1683 |                     for k, p in tparams.iteritems()]
1684 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
1685 |                                  name='%s_rup2' % k)
1686 |                    for k, p in tparams.iteritems()]
1687 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
1688 |                                     name='%s_rgrad2' % k)
1689 |                       for k, p in tparams.iteritems()]
1690 | 
1691 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
1692 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
1693 |              for rg2, g in zip(running_grads2, grads)]
1694 | 
1695 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
1696 |                                     profile=profile)
1697 | 
1698 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
1699 |              for zg, ru2, rg2 in zip(zipped_grads, running_up2,
1700 |                                      running_grads2)]
1701 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
1702 |              for ru2, ud in zip(running_up2, updir)]
1703 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
1704 | 
1705 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
1706 |                                on_unused_input='ignore', profile=profile)
1707 | 
1708 |     return f_grad_shared, f_update
1709 | 
1710 | 
1711 | def rmsprop(lr, tparams, grads, inp, cost):
1712 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1713 |                                   name='%s_grad' % k)
1714 |                     for k, p in tparams.iteritems()]
1715 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
1716 |                                    name='%s_rgrad' % k)
1717 |                      for k, p in tparams.iteritems()]
1718 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
1719 |                                     name='%s_rgrad2' % k)
1720 |                       for k, p in tparams.iteritems()]
1721 | 
1722 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
1723 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
1724 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
1725 |              for rg2, g in zip(running_grads2, grads)]
1726 | 
1727 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
1728 |                                     profile=profile)
1729 | 
1730 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
1731 |                            name='%s_updir' % k)
1732 |              for k, p in tparams.iteritems()]
1733 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
1734 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
1735 |                                             running_grads2)]
1736 |     param_up = [(p, p + udn[1])
1737 |                 for p, udn in zip(itemlist(tparams), updir_new)]
1738 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
1739 |                                on_unused_input='ignore', profile=profile)
1740 | 
1741 |     return f_grad_shared, f_update
1742 | 
1743 | 
1744 | def train(dim_word=100,  # word vector dimensionality
1745 |           dim_chunk=50,
1746 |           dim=1000,  # the number of LSTM units
1747 |           dim_chunk_hidden=2000,
1748 |           encoder='gru',
1749 |           decoder='gru_cond',
1750 |           patience=10,  # early stopping patience
1751 |           max_epochs=5000,
1752 |           finish_after=10000000,  # finish after this many updates
1753 |           dispFreq=100,
1754 |           decay_c=0.,  # L2 regularization penalty
1755 |           alpha_c=0.,  # alignment regularization
1756 |           clip_c=-1.,  # gradient clipping threshold
1757 |           lrate=0.01,  # learning rate
1758 |           n_words_src=100000,  # source vocabulary size
1759 |           n_words=100000,  # target vocabulary size
1760 |           n_chunks=1000,  # target vocabulary size
1761 |           maxlen_chunk=10,  # maximum length of the description
1762 |           maxlen_chunk_words=50,  # maximum length of the description
1763 |           optimizer='rmsprop',
1764 |           batch_size=16,
1765 |           valid_batch_size=16,
1766 |           saveto='model.npz',
1767 |           validFreq=1000,
1768 |           saveFreq=1000,   # save the parameters after every saveFreq updates
1769 |           sampleFreq=100,   # generate some samples after every sampleFreq
1770 |           datasets=[
1771 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
1772 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
1773 |           valid_datasets=['../data/dev/newstest2011.en.tok',
1774 |                           '../data/dev/newstest2011.fr.tok'],
1775 |           dictionaries=[
1776 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
1777 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
1778 |           dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
1779 |           use_dropout=False,
1780 |           reload_=False,
1781 |           overwrite=False):
1782 | 
1783 |     # Model options
1784 |     model_options = locals().copy()
1785 | 
1786 |     # load the dictionaries of both source and target
1787 |     # load dictionaries and invert them
1788 |     worddicts = [None] * len(dictionaries)
1789 |     worddicts_r = [None] * len(dictionaries)
1790 |     for ii, dd in enumerate(dictionaries):
1791 |         with open(dd, 'rb') as f:
1792 |             worddicts[ii] = pkl.load(f)
1793 |         worddicts_r[ii] = dict()
1794 |         for kk, vv in worddicts[ii].iteritems():
1795 |             worddicts_r[ii][vv] = kk
1796 | 
1797 |     # dict for chunk label
1798 |     worddict_chunk = [None]
1799 |     worddict_r_chunk = [None]
1800 |     with open(dictionary_chunk, 'rb') as f:
1801 |         worddict_chunk = pkl.load(f)
1802 |     worddict_r_chunk = dict()
1803 |     for kk, vv in worddict_chunk.iteritems():
1804 |         worddict_r_chunk[vv] = kk
1805 |     model_options['n_chunks'] = len(worddict_chunk)
1806 |     print 'chunk_dict size: ', model_options['n_chunks']
1807 |     print worddict_chunk
1808 | 
1809 |     # reload options
1810 |     if reload_ and os.path.exists(saveto) and os.path.exists(saveto + '.pkl'):
1811 |         print 'Reloading model options'
1812 |         with open('%s.pkl' % saveto, 'rb') as f:
1813 |             model_options = pkl.load(f)
1814 | 
1815 |     print 'Loading data'
1816 | 
1817 |     # begin to read by iterators
1818 |     train = TrainingTextIterator(datasets[0], datasets[1],
1819 |                                  dictionaries[0], dictionaries[1], dictionary_chunk,
1820 |                                  n_words_source=n_words_src, n_words_target=n_words,
1821 |                                  batch_size=batch_size,
1822 |                                  max_chunk_len=maxlen_chunk, max_word_len=maxlen_chunk_words)
1823 |     valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1],
1824 |                                  dictionaries[0], dictionaries[1], dictionary_chunk,
1825 |                                  n_words_source=n_words_src, n_words_target=n_words,
1826 |                                  batch_size=valid_batch_size,
1827 |                                  max_chunk_len=maxlen_chunk, max_word_len=maxlen_chunk_words)
1828 | 
1829 |     print 'Building model'
1830 | 
1831 | 
1832 |     # init all the parameters for model
1833 |     params = init_params(model_options)
1834 | 
1835 | 
1836 |     # reload parameters
1837 |     if reload_ and os.path.exists(saveto):
1838 |         print 'Reloading model parameters'
1839 |         params = load_params(saveto, params)
1840 | 
1841 | 
1842 |     tparams = init_tparams(params)
1843 |     # modify the module of build model!
1844 |     # especially the inputs and outputs
1845 |     trng, use_noise, \
1846 |     x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \
1847 |     opt_ret, \
1848 |     cost, cost_cw= \
1849 |         build_model(tparams, model_options)
1850 | 
1851 |     inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator]
1852 | 
1853 |     print 'Building sampler'
1854 |     f_init, f_next_chunk, f_next_word = build_sampler(tparams, model_options, trng, use_noise)
1855 | 
1856 |     # before any regularizer
1857 |     print 'Building f_log_probs...',
1858 |     f_log_probs = theano.function(inps, cost, profile=profile)
1859 |     print 'Done'
1860 | 
1861 |     cost = cost.mean()
1862 | 
1863 |     # apply L2 regularization on weights
1864 |     if decay_c > 0.:
1865 |         decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
1866 |         weight_decay = 0.
1867 |         for kk, vv in tparams.iteritems():
1868 |             weight_decay += (vv ** 2).sum()
1869 |         weight_decay *= decay_c
1870 |         cost += weight_decay
1871 | 
1872 |     # regularize the alpha weights
1873 |     if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
1874 |         alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
1875 |         alpha_reg = alpha_c * (
1876 |             (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
1877 |              opt_ret['dec_alphas_chunk'].sum(0))**2).sum(1).mean()
1878 |         alpha_reg += alpha_c * (
1879 |             (tensor.cast(y_mask.sum(0).sum(0)//x_mask.sum(0), 'float32')[:, None] -
1880 |              opt_ret['dec_alphas_cw'].sum(0).sum(0))**2).sum(1).mean()
1881 |         cost += alpha_reg
1882 | 
1883 |     # after all regularizers - compile the computational graph for cost
1884 |     print 'Building f_cost...',
1885 |     f_cost = theano.function(inps, cost, profile=profile)
1886 |     print 'Done'
1887 | 
1888 |     print 'Computing gradient...',
1889 |     grads = tensor.grad(cost, wrt=itemlist(tparams))
1890 |     print 'Done'
1891 | 
1892 |     # apply gradient clipping here
1893 |     if clip_c > 0.:
1894 |         g2 = 0.
1895 |         for g in grads:
1896 |             g2 += (g**2).sum()
1897 |         new_grads = []
1898 |         for g in grads:
1899 |             new_grads.append(tensor.switch(g2 > (clip_c**2),
1900 |                                            g / tensor.sqrt(g2) * clip_c,
1901 |                                            g))
1902 |         grads = new_grads
1903 | 
1904 |     # compile the optimizer, the actual computational graph is compiled here
1905 |     lr = tensor.scalar(name='lr')
1906 |     print 'Building optimizers...',
1907 |     f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
1908 |     print 'Done'
1909 | 
1910 |     print 'Optimization'
1911 | 
1912 |     best_p = None
1913 |     bad_counter = 0
1914 |     uidx = 0
1915 |     estop = False
1916 |     history_errs = []
1917 |     # reload history
1918 |     if reload_ and os.path.exists(saveto):
1919 |         rmodel = numpy.load(saveto)
1920 |         history_errs = list(rmodel['history_errs'])
1921 |         if 'uidx' in rmodel:
1922 |             uidx = rmodel['uidx']
1923 | 
1924 |     if validFreq == -1:
1925 |         validFreq = len(train[0])/batch_size
1926 |     if saveFreq == -1:
1927 |         saveFreq = len(train[0])/batch_size
1928 |     if sampleFreq == -1:
1929 |         sampleFreq = len(train[0])/batch_size
1930 | 
1931 |     # print 'train length', len(train)
1932 | 
1933 |     for eidx in xrange(max_epochs):
1934 |         n_samples = 0
1935 | 
1936 |         for x, y_chunk, y_cw in train:
1937 |             n_samples += len(x)
1938 |             uidx += 1
1939 |             use_noise.set_value(1.)
1940 | 
1941 |             x, x_mask, y_c, y_cw, chunk_indicator, y_mask = prepare_training_data(x, y_chunk, y_cw, maxlen_chunk=maxlen_chunk, maxlen_cw=maxlen_chunk_words,
1942 |                                                                               n_words_src=n_words_src,
1943 |                                                                               n_words=n_words)
1944 | 
1945 |             if x is None:
1946 |                 print 'Minibatch with zero sample under chunk length ', maxlen_chunk, 'word length: ', maxlen_chunk_words
1947 |                 uidx -= 1
1948 |                 continue
1949 | 
1950 |             ud_start = time.time()
1951 | 
1952 | 
1953 | 
1954 |             # compute cost, grads and copy grads to sh            self.target_buffer = _tcbufared variables
1955 |             cost = f_grad_shared(x, x_mask, y_c, y_mask, y_cw, chunk_indicator)
1956 | 
1957 |             # print 'Epoch ', eidx, 'processed one batch'
1958 | 
1959 |             # do the update on parameters
1960 |             f_update(lrate)
1961 | 
1962 |             ud = time.time() - ud_start
1963 | 
1964 |             # check for bad numbers, usually we remove non-finite elements
1965 |             # and continue training - but not done here
1966 |             if numpy.isnan(cost) or numpy.isinf(cost):
1967 |                 print 'NaN detected'
1968 |                 return 1., 1., 1.
1969 | 
1970 |             # verbose
1971 |             if numpy.mod(uidx, dispFreq) == 0:
1972 |                 print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
1973 | 
1974 |             # save the best model so far, in addition, save the latest model
1975 |             # into a separate file with the iteration number for external eval
1976 |             if numpy.mod(uidx, saveFreq) == 0:
1977 |                 print 'Saving the best model...',
1978 |                 if best_p is not None:
1979 |                     params = best_p
1980 |                 else:
1981 |                     params = unzip(tparams)
1982 |                 numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
1983 |                 pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
1984 |                 print 'Done'
1985 | 
1986 |                 # save with uidx
1987 |                 if not overwrite:
1988 |                     print 'Saving the model at iteration {}...'.format(uidx),
1989 |                     saveto_uidx = '{}.iter{}.npz'.format(
1990 |                         os.path.splitext(saveto)[0], uidx)
1991 |                     numpy.savez(saveto_uidx, history_errs=history_errs,
1992 |                                 uidx=uidx, **unzip(tparams))
1993 |                     print 'Done'
1994 | 
1995 | 
1996 |             # generate some samples with the model and display them
1997 |             if numpy.mod(uidx, sampleFreq) == 0:
1998 |                 # FIXME: random selection?
1999 |                 for jj in xrange(numpy.minimum(5, x.shape[1])):
2000 |                     stochastic = True
2001 |                     sample, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word,
2002 |                                                x[:, jj][:, None],
2003 |                                                model_options, trng=trng, k=1,
2004 |                                                stochastic=stochastic,
2005 |                                                argmax=False)
2006 |                     print 'Source ', jj, ': ',
2007 |                     for vv in x[:, jj]:
2008 |                         if vv == 0:
2009 |                             break
2010 |                         if vv in worddicts_r[0]:
2011 |                             print worddicts_r[0][vv],
2012 |                         else:
2013 |                             print 'UNK',
2014 |                     print
2015 |                     print 'Truth ', jj, ' : ',
2016 |                     ci = 0
2017 |                     # print y_chunk[: , jj]
2018 |                     for chunk_index, word_index in zip(y_c[:, jj], y_cw[:, jj]):
2019 | 
2020 |                         if word_index == 0:
2021 |                             break
2022 |                         if chunk_index in worddict_r_chunk and chunk_index != 1: # not NULL
2023 |                             print '|', worddict_r_chunk[chunk_index],
2024 |                         if word_index in worddicts_r[1]:
2025 |                             print worddicts_r[1][word_index],
2026 |                         else:
2027 |                             print 'UNK',
2028 |                         ci += 1
2029 |                     print
2030 |                     print 'Sample ', jj, ': ',
2031 |                     if stochastic:
2032 |                         ss = sample
2033 |                     else:
2034 |                         score = score / numpy.array([len(s) for s in sample])
2035 |                         ss = sample[score.argmin()]
2036 |                     for vv in ss:
2037 |                         if vv == 0:
2038 |                             continue
2039 |                         if vv < 0:
2040 |                             vv = vv * -1
2041 |                             # print vv,
2042 |                             print '|', worddict_r_chunk[vv],
2043 |                             continue
2044 |                         if vv in worddicts_r[1]:
2045 |                             print worddicts_r[1][vv],
2046 |                         else:
2047 |                             print 'UNK',
2048 |                     print
2049 | 
2050 |             # validate model on validation set and early stop if necessary
2051 |             if numpy.mod(uidx, validFreq) == 0:
2052 |                 use_noise.set_value(0.)
2053 |                 valid_errs = pred_probs(f_log_probs, prepare_training_data,
2054 |                                         model_options, valid)
2055 |                 valid_err = valid_errs.mean()
2056 |                 history_errs.append(valid_err)
2057 | 
2058 |                 if uidx == 0 or valid_err <= numpy.array(history_errs).min():
2059 |                     best_p = unzip(tparams)
2060 |                     bad_counter = 0
2061 |                 if len(history_errs) > patience and valid_err >= \
2062 |                         numpy.array(history_errs)[:-patience].min():
2063 |                     bad_counter += 1
2064 |                     if bad_counter > patience:
2065 |                         print 'Early Stop!'
2066 |                         estop = True
2067 |                         break
2068 | 
2069 |                 if numpy.isnan(valid_err):
2070 |                     ipdb.set_trace()
2071 | 
2072 |                 print 'Valid ', valid_err
2073 | 
2074 |             # finish after this many updates
2075 |             if uidx >= finish_after:
2076 |                 print 'Finishing after %d iterations!' % uidx
2077 |                 estop = True
2078 |                 break
2079 | 
2080 |         print 'Seen %d samples' % n_samples
2081 | 
2082 |         if estop:
2083 |             break
2084 | 
2085 |     if best_p is not None:
2086 |         zipp(best_p, tparams)
2087 | 
2088 |     use_noise.set_value(0.)
2089 |     valid_err = pred_probs(f_log_probs, prepare_training_data,
2090 |                            model_options, valid).mean()
2091 | 
2092 |     print 'Valid ', valid_err
2093 | 
2094 |     params = copy.copy(best_p)
2095 |     numpy.savez(saveto, zipped_params=best_p,
2096 |                 history_errs=history_errs,
2097 |                 uidx=uidx,
2098 |                 **params)
2099 | 
2100 |     return valid_err
2101 | 
2102 | 
2103 | def sgd(lr, tparams, grads, x, mask, y, cost):
2104 |     gshared = [theano.shared(p.get_value() * 0.,
2105 |                              name='%s_grad' % k)
2106 |                for k, p in tparams.iteritems()]
2107 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
2108 | 
2109 |     f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
2110 |                                     profile=profile)
2111 | 
2112 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
2113 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
2114 | 
2115 |     return f_grad_shared, f_update
2116 | 
2117 | 
2118 | if __name__ == '__main__':
2119 |     pass
2120 | 


--------------------------------------------------------------------------------
/beam_decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2/models/memory-set_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | #cd $PBS_O_WORKDIR
11 | python ./translate_gpu.py -n \
12 |         ./model_hal.npz  \
13 |         ./model_hal.npz.pkl  \
14 | 	././../../nmtdata/small.ch.pkl \
15 | 	././../../nmtdata/small.en.chunked.chunktag.pkl \
16 | 	././../../nmtdata/small.test \
17 | 	./small.result
18 | 


--------------------------------------------------------------------------------
/chunk_nmt_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | #export THEANO_FLAGS=device=gpu0,optimizer=None,floatX=float32,exception_verbosity=high
 9 | export THEANO_FLAGS=device=gpu2,floatX=float32
10 | 
11 | python ./train_nmt_zh2en.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo rm model*
4 | sudo rm nohup.out
5 | 


--------------------------------------------------------------------------------
/codetest.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'zhouh'
 2 | 
 3 | 
 4 | 
 5 | from training_data_iterator import TrainingTextIterator
 6 | from nmt import prepare_training_data
 7 | 
 8 | 
 9 | # train = TrainingTextIterator('/home/zhouh/workspace/python/nmtdata/small.ch',
10 | #                              '/home/zhouh/workspace/python/nmtdata/small.en.chunked',
11 | #                              '/home/zhouh/workspace/python/nmtdata/small.ch.pkl',
12 | #                              '/home/zhouh/workspace/python/nmtdata/small.en.chunked.pkl',
13 | #                              '/home/zhouh/workspace/python/nmtdata/small.en.chunked.chunktag.pkl',
14 | #                               n_words_source=10000, n_words_target=10000,
15 | #                               batch_size=2, max_chunk_len=50, max_word_len=50)
16 | 
17 | train = TrainingTextIterator('/home/zhouh/workspace/python/nmtdata/hms.ch.filter',
18 |                              '/home/zhouh/workspace/python/nmtdata/hms.en.filter.chunked',
19 |                              '/home/zhouh/workspace/python/nmtdata/hms.ch.filter.pkl',
20 |                              '/home/zhouh/workspace/python/nmtdata/hms.en.filter.chunked.pkl',
21 |                              '/home/zhouh/workspace/python/nmtdata/hms.en.filter.chunked.chunktag.pkl',
22 |                               n_words_source=10000, n_words_target=10000,
23 |                               batch_size=1, max_chunk_len=30, max_word_len=50)
24 | 
25 | 
26 | 
27 | n = 0
28 | batch = 0
29 | for i in train:
30 |     print batch
31 |     batch += 1
32 | 
33 |     s = i[0]
34 |     tc = i[1]
35 |     tcw = i[2]
36 | 
37 |     print 's', s
38 |     print 'tc', tc
39 |     print 'tcw', tcw
40 | 
41 | 
42 |     x, x_mask, y_c, y_cw, chunk_indicator, y_mask = prepare_training_data(s, tc, tcw, maxlen_chunk=10, maxlen_cw=50,
43 |                                                                       n_words_src=1000,
44 |                                                                       n_words=1000)
45 |     print 'x', x
46 |     print 'x_mask', x_mask
47 |     print 'y_c', y_c
48 |     print 'chunk_indicator', chunk_indicator
49 |     print 'y_cw', y_cw
50 |     print 'y_mask', y_mask
51 | 
52 | 
53 | 
54 | 
55 | print batch
56 | 
57 | 


--------------------------------------------------------------------------------
/computeCost.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import theano
  8 | import cPickle as pkl
  9 | 
 10 | from nmt import (build_model, pred_probs, load_params,
 11 |                  init_params, init_tparams, prepare_training_data)
 12 | 
 13 | from training_data_iterator import TrainingTextIterator
 14 | 
 15 | 
 16 | 
 17 | def main(model,
 18 |          pklmodel,
 19 |          valid_datasets=['../data/dev/newstest2011.en.tok',
 20 |                           '../data/dev/newstest2011.fr.tok'],
 21 |          dictionaries=[
 22 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
 23 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
 24 |          dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
 25 |          result_file='./cost.result'):
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 |     # load the dictionaries of both source and target
 32 |     # load dictionaries and invert them
 33 |     worddicts = [None] * len(dictionaries)
 34 |     worddicts_r = [None] * len(dictionaries)
 35 |     for ii, dd in enumerate(dictionaries):
 36 |         with open(dd, 'rb') as f:
 37 |             worddicts[ii] = pkl.load(f)
 38 |         worddicts_r[ii] = dict()
 39 |         for kk, vv in worddicts[ii].iteritems():
 40 |             worddicts_r[ii][vv] = kk
 41 | 
 42 |     # dict for chunk label
 43 |     worddict_chunk = [None]
 44 |     worddict_r_chunk = [None]
 45 |     with open(dictionary_chunk, 'rb') as f:
 46 |         worddict_chunk = pkl.load(f)
 47 |     worddict_r_chunk = dict()
 48 |     for kk, vv in worddict_chunk.iteritems():
 49 |         worddict_r_chunk[vv] = kk
 50 |     print worddict_chunk
 51 | 
 52 |     print 'load model model_options'
 53 |     with open('%s' % pklmodel, 'rb') as f:
 54 |         options = pkl.load(f)
 55 | 
 56 | 
 57 |     # build valid set
 58 |     valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1],
 59 |                                  dictionaries[0], dictionaries[1], dictionary_chunk,
 60 |                                  n_words_source=options['n_words_src'], n_words_target=options['n_words'],
 61 |                                  batch_size=options['batch_size'],
 62 |                                  max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words'])
 63 | 
 64 | 
 65 |     # allocate model parameters
 66 |     params = init_params(options)
 67 | 
 68 |     # load model parameters and set theano shared variables
 69 |     params = load_params(model, params)
 70 |     tparams = init_tparams(params)
 71 | 
 72 |     trng, use_noise, \
 73 |     x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \
 74 |     opt_ret, \
 75 |     cost, cost_cw= \
 76 |         build_model(tparams, options)
 77 | 
 78 | 
 79 |     inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator]
 80 | 
 81 | 
 82 | 
 83 |     # before any regularizer
 84 |     print 'Building f_log_probs...',
 85 |     f_log_probs = theano.function(inps, cost, profile=False)
 86 |     f_log_probs_cw = theano.function(inps, cost_cw, profile=False)
 87 |     print 'Done'
 88 | 
 89 |     valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data,
 90 |                                             options, valid)
 91 | 
 92 |     valid_err = valid_errs.mean()
 93 |     valid_err_cw = valid_errs_cw.mean()
 94 | 
 95 |     with open(result_file, 'w') as result_file:
 96 |         print >> result_file, valid_err, valid_err_cw
 97 | 
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     parser = argparse.ArgumentParser()
102 |     parser.add_argument('model', type=str)
103 |     parser.add_argument('pklmodel', type=str)
104 |     parser.add_argument('dictionary', type=str)
105 |     parser.add_argument('dictionary_target', type=str)
106 |     parser.add_argument('dictionary_chunk', type=str)
107 |     parser.add_argument('valid_source', type=str)
108 |     parser.add_argument('valid_target', type=str)
109 |     parser.add_argument('result_file', type=str)
110 | 
111 |     args = parser.parse_args()
112 | 
113 |     main(args.model,
114 |          args.pklmodel,
115 |          valid_datasets=[args.valid_source, args.valid_target],
116 |          dictionaries=[args.dictionary, args.dictionary_target],
117 |          dictionary_chunk=args.dictionary_chunk,
118 |          result_file=args.result_file)
119 | 


--------------------------------------------------------------------------------
/cpu_train_chunk_nmt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,optimizer=None,floatX=float32,exception_verbosity=high
 9 | 
10 | python ./train_nmt_zh2en_pc.py
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import cPickle as pkl
  4 | import gzip
  5 | 
  6 | 
  7 | def fopen(filename, mode='r'):
  8 |     if filename.endswith('.gz'):
  9 |         return gzip.open(filename, mode)
 10 |     return open(filename, mode)
 11 | 
 12 | 
 13 | class TextIterator:
 14 |     """Simple Bitext iterator."""
 15 |     def __init__(self, source, target,
 16 |                  source_dict, target_dict,
 17 |                  batch_size=128,
 18 |                  maxlen=100,
 19 |                  n_words_source=-1,
 20 |                  n_words_target=-1):
 21 |         self.source = fopen(source, 'r')
 22 |         self.target = fopen(target, 'r')
 23 |         with open(source_dict, 'rb') as f:
 24 |             self.source_dict = pkl.load(f)
 25 |         with open(target_dict, 'rb') as f:
 26 |             self.target_dict = pkl.load(f)
 27 | 
 28 |         self.batch_size = batch_size
 29 |         self.maxlen = maxlen
 30 | 
 31 |         self.n_words_source = n_words_source
 32 |         self.n_words_target = n_words_target
 33 | 
 34 |         self.source_buffer = []
 35 |         self.target_buffer = []
 36 |         self.k = batch_size * 20
 37 | 
 38 |         self.end_of_data = False
 39 | 
 40 |     def __iter__(self):
 41 |         return self
 42 | 
 43 |     def reset(self):
 44 |         self.source.seek(0)
 45 |         self.target.seek(0)
 46 | 
 47 |     def next(self):
 48 |         if self.end_of_data:
 49 |             self.end_of_data = False
 50 |             self.reset()
 51 |             raise StopIteration
 52 | 
 53 |         source = []
 54 |         target = []
 55 | 
 56 |         # fill buffer, if it's empty
 57 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
 58 | 
 59 |         if len(self.source_buffer) == 0:
 60 |             for k_ in xrange(self.k):
 61 |                 ss = self.source.readline()
 62 |                 if ss == "":
 63 |                     break
 64 |                 tt = self.target.readline()
 65 |                 if tt == "":
 66 |                     break
 67 | 
 68 |                 self.source_buffer.append(ss.strip().split())
 69 |                 self.target_buffer.append(tt.strip().split())
 70 | 
 71 |             # sort by target buffer
 72 |             tlen = numpy.array([len(t) for t in self.target_buffer])
 73 |             tidx = tlen.argsort()
 74 | 
 75 |             _sbuf = [self.source_buffer[i] for i in tidx]
 76 |             _tbuf = [self.target_buffer[i] for i in tidx]
 77 | 
 78 |             self.source_buffer = _sbuf
 79 |             self.target_buffer = _tbuf
 80 | 
 81 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
 82 |             self.end_of_data = False
 83 |             self.reset()
 84 |             raise StopIteration
 85 | 
 86 |         try:
 87 | 
 88 |             # actual work here
 89 |             while True:
 90 | 
 91 |                 # read from source file and map to word index
 92 |                 try:
 93 |                     ss = self.source_buffer.pop()
 94 |                 except IndexError:
 95 |                     break
 96 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
 97 |                       for w in ss]
 98 |                 if self.n_words_source > 0:
 99 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
100 | 
101 |                 # read from source file and map to word index
102 |                 tt = self.target_buffer.pop()
103 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
104 |                       for w in tt]
105 |                 if self.n_words_target > 0:
106 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
107 | 
108 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
109 |                     continue
110 | 
111 |                 source.append(ss)
112 |                 target.append(tt)
113 | 
114 |                 if len(source) >= self.batch_size or \
115 |                         len(target) >= self.batch_size:
116 |                     break
117 |         except IOError:
118 |             self.end_of_data = True
119 | 
120 |         if len(source) <= 0 or len(target) <= 0:
121 |             self.end_of_data = False
122 |             self.reset()
123 |             raise StopIteration
124 | 
125 |         return source, target
126 | 


--------------------------------------------------------------------------------
/haos.bib:
--------------------------------------------------------------------------------
  1 | @InProceedings{chen2022mtg,
  2 |   author    = {Chen, Yiran and Song, Zhenqiao and Wu, Xianze and Wang, Danqing and Xu, Jingjing and Chen, Jiaze and Zhou, Hao and Li, Lei},
  3 |   booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT Findings)},
  4 |   title     = {{MTG}: A Benchmark Suite for Multilingual Text Generation},
  5 |   year      = {2022},
  6 |   month     = jul,
  7 |   abstract  = {We introduce MTG, a new benchmark suite for training and evaluating multilingual text generation. It is the first and largest multilingual multiway text generation benchmark with 400k human-annotated data for four generation tasks (story generation, question generation, title generation and text summarization) across five languages (English, German, French, Spanish and Chinese). Its multiway characteristic makes it possible to achieve direct cross-lingual generation between any two languages, thus facilitating knowledge transfer. Based on MTG, we set various evaluation scenarios and conduct deep analyses of several popular multilingual generation models from different aspects. Our benchmark suite can foster model performance enhancement with more human-annotated parallel data and encourage model evaluation with more diverse generation scenarios.},
  8 |   eprint    = {https://arxiv.org/abs/2108.07140},
  9 |   author+an =	 {7=highlight}
 10 | }
 11 | @InProceedings{bao2022latent,
 12 |   author    = {Yu Bao and Hao Zhou and Shujian Huang and Dongqi Wang and Lihua Qian and Xinyu Dai and Jiajun Chen and Lei Li},
 13 |   booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)},
 14 |   title     = {latent-{GLAT}: Glancing at Latent Variables for Parallel Text Generation},
 15 |   year      = {2022},
 16 |   month     = may,
 17 |   abstract  = {Recently, parallel text generation has received widespread attention due to its success in generation efficiency. Although many advanced techniques are proposed to improve its generation quality, they still need the help of an autoregressive model for training to overcome the one-to-many multi-modal phenomenon in the dataset, limiting their applications. In this paper, we propose latent-GLAT, which employs the discrete latent variables to capture word categorical information and invoke an advanced curriculum learning technique, alleviating the multi-modality problem. Experiment results show that our method outperforms strong baselines without the help of an autoregressive model, which further broadens the application scenarios of the parallel decoding paradigm.},
 18 |   code      = {https://github.com/baoy-nlp/Latent-GLAT},
 19 |   eprint    = {https://openreview.net/forum?id=y4xCe0MSoWx},
 20 |   author+an =	 {1=student; 2=highlight}
 21 | }
 22 | @InProceedings{fu2022contextual,
 23 |   author    = {Zhiyi Fu and Wangchunshu Zhou and Jingjing Xu and Hao Zhou and Lei Li},
 24 |   booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)},
 25 |   title     = {Contextual Representation Learning beyond Masked Language Modeling},
 26 |   year      = {2022},
 27 |   month     = may,
 28 |   abstract  = {How do masked language models (MLMs) such as BERT learn contextual representations? In this work, we analyze the learning dynamics of MLMs. We find that MLMs adopt sampled embeddings as anchors to estimate and inject contextual semantics to representations, which limits the efficiency and effectiveness of MLMs. To address these issues, we propose TACO, a simple yet effective representation learning approach to directly model global semantics. TACO extracts and aligns contextual semantics hidden in contextualized representations to encourage models to attend global semantics when generating contextualized representations. Experiments on the GLUE benchmark show that TACO achieves up to 5x speedup and up to 1.2 points average improvement over existing MLMs.},
 29 |   code      = {https:// github.com/FUZHIYI/TACO},
 30 |   eprint    = {https://openreview.net/forum?id=KWL_ElhUejN},
 31 |   author+an =	 {4=highlight}
 32 | }
 33 | @InProceedings{chen2022e,
 34 |   author    = {Jiangjie Chen and Rui Xu and Ziquan Fu and Wei Shi and Zhongqiao Li and Xinbo Zhang and Changzhi Sun and Lei Li and Yanghua Xiao and Hao Zhou},
 35 |   booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
 36 |   title     = {{E-KAR}: A Benchmark for Rationalizing Natural Language Analogical Reasoning},
 37 |   year      = {2022},
 38 |   month     = may,
 39 |   abstract  = {The ability to recognize analogies is fundamental to human cognition. Existing benchmarks to test word analogy do not reveal the underneath process of analogical reasoning of neural models. Holding the belief that models capable of reasoning should be right for the right reasons, we propose a first-of-its- kind Explainable Knowledge-intensive Analogical Reasoning benchmark (E-KAR). Our benchmark consists of 1,655 (in Chinese) and 1,251 (in English) problems sourced from the Civil Service Exams, which require intensive background knowledge to solve. More importantly, we design a free-text explanation scheme to explain whether an analogy should be drawn, and manually annotate them for each and every question and candidate answer. Empirical results suggest that this benchmark is very challenging for some state-of-the-art models for both explanation generation and analogical question answering tasks, which invites further research in this area. Project page of E-KAR can be found at https:// ekar-leaderboard.github.io.},
 40 |   eprint    = {https://openreview.net/forum?id=9kXOFRtrEj},
 41 |   url       = {https://ekar-leaderboard.github.io},
 42 |   author+an =	 {1=student; 10=highlight;}
 43 | }
 44 | @InProceedings{sun2022rethinking,
 45 |   author    = {Zewei Sun and Mingxuan Wang and Hao Zhou and Chengqi Zhao and Shujian Huang and Jiajun Chen and Lei Li},
 46 |   booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
 47 |   title     = {Rethinking Document-level Neural Machine Translation},
 48 |   year      = {2022},
 49 |   month     = may,
 50 |   abstract  = {This paper does not aim at introducing a novel model for document-level neural machine translation. Instead, we head back to the original Transformer model and hope to answer the following question: Is the capacity of current models strong enough for document-level translation? Interestingly, we observe that the original Transformer with appropriate training techniques can achieve strong results for document translation, even with a length of 2000 words. We evaluate this model and several recent approaches on nine document-level datasets and two sentence-level datasets across six languages. Experiments show that document-level Transformer models outperforms sentence-level ones and many previous methods in a comprehensive set of metrics, including BLEU, four lexical indices, three newly proposed assistant linguistic indicators, and human evaluation. Our new datasets and evaluation scripts are in https://github. com/sunzewei2715/Doc2Doc_NMT.},
 51 |   code      = {https://github. com/sunzewei2715/Doc2Doc_NMT},
 52 |   eprint    = {https://openreview.net/forum?id=sU9fYzNZ3xX},
 53 |   author+an =	 {3=highlight}
 54 | }
 55 | @InProceedings{song2022switch,
 56 |   author    = {Zhenqiao Song and Hao Zhou and Lihua Qian and Jingjing Xu and Shanbo Cheng and Mingxuan Wang and Lei Li},
 57 |   booktitle = {International Conference on Learning Representations (ICLR)},
 58 |   title     = {{switch-GLAT}: Multilingual Parallel Machine Translation via Code-switch Decoder},
 59 |   year      = {2022},
 60 |   month     = apr,
 61 |   eprint    = {https://openreview.net/forum?id=5HvpvYd68b},
 62 |   author+an =	 {1=student; 2=highlight; 3=student}
 63 | }
 64 | @InProceedings{yang2022enhancing,
 65 |   author    = {Huiyun Yang and Huadong Chen and Hao Zhou and Lei Li},
 66 |   booktitle = {International Conference on Learning Representations (ICLR)},
 67 |   title     = {Enhancing Cross-lingual Transfer by Manifold Mixup},
 68 |   year      = {2022},
 69 |   month     = apr,
 70 |   eprint    = {https://openreview.net/forum?id=OjPmfr9GkVv},
 71 |   author+an =	 {1=student; 3=highlight}
 72 | }
 73 | @InProceedings{huang2022non,
 74 |   author    = {Chenyang Huang and Hao Zhou and Osmar Zaiane and Lili Mou and Lei Li},
 75 |   booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
 76 |   title     = {Non-Autoregressive Translation with Layer-Wise Prediction and Deep Supervision},
 77 |   year      = {2022},
 78 |   month     = feb,
 79 |   abstract  = {How do we perform efficient inference while retaining high translation quality? Existing neural machine translation models, such as Transformer, achieve high performance, but they decode words one by one, which is inefficient. Recent non-autoregressive translation models speed up the inference, but their quality is still inferior. In this work, we propose DSLP, a highly efficient and high-performance model for machine translation. The key insight is to train a non-autoregressive Transformer with Deep Supervision and feed additional Layer-wise Predictions. We conducted extensive experiments on four translation tasks (both directions of WMT'14 EN-DE and WMT'16 EN-RO). Results show that our approach consistently improves the BLEU scores compared with respective base models. Specifically, our best variant outperforms the autoregressive model on three translation tasks, while being 14.8 times more efficient in inference.},
 80 |   eprint    = {https://arxiv.org/abs/2110.07515},
 81 |   author+an =	 {1=student; 2=highlight}
 82 | }
 83 | @InProceedings{chen-gan2022aaai,
 84 |   title={Unsupervised Editing for Counterfactual Stories},
 85 |   author={Chen, Jiangjie and Gan, Chun and Chen, Sijie and Zhou, Hao and Xiao, Yanghua and Li, Lei},
 86 |   year={2022},
 87 |   booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
 88 |   author+an =	 {1=student; 2=student; 4=highlight}
 89 | }
 90 | 
 91 | @InProceedings{chen2022loren,
 92 |   title={LOREN: Logic-Regularized Reasoning for Interpretable Fact Verification},
 93 |   author={Chen, Jiangjie and Bao, Qiaoben and Sun, Changzhi and Zhang, Xinbo and Chen, Jiaze and Zhou, Hao and Xiao, Yanghua and Li, Lei},
 94 |   year={2022},
 95 |   booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
 96 |   author+an =	 {1=student; 6=highlight}
 97 | }
 98 | @InProceedings{zheng2021duplex,
 99 |   author    = {Zaixiang Zheng and Hao Zhou and Shujian Huang and Jiajun Chen and Jingjing Xu and Lei Li},
100 |   booktitle = {the 35th Conference on Neural Information Processing Systems (NeurIPS)},
101 |   title     = {Duplex Sequence-to-Sequence Learning for Reversible Machine Translation},
102 |   year      = {2021},
103 |   month     = dec,
104 |   abstract  = {In this work, we design a simple, direct, and fast framework for instance segmentation with strong performance. To this end, we propose a novel and effective approach, termed SOLOv2, following the principle of the SOLO method. First, our new framework is empowered by an efficient and holistic instance mask representation scheme, which dynamically segments each instance in the image, without resorting to bounding box detection. Specifically, the object mask generation is decoupled into a mask kernel prediction and mask feature learning, which are responsible for generating convolution kernels and the feature maps to be convolved with, respectively. Second, SOLOv2 significantly reduces inference overhead with our novel matrix non-maximum suppression (NMS) technique. Our Matrix NMS performs NMS with parallel matrix operations in one shot, and yields better results. We demonstrate that the proposed SOLOv2 achieves the state-of-the- art performance with high efficiency, making it suitable for both mobile and cloud applications. A light-weight version of SOLOv2 executes at 31.3 FPS and yields 37.1\% AP on COCO test-dev. Moreover, our state-of-the-art results in object detection (from our mask byproduct) and panoptic segmentation show the potential of SOLOv2 to serve as a new strong baseline for many instance-level recognition tasks.},
105 |   eprint    = {https://arxiv.org/abs/2105.03458},
106 |   author+an =	 {1=student; 2=highlight}
107 | }
108 | @InProceedings{qian2021volctrans,
109 |   author       = {Lihua Qian and Yi Zhou and Zaixiang Zheng and Yaoming Zhu and Zehui Lin and Jiangtao Feng and Shanbo Cheng and Lei Li and Mingxuan Wang and Hao Zhou},
110 |   booktitle    = {Sixth Conference on Machine Translation (WMT21)},
111 |   title        = {The {Volctrans} {GLAT} System: Non-autoregressive Translation Meets {WMT21}},
112 |   year         = {2021},
113 |   month        = nov,
114 |   abstract     = {This paper describes the Volctrans' submission to the WMT21 news translation shared task for German->English translation. We build a parallel (i.e., non-autoregressive) translation system using the Glancing Transformer, which enables fast and accurate parallel decoding in contrast to the currently prevailing autoregressive models. To the best of our knowledge, this is the first parallel translation system that can be scaled to such a practical scenario like WMT competition. More importantly, our parallel translation system achieves the best BLEU score (35.0) on German->English translation task, outperforming all strong autoregressive counterparts.},
115 |   entrysubtype = {workshop},
116 |   eprint       = {https://arxiv.org/abs/2109.11247},
117 |   author+an =	 {1=student; 2=student; 3=student; 10=highlight}
118 | }
119 | @InProceedings{ru2021learning,
120 |   author    = {Dongyu Ru and Changzhi Sun and Jiangtao Feng and Lin Qiu and Hao Zhou and Weinan Zhang and Yong Yu and Lei Li},
121 |   booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
122 |   title     = {Learning Logic Rules for Document-level Relation Extraction},
123 |   year      = {2021},
124 |   month     = nov,
125 |   abstract  = {Document-level relation extraction aims to identify relations between entities in a whole document. Prior efforts to capture long-range dependencies have relied heavily on implicitly powerful representations learned through (graph) neural networks, which makes the model less transparent. To tackle this challenge, in this paper, we propose LogiRE, a novel probabilistic model for document-level relation extraction by learning logic rules. LogiRE treats logic rules as latent variables and consists of two modules: a rule generator and a relation extractor. The rule generator is to generate logic rules potentially contributing to final predictions, and the relation extractor outputs final predictions based on the generated logic rules. Those two modules can be efficiently optimized with the expectation--maximization (EM) algorithm. By introducing logic rules into neural networks, LogiRE can explicitly capture long-range dependencies as well as enjoy better interpretation. Empirical results show that LogiRE significantly outperforms several strong baselines in terms of relation performance (∼1.8 F1 score) and logical consistency (over 3.3 logic score). Our code is available at https://github. com/rudongyu/LogiRE.},
126 |   code      = {https://github.com/rudongyu/LogiRE},
127 |   video     = {https://underline.io/lecture/38055-learning-logic-rules-for-document-level-relation-extraction},
128 |   author+an =	 {5=highlight}
129 | }
130 | @InProceedings{wang2021cnewsum,
131 |   author    = {Danqing Wang and Jiaze Chen and Xianze Wu and Hao Zhou and Lei Li},
132 |   booktitle = {The 10th CCF International Conference on Natural Language Processing and Chinese Computing (NLPCC)},
133 |   title     = {{CNewSum}: A Large-scale Chinese News Summarization Dataset with Human-annotated Adequacy and Deducibility Level},
134 |   year      = {2021},
135 |   address   = {Qingdao, China},
136 |   month     = oct,
137 |   abstract  = {Automatic text summarization aims to produce a brief but crucial summary for the input documents. Both extractive and abstractive methods have witnessed great success in English datasets in recent years. However, there has been a minimal exploration of text summarization in Chinese, limited by the lack of large-scale datasets. In this paper, we present a large-scale Chinese news summarization dataset CNewSum, which consists of 304,307 documents and human-written summaries for the news feed. It has long documents with high-abstractive summaries, which can encourage document-level understanding and generation for current summarization models. An additional distinguishing feature of CNewSum is that its test set contains adequacy and deducibility annotations for the summaries. The adequacy level measures the degree of summary information covered by the document, and the deducibility indicates the reasoning ability the model needs to generate the summary. These annotations can help researchers analyze and target their model performance bottleneck. We examine recent methods on CNewSum and release our dataset to provide a solid testbed for automatic Chinese summarization research.},
138 |   eprint    = {https://arxiv.org/abs/2110.10874},
139 |   url       = {https://dqwang122.github.io/projects/CNewSum/},
140 |   author+an =	 {4=highlight}
141 | }
142 | @InProceedings{shi-song2021ecml,
143 |   author    = {Wenxian Shi and Yuxuan Song and Bohan Li and Hao Zhou and Lei Li},
144 |   booktitle = {the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD)},
145 |   title     = {Follow Your Path: a Progressive Method for Knowledge Distillation},
146 |   year      = {2021},
147 |   month     = jul,
148 |   author+an =	 {1=student; 2=student; 3=student; 4=highlight}
149 | }
150 | @InProceedings{qian2021acl,
151 |   author    = {Lihua Qian and Hao Zhou and Yu Bao and Mingxuan Wang and Lin Qiu and Weinan Zhang and Yong Yu and Lei Li},
152 |   booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
153 |   title     = {Glancing Transformer for Non-Autoregressive Neural Machine Translation},
154 |   year      = {2021},
155 |   month     = jul,
156 |   author+an =	 {1=student; 2=highlight; 3=student}
157 | }
158 | @InProceedings{xu2021acl,
159 |   author    = {Jingjing Xu and Hao Zhou and Chun Gan and Zaixiang Zheng and Lei Li},
160 |   booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Best Paper Award},
161 |   title     = {Vocabularization via Optimal Transport for Neural Machine Translation},
162 |   year      = {2021},
163 |   month     = jul,
164 |   author+an =	 {1=student; 2=highlight; 3=student; 4=student}
165 | }
166 | @InProceedings{wang2021acl,
167 |   author    = {Yijun Wang and Changzhi Sun and Yuanbin Wu and Hao Zhou and Lei Li and Junchi Yan},
168 |   booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
169 |   title     = {A Unified Label Space for Entity Relation Extraction},
170 |   year      = {2021},
171 |   month     = jul,
172 |   author+an =	 {4=highlight}
173 | }
174 | @InProceedings{sunzhang2021acl,
175 |   author    = {Changzhi Sun and Xinbo Zhang and Jiangjie Chen and Chun Gan and Yuanbin Wu and Jiaze Chen and Hao Zhou and Lei Li},
176 |   booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Finding},
177 |   title     = {Probabilistic Graph Reasoning for Natural Proof Generation},
178 |   year      = {2021},
179 |   month     = jul,
180 |   author+an =	 {7=highlight}
181 | }
182 | @InProceedings{wangdq2021acl,
183 |   author    = {Danqing Wang and Jiaze Chen and Hao Zhou and Xipeng Qiu and Lei Li},
184 |   booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Finding},
185 |   title     = {Contrastive Aligned Joint Learning for Multilingual Summarization},
186 |   year      = {2021},
187 |   month     = jul,
188 |   author+an =	 {3=highlight}
189 | }
190 | @InProceedings{wang2021enpar,
191 |   author    = {Yijun Wang and Changzhi Sun and Yuanbin Wu and Hao Zhou and Lei Li and Junchi Yan},
192 |   booktitle = {Proceedings of European Chapter of the Association for Computational Linguistics (EACL)},
193 |   title     = {{ENPAR}: Enhancing Entity and Entity Pair Representations for Joint Entity Relation Extraction},
194 |   year      = {2021},
195 |   month     = apr,
196 |   author+an =	 {4=highlight}
197 | }
198 | @InProceedings{yutongICLR,
199 |   author    = {Yutong Xie and Chence Shi and Hao Zhou and Yuwei Yang and Weinan Zhang and Yong Yu and Lei Li},
200 |   booktitle = {International Conference on Learning Representations (ICLR) - Spotlight},
201 |   title     = {MARS: Markov Molecular Sampling for Multi-objective Drug Discovery},
202 |   year      = {2021},
203 |    month     = mar,
204 |   author+an =	 {1=student; 2=student; 3=highlight}
205 | }
206 | @InProceedings{huang2021acmo,
207 |   author    = {Xunpeng Huang and Runxin Xu and Hao Zhou and Zhe Wang and Zhengyang Liu and Lei Li},
208 |   booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
209 |   title     = {ACMo: Angle-Calibrated Moment Methods for Stochastic Optimization},
210 |   year      = {2021},
211 |   month     = feb,
212 |   author+an =	 {1=student; 2=student; 3=highlight}
213 | }
214 | @InProceedings{dong2021listen,
215 |   author    = {Qianqian Dong and Rong Ye and Mingxuan Wang and Hao Zhou and Shuang Xu and Bo Xu and Lei Li},
216 |   booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
217 |   title     = {Listen, Understand and Translate: Triple Supervision Decouples End-to-end Speech-to-text Translation},
218 |   year      = {2021},
219 |   month     = feb,
220 |   author+an =	 {4=highlight}
221 | }
222 | @InProceedings{dong2021consecutive,
223 |   author    = {Qianqian Dong and Mingxuan Wang and Hao Zhou and Shuang Xu and Bo Xu and Lei Li},
224 |   booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
225 |   title     = {Consecutive Decoding for Speech-to-text Translation},
226 |   year      = {2021},
227 |   month     = feb,
228 |   author+an =	 {3=highlight}
229 | }
230 | @InProceedings{song2021triangular,
231 |   author    = {Zhenqiao Song and Jiaze Chen and Hao Zhou and Lei Li},
232 |   booktitle = {Proceedings of the 14th International Conference on Web Search and Data Mining (WSDM)},
233 |   title     = {Triangular Bidword Generation for Sponsored Search Auction},
234 |   year      = {2021},
235 |   author+an =	 {1=student; 3=highlight}
236 | }
237 | @InProceedings{li2020sentence,
238 |   author    = {Bohan Li and Hao Zhou and Junxian He and Mingxuan Wang and Yiming Yang and Lei Li},
239 |   booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
240 |   title     = {On the Sentence Embeddings from Pre-trained Language Models},
241 |   year      = {2020},
242 |   month     = nov,
243 |   author+an =	 {1=student; 2=highlight}
244 | }
245 | @InProceedings{lin2020pre,
246 |   author    = {Zehui Lin and Xiao Pan and Mingxuan Wang and Xipeng Qiu and Jiangtao Feng and Hao Zhou and Lei Li},
247 |   booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
248 |   title     = {Pre-training Multilingual Neural Machine Translation by Leveraging Alignment Information},
249 |   year      = {2020},
250 |   month     = nov,
251 |   author+an =	 {6=highlight}
252 | }
253 | @InProceedings{ru2020active,
254 |   author    = {Dongyu Ru and Jiangtao Feng and Lin Qiu and Hao Zhou and Mngxuan Wang and Weinan Zhang and Yong Yu and Lei Li},
255 |   booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
256 |   title     = {Active Sentence Learning by Adversarial Uncertainty Sampling in Discrete Space},
257 |   year      = {2020},
258 |   month     = nov,
259 |   author+an =	 {1=student; 1=student; 3=student; 4=highlight}
260 | }
261 | @InProceedings{shi2020dispersed,
262 |   author    = {Wenxian Shi and Hao Zhou and Ning Miao and Lei Li},
263 |   booktitle = {Proceedings of the 37th International Conference on Machine learning (ICML)},
264 |   title     = {Dispersing Exponential Family Mixture {VAE}s for Interpretable Text Generation},
265 |   year      = {2020},
266 |   month     = jul,
267 |   author+an =	 {1=student; 2=highlight},
268 | }
269 | @InProceedings{ru2020quachie,
270 |   author       = {Dongyu Ru and Zhenghui Wang and Lin Qiu and Hao Zhou and Lei Li and Weinan Zhang and Yong Yu},
271 |   booktitle    = {the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR) - System Demonstrations},
272 |   title        = {{QuAChIE}: Question Answering based {Chinese} Information Extraction System},
273 |   year         = {2020},
274 |   month        = jul,
275 |   entrysubtype = {demo},
276 |   author+an =	 {1=student; 2=student; 3=student; 4=highlight}
277 | }
278 | @InProceedings{miao2020do,
279 |   author    = {Ning Miao and Yuxuan Song and Hao Zhou and Lei Li},
280 |   booktitle = {the 58th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers},
281 |   title     = {Do you have the right scissors? Tailoring Pre-trained Language Models via {Monte}-{Carlo} Methods},
282 |   year      = {2020},
283 |   month     = jul,
284 |   author+an =	 {1=student; 2=student; 3=highlight}
285 | }
286 | @inproceedings{liu-etal-2020-unsupervised,
287 |     title = "Unsupervised Paraphrasing by Simulated Annealing",
288 |     author = "Liu, Xianggen  and
289 |       Mou, Lili  and
290 |       Meng, Fandong  and
291 |       Zhou, Hao  and
292 |       Zhou, Jie  and
293 |       Song, Sen",
294 |     booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)",
295 |     month = jul,
296 |     year = "2020",
297 |     author+an =	 {1=student; 4=highlight}
298 | }
299 | @InProceedings{xu2020xiaomingbot,
300 |   author       = {Runxin Xu and Jun Cao and Mingxuan Wang and Jiaze Chen and Hao Zhou and Ying Zeng and Yuping Wang and Li Chen and Xiang Yin and Xijin Zhang and Songcheng Jiang and Yuxuan Wang and Lei Li},
301 |   booktitle    = {the 58th Annual Meeting of the Association for Computational Linguistics (ACL) - System Demonstrations},
302 |   title        = {Xiaomingbot: A Multilingual Robot News Reporter},
303 |   year         = {2020},
304 |   month        = jul,
305 |   author+an =	 {1=student; 5=highlight}
306 | }
307 | @InProceedings{song2020improving,
308 |   author    = {Yuxuan Song and Ning Miao and Hao Zhou and Lantao Yu and Mingxuan Wang and Lei Li},
309 |   booktitle = {The 23rd International Conference on Artificial Intelligence and Statistics (AISTATS)},
310 |   title     = {Improving Maximum Likelihood Training for Text Generation with Density Ratio Estimation},
311 |   year      = {2020},
312 |   month     = aug,
313 |   author+an =	 {1=student; 2=student; 3=highlight}
314 | }
315 | @InProceedings{ye2020variational,
316 |   author    = {Rong Ye and Wenxian Shi and Hao Zhou and Zhongyu Wei and Lei Li},
317 |   booktitle = {International Conference on Learning Representations (ICLR)},
318 |   title     = {Variational Template Machine for Data-to-Text Generation},
319 |   year      = {2020},
320 |   month     = apr,
321 |   author+an =	 {1=student; 2=student; 3=highlight}
322 | }
323 | @InProceedings{zheng2020mirror,
324 |   author    = {Zaixiang Zheng and Hao Zhou and Shujian Huang and Lei Li and Xinyu Dai and Jiajun Chen},
325 |   booktitle = {International Conference on Learning Representations (ICLR) - Oral},
326 |   title     = {Mirror Generative Models for Neural Machine Translation},
327 |   year      = {2020},
328 |   month     = apr,
329 |   author+an =	 {1=student; 2=highlight}
330 | }
331 | @InProceedings{song2020infomax,
332 |   title={Infomax Neural Joint Source-Channel Coding via Adversarial Bit Flip},
333 |   author={Song, Yuxuan and Xu, Minkai and Yu, Lantao and Zhou, Hao and Shao, Shuo and Yu, Yong},
334 |   booktitle = {the 34th {AAAI} Conference on Artificial Intelligence ({AAAI})},
335 |   year={2020},
336 |   month     = feb,
337 |   author+an =	 {1=student; 2=student; 4=highlight}
338 | }
339 | @InProceedings{yang2020towards,
340 |   author    = {Jiacheng Yang and Mingxuan Wang and Hao Zhou and Chengqi Zhao and Weinan Zhang and Yong Yu and Lei Li},
341 |   booktitle = {the 34th {AAAI} Conference on Artificial Intelligence ({AAAI})},
342 |   title     = {Towards Making the Most of {BERT} in Neural Machine Translation},
343 |   year      = {2020},
344 |   month     = feb,
345 |   author+an =	 {3=highlight}
346 |   
347 | }
348 | @InProceedings{wu2020importance,
349 |   author    = {Qingyang Wu and Lei Li and Hao Zhou and Ying Zeng and Zhou Yu},
350 |   booktitle = {the 34th {AAAI} Conference on Artificial Intelligence (AAAI)},
351 |   title     = {Importance-Aware Learning for Neural Headline Editing},
352 |   year      = {2020},
353 |   month     = feb,
354 |   author+an =	 {1=student; 3=highlight}
355 | }
356 | @InProceedings{fu2019rethinking,
357 |   author    = {Fu, Yao and Zhou, Hao and Chen, Jiaze and Li, Lei},
358 |   booktitle = {the 12th International Conference on Natural Language Generation (INLG)},
359 |   title     = {Rethinking Text Attribute Transfer: A Lexical Analysis},
360 |   year      = {2019},
361 |   month     = oct,
362 |   author+an =	 {1=student; 2=highlight}
363 | }
364 | @InProceedings{miao2019kernelized,
365 |   author    = {Miao, Ning and Zhou, Hao and Zhao, Chengqi and Shi, Wenxian and Li, Lei},
366 |   booktitle = {the 33rd Conference on Neural Information Processing Systems (NeurIPS)},
367 |   title     = {Kernelized {Bayesian} Softmax for Text Generation},
368 |   year      = {2019},
369 |   month     = dec,
370 |   author+an =	 {1=student; 2=highlight}
371 | }
372 | @inproceedings{qiu-etal-2019-dynamically,
373 |     title = "Dynamically Fused Graph Network for Multi-hop Reasoning",
374 |     author = "Qiu, Lin  and
375 |       Xiao, Yunxuan  and
376 |       Qu, Yanru  and
377 |       Zhou, Hao  and
378 |       Li, Lei  and
379 |       Zhang, Weinan  and
380 |       Yu, Yong",
381 |     booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)",
382 |     month = jul,
383 |     year = "2019",
384 |     author+an =	 {1=student; 2=student; 3=student; 4=highlight}
385 | }
386 | @inproceedings{zhang-etal-2019-generating-fluent,
387 |     title = "Generating Fluent Adversarial Examples for Natural Languages",
388 |     author = "Zhang, Huangzhao  and
389 |       Zhou, Hao  and
390 |       Miao, Ning  and
391 |       Li, Lei",
392 |     booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers",
393 |     month = jul,
394 |     year = "2019",
395 |     author+an =	 {1=student; 3=student; 2=highlight}
396 | }
397 | @inproceedings{bao-etal-2019-generating,
398 |     title = "Generating Sentences from Disentangled Syntactic and Semantic Spaces",
399 |     author = "Bao, Yu  and
400 |       Zhou, Hao  and
401 |       Huang, Shujian  and
402 |       Li, Lei  and
403 |       Mou, Lili  and
404 |       Vechtomova, Olga  and
405 |       Dai, Xin-yu  and
406 |       Chen, Jiajun",
407 |     booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)",
408 |     month = jul,
409 |     year = "2019",
410 |     author+an =	 {1=student; 2=highlight}
411 | }
412 | @inproceedings{wei-etal-2019-imitation,
413 |     title = "Imitation Learning for Non-Autoregressive Neural Machine Translation",
414 |     author = "Wei, Bingzhen  and
415 |       Wang, Mingxuan  and
416 |       Zhou, Hao  and
417 |       Lin, Junyang  and
418 |       Sun, Xu",
419 |     booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)",
420 |     month = jul,
421 |     year = "2019",
422 |     author+an =	 {3=highlight}
423 | }
424 | @inproceedings{ijcai2019-0730,
425 |   title     = {Correct-and-Memorize: Learning to Translate from Interactive Revisions},
426 |   author    = {Weng, Rongxiang and Zhou, Hao and Huang, Shujian and Li, Lei and Xia, Yifan and Chen, Jiajun},
427 |   booktitle = {Proceedings of the Twenty-Eighth International Joint Conference on
428 |                Artificial Intelligence (IJCAI)},
429 |   year      = {2019},
430 |   month     = {jul},
431 |   author+an =	 {1=student; 2=highlight}
432 | }
433 | 
434 | @InProceedings{sun2019graspsnooker,
435 |   author       = {Sun, Zhaoyue and Chen, Jiaze and Zhou, Hao and Zhou, Deyu and Li, Lei and Jiang, Mingmin},
436 |   booktitle    = {the 28th International Joint Conference on Artificial Intelligence (IJCAI)  - System Demonstrations},
437 |   title        = {{GraspSnooker}: Automatic {Chinese} Commentary Generation for Snooker Videos},
438 |   year         = {2019},
439 |   month        = aug,
440 |   author+an =	 {1=student; 3=highlight}
441 | }
442 | @inproceedings{bahuleyan-etal-2019-stochastic,
443 |     title = "Stochastic {W}asserstein Autoencoder for Probabilistic Sentence Generation",
444 |     author = "Bahuleyan, Hareesh  and
445 |       Mou, Lili  and
446 |       Zhou, Hao  and
447 |       Vechtomova, Olga",
448 |     booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics (NAACL)",
449 |     month = jun,
450 |     year = "2019",
451 |     author+an =	 {3=highlight}
452 | }
453 | @inproceedings{wei2019neural,
454 |   title={Why do neural dialog systems generate short and meaningless replies? a comparison between dialog and translation},
455 |   author={Wei, Bolin and Lu, Shuai and Mou, Lili and Zhou, Hao and Poupart, Pascal and Li, Ge and Jin, Zhi},
456 |   booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
457 |   year={2019},
458 |   author+an =	 {4=highlight}
459 | }
460 | @inproceedings{miao2019cgmh,
461 |   title={Cgmh: Constrained sentence generation by metropolis-hastings sampling},
462 |   author={Miao, Ning and Zhou, Hao and Mou, Lili and Yan, Rui and Li, Lei},
463 |   booktitle={Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
464 |   year={2019},
465 |   author+an =	 {1=student; 2=highlight}
466 | }
467 | @inproceedings{NEURIPS2018_734e6bfc,
468 |  author = {Cao, Wei and Wang, Dong and Li, Jian and Zhou, Hao and Li, Lei and Li, Yitan},
469 |  booktitle = {Advances in Neural Information Processing Systems (NIPS)},
470 |  title = {BRITS: Bidirectional Recurrent Imputation for Time Series},
471 |  year = {2018},
472 |  author+an =	 {1=student; 4=highlight}
473 | }
474 | @inproceedings{shi2018tree,
475 |     title={On Tree-Based Neural Sentence Modeling},
476 |     author={Shi, Haoyue and Zhou, Hao and Chen, Jiaze and Li, Lei},
477 |     booktitle={Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
478 |     year={2018},
479 |     author+an =	 {1=student; 2=highlight}
480 | }
481 | @article{zheng-etal-2018-modeling,
482 |     title = "Modeling Past and Future for Neural Machine Translation",
483 |     author = "Zheng, Zaixiang  and
484 |       Zhou, Hao  and
485 |       Huang, Shujian  and
486 |       Mou, Lili  and
487 |       Dai, Xinyu  and
488 |       Chen, Jiajun  and
489 |       Tu, Zhaopeng",
490 |     journal = "Transactions of the Association for Computational Linguistics (TACL)",
491 |     year = "2018",
492 |     author+an =	 {1=student; 2=highlight}
493 | }
494 | @inproceedings{zhou-etal-2017-word,
495 |     title = "Word-Context Character Embeddings for {C}hinese Word Segmentation",
496 |     author = "Zhou, Hao  and
497 |       Yu, Zhenting  and
498 |       Zhang, Yue  and
499 |       Huang, Shujian  and
500 |       Dai, Xinyu  and
501 |       Chen, Jiajun",
502 |     booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
503 |     month = sep,
504 |     year = "2017",
505 |     author+an =	 {1=highlight}
506 | }
507 | @inproceedings{zhou-etal-2017-chunk,
508 |     title = "Chunk-Based Bi-Scale Decoder for Neural Machine Translation",
509 |     author = "Zhou, Hao  and
510 |       Tu, Zhaopeng  and
511 |       Huang, Shujian  and
512 |       Liu, Xiaohua  and
513 |       Li, Hang  and
514 |       Chen, Jiajun",
515 |     booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers",
516 |     month = jul,
517 |     year = "2017",
518 |     author+an =	 {1=highlight}
519 | }
520 | @inproceedings{zhou-etal-2016-search,
521 |     title = "A Search-Based Dynamic Reranking Model for Dependency Parsing",
522 |     author = "Zhou, Hao  and
523 |       Zhang, Yue  and
524 |       Huang, Shujian  and
525 |       Zhou, Junsheng  and
526 |       Dai, Xin-Yu  and
527 |       Chen, Jiajun",
528 |     booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL)",
529 |     month = aug,
530 |     year = "2016",
531 |     author+an =	 {1=highlight}
532 | }
533 | @InProceedings{ZHOU16.150,
534 |   author = {Hao Zhou and Yue Zhang and Shujian Huang and Xin-Yu Dai and Jiajun Chen},
535 |   title = {Evaluating a Deterministic Shift-Reduce Neural Parser for Constituent Parsing},
536 |   booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC)},
537 |   year = {2016},
538 |   month = {may},
539 |   author+an =	 {1=highlight}
540 |  }
541 | @article{zhou15-jair,
542 |     title = "A Neural Probabilistic Structured-Prediction Method for Transition-Based Natural Language Processing",
543 |     author = "Zhou, Hao  and
544 |       Zhang, Yue  and
545 |       Chuan, Chen and
546 |       Huang, Shujian and
547 |       Xinyu, Dai  and
548 |       Chen, Jiajun",
549 |     journal = "Journal of Artificial Intelligence Research (JAIR)",
550 |     year = "2016",
551 |     author+an =	 {1=highlight}
552 | }
553 | 
554 | @article{zhou15-talip,
555 |     title = "Enhancing Shift-Reduce Constituent Parsing with Action N-Gram Model",
556 |     author = "Zhou, Hao  and
557 |       Huang, Shujian  and
558 |       Zhou, Junsheng  and
559 |       Zhang, Yue  and
560 |       Chen, Huadong  and
561 |       Dai, Xinyu  and
562 |       Chen, Chuan  and
563 |       Chen, Jiajun",
564 |     journal = "ACM Transactions on Asian and Low-Resource Language Information Processing (TALLIP)",
565 |     year = "2015",
566 |     author+an =	 {1=highlight}
567 | }
568 | @inproceedings{zhou-etal-2015-neural,
569 |     title = "A Neural Probabilistic Structured-Prediction Model for Transition-Based Dependency Parsing",
570 |     author = "Zhou, Hao  and
571 |       Zhang, Yue  and
572 |       Huang, Shujian  and
573 |       Chen, Jiajun",
574 |     booktitle = "Proceedings of the 53th Annual Meeting of the Association for Computational Linguistics (ACL)",
575 |     year = "2015",
576 |     author+an =	 {1=highlight}
577 | }
578 | 


--------------------------------------------------------------------------------
/lookup.py:
--------------------------------------------------------------------------------
1 | __author__ = 'zhouh'
2 | 


--------------------------------------------------------------------------------
/multi-len-bleu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | datadir=/home/zhouh/Data/nmt/
 4 | 
 5 | for i in $(seq 2 1 6)
 6 | do
 7 | echo '====== 0'$i'=======' 
 8 | python ./BLEUbyLength.py ../BLEU/multi-bleu.perl  $datadir/devntest/MT0${i}/MT0${i}.src ./test.result.chunk.$i $datadir/devntest/MT0${i}/reference
 9 | done
10 | 


--------------------------------------------------------------------------------
/output_align.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import theano
  8 | import cPickle as pkl
  9 | from training_data_iterator import TrainingTextIterator
 10 | 
 11 | from nmt import (build_sampler, gen_sample, load_params,
 12 |                  init_params, init_tparams, build_alignment, prepare_training_data)
 13 | 
 14 | 
 15 | from multiprocessing import Process, Queue
 16 | 
 17 | 
 18 | def main(model, pklmodel, dictionary, dictionary_target,dictionary_chunk, source_file,target_file, saveto, ck=5, wk=5, k=20,
 19 |          normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False):
 20 |     print 'load model model_options'
 21 |     with open('%s' % pklmodel, 'rb') as f:
 22 |         options = pkl.load(f)
 23 | 
 24 |     print 'load source dictionary and invert'
 25 |     with open(dictionary, 'rb') as f:
 26 |         word_dict = pkl.load(f)
 27 |     word_idict = dict()
 28 |     for kk, vv in word_dict.iteritems():
 29 |         word_idict[vv] = kk
 30 |     word_idict[0] = '<eos>'
 31 |     word_idict[1] = 'UNK'
 32 | 
 33 |     print 'load target dictionary and invert'
 34 |     with open(dictionary_target, 'rb') as f:
 35 |         word_dict_trg = pkl.load(f)
 36 |     word_idict_trg = dict()
 37 |     for kk, vv in word_dict_trg.iteritems():
 38 |         word_idict_trg[vv] = kk
 39 |     word_idict_trg[0] = '<eos>'
 40 |     word_idict_trg[1] = 'UNK'
 41 | 
 42 | 
 43 | 
 44 |     # dict for chunk label
 45 |     worddict_chunk = [None]
 46 |     worddict_r_chunk = [None]
 47 |     with open(dictionary_chunk, 'rb') as f:
 48 |         worddict_chunk = pkl.load(f)
 49 |     worddict_r_chunk = dict()
 50 |     for kk, vv in worddict_chunk.iteritems():
 51 |         worddict_r_chunk[vv] = kk
 52 | 
 53 | 
 54 |     def _seqs2wordsByChunk(caps, boundary, chunk, dictionary):
 55 |         capsw = []
 56 |         for cc, bb, ch in zip(caps, boundary, chunk):
 57 |             if cc == 0:
 58 |                 continue
 59 |             # if w == -10000:
 60 |             #     ww.append('| NOTEND')
 61 |             #     continue
 62 |             if cc < 0:
 63 |                 # ww.append('|' +  str(w))
 64 |                 continue
 65 | 
 66 | 
 67 |             if bb == 0:
 68 | 
 69 |                 capsw[-1] = capsw[-1] + "_" + (dictionary[cc])
 70 | 
 71 |             else:
 72 |                 capsw.append(dictionary[cc])
 73 | 
 74 | 
 75 |         return capsw
 76 | 
 77 | 
 78 |     # output in the chunk format:
 79 |     # w1, POS, chunk_boundary-chunk_tag
 80 |     def _seqs2wordsByChunkFormat(caps, boundary, chunk, dictionary, chunk_dic):
 81 |         capsw = []
 82 |         current_tag = ''
 83 | 
 84 |         for cc, bb, ch in zip(caps, boundary, chunk):
 85 |             if cc == 0:
 86 |                 continue
 87 |             # if w == -10000:
 88 |             #     ww.append('| NOTEND')
 89 |             #     continue
 90 |             if cc < 0:
 91 |                 # ww.append('|' +  str(w))
 92 |                 continue
 93 | 
 94 | 
 95 |             if bb == 0:
 96 | 
 97 |                 capsw.append(dictionary[cc] + ' ' + 'I-'+chunk_dic[ch])
 98 | 
 99 |             else:
100 |                 capsw.append(dictionary[cc] + ' ' + 'B-'+chunk_dic[ch])
101 | 
102 | 
103 |         return capsw
104 | 
105 | 
106 |     # utility function
107 |     def _seqs2words(caps, dictionary):
108 |         capsw = []
109 |         ww = []
110 |         for w in caps:
111 |             if w == 0:
112 |                 continue
113 |             ww.append(dictionary[w])
114 |         return ww
115 | 
116 | 
117 | 
118 | 
119 |     # allocate model parameters
120 |     params = init_params(options)
121 | 
122 |     # load model parameters and set theano shared variables
123 |     params = load_params(model, params)
124 |     tparams = init_tparams(params)
125 | 
126 | 
127 |     f_align = build_alignment(tparams, options)
128 | 
129 | 
130 |     # begin to read by iterators
131 |     train = TrainingTextIterator(source_file, target_file,
132 |                                  dictionary, dictionary_target, dictionary_chunk,
133 |                                  n_words_source=30000, n_words_target=30000,
134 |                                  batch_size=1,
135 |                                  max_chunk_len=50, max_word_len=10000)
136 | 
137 | 
138 |     boundary_right = 0.0
139 |     tag_right = 0.0
140 | 
141 |     boundary_total = 0.0
142 |     tag_total = 0.0
143 | 
144 |     for x, y_chunk, y_cw in train:
145 | 
146 |         x, x_mask, y_c, y_cw, chunk_indicator, y_mask = \
147 |             prepare_training_data(x,
148 |                                   y_chunk,
149 |                                   y_cw,
150 |                                   maxlen_chunk=100000,
151 |                                   maxlen_cw=100000,
152 |                                   n_words_src=30000,
153 |                                   n_words=30000)
154 | 
155 | 
156 | 
157 | 
158 |         align, chunk_tag, chunk_boundary = f_align(x, x_mask, y_c, y_cw, y_mask, chunk_indicator)
159 | 
160 | 
161 |         x = x.reshape((x.shape[0],) )
162 |         y_cw = y_cw.reshape((y_cw.shape[0],) )
163 |         y_c = y_c.reshape((y_c.shape[0],) )
164 |         chunk_indicator = chunk_indicator.reshape((chunk_indicator.shape[0],))
165 | 
166 | 
167 |         print '\n'.join(_seqs2wordsByChunkFormat(numpy.ndarray.tolist(y_cw),
168 |                                           numpy.ndarray.tolist(chunk_boundary),
169 |                                           numpy.ndarray.tolist(chunk_tag),
170 |                                           word_idict_trg, worddict_r_chunk))
171 | 
172 |         for gold_boundary, gold_chunk_tag, predict_boundary, predict_chunk_tag in zip(numpy.ndarray.tolist(chunk_indicator),
173 |                                                                                       numpy.ndarray.tolist(y_c),
174 |                                                                                       numpy.ndarray.tolist(chunk_boundary),
175 |                                                                                       numpy.ndarray.tolist(chunk_tag)):
176 |             boundary_total += 1
177 |             tag_total += 1
178 | 
179 |             if gold_boundary == predict_boundary:
180 |                 boundary_right += 1
181 | 
182 |                 if gold_chunk_tag == predict_chunk_tag:
183 |                     tag_right += 1
184 | 
185 | 
186 |         # for tag, boundary in zip(numpy.ndarray.tolist(chunk_tag), numpy.ndarray.tolist(chunk_boundary)):
187 |         #     print
188 |         #
189 |         # # filter alignment
190 |         # filter_align = []
191 |         # for b, align in zip(numpy.ndarray.tolist(chunk_indicator), numpy.ndarray.tolist(align[0])):
192 |         #     if b == 1.0:
193 |         #         filter_align.append(align)
194 |         #
195 |         #
196 |         # print 'align =',
197 |         # # a = numpy.ndarray.tolist(filter_align)
198 |         # a = numpy.array(filter_align)
199 |         # a = numpy.transpose(a)
200 |         # a = numpy.ndarray.tolist(a)
201 |         #
202 |         # print a
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 |     print 'boundary prec: ', boundary_right / boundary_total
211 |     print 'tag prec: ', tag_right / tag_total
212 |     print 'Done'
213 | 
214 | 
215 | if __name__ == "__main__":
216 |     parser = argparse.ArgumentParser()
217 |     parser.add_argument('-ck', type=int, default=3)
218 |     parser.add_argument('-wk', type=int, default=5)
219 |     parser.add_argument('-k', type=int, default=8)
220 |     parser.add_argument('-p', type=int, default=5)
221 |     parser.add_argument('-n', action="store_true", default=False)
222 |     parser.add_argument('-jointProb', action="store_true", default=False)
223 |     parser.add_argument('-c', action="store_true", default=False)
224 |     parser.add_argument('-show_boundary', action="store_true", default=False)
225 |     parser.add_argument('model', type=str)
226 |     parser.add_argument('pklmodel', type=str)
227 |     parser.add_argument('dictionary', type=str)
228 |     parser.add_argument('dictionary_target', type=str)
229 |     parser.add_argument('dictionary_chunk', type=str)
230 |     parser.add_argument('source', type=str)
231 |     parser.add_argument('target', type=str)
232 |     parser.add_argument('saveto', type=str)
233 | 
234 |     args = parser.parse_args()
235 | 
236 |     main(args.model, args.pklmodel, args.dictionary, args.dictionary_target,args.dictionary_chunk, args.source,args.target,
237 |          args.saveto, ck=args.ck, wk=args.wk, normalize=args.n, n_process=args.p,
238 |          chr_level=args.c, jointProb=args.jointProb, show_boundary=args.show_boundary)
239 | 


--------------------------------------------------------------------------------
/rmmodel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu0,floatX=float32
 9 | 
10 | modeldir=./
11 | 
12 | for i in $(seq 1000 1000 200000)
13 | do 
14 | modelfile=$modeldir/model_hal.iter${i}.npz
15 | rm $modelfile
16 | done
17 | 
18 | 


--------------------------------------------------------------------------------
/scp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | scp ./* zhouhao@192.168.88.125:/home/zhouhao/workspace/python/exp-nmt/HMS-chunk-nmt/$1
4 | 


--------------------------------------------------------------------------------
/scp240.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | scp ./* zhouh@114.212.189.240:/home/zhouh/workspace/python/chunk-nmt/$1
4 | 


--------------------------------------------------------------------------------
/scp5.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | scp ./* zptu@192.168.88.129:/home/zptu/zhouhao/workspace/python/exp-nmt/HMS-chunk-nmt/$1
4 | 


--------------------------------------------------------------------------------
/test.398000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | 
 9 | export THEANO_FLAGS=device=gpu2,floatX=float32
10 | datadir=/home/zhouh/Data/nmt
11 | modeldir=../16
12 | iter=398000
13 | 
14 | echo 'joint prob beam = 20' >> 3to6.log
15 | #cd $PBS_O_WORKDIR
16 | for i in $(seq 2 1 6)
17 | do
18 | {
19 | python ./translate_gpu.py -n -jointProb \
20 | 	$modeldir/model_hal.iter${iter}.npz  \
21 | 	$modeldir/model_hal.npz.pkl  \
22 | 	$datadir/hms.ch.filter.pkl \
23 | 	$datadir/hms.en.filter.chunked.pkl \
24 | 	$datadir/devntest/MT0${i}/MT0${i}.src \
25 | 	./test.result.chunk.${i} 
26 | echo $i >> 3to6.log
27 | perl ../BLEU/multi-bleu.perl /home/zhouh/Data/nmt/devntest/MT0${i}/reference < test.result.chunk.${i} >> 3to6.log
28 | }&
29 | done
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/test.align.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu3,floatX=float32
 9 | #export THEANO_FLAGS=device=gpu0,optimizer=None,floatX=float32,exception_verbosity=high
10 | datadir=/home/zhouh/Data/nmt
11 | 
12 | i=3
13 | for i in $(seq 2 1 5)
14 | do
15 | { python ./output_align.py \
16 |         ./model_hal.iter398000.npz  \
17 |         ./model_hal.npz.pkl  \
18 | 	$datadir/hms.ch.filter.pkl \
19 | 	$datadir/hms.en.filter.chunked.pkl \
20 | 	$datadir/hms.en.filter.chunked.chunktag.pkl \
21 | 	$datadir/devntest/MT0${i}/MT0${i}.src\
22 | 	$datadir/devntest/MT0${i}/reference0.tag.chunked.chunked\
23 | 	./align.output >> boundary.log.$i
24 | 
25 | }&
26 | done
27 | 


--------------------------------------------------------------------------------
/test.batch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | 
 9 | export THEANO_FLAGS=device=gpu1,floatX=float32
10 | 
11 | #cd $PBS_O_WORKDIR
12 | 
13 | modeldir=./
14 | gap=`expr $1 \* 1000`
15 | echo $gap
16 | 
17 | mkdir outputs
18 | rm test.log
19 | 
20 | for i in $(seq 250000 $gap 500000)
21 | do
22 |     for j in $(seq 1000 1000 $gap)
23 |     do
24 |     {   iter=`expr $i + $j`
25 | 
26 |         modelfile=$modeldir/model_hal.iter${iter}.npz
27 |         while [ ! -f $modelfile ];do
28 |         sleep 1m;
29 |         done;
30 | 
31 |         outputfile=./outputs/MT02.trans${iter}.en
32 | 
33 |         python ./translate_gpu.py -n $modelfile $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/hms.ch.filter.pkl /home/zhouh/Data/nmt/hms.en.filter.chunked.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src $outputfile
34 | 	echo ${iter} >> test.log
35 |         perl ../BLEU/multi-bleu.perl ~/Data/nmt/devntest/MT02/reference < $outputfile >>test.log
36 |     }&
37 | 
38 | 
39 |     done
40 |     wait
41 | done
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/test.scratch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu0,floatX=float32
 9 | 
10 | modeldir=./
11 | output=outputs
12 | 
13 | #cd $PBS_O_WORKDIR
14 | #python ./translate_gpu.py -n -p 4 -ck 8 -wk 3 $modeldir/model_hal.npz $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/corpus.ch.pkl /home/zhouh/Data/nmt/corpus.en.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src ./outputs/$output
15 | if [ ! -f $output ]; then
16 | 	mkdir $outputs
17 | fi
18 | 
19 | 
20 | for i in $(seq 20000 10000 200000)
21 | do 
22 | modelfile=$modeldir/model_hal.iter${i}.npz
23 | while [ ! -f $modelfile ];do
24 | sleep 1m;
25 | done;
26 | sleep 1m
27 | python ./translate_gpu.py -n -p 4 -ck $1 -wk $2 $modeldir/model_hal.iter${i}.npz $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/corpus.ch.filter.pkl /home/zhouh/Data/nmt/corpus.en.filter.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src ./outputs/MT02.trans${i}.scratch.en.$1.$2
28 | done
29 | 
30 | for i in $(seq 201000 1000 500000)
31 | do 
32 | modelfile=$modeldir/model_hal.iter${i}.npz
33 | while [ ! -f $modelfile ];do
34 | sleep 1m;
35 | done;
36 | sleep 1m
37 | python ./translate_gpu.py -n -p 4 -ck $1 -wk $2 $modeldir/model_hal.iter${i}.npz $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/corpus.ch.filter.pkl /home/zhouh/Data/nmt/corpus.en.filter.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src ./outputs/MT02.trans${i}.scratch.en.$1.$2
38 | done
39 | 
40 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | 
 9 | export THEANO_FLAGS=device=gpu2,floatX=float32
10 | datadir=/home/zhouh/Data/nmt
11 | 
12 | iter=398000
13 | 
14 | #cd $PBS_O_WORKDIR
15 | for i in $(seq 2 1 7)
16 | do
17 | python ./translate_gpu.py -n \
18 | 	./model_hal.npz  \
19 | 	./model_hal.npz.pkl  \
20 | 	$datadir/hms.ch.filter.pkl \
21 | 	$datadir/hms.en.filter.chunked.pkl \
22 | 	$datadir/devntest/MT0${i}/MT0${i}.src \
23 | 	./test.result.chunk.${i}
24 | echo $i >> 3to6.log
25 | perl ../BLEU/multi-bleu.perl /home/zhouh/Data/nmt/devntest/MT0${i}/reference < test.result.chunk.${i} >> 3to6.log
26 | done
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/test_zh2en.pc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2/models/memory-set_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | #cd $PBS_O_WORKDIR
11 | python ./translate_gpu.py -n -p 3 \
12 |         ./model_hal.npz  \
13 |         ./model_hal.npz.pkl  \
14 | 	././../../nmtdata/small.ch.pkl \
15 | 	././../../nmtdata/small.en.chunked.chunktag.pkl \
16 | 	././../../nmtdata/small.test \
17 | 	./small.result
18 | 


--------------------------------------------------------------------------------
/test_zh2en.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2/models/memory-set_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=cpu,floatX=float32
 9 | 
10 | #cd $PBS_O_WORKDIR
11 | python ./translate.py -n -p 8 \
12 |         ./models/model_hal.npz  \
13 | 	$HOME/Data/nmt/corpus.ch.pkl \
14 | 	$HOME/Data/nmt/corpus.en.pkl \
15 | 	$HOME/Data/nmt/devntest/MT02/MT02.src\
16 | 	./result/MT02.trans.en
17 | 
18 | python ./translate.py -n -p 8 \
19 |         ./models/model_hal.npz  \
20 |         $HOME/Data/nmt/corpus.ch.pkl \
21 |         $HOME/Data/nmt/corpus.en.pkl \
22 |         $HOME/Data/nmt/devntest/MT03/MT03.src\
23 |         ./result/MT03.trans.en
24 | 
25 | python ./translate.py -n -p 8 \
26 |         ./models/model_hal.npz  \
27 |         $HOME/Data/nmt/corpus.ch.pkl \
28 |         $HOME/Data/nmt/corpus.en.pkl \
29 |         $HOME/Data/nmt/devntest/MT04/MT04.src\
30 |         ./result/MT04.trans.en
31 | 
32 | python ./translate.py -n -p 8 \
33 |         ./models/model_hal.npz  \
34 |         $HOME/Data/nmt/corpus.ch.pkl \
35 |         $HOME/Data/nmt/corpus.en.pkl \
36 |         $HOME/Data/nmt/devntest/MT05/MT05.src\
37 |         ./result/MT05.trans.en
38 | 
39 | python ./translate.py -n -p 8 \
40 |         ./models/model_hal.npz  \
41 |         $HOME/Data/nmt/corpus.ch.pkl \
42 |         $HOME/Data/nmt/corpus.en.pkl \
43 |         $HOME/Data/nmt/devntest/MT06/MT06.src\
44 |         ./result/MT06.trans.en
45 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/train_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=20
 3 | #PBS -l walltime=168:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q GpuQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu,floatX=float32
 9 | 
10 | cd $PBS_O_WORKDIR
11 | python ./train_nmt_all.py
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/train_nmt.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | import numpy
 5 | import os
 6 | 
 7 | from nmt import train
 8 | 
 9 | def main(job_id, params):
10 |     print params
11 |     validerr = train(saveto=params['model'][0],
12 |                      reload_=params['reload'][0],
13 |                      dim_word=params['dim_word'][0],
14 |                      dim=params['dim'][0],
15 |                      n_words=params['n-words'][0],
16 |                      n_words_src=params['n-words'][0],
17 |                      decay_c=params['decay-c'][0],
18 |                      clip_c=params['clip-c'][0],
19 |                      lrate=params['learning-rate'][0],
20 |                      optimizer=params['optimizer'][0],
21 |                      patience=1000,
22 |                      maxlen=50,
23 |                      batch_size=32,
24 |                      valid_batch_size=32,
25 |                      validFreq=100,
26 |                      dispFreq=10,
27 |                      saveFreq=100,
28 |                      sampleFreq=100,
29 |                      datasets=['../data/hal/train/tok/en',
30 |                                '../data/hal/train/tok/fr'],
31 |                      valid_datasets=['../data/hal/dev/tok/en',
32 |                                      '../data/hal/dev/tok/fr'],
33 |                      dictionaries=['../data/hal/train/tok/en.pkl',
34 |                                    '../data/hal/train/tok/fr.pkl'],
35 |                      use_dropout=params['use-dropout'][0],
36 |                      overwrite=False)
37 |     return validerr
38 | 
39 | if __name__ == '__main__':
40 |     main(0, {
41 |         'model': ['model_hal.npz'],
42 |         'dim_word': [512],
43 |         'dim': [1024],
44 |         'n-words': [30000],
45 |         'optimizer': ['adadelta'],
46 |         'decay-c': [0.],
47 |         'clip-c': [1.],
48 |         'use-dropout': [False],
49 |         'learning-rate': [0.0001],
50 |         'reload': [True]})
51 | 


--------------------------------------------------------------------------------
/train_nmt_all.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | from nmt import train
 5 | 
 6 | def main(job_id, params):
 7 |     print params
 8 |     validerr = train(saveto=params['model'][0],
 9 |                                         reload_=params['reload'][0],
10 |                                         dim_word=params['dim_word'][0],
11 |                                         dim=params['dim'][0],
12 |                                         n_words=params['n-words'][0],
13 |                                         n_words_src=params['n-words'][0],
14 |                                         decay_c=params['decay-c'][0],
15 |                                         clip_c=params['clip-c'][0],
16 |                                         lrate=params['learning-rate'][0],
17 |                                         optimizer=params['optimizer'][0],
18 |                                         maxlen=50,
19 |                                         batch_size=32,
20 |                                         valid_batch_size=32,
21 | 					datasets=['/ichec/home/users/%s/data/all.en.concat.shuf.gz'%os.environ['USER'],
22 | 					'/ichec/home/users/%s/data/all.fr.concat.shuf.gz'%os.environ['USER']],
23 | 					valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'],
24 | 					'/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']],
25 | 					dictionaries=['/ichec/home/users/%s/data/all.en.concat.gz.pkl'%os.environ['USER'],
26 | 					'/ichec/home/users/%s/data/all.fr.concat.gz.pkl'%os.environ['USER']],
27 |                                         validFreq=5000,
28 |                                         dispFreq=10,
29 |                                         saveFreq=5000,
30 |                                         sampleFreq=1000,
31 |                                         use_dropout=params['use-dropout'][0],
32 |                                         overwrite=False)
33 |     return validerr
34 | 
35 | if __name__ == '__main__':
36 |     main(0, {
37 |         'model': ['/ichec/home/users/%s/models/model_session2_all.npz'%os.environ['USER']],
38 |         'dim_word': [500],
39 |         'dim': [1024],
40 |         'n-words': [30000],
41 |         'optimizer': ['adadelta'],
42 |         'decay-c': [0.],
43 |         'clip-c': [1.],
44 |         'use-dropout': [False],
45 |         'learning-rate': [0.0001],
46 |         'reload': [False]})
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/train_nmt_zh2en.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | import numpy
 5 | import os
 6 | 
 7 | from nmt import train
 8 | 
 9 | def main(job_id, params):
10 |     print params
11 |     validerr = train(saveto=params['model'][0],
12 |                      reload_=params['reload'][0],
13 |                      dim_word=params['dim_word'][0],
14 |                      dim_chunk=params['dim_chunk'][0],
15 |                      dim_chunk_hidden=params['dim_chunk_hidden'][0],
16 |                      dim=params['dim'][0],
17 |                      n_words=params['n-words'][0],
18 |                      n_words_src=params['n-words'][0],
19 |                      decay_c=params['decay-c'][0],
20 |                      clip_c=params['clip-c'][0],
21 |                      lrate=params['learning-rate'][0],
22 |                      optimizer=params['optimizer'][0],
23 |                      patience=10000,
24 |                      batch_size=32,
25 |                      valid_batch_size=32,
26 |                      validFreq=100,
27 |                      dispFreq=10,
28 |                      saveFreq=1000,
29 |                      sampleFreq=100,
30 |                      maxlen_chunk_words=50,  # maximum length of the description
31 |                      datasets=['/home/zhouh/Data/nmt/hms.ch.filter',
32 |                                '/home/zhouh/Data/nmt/hms.en.filter.chunked'],
33 |                      valid_datasets=['/home/zhouh/Data/nmt/devntest/MT02/MT02.src',
34 |                                      '/home/zhouh/Data/nmt/devntest/MT02/reference0.tag.chunked.chunked'],
35 |                      dictionaries=['/home/zhouh/Data/nmt/hms.ch.filter.pkl',
36 |                                    '/home/zhouh/Data/nmt/hms.en.filter.chunked.pkl'],
37 |                      dictionary_chunk='/home/zhouh/Data/nmt/hms.en.filter.chunked.chunktag.pkl',
38 |                      use_dropout=params['use-dropout'][0],
39 |                      overwrite=False)
40 |     return validerr
41 | 
42 | if __name__ == '__main__':
43 |     main(0, {
44 |         'model': ['model_hal.npz'],
45 |         'dim_word': [600],
46 |         'dim_chunk': [1000],
47 |         'dim': [1000],
48 |         'dim_chunk_hidden': [1000],
49 |         'n-words': [30000],
50 |         'optimizer': ['adadelta'],
51 |         'decay-c': [0.],
52 |         'clip-c': [1.],
53 |         'use-dropout': [False],
54 |         'learning-rate': [0.0001],
55 |         'reload': [True]})
56 | 


--------------------------------------------------------------------------------
/train_nmt_zh2en_pc.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import os
 3 | 
 4 | import numpy
 5 | import os
 6 | 
 7 | from nmt import train
 8 | 
 9 | def main(job_id, params):
10 |     print params
11 |     validerr = train(saveto=params['model'][0],
12 |                      reload_=params['reload'][0],
13 |                      dim_word=params['dim_word'][0],
14 |                      dim_chunk=params['dim_chunk'][0],
15 |                      dim_chunk_hidden=params['dim_chunk_hidden'][0],
16 |                      dim=params['dim'][0],
17 |                      n_words=params['n-words'][0],
18 |                      n_words_src=params['n-words'][0],
19 |                      decay_c=params['decay-c'][0],
20 |                      clip_c=params['clip-c'][0],
21 |                      lrate=params['learning-rate'][0],
22 |                      optimizer=params['optimizer'][0],
23 |                      patience=1000,
24 |                      batch_size=2,
25 |                      valid_batch_size=2,
26 |                      validFreq=3,
27 |                      dispFreq=10,
28 |                      saveFreq=10,
29 |                      sampleFreq=10,
30 |                      maxlen_chunk=30,  # maximum length of the description
31 |                      maxlen_chunk_words=50,  # maximum length of the description
32 |                      datasets=['/home/zhouh/workspace/python/nmtdata/small.ch',
33 |                                '/home/zhouh/workspace/python/nmtdata/small.en.chunked'],
34 |                      valid_datasets=['/home/zhouh/workspace/python/nmtdata/small.ch',
35 |                                      '/home/zhouh/workspace/python/nmtdata/small.en.chunked'],
36 |                      dictionaries=['/home/zhouh/workspace/python/nmtdata/small.ch.pkl',
37 |                                    '/home/zhouh/workspace/python/nmtdata/small.en.chunked.pkl'],
38 |                      dictionary_chunk='/home/zhouh/workspace/python/nmtdata/small.en.chunked.chunktag.pkl',
39 |                      use_dropout=params['use-dropout'][0],
40 |                      overwrite=False)
41 |     return validerr
42 | 
43 | if __name__ == '__main__':
44 |     main(0, {
45 |         'model': ['model_hal.npz'],
46 |         'dim_word': [30],
47 |         'dim_chunk': [50],
48 |         'dim_chunk_hidden' : [60],
49 |         'dim': [40],
50 |         'n-words': [100],
51 |         'optimizer': ['adadelta'],
52 |         'decay-c': [0.],
53 |         'clip-c': [1.],
54 |         'use-dropout': [False],
55 |         'learning-rate': [0.0001],
56 |         'reload': [True]})
57 | 


--------------------------------------------------------------------------------
/training_data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import cPickle as pkl
  4 | import gzip
  5 | import sys
  6 | import codecs
  7 | 
  8 | 
  9 | def fopen(filename, mode='r'):
 10 |     if filename.endswith('.gz'):
 11 |         return gzip.open(filename, mode)
 12 |     return open(filename, mode)
 13 | 
 14 | 
 15 | unk_idx = 1
 16 | sentence_end_idx = 0
 17 | 
 18 | 
 19 | class TrainingTextIterator:
 20 |     """Simple Bitext iterator."""
 21 |     #
 22 |     # max_chunk_len,  # max size of chunks in a sentence
 23 |     # max_word_len,   # max size of words in a chunk
 24 |     #
 25 |     def __init__(self, source, target,
 26 |                  source_dict, target_dict, target_chunk_dict,
 27 |                  batch_size=128,
 28 |                  max_chunk_len=20,  # max size of chunks in a sentence
 29 |                  max_word_len=5,   # max size of words in a chunk
 30 |                  n_words_source=-1,
 31 |                  n_words_target=-1):
 32 |         self.source = fopen(source, 'r')
 33 |         self.target = fopen(target, 'r')
 34 |         with open(source_dict, 'rb') as f:
 35 |             self.source_dict = pkl.load(f)
 36 |         with open(target_dict, 'rb') as f:
 37 |             self.target_dict = pkl.load(f)
 38 | 
 39 |         with open(target_chunk_dict, 'rb') as f:
 40 |             self.target_chunk_dict = pkl.load(f)
 41 | 
 42 |         self.batch_size = batch_size
 43 |         self.max_chunk_len = max_chunk_len
 44 |         self.max_word_len = max_word_len
 45 | 
 46 |         self.n_words_source = n_words_source
 47 |         self.n_words_target = n_words_target
 48 | 
 49 |         self.source_buffer = []
 50 |         self.target_chunk_buffer = []
 51 |         self.target_chunk_words_buffer = []
 52 |         self.k = batch_size * 50
 53 | 
 54 |         self.end_of_data = False
 55 | 
 56 | 
 57 | 
 58 |     def __iter__(self):
 59 |         return self
 60 | 
 61 |     def reset(self):
 62 |         self.source.seek(0)
 63 |         self.target.seek(0)
 64 | 
 65 |     def readNextChunkSent(self):
 66 |         chunk_words = []
 67 |         chunk_tag = []
 68 | 
 69 |         while(True):
 70 |             chunk_line = self.target.readline()
 71 | 
 72 |             if(chunk_line == '' and len(chunk_tag) == 0):
 73 |                 return None, None
 74 | 
 75 |             # read until meeting empty line
 76 |             if(len(chunk_line.strip()) == 0):
 77 |                 break
 78 | 
 79 |             # the chunk and the  is seperated by \t, and words are sperated by space
 80 |             tokens = chunk_line.strip().split('\t')
 81 | 
 82 |             words = tokens[1].strip().split()
 83 |             ctags = ['NULL'] * len(words)   # index of 'NULL' in chunk dictionary is 1
 84 |             ctags[0] = tokens[0]
 85 |             chunk_tag.extend( ctags )
 86 |             chunk_words.extend( words )
 87 | 
 88 |         assert len(chunk_tag) == len(chunk_words)
 89 | 
 90 |         return chunk_tag, chunk_words
 91 | 
 92 |     def readBuffer(self):
 93 | 
 94 |         # print 'read the buffer'
 95 | 
 96 |         # read k items into the buffer
 97 |         for k_ in xrange(self.k):
 98 |             ss = self.source.readline()
 99 | 
100 |             if ss == "":
101 |                 break
102 | 
103 |             # print ss
104 |             chunk_tags, chunk_words = self.readNextChunkSent()
105 |             if chunk_tags is None and chunk_words is None:
106 |                 break
107 | 
108 |             # print chunk_words
109 | 
110 |             self.source_buffer.append(ss.strip().split())
111 |             self.target_chunk_buffer.append(chunk_tags)
112 |             self.target_chunk_words_buffer.append(chunk_words)
113 | 
114 |         # sort by target buffer
115 |         tlen = numpy.array([len(t) for t in self.target_chunk_buffer])
116 |         tidx = tlen.argsort()
117 | 
118 |         _sbuf = [self.source_buffer[i] for i in tidx]
119 |         _tcbuf = [self.target_chunk_buffer[i] for i in tidx]
120 |         _tcwbuf = [self.target_chunk_words_buffer[i] for i in tidx]
121 | 
122 |         self.source_buffer = _sbuf
123 |         self.target_chunk_buffer = _tcbuf
124 |         self.target_chunk_words_buffer = _tcwbuf
125 | 
126 |         if len(self.source_buffer) == 0 or len(self.target_chunk_buffer) == 0:
127 | 
128 |             # print len(self.source_buffer),  len(self.target_chunk_buffer)
129 |             self.end_of_data = False
130 |             self.reset()
131 |             raise StopIteration
132 | 
133 | 
134 |     def next(self):
135 |         if self.end_of_data:
136 |             self.end_of_data = False
137 |             self.reset()
138 |             raise StopIteration
139 | 
140 |         source = []
141 |         target_chunk = []
142 |         target_chunk_words = []
143 | 
144 |         get_none_items = False
145 | 
146 |         # fill buffer, if it's empty
147 |         assert len(self.source_buffer) == len(self.target_chunk_buffer), 'Buffer size mismatch!'
148 | 
149 |         if len(self.source_buffer) == 0:
150 | 
151 |             self.readBuffer()
152 | 
153 | 
154 |         # retrieval index for each string token
155 |         try:
156 | 
157 |             # print 'get next'
158 | 
159 |             # actual work here
160 |             while True:
161 | 
162 |                 if len(self.source_buffer) == 0 and len(source) == 0:
163 |                     self.readBuffer()
164 | 
165 |                 # read from source file and map to word index
166 |                 try:
167 |                     ss = self.source_buffer.pop()
168 |                 except IndexError:
169 |                     break
170 | 
171 |                 # print 'source before', ' '.join(ss)
172 |                 ss = [self.source_dict[w] if w in self.source_dict else 1
173 |                       for w in ss]
174 |                 if self.n_words_source > 0:
175 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
176 | 
177 |                 # read from target file and map to word index
178 |                 tt = self.target_chunk_buffer.pop()
179 | 
180 |                 # print 'target chunk before', tt
181 | 
182 | 
183 |                 tt = [self.target_chunk_dict[w] for w in tt]
184 | 
185 |                 #
186 |                 # mark all the chunk tag in the dictionary as 0 and 1,
187 |                 # we only want to predict the boundary
188 |                 #
189 |                 # tt = [1 if w == 1 else 0 for w in tt]
190 | 
191 |                 # print 'target chunk after', tt
192 |                 # tt = [w if w < self.n_words_target else 1 for w in tt]
193 | 
194 |                 # read from target file and map to word index
195 |                 tcw = self.target_chunk_words_buffer.pop()
196 | 
197 |                 # print 'target before', tcw
198 |                 tcw = [self.target_dict[w] if w in self.target_dict else 1 for w in tcw]
199 |                 if self.n_words_target > 0:
200 |                     tcw = [w if w < self.n_words_target else 1 for w in tcw]
201 | 
202 |                 # print 'target after', tcw
203 | 
204 | 
205 |                 # if the source or target chunk or words in target chunk exceed max len, just skip
206 |                 # if len(ss) > self.max_word_len and len(tt) > self.max_chunk_len:
207 |                 #     continue
208 |                 if len(ss) > self.max_word_len or len(tt) > self.max_word_len:
209 | 
210 |                     # print 'skip', len(ss), len(tt)
211 |                     continue
212 |                 # else:
213 |                 #     print 'not skip', len(ss), len(tt)
214 | 
215 |                 source.append(ss)
216 |                 target_chunk.append(tt)
217 |                 target_chunk_words.append(tcw)
218 | 
219 |                 if len(source) >= self.batch_size or \
220 |                         len(target_chunk) >= self.batch_size:
221 |                     break
222 | 
223 | 
224 |         except IOError:
225 |             print 'IOError'
226 |             self.end_of_data = True
227 | 
228 |         if len(source) <= 0 or len(target_chunk) <= 0 or len(target_chunk_words) <= 0:
229 | 
230 |             # print len(source) ,len(target_chunk) , len(target_chunk_words)
231 |             print 'StopIteration'
232 |             self.end_of_data = False
233 |             self.reset()
234 |             raise StopIteration
235 | 
236 |         return source, target_chunk, target_chunk_words
237 | 


--------------------------------------------------------------------------------
/translate.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import theano
  8 | import cPickle as pkl
  9 | 
 10 | from nmt import (build_sampler, gen_sample, load_params,
 11 |                  init_params, init_tparams)
 12 | 
 13 | from multiprocessing import Process, Queue
 14 | 
 15 | 
 16 | def translate_model(queue, rqueue, pid, model, options, ck, wk, normalize):
 17 | 
 18 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 19 |     trng = RandomStreams(1234)
 20 |     use_noise = theano.shared(numpy.float32(0.))
 21 | 
 22 |     # allocate model parameters
 23 |     params = init_params(options)
 24 | 
 25 |     # load model parameters and set theano shared variables
 26 |     params = load_params(model, params)
 27 |     tparams = init_tparams(params)
 28 | 
 29 |     # word index
 30 |     f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng, use_noise)
 31 | 
 32 |     def _translate(seq):
 33 | 
 34 |         be_stochastic = False
 35 |         # sample given an input sequence and obtain scores
 36 |         sample, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word,
 37 |                                    numpy.array(seq).reshape([len(seq), 1]),
 38 |                                    options, trng=trng, maxlen_words=5, k_chunk=ck, k_word=wk,
 39 |                maxlen_chunks=50,
 40 |                                    stochastic=be_stochastic, argmax=True)
 41 | 
 42 |         if be_stochastic:
 43 |             return sample
 44 | 
 45 |         # normalize scores according to sequence lengths
 46 |         if normalize:
 47 |             lengths = numpy.array([len(s) for s in sample])
 48 |             score = score / lengths
 49 | 
 50 |         # print 'score', score
 51 |         # print 'candidates', sample
 52 |         
 53 |         sidx = numpy.argmin(score)
 54 |         return sample[sidx]
 55 | 
 56 |     while True:
 57 |         req = queue.get()
 58 |         if req is None:
 59 |             break
 60 | 
 61 |         idx, x = req[0], req[1]
 62 |         print pid, '-', idx
 63 |         seq = _translate(x)
 64 | 
 65 |         rqueue.put((idx, seq))
 66 | 
 67 |     return
 68 | 
 69 | 
 70 | def main(model, pklmodel, dictionary, dictionary_target, source_file, saveto, ck=5, wk=5,
 71 |          normalize=False, n_process=5, chr_level=False):
 72 | 
 73 |     # load model model_options
 74 |     with open('%s' % pklmodel, 'rb') as f:
 75 |         options = pkl.load(f)
 76 | 
 77 |     # load source dictionary and invert
 78 |     with open(dictionary, 'rb') as f:
 79 |         word_dict = pkl.load(f)
 80 |     word_idict = dict()
 81 |     for kk, vv in word_dict.iteritems():
 82 |         word_idict[vv] = kk
 83 |     word_idict[0] = '<eos>'
 84 |     word_idict[1] = 'UNK'
 85 | 
 86 |     # load target dictionary and invert
 87 |     with open(dictionary_target, 'rb') as f:
 88 |         word_dict_trg = pkl.load(f)
 89 |     word_idict_trg = dict()
 90 |     for kk, vv in word_dict_trg.iteritems():
 91 |         word_idict_trg[vv] = kk
 92 |     word_idict_trg[0] = '<eos>'
 93 |     word_idict_trg[1] = 'UNK'
 94 | 
 95 |     # create input and output queues for processes
 96 |     queue = Queue()
 97 |     rqueue = Queue()
 98 |     processes = [None] * n_process
 99 |     for midx in xrange(n_process):
100 |         processes[midx] = Process(
101 |             target=translate_model,
102 |             args=(queue, rqueue, midx, model, options, ck, wk, normalize))
103 |         processes[midx].start()
104 | 
105 |     # utility function
106 |     def _seqs2words(caps):
107 |         capsw = []
108 |         for cc in caps:
109 |             ww = []
110 |             for w in cc:
111 |                 if w == 0:
112 |                     continue
113 |                 # if w == -10000:
114 |                 #     ww.append('| NOTEND')
115 |                 #     continue
116 |                 elif w < 0:
117 |                     # ww.append('|' +  str(w))
118 |                     continue
119 |                 ww.append(word_idict_trg[w])
120 |             capsw.append(' '.join(ww))
121 |         return capsw
122 | 
123 |     def _send_jobs(fname):
124 |         with open(fname, 'r') as f:
125 |             for idx, line in enumerate(f):
126 |                 if chr_level:
127 |                     words = list(line.decode('utf-8').strip())
128 |                 else:
129 |                     words = line.strip().split()
130 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
131 |                 x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
132 |                 x += [0]
133 |                 queue.put((idx, x))
134 |         return idx+1
135 | 
136 |     def _finish_processes():
137 |         for midx in xrange(n_process):
138 |             queue.put(None)
139 | 
140 |     def _retrieve_jobs(n_samples):
141 |         trans = [None] * n_samples
142 |         for idx in xrange(n_samples):
143 |             resp = rqueue.get()
144 |             trans[resp[0]] = resp[1]
145 |             if numpy.mod(idx, 10) == 0:
146 |                 print 'Sample ', (idx+1), '/', n_samples, ' Done'
147 |         return trans
148 | 
149 |     print 'Translating ', source_file, '...'
150 |     n_samples = _send_jobs(source_file)
151 |     trans = _seqs2words(_retrieve_jobs(n_samples))
152 |     _finish_processes()
153 |     with open(saveto, 'w') as f:
154 |         print >>f, '\n'.join(trans)
155 |     print 'Done'
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     parser = argparse.ArgumentParser()
160 |     parser.add_argument('-ck', type=int, default=3)
161 |     parser.add_argument('-wk', type=int, default=5)
162 |     parser.add_argument('-p', type=int, default=5)
163 |     parser.add_argument('-n', action="store_true", default=False)
164 |     parser.add_argument('-c', action="store_true", default=False)
165 |     parser.add_argument('model', type=str)
166 |     parser.add_argument('pklmodel', type=str)
167 |     parser.add_argument('dictionary', type=str)
168 |     parser.add_argument('dictionary_target', type=str)
169 |     parser.add_argument('source', type=str)
170 |     parser.add_argument('saveto', type=str)
171 | 
172 |     args = parser.parse_args()
173 | 
174 |     main(args.model, args.pklmodel, args.dictionary, args.dictionary_target, args.source,
175 |          args.saveto, ck=args.ck, wk=args.wk, normalize=args.n, n_process=args.p,
176 |          chr_level=args.c)
177 | 


--------------------------------------------------------------------------------
/translate_gpu.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import theano
  8 | import cPickle as pkl
  9 | 
 10 | from nmt import (build_sampler, gen_sample, load_params,
 11 |                  init_params, init_tparams)
 12 | 
 13 | from multiprocessing import Process, Queue
 14 | 
 15 | 
 16 | def main(model, pklmodel, dictionary, dictionary_target, source_file, saveto, ck=5, wk=5, k=20,
 17 |          normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False):
 18 |     print 'load model model_options'
 19 |     with open('%s' % pklmodel, 'rb') as f:
 20 |         options = pkl.load(f)
 21 | 
 22 |     print 'load source dictionary and invert'
 23 |     with open(dictionary, 'rb') as f:
 24 |         word_dict = pkl.load(f)
 25 |     word_idict = dict()
 26 |     for kk, vv in word_dict.iteritems():
 27 |         word_idict[vv] = kk
 28 |     word_idict[0] = '<eos>'
 29 |     word_idict[1] = 'UNK'
 30 | 
 31 |     print 'load target dictionary and invert'
 32 |     with open(dictionary_target, 'rb') as f:
 33 |         word_dict_trg = pkl.load(f)
 34 |     word_idict_trg = dict()
 35 |     for kk, vv in word_dict_trg.iteritems():
 36 |         word_idict_trg[vv] = kk
 37 |     word_idict_trg[0] = '<eos>'
 38 |     word_idict_trg[1] = 'UNK'
 39 | 
 40 |     # utility function
 41 |     def _seqs2words(caps, boundary, chunk):
 42 |         capsw = []
 43 |         for cc, bb, ch in zip(caps, boundary, chunk):
 44 |             ww = []
 45 |             for w, b, c in zip(cc, bb, ch):
 46 |                 if w == 0:
 47 |                     continue
 48 |                 # if w == -10000:
 49 |                 #     ww.append('| NOTEND')
 50 |                 #     continue
 51 |                 elif w < 0:
 52 |                     # ww.append('|' +  str(w))
 53 |                     continue
 54 | 
 55 |                 if show_boundary:
 56 |                     if b == 1.0:
 57 |                         ww.append('|')
 58 |                 ww.append(word_idict_trg[w])
 59 |             capsw.append(' '.join(ww))
 60 |         return capsw
 61 | 
 62 |     def _seqs2wordsByChunk(caps, boundary, chunk):
 63 |         capsw = []
 64 |         for cc, bb, ch in zip(caps, boundary, chunk):
 65 |             ww = []
 66 |             for w, b, c in zip(cc, bb, ch):
 67 |                 if w == 0:
 68 |                     continue
 69 |                 # if w == -10000:
 70 |                 #     ww.append('| NOTEND')
 71 |                 #     continue
 72 |                 elif w < 0:
 73 |                     # ww.append('|' +  str(w))
 74 |                     continue
 75 | 
 76 |                 if b == 1.0:
 77 |                     ww.append('| ' + str(c))
 78 |                 ww.append(word_idict_trg[w])
 79 |             capsw.append(' '.join(ww))
 80 |         return capsw
 81 | 
 82 |     def _send_jobs(fname):
 83 |         retval = []
 84 |         with open(fname, 'r') as f:
 85 |             for idx, line in enumerate(f):
 86 |                 if chr_level:
 87 |                     words = list(line.decode('utf-8').strip())
 88 |                 else:
 89 |                     words = line.strip().split()
 90 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
 91 |                 x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
 92 |                 x += [0]
 93 |                 retval.append(x)
 94 |         return retval
 95 | 
 96 |     print 'Translating ', source_file, '...'
 97 | 
 98 |     print 'look up table'
 99 |     n_samples = _send_jobs(source_file)
100 | 
101 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
102 |     trng = RandomStreams(1234)
103 |     use_noise = theano.shared(numpy.float32(0.))
104 | 
105 |     # allocate model parameters
106 |     params = init_params(options)
107 | 
108 |     # load model parameters and set theano shared variables
109 |     params = load_params(model, params)
110 |     tparams = init_tparams(params)
111 | 
112 |     # word index
113 |     f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng, use_noise)
114 | 
115 |     def _translate(seq):
116 | 
117 |         be_stochastic = False
118 |         # sample given an input sequence and obtain scores
119 |         sample, boundary, chunk, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word,
120 |                                              numpy.array(seq).reshape([len(seq), 1]),
121 |                                              options, trng=trng, maxlen=200, k_chunk=ck, k_word=wk, k=k,
122 |                                              stochastic=be_stochastic, argmax=True, jointProb=False)
123 | 
124 |         if be_stochastic:
125 |             return sample
126 | 
127 |         # normalize scores according to sequence lengths
128 |         if normalize:
129 |             lengths = numpy.array([len(s) for s in sample])
130 |             score = score / lengths
131 | 
132 |         # print 'score', score
133 |         # print 'candidates', sample
134 | 
135 |         sidx = numpy.argmin(score)
136 |         return sample[sidx], boundary[sidx], chunk[sidx]
137 | 
138 | 
139 | 
140 |     ys = []
141 |     yb = []
142 |     yc = []
143 |     idx = 0
144 |     for x in n_samples:
145 |         y, y_boundary, y_chunk = _translate(x)
146 |         ys.append(y)
147 |         yb.append(y_boundary)
148 |         yc.append(y_chunk)
149 |         print idx
150 |         idx += 1
151 | 
152 | 
153 |     # print ys
154 |     # print yb
155 |     trans = _seqs2words(ys, yb, yc)
156 |     trans_chunk = _seqs2wordsByChunk(ys, yb, yc)
157 | 
158 |     with open(saveto, 'w') as f:
159 |         print >> f, '\n'.join(trans)
160 |     with open(saveto+'chunk', 'w') as f:
161 |         print >> f, '\n'.join(trans_chunk)
162 |     print 'Done'
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     parser = argparse.ArgumentParser()
167 |     parser.add_argument('-ck', type=int, default=3)
168 |     parser.add_argument('-wk', type=int, default=5)
169 |     parser.add_argument('-k', type=int, default=8)
170 |     parser.add_argument('-p', type=int, default=5)
171 |     parser.add_argument('-n', action="store_true", default=False)
172 |     parser.add_argument('-jointProb', action="store_true", default=False)
173 |     parser.add_argument('-c', action="store_true", default=False)
174 |     parser.add_argument('-show_boundary', action="store_true", default=False)
175 |     parser.add_argument('model', type=str)
176 |     parser.add_argument('pklmodel', type=str)
177 |     parser.add_argument('dictionary', type=str)
178 |     parser.add_argument('dictionary_target', type=str)
179 |     parser.add_argument('source', type=str)
180 |     parser.add_argument('saveto', type=str)
181 | 
182 |     args = parser.parse_args()
183 | 
184 |     main(args.model, args.pklmodel, args.dictionary, args.dictionary_target, args.source,
185 |          args.saveto, ck=args.ck, wk=args.wk, normalize=args.n, n_process=args.p,
186 |          chr_level=args.c, jointProb=args.jointProb, show_boundary=args.show_boundary)
187 | 


--------------------------------------------------------------------------------
/valid.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=24
 3 | #PBS -l walltime=24:00:00
 4 | #PBS -N session2_default
 5 | #PBS -A course
 6 | #PBS -q ShortQ
 7 | 
 8 | export THEANO_FLAGS=device=gpu1,floatX=float32
 9 | 
10 | 
11 | 
12 | modeldir=.
13 | 
14 | datadir=/home/zhouh/Data/nmt/
15 | 
16 | modelfile=$modeldir/model_hal
17 | python ./validate.py $modelfile ./model_hal.npz.pkl ./bleu.log ./outputs/test.result ../BLEU/multi-bleu.perl $datadir/hms.ch.filter.pkl $datadir/hms.en.filter.chunked.pkl  $datadir/hms.en.filter.chunked.chunktag.pkl $datadir/devntest/MT02/MT02.src $datadir/devntest/MT02/reference0.tag.chunked.chunked $datadir/devntest/MT02/reference
18 | 
19 | 


--------------------------------------------------------------------------------
/validate.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Translates a source file using a translation model.
  3 | '''
  4 | import argparse
  5 | 
  6 | import numpy
  7 | import theano
  8 | import cPickle as pkl
  9 | import heapq
 10 | import logging
 11 | import time
 12 | import subprocess
 13 | import re
 14 | import os
 15 | 
 16 | from nmt import (build_model, pred_probs, load_params,
 17 |                  init_params, init_tparams, prepare_training_data)
 18 | 
 19 | from training_data_iterator import TrainingTextIterator
 20 | 
 21 | 
 22 | def getBLEU():
 23 | 
 24 |     return 0
 25 | 
 26 | 
 27 | 
 28 | def main(model,
 29 |          pklmodel,
 30 |          logfile,
 31 |          outputfile,
 32 |          bleu_scrip,
 33 |          valid_datasets=['../data/dev/newstest2011.en.tok',
 34 |                           '../data/dev/newstest2011.fr.tok',
 35 |                          '../data/dev/newstest2011.fr.tok'],
 36 |          dictionaries=[
 37 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
 38 |               '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl',],
 39 |          dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
 40 |          beginModelIter=300000,
 41 |          k_best_keep=5):
 42 | 
 43 | 
 44 | 
 45 | 
 46 |     logfile = open(logfile, 'w')
 47 | 
 48 |     best_bleu = -1
 49 |     best_bleu_iter = beginModelIter
 50 | 
 51 |     cost_cache = []
 52 |     cost_iter = []
 53 | 
 54 |     for iter in range(beginModelIter, 500000, 1000):
 55 | 
 56 |         model_file_name = model + '.iter' + str(iter) + '.npz'
 57 | 
 58 | 
 59 |         # wait until current model is written
 60 |         while not os.path.isfile(model_file_name):
 61 |             time.sleep(300)
 62 | 
 63 |         print iter
 64 | 
 65 |         cmd_get_cost = ['python',
 66 |                         'computeCost.py',
 67 |                         model_file_name,
 68 |                         pklmodel,
 69 |                         dictionaries[0],
 70 |                         dictionaries[1],
 71 |                         dictionary_chunk,
 72 |                         valid_datasets[0],
 73 |                         valid_datasets[1],
 74 |                         './cost.result']
 75 | 
 76 |         subprocess.check_call(" ".join(cmd_get_cost), shell=True)
 77 | 
 78 |         fin = open('./cost.result', 'rU')
 79 |         out = fin.readline()
 80 | 
 81 |         tokens = out.strip().split()
 82 | 
 83 |         totalCost = float(tokens[0])
 84 |         wordCost = float(tokens[1])
 85 | 
 86 |         fin.close()
 87 | 
 88 | 
 89 |         print >> logfile, '==========================='
 90 | 
 91 |         print >> logfile, 'Iter ' + str(iter) + ', Word Cost ' + str(wordCost), 'Total Cost', totalCost
 92 | 
 93 |         cost_cache.append(totalCost)
 94 |         cost_iter.append(iter)
 95 | 
 96 | 
 97 |         if iter % 50000 == 0 and iter != beginModelIter:
 98 | 
 99 | 
100 | 
101 |             list = numpy.array(cost_cache)
102 |             toDecode = list.argsort()[:5]
103 | 
104 |             for d in toDecode:
105 | 
106 |                 d_iter = cost_iter[d]
107 | 
108 |                 decode_model_name = model + '.iter' + str(d_iter) + '.npz'
109 | 
110 |                 print >> logfile, 'To Decode for Iter '+str(d_iter)
111 | 
112 | 
113 |                 output_iter = outputfile + str(d_iter)
114 | 
115 |                 val_start_time = time.time()
116 | 
117 |                 cmd_translate = ['python',
118 |                                  'translate_gpu.py',
119 |                                  '-n',
120 |                                  decode_model_name,
121 |                                  pklmodel,
122 |                                  dictionaries[0],
123 |                                  dictionaries[1],
124 |                                  valid_datasets[0],
125 |                                  output_iter]
126 | 
127 |                 subprocess.check_call(" ".join(cmd_translate), shell=True)
128 | 
129 | 
130 |                 print >>logfile, "Decoding took {} minutes".format(float(time.time() - val_start_time) / 60.)
131 | 
132 | 
133 | 
134 |                 cmd_bleu_cmd = ['perl', bleu_scrip, \
135 |                                 valid_datasets[2], \
136 |                                 '<', \
137 |                                 output_iter, \
138 |                                 '>'
139 |                                 './output.eva']
140 | 
141 |                 subprocess.check_call(" ".join(cmd_bleu_cmd), shell=True)
142 | 
143 |                 fin = open('./output.eva', 'rU')
144 |                 out = re.search('BLEU = [-.0-9]+', fin.readlines()[0])
145 |                 fin.close()
146 | 
147 |                 bleu_score = float(out.group()[7:])
148 | 
149 |                 print >>logfile, 'Iter '+str(d_iter) + 'BLEU: ' + str(bleu_score)
150 | 
151 |                 if bleu_score > best_bleu:
152 |                     best_bleu = bleu_score
153 |                     best_bleu_iter = d_iter
154 | 
155 |                 print >>logfile, '## Best BLEU: ' + str(best_bleu) + 'at Iter' + str(best_bleu_iter)
156 | 
157 |                 logfile.flush()
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 |             cost_cache = []
165 |             cost_iter = []
166 | 
167 |     logfile.close()
168 | 
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     parser = argparse.ArgumentParser()
173 |     parser.add_argument('model', type=str)
174 |     parser.add_argument('pklmodel', type=str)
175 |     parser.add_argument('logfile', type=str)
176 |     parser.add_argument('outputfile', type=str)
177 |     parser.add_argument('bleu_scrip', type=str)
178 |     parser.add_argument('dictionary', type=str)
179 |     parser.add_argument('dictionary_target', type=str)
180 |     parser.add_argument('dictionary_chunk', type=str)
181 |     parser.add_argument('valid_source', type=str)
182 |     parser.add_argument('valid_target', type=str)
183 |     parser.add_argument('valid_reference', type=str)
184 | 
185 |     args = parser.parse_args()
186 | 
187 |     main(args.model,
188 |          args.pklmodel,
189 |          args.logfile,
190 |          args.outputfile,
191 |          args.bleu_scrip,
192 |          valid_datasets=[args.valid_source, args.valid_target, args.valid_reference],
193 |          dictionaries=[args.dictionary, args.dictionary_target],
194 |          dictionary_chunk=args.dictionary_chunk )
195 | 
196 | 


--------------------------------------------------------------------------------