├── BLEUbyLength.py ├── README.md ├── baseline.py ├── beam_decode.sh ├── chunk_nmt_train.sh ├── clean.sh ├── codetest.py ├── computeCost.py ├── cpu_train_chunk_nmt.sh ├── data_iterator.py ├── haos.bib ├── lookup.py ├── multi-len-bleu.sh ├── nmt.py ├── output_align.py ├── rmmodel.sh ├── scp.sh ├── scp240.sh ├── scp5.sh ├── test.398000.sh ├── test.align.sh ├── test.batch.sh ├── test.scratch.sh ├── test.sh ├── test_zh2en.pc.sh ├── test_zh2en.sh ├── train.sh ├── train_all.sh ├── train_nmt.py ├── train_nmt_all.py ├── train_nmt_zh2en.py ├── train_nmt_zh2en_pc.py ├── training_data_iterator.py ├── translate.py ├── translate_gpu.py ├── valid.sh └── validate.py /BLEUbyLength.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import subprocess 7 | import re 8 | import os 9 | 10 | 11 | def getBLEU(): 12 | 13 | return 0 14 | 15 | 16 | 17 | def main(bleu_scrip, 18 | valid_datasets=['../data/dev/newstest2011.en.tok', 19 | '../data/dev/newstest2011.fr.tok', 20 | '../data/dev/newstest2011.fr.tok'], 21 | length=10): 22 | 23 | len_chose = [10, 20, 30, 40, 50] 24 | 25 | 26 | source_file = open(valid_datasets[0], 'r') 27 | 28 | target_file = open(valid_datasets[1], 'r') 29 | 30 | reference0_file = open(valid_datasets[2]+'0', 'r') 31 | reference1_file = open(valid_datasets[2]+'1', 'r') 32 | reference2_file = open(valid_datasets[2]+'2', 'r') 33 | reference3_file = open(valid_datasets[2]+'3', 'r') 34 | 35 | source_sents = source_file.readlines() 36 | target_sents = target_file.readlines() 37 | reference0_sents = reference0_file.readlines() 38 | reference1_sents = reference1_file.readlines() 39 | reference2_sents = reference2_file.readlines() 40 | reference3_sents = reference3_file.readlines() 41 | 42 | idx_set=[ [], [], [], [], [], [] ] 43 | 44 | target_set = [ [] ] * len(idx_set) 45 | r0_set = [ [] ] * len(idx_set) 46 | r1_set = [ [] ] * len(idx_set) 47 | r2_set = [ [] ] * len(idx_set) 48 | r3_set = [ [] ] * len(idx_set) 49 | 50 | 51 | 52 | for idx, sent in enumerate(source_sents): 53 | tokens = sent.strip().split() 54 | l = len(tokens) 55 | # print l 56 | 57 | if l <= len_chose[0]: 58 | idx_set[0].append(idx) 59 | # print '< 10', l, idx 60 | elif l <= len_chose[1]: 61 | idx_set[1].append(idx) 62 | elif l <= len_chose[2]: 63 | idx_set[2].append(idx) 64 | elif l <= len_chose[3]: 65 | idx_set[3].append(idx) 66 | elif l <= len_chose[4]: 67 | idx_set[4].append(idx) 68 | else: 69 | idx_set[5].append(idx) 70 | 71 | 72 | 73 | 74 | # get the filter of the sentences 75 | for i in range(6): 76 | 77 | # print idx_set[i] 78 | target_set[i] = [target_sents[k].strip() for k in idx_set[i]] 79 | r0_set[i] = [reference0_sents[k].strip() for k in idx_set[i]] 80 | r1_set[i] = [reference1_sents[k].strip() for k in idx_set[i]] 81 | r2_set[i] = [reference2_sents[k].strip() for k in idx_set[i]] 82 | r3_set[i] = [reference3_sents[k].strip() for k in idx_set[i]] 83 | 84 | with open('./translate.tmp', 'w') as f: 85 | print >> f , '\n'.join(target_set[i]) 86 | 87 | with open('./tmp.reference0', 'w') as f: 88 | print >> f, '\n'.join(r0_set[i]) 89 | 90 | with open('./tmp.reference1', 'w') as f: 91 | print >> f, '\n'.join(r1_set[i]) 92 | 93 | with open('./tmp.reference2', 'w') as f: 94 | print >> f, '\n'.join(r2_set[i]) 95 | 96 | with open('./tmp.reference3', 'w') as f: 97 | print >> f, '\n'.join(r3_set[i]) 98 | 99 | 100 | cmd_bleu_cmd = ['perl', bleu_scrip, \ 101 | './tmp.reference', \ 102 | '<', \ 103 | './translate.tmp', \ 104 | '>' 105 | './output.eva'] 106 | 107 | subprocess.check_call(" ".join(cmd_bleu_cmd), shell=True) 108 | 109 | fin = open('./output.eva', 'rU') 110 | out = re.search('BLEU = [-.0-9]+', fin.readlines()[0]) 111 | fin.close() 112 | 113 | bleu_score = float(out.group()[7:]) 114 | 115 | if i < len(len_chose): 116 | print '### Len <= ', len_chose[i] 117 | else: 118 | print '### Len > ', len_chose[i-1] 119 | print '### BLEU:', bleu_score, 'total: ', len(target_set[i]) 120 | 121 | 122 | 123 | if __name__ == "__main__": 124 | parser = argparse.ArgumentParser() 125 | parser.add_argument('-length', type=int, default=10) 126 | parser.add_argument('bleu_scrip', type=str) 127 | parser.add_argument('valid_source', type=str) 128 | parser.add_argument('valid_target', type=str) 129 | parser.add_argument('valid_reference', type=str) 130 | 131 | 132 | args = parser.parse_args() 133 | 134 | main(args.bleu_scrip, 135 | valid_datasets=[args.valid_source, args.valid_target, args.valid_reference], 136 | length=args.length) 137 | 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chunk-Based Bi-Scaled Decoder for Neural Machine Translation 2 | 3 | ------ 4 | 5 | This is the code for paper "Chunk-Based Bi-Scaled Decoder for Neural Machine Translation". 6 | 7 | The chunk-based neural machine translation system is on basis of session 2 of [dl4mt-tutorial](https://github.com/nyu-dl/dl4mt-tutorial), which is a attention based encoder-decoder machine translation model. 8 | 9 | The main difference between our proposed model and dl4mt is that we use a bi-scaled decoder to leverage the target-side phrase information for better translation, and propose the phrase attention for phrase level soft alignments. 10 | 11 | ## Reguired Software 12 | * Python 2.7 13 | * [Theano](http://deeplearning.net/software/theano/) 14 | 15 | ## Training 16 | 17 | export THEANO_FLAGS=device=gpu2,floatX=float32 18 | python ./train_nmt_zh2en.py 19 | 20 | ## Evaluation 21 | 22 | export THEANO_FLAGS=device=gpu2,floatX=float32 23 | datadir=/home/zhouh/Data/nmt 24 | modeldir=./ 25 | 26 | python ./translate_gpu.py -n -jointProb \ 27 | $modeldir/model_hal.iter.npz \ 28 | $modeldir/model_hal.npz.pkl \ 29 | $datadir/hms.ch.filter.pkl \ 30 | $datadir/hms.en.filter.chunked.pkl \ 31 | $datadir/devntest/MT0${i}/MT0${i}.src \ 32 | ./test.result.chunk.${i} 33 | 34 | 35 | 36 | ------ 37 | 38 | 39 | [1]: Hao Zhou, Zhaopeng Tu, Shujian Huang, Xiaohua Liu, Hang Li and Jiajun Chen. Chunk-based Bi-Scale Decoder for Neural Machine Translation. In Proceeding of ACL 2017, short paper. -------------------------------------------------------------------------------- /baseline.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a neural machine translation model with soft attention 3 | ''' 4 | import theano 5 | import theano.tensor as tensor 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | 8 | import cPickle as pkl 9 | import ipdb 10 | import numpy 11 | import copy 12 | 13 | import os 14 | import warnings 15 | import sys 16 | import time 17 | 18 | from collections import OrderedDict 19 | 20 | from training_data_iterator import TrainingTextIterator 21 | from data_iterator import TextIterator 22 | 23 | 24 | profile = False 25 | 26 | 27 | # push parameters to Theano shared variables 28 | def zipp(params, tparams): 29 | for kk, vv in params.iteritems(): 30 | tparams[kk].set_value(vv) 31 | 32 | 33 | # pull parameters from Theano shared variables 34 | def unzip(zipped): 35 | new_params = OrderedDict() 36 | for kk, vv in zipped.iteritems(): 37 | new_params[kk] = vv.get_value() 38 | return new_params 39 | 40 | 41 | # get the list of parameters: Note that tparams must be OrderedDict 42 | def itemlist(tparams): 43 | return [vv for kk, vv in tparams.iteritems()] 44 | 45 | 46 | # dropout 47 | def dropout_layer(state_before, use_noise, trng): 48 | proj = tensor.switch( 49 | use_noise, 50 | state_before * trng.binomial(state_before.shape, p=0.5, n=1, 51 | dtype=state_before.dtype), 52 | state_before * 0.5) 53 | return proj 54 | 55 | 56 | # make prefix-appended name 57 | def _p(pp, name): 58 | return '%s_%s' % (pp, name) 59 | 60 | 61 | # initialize Theano shared variables according to the initial parameters 62 | def init_tparams(params): 63 | tparams = OrderedDict() 64 | for kk, pp in params.iteritems(): 65 | tparams[kk] = theano.shared(params[kk], name=kk) 66 | return tparams 67 | 68 | 69 | # load parameters 70 | def load_params(path, params): 71 | pp = numpy.load(path) 72 | for kk, vv in params.iteritems(): 73 | if kk not in pp: 74 | warnings.warn('%s is not in the archive' % kk) 75 | continue 76 | params[kk] = pp[kk] 77 | 78 | return params 79 | 80 | # layers: 'name': ('parameter initializer', 'feedforward') 81 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 82 | 'gru': ('param_init_gru', 'gru_layer'), 83 | 'gru_cond': ('param_init_gru_cond', 'gru_cond_layer'), 84 | } 85 | 86 | 87 | def get_layer(name): 88 | fns = layers[name] 89 | return (eval(fns[0]), eval(fns[1])) 90 | 91 | 92 | # some utilities 93 | def ortho_weight(ndim): 94 | W = numpy.random.randn(ndim, ndim) 95 | u, s, v = numpy.linalg.svd(W) 96 | return u.astype('float32') 97 | 98 | 99 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 100 | if nout is None: 101 | nout = nin 102 | if nout == nin and ortho: 103 | W = ortho_weight(nin) 104 | else: 105 | W = scale * numpy.random.randn(nin, nout) 106 | return W.astype('float32') 107 | 108 | 109 | def get_tensor_weight(n, nin, nout, scale=0.01): 110 | 111 | W = scale * numpy.random.randn(n, nin, nout) 112 | return W.astype('float32') 113 | 114 | 115 | def tanh(x): 116 | return tensor.tanh(x) 117 | 118 | 119 | def linear(x): 120 | return x 121 | 122 | 123 | def concatenate(tensor_list, axis=0): 124 | """ 125 | Alternative implementation of `theano.tensor.concatenate`. 126 | This function does exactly the same thing, but contrary to Theano's own 127 | implementation, the gradient is implemented on the GPU. 128 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 129 | because the inverse operation (splitting) needs to be done on the CPU. 130 | This implementation does not have that problem. 131 | :usage: 132 | >>> x, y = theano.tensor.matrices('x', 'y') 133 | >>> c = concatenate([x, y], axis=1) 134 | :parameters: 135 | - tensor_list : list 136 | list of Theano tensor expressions that should be concatenated. 137 | - axis : int 138 | the tensors will be joined along this axis. 139 | :returns: 140 | - out : tensor 141 | the concatenated tensor expression. 142 | """ 143 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 144 | 145 | output_shape = () 146 | for k in range(axis): 147 | output_shape += (tensor_list[0].shape[k],) 148 | output_shape += (concat_size,) 149 | for k in range(axis + 1, tensor_list[0].ndim): 150 | output_shape += (tensor_list[0].shape[k],) 151 | 152 | out = tensor.zeros(output_shape) 153 | offset = 0 154 | for tt in tensor_list: 155 | indices = () 156 | for k in range(axis): 157 | indices += (slice(None),) 158 | indices += (slice(offset, offset + tt.shape[axis]),) 159 | for k in range(axis + 1, tensor_list[0].ndim): 160 | indices += (slice(None),) 161 | 162 | out = tensor.set_subtensor(out[indices], tt) 163 | offset += tt.shape[axis] 164 | 165 | return out 166 | 167 | 168 | 169 | # batch preparation 170 | def prepare_training_data(seqs_x, seqs_y_c, seqs_y_cw, maxlen_chunk=None, maxlen_cw=None, n_words_src=30000, 171 | n_words=30000): 172 | # x: a list of sentences 173 | lengths_x = [len(s) for s in seqs_x] 174 | lengths_y = [ len(s) for s in seqs_y_cw] 175 | 176 | n_samples = len(seqs_x) 177 | maxlen_x = numpy.max(lengths_x) + 1 178 | maxlen_y = numpy.max(lengths_y) + 1 179 | 180 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 181 | y_c = numpy.zeros((maxlen_y, n_samples)).astype('int64') 182 | y_cw = numpy.zeros((maxlen_y, n_samples)).astype('int64') 183 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 184 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 185 | chunk_indicator = numpy.zeros((maxlen_y, n_samples)).astype('float32') 186 | 187 | for idx, [s_x, s_y_c, s_y_cw] in enumerate(zip(seqs_x, seqs_y_c, seqs_y_cw)): 188 | x[:lengths_x[idx], idx] = s_x 189 | x_mask[:lengths_x[idx]+1, idx] = 1. 190 | # print 'yc', y_c 191 | # print 'shape yc', y_c.shape 192 | # print 'idx', idx 193 | # print 'max', maxlen_y[idx] 194 | # print 'syc', s_y_c 195 | # print 'shape syc', s_y_c.shape 196 | y_c[:lengths_y[idx], idx] = s_y_c 197 | y_cw[:lengths_y[idx], idx] = s_y_cw 198 | y_mask[:lengths_y[idx]+1, idx] = 1. 199 | 200 | 201 | indicator_mask = [1 if cc != 1 else 0 for cc in s_y_c] 202 | 203 | # indicator here is a chunk begin or not (1 if True ) 204 | chunk_indicator[:lengths_y[idx], idx] = indicator_mask 205 | 206 | 207 | 208 | # print y_cw 209 | 210 | return x, x_mask, y_c, y_cw, chunk_indicator, y_mask 211 | 212 | # batch preparation 213 | def prepare_data(seqs_x, seqs_y, maxlen=None, n_words_src=30000, 214 | n_words=30000): 215 | # x: a list of sentences 216 | lengths_x = [len(s) for s in seqs_x] 217 | lengths_y = [len(s) for s in seqs_y] 218 | 219 | if maxlen is not None: 220 | new_seqs_x = [] 221 | new_seqs_y = [] 222 | new_lengths_x = [] 223 | new_lengths_y = [] 224 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y): 225 | if l_x < maxlen and l_y < maxlen: 226 | new_seqs_x.append(s_x) 227 | new_lengths_x.append(l_x) 228 | new_seqs_y.append(s_y) 229 | new_lengths_y.append(l_y) 230 | lengths_x = new_lengths_x 231 | seqs_x = new_seqs_x 232 | lengths_y = new_lengths_y 233 | seqs_y = new_seqs_y 234 | 235 | if len(lengths_x) < 1 or len(lengths_y) < 1: 236 | return None, None, None, None 237 | 238 | n_samples = len(seqs_x) 239 | maxlen_x = numpy.max(lengths_x) + 1 240 | maxlen_y = numpy.max(lengths_y) + 1 241 | 242 | x = numpy.zeros((maxlen_x, n_samples)).astype('int64') 243 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 244 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 245 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 246 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)): 247 | x[:lengths_x[idx], idx] = s_x 248 | x_mask[:lengths_x[idx]+1, idx] = 1. 249 | y[:lengths_y[idx], idx] = s_y 250 | y_mask[:lengths_y[idx]+1, idx] = 1. 251 | 252 | return x, x_mask, y, y_mask 253 | 254 | 255 | # feedforward layer: affine transformation + point-wise nonlinearity 256 | def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, 257 | ortho=True): 258 | if nin is None: 259 | nin = options['dim_proj'] 260 | if nout is None: 261 | nout = options['dim_proj'] 262 | params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) 263 | params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') 264 | 265 | return params 266 | 267 | 268 | def fflayer(tparams, state_below, options, prefix='rconv', 269 | activ='lambda x: tensor.tanh(x)', **kwargs): 270 | return eval(activ)( 271 | tensor.dot(state_below, tparams[_p(prefix, 'W')]) + 272 | tparams[_p(prefix, 'b')]) 273 | 274 | 275 | # GRU layer 276 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 277 | if nin is None: 278 | nin = options['dim_proj'] 279 | if dim is None: 280 | dim = options['dim_proj'] 281 | 282 | # embedding to gates transformation weights, biases 283 | W = numpy.concatenate([norm_weight(nin, dim), 284 | norm_weight(nin, dim)], axis=1) 285 | params[_p(prefix, 'W')] = W 286 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 287 | 288 | # recurrent transformation weights for gates 289 | U = numpy.concatenate([ortho_weight(dim), 290 | ortho_weight(dim)], axis=1) 291 | params[_p(prefix, 'U')] = U 292 | 293 | # embedding to hidden state proposal weights, biases 294 | Wx = norm_weight(nin, dim) 295 | params[_p(prefix, 'Wx')] = Wx 296 | params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') 297 | 298 | # recurrent transformation weights for hidden state proposal 299 | Ux = ortho_weight(dim) 300 | params[_p(prefix, 'Ux')] = Ux 301 | 302 | return params 303 | 304 | 305 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, 306 | **kwargs): 307 | nsteps = state_below.shape[0] 308 | if state_below.ndim == 3: 309 | n_samples = state_below.shape[1] 310 | else: 311 | n_samples = 1 312 | 313 | dim = tparams[_p(prefix, 'Ux')].shape[1] 314 | 315 | if mask is None: 316 | mask = tensor.alloc(1., state_below.shape[0], 1) 317 | 318 | # utility function to slice a tensor 319 | def _slice(_x, n, dim): 320 | if _x.ndim == 3: 321 | return _x[:, :, n*dim:(n+1)*dim] 322 | return _x[:, n*dim:(n+1)*dim] 323 | 324 | # state_below is the input word embeddings 325 | # input to the gates, concatenated 326 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ 327 | tparams[_p(prefix, 'b')] 328 | # input to compute the hidden state proposal 329 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ 330 | tparams[_p(prefix, 'bx')] 331 | 332 | # step function to be used by scan 333 | # arguments | sequences |outputs-info| non-seqs 334 | def _step_slice(m_, x_, xx_, h_, U, Ux): 335 | preact = tensor.dot(h_, U) 336 | preact += x_ 337 | 338 | # reset and update gates 339 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 340 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 341 | 342 | # compute the hidden state proposal 343 | preactx = tensor.dot(h_, Ux) 344 | preactx = preactx * r 345 | preactx = preactx + xx_ 346 | 347 | # hidden state proposal 348 | h = tensor.tanh(preactx) 349 | 350 | # leaky integrate and obtain next hidden state 351 | h = u * h_ + (1. - u) * h 352 | h = m_[:, None] * h + (1. - m_)[:, None] * h_ 353 | 354 | return h 355 | 356 | # prepare scan arguments 357 | seqs = [mask, state_below_, state_belowx] 358 | init_states = [tensor.alloc(0., n_samples, dim)] 359 | _step = _step_slice 360 | shared_vars = [tparams[_p(prefix, 'U')], 361 | tparams[_p(prefix, 'Ux')]] 362 | 363 | rval, updates = theano.scan(_step, 364 | sequences=seqs, 365 | outputs_info=init_states, 366 | non_sequences=shared_vars, 367 | name=_p(prefix, '_layers'), 368 | n_steps=nsteps, 369 | profile=profile, 370 | strict=True) 371 | rval = [rval] 372 | return rval 373 | 374 | 375 | # Conditional GRU layer with Attention 376 | def param_init_gru_cond(options, params, prefix='gru_cond', 377 | nin=None, dim=None, dimctx=None, 378 | nin_nonlin=None, dim_nonlin=None, nin_chunk=None, dim_chunk_hidden=None, nin_nonlin_chunk=None): 379 | 380 | 381 | if nin is None: 382 | nin = options['dim'] 383 | if dim is None: 384 | dim = options['dim'] 385 | if dimctx is None: 386 | dimctx = options['dim'] 387 | if nin_nonlin is None: 388 | nin_nonlin = nin 389 | if dim_nonlin is None: 390 | dim_nonlin = dim 391 | if nin_chunk is None: 392 | nin_chunk = nin 393 | if nin_nonlin_chunk is None: 394 | nin_nonlin_chunk = nin_chunk 395 | if dim_chunk_hidden is None: 396 | dim_chunk_hidden = dim 397 | 398 | 399 | chunk_label_num = options['n_chunks'] 400 | 401 | W = numpy.concatenate([norm_weight(nin, dim), 402 | norm_weight(nin, dim)], axis=1) 403 | params[_p(prefix, 'W')] = W 404 | params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') 405 | U = numpy.concatenate([ortho_weight(dim_nonlin), 406 | ortho_weight(dim_nonlin)], axis=1) 407 | params[_p(prefix, 'U')] = U 408 | 409 | Wx = norm_weight(nin_nonlin, dim_nonlin) 410 | params[_p(prefix, 'Wx')] = Wx 411 | 412 | W_use_current_chunk = norm_weight(dim_chunk_hidden, dim) # TODO the dimention here need to be careful 413 | params[_p(prefix, 'W_use_current_chunk')] = W_use_current_chunk 414 | 415 | 416 | W_current_chunk_c = norm_weight(dim_chunk_hidden, dim * 2) 417 | params[_p(prefix, 'W_current_chunk_c')] = W_current_chunk_c 418 | 419 | 420 | Ux = ortho_weight(dim_nonlin) 421 | params[_p(prefix, 'Ux')] = Ux 422 | params[_p(prefix, 'bx')] = numpy.zeros((dim_nonlin,)).astype('float32') 423 | 424 | U_nl = numpy.concatenate([ortho_weight(dim_nonlin), 425 | ortho_weight(dim_nonlin)], axis=1) 426 | params[_p(prefix, 'U_nl')] = U_nl 427 | params[_p(prefix, 'b_nl')] = numpy.zeros((2 * dim_nonlin,)).astype('float32') 428 | 429 | Ux_nl = ortho_weight(dim_nonlin) 430 | params[_p(prefix, 'Ux_nl')] = Ux_nl 431 | params[_p(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin,)).astype('float32') 432 | 433 | # context to LSTM 434 | Wc = norm_weight(dimctx, dim*2) 435 | params[_p(prefix, 'Wc')] = Wc 436 | 437 | Wcx = norm_weight(dimctx, dim) 438 | params[_p(prefix, 'Wcx')] = Wcx 439 | 440 | # attention: combined -> hidden 441 | W_comb_att = norm_weight(dim, dimctx) 442 | params[_p(prefix, 'W_comb_att')] = W_comb_att 443 | 444 | # attention: context -> hidden 445 | Wc_att = norm_weight(dimctx) 446 | params[_p(prefix, 'Wc_att')] = Wc_att 447 | 448 | 449 | # attention: combined -> hidden 450 | W_cu_chunk_att = norm_weight(dim_chunk_hidden, dimctx) 451 | params[_p(prefix, 'W_cu_chunk_att')] = W_cu_chunk_att 452 | 453 | 454 | # attention: hidden bias 455 | b_att = numpy.zeros((dimctx,)).astype('float32') 456 | params[_p(prefix, 'b_att')] = b_att 457 | 458 | # attention: 459 | U_att = norm_weight(dimctx, 1) 460 | params[_p(prefix, 'U_att')] = U_att 461 | c_att = numpy.zeros((1,)).astype('float32') 462 | params[_p(prefix, 'c_tt')] = c_att 463 | 464 | 465 | # new the chunking parameters 466 | 467 | 468 | params[_p(prefix, 'chunk_transform_matrix')] = get_tensor_weight(chunk_label_num, dim_nonlin, nin_chunk) 469 | 470 | 471 | W_chunk = numpy.concatenate([norm_weight(nin_chunk, dim_chunk_hidden), 472 | norm_weight(nin_chunk, dim_chunk_hidden)], axis=1) # nin * 2 dim 473 | params[_p(prefix, 'W_chunk')] = W_chunk 474 | params[_p(prefix, 'b_chunk')] = numpy.zeros((2 * dim_chunk_hidden,)).astype('float32') 475 | 476 | U_chunk = numpy.concatenate([ortho_weight(dim_chunk_hidden), 477 | ortho_weight(dim_chunk_hidden)], axis=1) 478 | params[_p(prefix, 'U_chunk')] = U_chunk 479 | 480 | Wx_chunk = norm_weight(nin_nonlin_chunk, dim_chunk_hidden) 481 | params[_p(prefix, 'Wx_chunk')] = Wx_chunk 482 | Ux_chunk = ortho_weight(dim_chunk_hidden) 483 | params[_p(prefix, 'Ux_chunk')] = Ux_chunk 484 | params[_p(prefix, 'bx_chunk')] = numpy.zeros((dim_chunk_hidden,)).astype('float32') 485 | 486 | U_nl_chunk = numpy.concatenate([ortho_weight(dim_chunk_hidden), 487 | ortho_weight(dim_chunk_hidden)], axis=1) 488 | params[_p(prefix, 'U_nl_chunk')] = U_nl_chunk 489 | params[_p(prefix, 'b_nl_chunk')] = numpy.zeros((2 * dim_chunk_hidden,)).astype('float32') 490 | 491 | Ux_nl_chunk = ortho_weight(dim_chunk_hidden) 492 | params[_p(prefix, 'Ux_nl_chunk')] = Ux_nl_chunk 493 | params[_p(prefix, 'bx_nl_chunk')] = numpy.zeros((dim_chunk_hidden,)).astype('float32') 494 | 495 | # context to LSTM 496 | Wc_chunk = norm_weight(dimctx, dim_chunk_hidden*2) 497 | params[_p(prefix, 'Wc_chunk')] = Wc_chunk 498 | 499 | Wcx_chunk = norm_weight(dimctx, dim_chunk_hidden) 500 | params[_p(prefix, 'Wcx_chunk')] = Wcx_chunk 501 | 502 | # attention: combined -> hidden 503 | W_comb_att_chunk = norm_weight(dim_chunk_hidden, dimctx) 504 | params[_p(prefix, 'W_comb_att_chunk')] = W_comb_att_chunk 505 | 506 | # attention: context -> hidden 507 | Wc_att_chunk = norm_weight(dimctx) 508 | params[_p(prefix, 'Wc_att_chunk')] = Wc_att_chunk 509 | 510 | # attention: hidden bias 511 | b_att_chunk = numpy.zeros((dimctx,)).astype('float32') 512 | params[_p(prefix, 'b_att_chunk')] = b_att_chunk 513 | 514 | # attention: 515 | U_att_chunk = norm_weight(dimctx, 1) 516 | params[_p(prefix, 'U_att_chunk')] = U_att_chunk 517 | c_att_chunk = numpy.zeros((1,)).astype('float32') 518 | params[_p(prefix, 'c_tt_chunk')] = c_att_chunk 519 | 520 | 521 | 522 | return params 523 | 524 | 525 | def gru_cond_layer(tparams, emb, chunk_index, options, prefix='gru', 526 | mask=None, chunk_boundary_indicator=None, context=None, 527 | one_step=False, one_step_chunk=False, one_step_word=False, 528 | init_state_chunk=None,init_state_chunk_words=None, 529 | current_chunk_hidden=None,last_chunk_end_word_hidden1=None, current_word_hidden1=None, 530 | context_mask=None, **kwargs): 531 | 532 | 533 | assert context, 'Context must be provided' 534 | 535 | 536 | # nsteps = chunk_index.shape[0] 537 | 538 | if chunk_index is not None: 539 | nsteps = chunk_index.shape[0] 540 | 541 | if one_step_chunk: 542 | assert init_state_chunk, 'previous state must be provided' 543 | assert init_state_chunk_words, 'previous state must be provided' 544 | 545 | # if this is a sample or decode process, we may use a sample = 1 predict 546 | if emb is not None: 547 | if emb.ndim == 3: 548 | n_samples = emb.shape[1] 549 | else: 550 | n_samples = emb.shape[0] 551 | else: 552 | n_samples = current_word_hidden1.shape[0] 553 | 554 | 555 | # the hidden dim 556 | dim = tparams[_p(prefix, 'Wcx')].shape[1] 557 | 558 | # chunk hidden dim 559 | chunk_hidden_dim = tparams[_p(prefix, 'Wcx_chunk')].shape[1] 560 | 561 | # if mask is None, it is the sample process 562 | if mask is None: 563 | mask = tensor.alloc(1., n_samples, 1) 564 | 565 | # initial/previous state 566 | if init_state_chunk is None: 567 | init_state_chunk = tensor.alloc(0., n_samples, chunk_hidden_dim) 568 | if init_state_chunk_words is None: 569 | init_state_chunk_words = tensor.alloc(0., n_samples, dim) 570 | 571 | # projected context 572 | assert context.ndim == 3, \ 573 | 'Context must be 3-d: #annotation x #sample x dim' 574 | 575 | def _slice(_x, n, dim): 576 | if _x.ndim == 3: 577 | return _x[:, :, n*dim:(n+1)*dim] 578 | return _x[:, n*dim:(n+1)*dim] 579 | 580 | # 581 | # chunking prediction 582 | # 583 | # We have to firstly compute the word hidden 1 and then compute the 584 | # chunk hidden given the word hidden 1 585 | def predict_chunk( m_, cw_x_, cw_xx_, h_chunk, 586 | h_cw, h1_last_chunk_end_word, 587 | # non sequences 588 | pctx_chunk, cc, 589 | chunk_transform_matrix, U_chunk, Wc_chunk, W_comb_att_chunk, U_att_chunk, c_tt_chunk, 590 | Ux_chunk, Wcx_chunk, U_nl_chunk, Ux_nl_chunk, b_nl_chunk, bx_nl_chunk, Wx_chunk, bx_chunk, 591 | W_chunk, b_chunk, 592 | W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, W_cu_chunk_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, 593 | b_nl, bx_nl): 594 | 595 | 596 | # 597 | # incorporate the last words into word hidden 1 : h1 598 | # 599 | preact1 = tensor.dot(h_cw, U) 600 | preact1 += cw_x_ 601 | preact1 = tensor.nnet.sigmoid(preact1) 602 | 603 | r1 = _slice(preact1, 0, dim) 604 | u1 = _slice(preact1, 1, dim) 605 | 606 | preactx1 = tensor.dot(h_cw, Ux) 607 | preactx1 *= r1 608 | preactx1 += cw_xx_ 609 | 610 | h1 = tensor.tanh(preactx1) 611 | 612 | h1 = u1 * h_cw + (1. - u1) * h1 613 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_cw 614 | 615 | ret_word_hidden1 = h1 616 | 617 | ########### end compute word h1 618 | 619 | # 620 | # compute the chunk embedding ###################### 621 | # 622 | last_chunk_emb = h1 - h1_last_chunk_end_word 623 | 624 | 625 | transform = chunk_transform_matrix[0] 626 | last_chunk_emb = tensor.dot(last_chunk_emb, transform) # TODO, make sure that here the chunkindex is last chunk index 627 | 628 | 629 | # 630 | # compute the current chunk hidden 631 | # 632 | chunk_xx_ = tensor.dot(last_chunk_emb, Wx_chunk) + \ 633 | bx_chunk 634 | chunk_x_ = tensor.dot(last_chunk_emb, W_chunk) + \ 635 | b_chunk 636 | 637 | 638 | preact1 = tensor.dot(h_chunk, U_chunk) 639 | preact1 += chunk_x_ 640 | preact1 = tensor.nnet.sigmoid(preact1) 641 | 642 | r1 = _slice(preact1, 0, chunk_hidden_dim) 643 | u1 = _slice(preact1, 1, chunk_hidden_dim) 644 | 645 | preactx1 = tensor.dot(h_chunk, Ux_chunk) 646 | preactx1 *= r1 647 | preactx1 += chunk_xx_ 648 | 649 | h1 = tensor.tanh(preactx1) 650 | 651 | 652 | h1 = u1 * h_chunk + (1. - u1) * h1 653 | 654 | 655 | h1 = m_[:, None] * h1 + (1. - m_)[:, None] * h_chunk 656 | 657 | # 658 | # attention 659 | # 660 | pstate_ = tensor.dot(h1, W_comb_att_chunk) 661 | pctx__ = pctx_chunk + pstate_[None, :, :] 662 | #pctx__ += xc_ 663 | pctx__ = tensor.tanh(pctx__) 664 | alpha = tensor.dot(pctx__, U_att_chunk)+c_tt_chunk 665 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) 666 | alpha = tensor.exp(alpha) 667 | if context_mask: 668 | alpha = alpha * context_mask 669 | alpha = alpha / alpha.sum(0, keepdims=True) 670 | ctx_ = (cc * alpha[:, :, None]).sum(0) # current context 671 | 672 | 673 | preact2 = tensor.dot(h1, U_nl_chunk)+b_nl_chunk 674 | preact2 += tensor.dot(ctx_, Wc_chunk) 675 | preact2 = tensor.nnet.sigmoid(preact2) 676 | 677 | r2 = _slice(preact2, 0, chunk_hidden_dim) 678 | u2 = _slice(preact2, 1, chunk_hidden_dim) 679 | 680 | preactx2 = tensor.dot(h1, Ux_nl_chunk)+bx_nl_chunk 681 | preactx2 *= r2 682 | preactx2 += tensor.dot(ctx_, Wcx_chunk) 683 | 684 | h2 = tensor.tanh(preactx2) 685 | 686 | h2 = u2 * h1 + (1. - u2) * h2 687 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 688 | 689 | chunk_hidden2 = h2 690 | chunk_ctx = ctx_ 691 | chunk_alpha = alpha.T 692 | 693 | 694 | 695 | return ret_word_hidden1, last_chunk_emb, chunk_hidden2, chunk_ctx, chunk_alpha 696 | 697 | 698 | # 699 | # given word hidden1, chunk hidden, compute the 700 | # word hidden2 701 | # 702 | def predict_word_hidden2(m_, word_hidden1, chunk_hidden, 703 | pctx_, cc_, 704 | W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, 705 | W_cu_chunk_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): 706 | 707 | 708 | m = tensor.alloc(0., chunk_hidden.shape[0], chunk_hidden.shape[1]) 709 | 710 | chunk_hidden = m * chunk_hidden 711 | # 712 | # given the word hidden1 and chunk hidden , compute 713 | # word attention 714 | pstate_ = tensor.dot(word_hidden1, W_comb_att) 715 | pstate_chunk = tensor.dot(chunk_hidden, W_cu_chunk_att) 716 | 717 | 718 | 719 | ################ 720 | # revise 721 | ################ 722 | pctx__ = pctx_ + pstate_[None, :, :] + pstate_chunk[None, :, :] 723 | # pctx__ = pctx_ + pstate_[None, :, :] 724 | pctx__ = tensor.tanh(pctx__) 725 | alpha = tensor.dot(pctx__, U_att)+c_tt 726 | alpha = alpha.reshape([alpha.shape[0], alpha.shape[1]]) 727 | alpha = tensor.exp(alpha) 728 | if context_mask: 729 | alpha = alpha * context_mask 730 | alpha = alpha / alpha.sum(0, keepdims=True) 731 | ctx_ = (cc_ * alpha[:, :, None]).sum(0) # current context 732 | 733 | 734 | 735 | preact2 = tensor.dot(word_hidden1, U_nl)+b_nl 736 | 737 | 738 | ################ 739 | # revise 740 | ################ 741 | preact2 += tensor.dot(ctx_, Wc) + tensor.dot(chunk_hidden, W_current_chunk_c) 742 | # preact2 += tensor.dot(ctx_, Wc) 743 | preact2 = tensor.nnet.sigmoid(preact2) 744 | 745 | r2 = _slice(preact2, 0, dim) 746 | u2 = _slice(preact2, 1, dim) 747 | 748 | preactx2 = tensor.dot(word_hidden1, Ux_nl)+bx_nl 749 | preactx2 *= r2 750 | 751 | 752 | ################ 753 | # revise 754 | ################ 755 | preactx2 += tensor.dot(ctx_, Wcx) + tensor.dot(chunk_hidden, W_current_chunk_hidden) # here we add current chunk representation 756 | # preactx2 += tensor.dot(ctx_, Wcx) # here we add current chunk representation 757 | 758 | 759 | h2 = tensor.tanh(preactx2) 760 | 761 | h2 = u2 * word_hidden1 + (1. - u2) * h2 762 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * word_hidden1 763 | 764 | return h2, ctx_, alpha.T # pstate_, preact, preactx, r, u 765 | 766 | 767 | 768 | def scan_step( # seq 769 | m_, chunk_boundary, cw_x_, cw_xx_, 770 | # outputs info 771 | h_chunk, position_chunk_hidden2, ctx_chunk, alpha_chunk, chunk_true, 772 | h_cw, position_h1, h1_last_chunk_end_word, ctx_cw, alpha_cw, 773 | # non sequences 774 | pctx_chunk, pctx_cw, cc, 775 | chunk_transform_matrix, U_chunk, Wc_chunk, W_comb_att_chunk, U_att_chunk, c_tt_chunk, 776 | Ux_chunk, Wcx_chunk, U_nl_chunk, Ux_nl_chunk, b_nl_chunk, bx_nl_chunk, Wx_chunk, bx_chunk, 777 | W_chunk, b_chunk, 778 | W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, W_cu_chunk_att, U_att, c_tt, Ux, Wcx, 779 | U_nl, Ux_nl, b_nl, bx_nl): 780 | 781 | 782 | word_hidden1, \ 783 | last_chunk_emb, \ 784 | current_position_hypo_chunk_hidden, \ 785 | chunk_ctx, chunk_alpha = \ 786 | predict_chunk(m_, cw_x_, cw_xx_, h_chunk, 787 | h_cw, h1_last_chunk_end_word, 788 | pctx_chunk, cc, 789 | chunk_transform_matrix, U_chunk, Wc_chunk, W_comb_att_chunk, U_att_chunk, c_tt_chunk, 790 | Ux_chunk, Wcx_chunk, U_nl_chunk, Ux_nl_chunk, b_nl_chunk, bx_nl_chunk, Wx_chunk, bx_chunk, 791 | W_chunk, b_chunk, 792 | W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, W_cu_chunk_att, U_att, c_tt, Ux, 793 | Wcx, U_nl, Ux_nl, b_nl, bx_nl) 794 | 795 | 796 | # 797 | # if current chunk indocator is 1, then this is a begin of a new chunk, 798 | # the chunk hidden is cuttent chunk hidde, otherwise, this word is still in the old chunk 799 | # the last chunk hidden will still be used. 800 | # 801 | chunk_hidden = chunk_boundary[:, None] * current_position_hypo_chunk_hidden \ 802 | + (1. - chunk_boundary)[:, None] * h_chunk 803 | 804 | h1_last_chunk_end_word = chunk_boundary[:, None] * word_hidden1 \ 805 | + (1. - chunk_boundary)[:, None] * h1_last_chunk_end_word 806 | 807 | 808 | word_hidden2, \ 809 | word_ctx_, \ 810 | word_alpha = predict_word_hidden2(m_, word_hidden1, chunk_hidden, 811 | pctx_cw, cc, 812 | W_current_chunk_hidden, W_current_chunk_c, U, Wc, W_comb_att, 813 | W_cu_chunk_att, U_att, c_tt, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl) 814 | 815 | 816 | 817 | return chunk_hidden, current_position_hypo_chunk_hidden, chunk_ctx, chunk_alpha, last_chunk_emb, \ 818 | word_hidden2, word_hidden1, h1_last_chunk_end_word, word_ctx_, word_alpha 819 | 820 | 821 | 822 | _step = scan_step 823 | 824 | word_shared_vars = [tparams[_p(prefix, 'W_use_current_chunk')], 825 | tparams[_p(prefix, 'W_current_chunk_c')], 826 | tparams[_p(prefix, 'U')], 827 | tparams[_p(prefix, 'Wc')], 828 | tparams[_p(prefix, 'W_comb_att')], 829 | tparams[_p(prefix, 'W_cu_chunk_att')], 830 | tparams[_p(prefix, 'U_att')], 831 | tparams[_p(prefix, 'c_tt')], 832 | tparams[_p(prefix, 'Ux')], 833 | tparams[_p(prefix, 'Wcx')], 834 | tparams[_p(prefix, 'U_nl')], 835 | tparams[_p(prefix, 'Ux_nl')], 836 | tparams[_p(prefix, 'b_nl')], 837 | tparams[_p(prefix, 'bx_nl')]] 838 | 839 | chunk_shared_vars = [tparams[_p(prefix, 'chunk_transform_matrix')], 840 | tparams[_p(prefix, 'U_chunk')], 841 | tparams[_p(prefix, 'Wc_chunk')], 842 | tparams[_p(prefix, 'W_comb_att_chunk')], 843 | tparams[_p(prefix, 'U_att_chunk')], 844 | tparams[_p(prefix, 'c_tt_chunk')], 845 | tparams[_p(prefix, 'Ux_chunk')], 846 | tparams[_p(prefix, 'Wcx_chunk')], 847 | tparams[_p(prefix, 'U_nl_chunk')], 848 | tparams[_p(prefix, 'Ux_nl_chunk')], 849 | tparams[_p(prefix, 'b_nl_chunk')], 850 | tparams[_p(prefix, 'bx_nl_chunk')], 851 | tparams[_p(prefix, 'Wx_chunk')], 852 | tparams[_p(prefix, 'bx_chunk')], 853 | tparams[_p(prefix, 'W_chunk')], 854 | tparams[_p(prefix, 'b_chunk')]] 855 | 856 | # compute the word hidden1 and chunk hidden during sample 857 | if one_step_chunk: 858 | 859 | chunk_pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att_chunk')]) + \ 860 | tparams[_p(prefix, 'b_att_chunk')] 861 | 862 | 863 | # projected x 864 | state_belowx = tensor.dot(emb, tparams[_p(prefix, 'Wx')]) + \ 865 | tparams[_p(prefix, 'bx')] 866 | state_below_ = tensor.dot(emb, tparams[_p(prefix, 'W')]) + \ 867 | tparams[_p(prefix, 'b')] 868 | 869 | 870 | 871 | seqs = [mask, state_below_, state_belowx, init_state_chunk, 872 | init_state_chunk_words, last_chunk_end_word_hidden1] 873 | rval = predict_chunk(*(seqs + [chunk_pctx_, context] + 874 | chunk_shared_vars + word_shared_vars)) 875 | return rval[0], rval[1], rval[2], rval[3], rval[4], None, None, None, None, None 876 | # ret_word_hidden1, last_chunk_emb, chunk_hidden2, chunk_ctx, chunk_alpha 877 | 878 | # given the word hidden1 and chunk hidden, compute the word hidden 2 879 | elif one_step_word: 880 | 881 | # word pctx 882 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + \ 883 | tparams[_p(prefix, 'b_att')] 884 | 885 | seqs = [mask, current_word_hidden1, current_chunk_hidden, 886 | pctx_, context] 887 | 888 | rval = predict_word_hidden2(*(seqs + word_shared_vars )) 889 | return rval[0], rval[1], rval[2], None, None, None, None, None, None, None 890 | # word hidden2, word ctx, word attention alpha 891 | 892 | 893 | # word pctx 894 | pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + \ 895 | tparams[_p(prefix, 'b_att')] 896 | 897 | # chunk pctx 898 | chunk_pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att_chunk')]) + \ 899 | tparams[_p(prefix, 'b_att_chunk')] 900 | 901 | 902 | 903 | # projected x 904 | state_belowx = tensor.dot(emb, tparams[_p(prefix, 'Wx')]) + \ 905 | tparams[_p(prefix, 'bx')] 906 | state_below_ = tensor.dot(emb, tparams[_p(prefix, 'W')]) + \ 907 | tparams[_p(prefix, 'b')] 908 | 909 | # the sequence is 910 | # @mask the word mask for batch training 911 | # @chunk_boundary_indicator 1: this is a begin of a chunk, 0: this is a inter part of a chunk 912 | # @state_below_ W*y_emb 913 | # @state_belowx W*y_emb, for different usage 914 | seqs = [mask, chunk_boundary_indicator, state_below_, state_belowx] 915 | 916 | 917 | 918 | # outputs_info of the training scan process 919 | init_chunk_ctx = tensor.alloc(0., n_samples, context.shape[2]) 920 | init_chunk_alpha = tensor.alloc(0., n_samples, context.shape[0]) 921 | h1_last_chunk_end_word = tensor.alloc(0., n_samples, dim) # set last chunk hidden 0 922 | position_h1 = tensor.alloc(0., n_samples, dim) 923 | last_chunk_emb = tensor.alloc(0., n_samples, options['dim_chunk']) 924 | 925 | init_word_ctx = tensor.alloc(0., n_samples, context.shape[2]) 926 | init_word_alpha = tensor.alloc(0., n_samples, context.shape[0]) 927 | 928 | 929 | # chunk_hidden, current_position_hypo_chunk_hidden, chunk_ctx, chunk_alpha, last_chunk_emb, \ 930 | # word_hidden1, word_hidden2, h1_last_chunk_end_word, word_ctx_, word_alpha 931 | 932 | outputs = [init_state_chunk, 933 | init_state_chunk, # only for output 934 | init_chunk_ctx, 935 | init_chunk_alpha, 936 | last_chunk_emb, 937 | init_state_chunk_words, 938 | position_h1, # current position computed word hidden1 939 | h1_last_chunk_end_word, 940 | init_word_ctx, 941 | init_word_alpha] 942 | 943 | 944 | rval, updates = theano.scan(_step, 945 | sequences=seqs, 946 | outputs_info=outputs, 947 | # here pctx is the tranformation of the source context 948 | non_sequences=[chunk_pctx_, pctx_, context]+chunk_shared_vars+word_shared_vars, 949 | name=_p(prefix, '_layers'), 950 | #n_steps=n_chunk_step, 951 | n_steps=nsteps, 952 | profile=profile, 953 | strict=True) 954 | 955 | return rval 956 | # chunk_hidden, chunk_ctx, chunk_alpha, word_hidden2, h1_last_chunk_end_word, word_ctx_, word_alpha 957 | 958 | 959 | 960 | 961 | # initialize all parameters 962 | def init_params(options): 963 | params = OrderedDict() 964 | 965 | # embedding 966 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 967 | 968 | params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) 969 | 970 | # encoder: bidirectional RNN 971 | params = get_layer(options['encoder'])[0](options, params, 972 | prefix='encoder', 973 | nin=options['dim_word'], 974 | dim=options['dim']) 975 | params = get_layer(options['encoder'])[0](options, params, 976 | prefix='encoder_r', 977 | nin=options['dim_word'], 978 | dim=options['dim']) 979 | 980 | 981 | ctxdim = 2 * options['dim'] 982 | 983 | 984 | # 985 | # generate the initial hidden representation for word and chunk 986 | # 987 | 988 | # init_state, init_cell 989 | params = get_layer('ff')[0](options, params, prefix='ff_state_chunk', 990 | nin=ctxdim, nout=options['dim_chunk_hidden']) 991 | 992 | 993 | # init_state, init_cell 994 | params = get_layer('ff')[0](options, params, prefix='ff_state_chunk_words', 995 | nin=ctxdim, nout=options['dim']) 996 | 997 | 998 | 999 | # decoder 1000 | params = get_layer(options['decoder'])[0](options, params, 1001 | prefix='decoder', 1002 | nin=options['dim_word'], 1003 | dim=options['dim'], 1004 | dimctx=ctxdim, 1005 | nin_chunk=options['dim_chunk'], 1006 | dim_chunk_hidden=options['dim_chunk_hidden']) 1007 | 1008 | 1009 | # readout 1010 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', 1011 | nin=options['dim'], nout=options['dim_word'], 1012 | ortho=False) 1013 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 1014 | nin=options['dim_word'], 1015 | nout=options['dim_word'], ortho=False) 1016 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 1017 | nin=ctxdim, nout=options['dim_word'], 1018 | ortho=False) 1019 | params = get_layer('ff')[0](options, params, prefix='ff_logit_using_chunk_hidden', 1020 | nin=options['dim_chunk_hidden'], nout=options['dim_word'], 1021 | ortho=False) 1022 | # params = get_layer('ff')[0](options, params, prefix='ff_logit_chunk_hidden', 1023 | # nin=ctxdim, nout=options['dim_word'], 1024 | # ortho=False) 1025 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 1026 | nin=options['dim_word'], 1027 | nout=options['n_words']) 1028 | 1029 | # readout 1030 | 1031 | params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm_chunk', 1032 | nin=options['dim_chunk_hidden'], nout=options['dim_chunk'], 1033 | ortho=False) 1034 | 1035 | # we should note here, we use word dim 1036 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev_chunk', 1037 | nin=options['dim_chunk'], 1038 | nout=options['dim_chunk'], ortho=False) 1039 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx_chunk', 1040 | nin=ctxdim, nout=options['dim_chunk'], 1041 | ortho=False) 1042 | 1043 | params = get_layer('ff')[0](options, params, prefix='logit_ctx_last_word', 1044 | nin=options['dim_word'], 1045 | nout=options['dim_chunk'], 1046 | ortho=False) 1047 | 1048 | params = get_layer('ff')[0](options, params, prefix='logit_ctx_current_word_hidden1', 1049 | nin=options['dim'], 1050 | nout=options['dim_chunk'], 1051 | ortho=False) 1052 | 1053 | 1054 | params = get_layer('ff')[0](options, params, prefix='ff_logit_chunk', 1055 | nin=options['dim_chunk'], 1056 | nout=options['dim_chunk'], 1057 | ortho=False) 1058 | 1059 | 1060 | return params 1061 | 1062 | 1063 | # build a training model 1064 | def build_model(tparams, options): 1065 | opt_ret = dict() 1066 | 1067 | trng = RandomStreams(1234) 1068 | use_noise = theano.shared(numpy.float32(0.)) 1069 | 1070 | # description string: #words x #samples 1071 | x = tensor.matrix('x', dtype='int64') 1072 | x_mask = tensor.matrix('x_mask', dtype='float32') 1073 | 1074 | y_chunk = tensor.matrix('y_chunk', dtype='int64') 1075 | y_chunk_words = tensor.matrix('y_chunk_words', dtype='int64') 1076 | y_mask = tensor.matrix('y_mask', dtype='float32') 1077 | chunk_indicator = tensor.matrix('chunk_indicator', dtype='float32') 1078 | 1079 | # for the backward rnn, we just need to invert x and x_mask 1080 | xr = x[::-1] 1081 | xr_mask = x_mask[::-1] 1082 | 1083 | n_timesteps = x.shape[0] 1084 | n_timesteps_y = y_chunk.shape[0] 1085 | n_samples = x.shape[1] 1086 | 1087 | # word embedding for forward rnn (source) 1088 | emb = tparams['Wemb'][x.flatten()] 1089 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 1090 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 1091 | prefix='encoder', 1092 | mask=x_mask) 1093 | # word embedding for backward rnn (source) 1094 | embr = tparams['Wemb'][xr.flatten()] 1095 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 1096 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 1097 | prefix='encoder_r', 1098 | mask=xr_mask) 1099 | 1100 | # context will be the concatenation of forward and backward rnns 1101 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 1102 | 1103 | # mean of the context (across time) will be used to initialize decoder rnn 1104 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 1105 | 1106 | # or you can use the last state of forward + backward encoder rnns 1107 | # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) 1108 | 1109 | # initial decoder state for both 1110 | init_state_chunk = get_layer('ff')[1](tparams, ctx_mean, options, 1111 | prefix='ff_state_chunk', activ='tanh') 1112 | init_state_chunk_words = get_layer('ff')[1](tparams, ctx_mean, options, 1113 | prefix='ff_state_chunk_words', activ='tanh') 1114 | 1115 | # word embedding (target), we will shift the target sequence one time step 1116 | # to the right. This is done because of the bi-gram connections in the 1117 | # readout and decoder rnn. The first target will be all zeros and we will 1118 | # not condition on the last output. 1119 | 1120 | 1121 | # shift the word embeddings in the chunk 1122 | emb = tparams['Wemb_dec'][y_chunk_words.flatten()] 1123 | emb = emb.reshape([n_timesteps_y, n_samples, options['dim_word']]) 1124 | 1125 | emb_shifted = tensor.zeros_like(emb) 1126 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 1127 | emb = emb_shifted 1128 | 1129 | y_chunk_shift = tensor.zeros_like(y_chunk) 1130 | y_chunk_shift = tensor.set_subtensor(y_chunk_shift[1:], y_chunk[:-1]) 1131 | 1132 | # 1133 | # decoder 1134 | chunk_hidden, \ 1135 | current_position_hypo_chunk_hidden, \ 1136 | chunk_ctx, \ 1137 | chunk_alpha, \ 1138 | last_chunk_emb, \ 1139 | word_hidden2, \ 1140 | word_hidden1, \ 1141 | h1_last_chunk_end_word, \ 1142 | word_ctx_, \ 1143 | word_alpha = get_layer(options['decoder'])[1](tparams, emb, y_chunk_shift, 1144 | options, 1145 | prefix='decoder', 1146 | mask=y_mask, 1147 | chunk_boundary_indicator=chunk_indicator, 1148 | context=ctx, 1149 | context_mask=x_mask, 1150 | init_state_chunk=init_state_chunk, 1151 | init_state_chunk_words=init_state_chunk_words) 1152 | # 1153 | opt_ret['dec_alphas_chunk'] = chunk_alpha 1154 | 1155 | 1156 | logit_lstm_chunk = get_layer('ff')[1](tparams, current_position_hypo_chunk_hidden, options, 1157 | prefix='ff_logit_lstm_chunk', activ='linear') 1158 | logit_prev_chunk = get_layer('ff')[1](tparams, last_chunk_emb, options, 1159 | prefix='ff_logit_prev_chunk', activ='linear') 1160 | logit_ctx_chunk = get_layer('ff')[1](tparams, chunk_ctx, options, 1161 | prefix='ff_logit_ctx_chunk', activ='linear') 1162 | 1163 | logit_ctx_last_word = get_layer('ff')[1](tparams, emb, options, 1164 | prefix='logit_ctx_last_word', activ='linear') 1165 | logit_ctx_current_word_hidden1 = get_layer('ff')[1](tparams, word_hidden1, options, 1166 | prefix='logit_ctx_current_word_hidden1', activ='linear') 1167 | 1168 | 1169 | 1170 | logit_chunk = tensor.tanh(logit_lstm_chunk+logit_prev_chunk+logit_ctx_chunk+logit_ctx_last_word+logit_ctx_current_word_hidden1) 1171 | 1172 | if options['use_dropout']: 1173 | logit_chunk = dropout_layer(logit_chunk, use_noise, trng) 1174 | logit_chunk = get_layer('ff')[1](tparams, logit_chunk, options, 1175 | prefix='ff_logit_chunk', activ='linear') 1176 | logit_shp_chunk = logit_chunk.shape 1177 | probs_chunk = tensor.nnet.softmax(logit_chunk.reshape([logit_shp_chunk[0]*logit_shp_chunk[1], 1178 | logit_shp_chunk[2]])) 1179 | 1180 | # cost 1181 | y_flat_chunk = y_chunk.flatten() 1182 | y_flat_idx_chunk = tensor.arange(y_flat_chunk.shape[0]) * options['n_chunks'] + y_flat_chunk 1183 | cost = -tensor.log(probs_chunk.flatten()[y_flat_idx_chunk]) 1184 | cost = cost.reshape([y_chunk.shape[0], y_chunk.shape[1]]) 1185 | 1186 | 1187 | m = tensor.alloc(0., y_mask.shape[0], y_mask.shape[1]) 1188 | cost = m * cost 1189 | 1190 | 1191 | # weights (alignment matrix) 1192 | opt_ret['dec_alphas_cw'] = word_ctx_ 1193 | 1194 | # compute word probabilities 1195 | logit_lstm_cw = get_layer('ff')[1](tparams, word_hidden2, options, 1196 | prefix='ff_logit_lstm', activ='linear') 1197 | logit_prev_cw = get_layer('ff')[1](tparams, emb, options, 1198 | prefix='ff_logit_prev', activ='linear') 1199 | logit_ctx_cw = get_layer('ff')[1](tparams, word_ctx_, options, 1200 | prefix='ff_logit_ctx', activ='linear') 1201 | 1202 | 1203 | logit_ctx_using_current_chunk_hidden = get_layer('ff')[1](tparams, chunk_hidden, options, 1204 | prefix='ff_logit_using_chunk_hidden', activ='linear') 1205 | 1206 | m = tensor.alloc(0., logit_ctx_using_current_chunk_hidden.shape[0], logit_ctx_using_current_chunk_hidden.shape[1], logit_ctx_using_current_chunk_hidden.shape[2]) 1207 | 1208 | logit_ctx_using_current_chunk_hidden = m * logit_ctx_using_current_chunk_hidden 1209 | 1210 | 1211 | logit_cw = tensor.tanh(logit_lstm_cw+logit_prev_cw+logit_ctx_cw+logit_ctx_using_current_chunk_hidden) 1212 | # logit_cw = tensor.tanh(logit_lstm_cw+logit_prev_cw+logit_ctx_cw) 1213 | 1214 | if options['use_dropout']: 1215 | logit_cw = dropout_layer(logit_cw, use_noise, trng) 1216 | logit_cw = get_layer('ff')[1](tparams, logit_cw, options, 1217 | prefix='ff_logit', activ='linear') 1218 | logit_shp_cw = logit_cw.shape 1219 | probs_cw = tensor.nnet.softmax(logit_cw.reshape([logit_shp_cw[0]*logit_shp_cw[1], 1220 | logit_shp_cw[2]])) 1221 | 1222 | # cost 1223 | y_flat_cw = y_chunk_words.flatten() 1224 | y_flat_idx_cw = tensor.arange(y_flat_cw.shape[0]) * options['n_words'] + y_flat_cw 1225 | 1226 | cost_cw = -tensor.log(probs_cw.flatten()[y_flat_idx_cw]) 1227 | cost_cw = cost_cw.reshape([y_chunk_words.shape[0], y_chunk_words.shape[1]]) 1228 | 1229 | 1230 | cost = cost + cost_cw 1231 | # cost = cost_cw 1232 | cost = (cost * y_mask).sum(0) 1233 | 1234 | return trng, use_noise, x, x_mask, y_chunk, y_mask, y_chunk_words, chunk_indicator,\ 1235 | opt_ret, cost, cost_cw 1236 | 1237 | # build a sampler 1238 | def build_sampler(tparams, options, trng, use_noise): 1239 | 1240 | 1241 | x = tensor.matrix('x', dtype='int64') 1242 | 1243 | xr = x[::-1] 1244 | 1245 | n_timesteps = x.shape[0] 1246 | n_samples = x.shape[1] 1247 | 1248 | # word embedding (source), forward and backward 1249 | emb = tparams['Wemb'][x.flatten()] 1250 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) 1251 | embr = tparams['Wemb'][xr.flatten()] 1252 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) 1253 | 1254 | # encoder 1255 | proj = get_layer(options['encoder'])[1](tparams, emb, options, 1256 | prefix='encoder') 1257 | projr = get_layer(options['encoder'])[1](tparams, embr, options, 1258 | prefix='encoder_r') 1259 | 1260 | # concatenate forward and backward rnn hidden states 1261 | ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) 1262 | 1263 | # get the input for decoder rnn initializer mlp 1264 | ctx_mean = ctx.mean(0) 1265 | # ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) 1266 | 1267 | # initial decoder state for both 1268 | init_state_chunk = get_layer('ff')[1](tparams, ctx_mean, options, 1269 | prefix='ff_state_chunk', activ='tanh') 1270 | init_state_chunk_words = get_layer('ff')[1](tparams, ctx_mean, options, 1271 | prefix='ff_state_chunk_words', activ='tanh') 1272 | 1273 | 1274 | print 'Building f_init...', 1275 | outs = [init_state_chunk, init_state_chunk_words, ctx] 1276 | f_init = theano.function([x], outs, name='f_init', profile=profile) 1277 | print 'Done' 1278 | 1279 | 1280 | 1281 | 1282 | # 1283 | # build predict word hidden 1 and chunk hidden2 1284 | # 1285 | 1286 | # TODO note that here the y_chunk and y_chunk_words are both vector, because it only conduct one steps! 1287 | # y_chunk = tensor.vector('y_sample_chunk', dtype='int64') 1288 | y_chunk_words = tensor.vector('y_sample_chunk_words', dtype='int64') 1289 | 1290 | chunk_boundary = tensor.vector('chunk_boundary', dtype='float32') 1291 | 1292 | init_state_chunk = tensor.matrix('init_state_chunk', dtype='float32') 1293 | init_state_chunk_words = tensor.matrix('init_state_chunk_words', dtype='float32') 1294 | 1295 | last_chunk_end_word_hidden1 = tensor.matrix('last_chunk_end_word_hidden1', dtype='float32') 1296 | 1297 | 1298 | current_chunk_hidden = tensor.matrix('current_chunk_hidden', dtype='float32') 1299 | 1300 | # if it's the first word, emb should be all zero and it is indicated by -1 1301 | emb_chunk_word = tensor.switch(y_chunk_words[:, None] < 0, 1302 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 1303 | tparams['Wemb_dec'][y_chunk_words]) 1304 | 1305 | 1306 | 1307 | # 1308 | # decoder 1309 | # 1310 | retval_predict_chunk = get_layer(options['decoder'])[1](tparams, 1311 | emb_chunk_word, 1312 | None, 1313 | options, 1314 | prefix='decoder', 1315 | context=ctx, 1316 | one_step=True, 1317 | one_step_word=False, 1318 | one_step_chunk=True, 1319 | init_state_chunk=init_state_chunk, 1320 | init_state_chunk_words=init_state_chunk_words, 1321 | last_chunk_end_word_hidden1=last_chunk_end_word_hidden1) 1322 | word_hidden1 = retval_predict_chunk[0] 1323 | last_chunk_emb = retval_predict_chunk[1] 1324 | current_position_hypo_chunk_hidden = retval_predict_chunk[2] 1325 | chunk_ctx = retval_predict_chunk[3] 1326 | chunk_alpha = retval_predict_chunk[4] 1327 | 1328 | 1329 | # 1330 | # get the chunk prediction 1331 | # 1332 | logit_lstm_chunk = get_layer('ff')[1](tparams, current_position_hypo_chunk_hidden, options, 1333 | prefix='ff_logit_lstm_chunk', activ='linear') 1334 | logit_prev_chunk = get_layer('ff')[1](tparams, last_chunk_emb, options, 1335 | prefix='ff_logit_prev_chunk', activ='linear') 1336 | logit_ctx_chunk = get_layer('ff')[1](tparams, chunk_ctx, options, 1337 | prefix='ff_logit_ctx_chunk', activ='linear') 1338 | 1339 | logit_ctx_last_word = get_layer('ff')[1](tparams, emb_chunk_word, options, 1340 | prefix='logit_ctx_last_word', activ='linear') 1341 | logit_ctx_current_word_hidden1 = get_layer('ff')[1](tparams, word_hidden1, options, 1342 | prefix='logit_ctx_current_word_hidden1', activ='linear') 1343 | 1344 | 1345 | 1346 | logit_chunk = tensor.tanh(logit_lstm_chunk+logit_prev_chunk+logit_ctx_chunk+logit_ctx_last_word+logit_ctx_current_word_hidden1) 1347 | 1348 | if options['use_dropout']: 1349 | logit_chunk = dropout_layer(logit_chunk, use_noise, trng) 1350 | logit_chunk = get_layer('ff')[1](tparams, logit_chunk, options, 1351 | prefix='ff_logit_chunk', activ='linear') 1352 | probs_chunk = tensor.nnet.softmax(logit_chunk) 1353 | 1354 | next_sample_chunk = trng.multinomial(pvals=probs_chunk).argmax(1) 1355 | 1356 | 1357 | print 'Building f_next_chunk..' 1358 | inps = [y_chunk_words, ctx, init_state_chunk, init_state_chunk_words, last_chunk_end_word_hidden1] 1359 | outs = [probs_chunk, next_sample_chunk, word_hidden1, current_position_hypo_chunk_hidden] 1360 | f_next_chunk = theano.function(inps, outs, name='f_next_chunk', profile=profile) 1361 | print 'End Building f_next_chunk..' 1362 | 1363 | 1364 | 1365 | 1366 | 1367 | 1368 | 1369 | 1370 | # 1371 | # begin to predict the word hidden2 1372 | # 1373 | 1374 | 1375 | chunk_boundary = tensor.vector('chunk_boundary', dtype='float32') 1376 | current_chunk_hidden = tensor.matrix('current_chunk_hidden', dtype='float32') 1377 | current_position_hypo_chunk_hidden = tensor.matrix('current_position_hypo_chunk_hidden', dtype='float32') 1378 | word_hidden1 = tensor.matrix('word_hidden1', dtype='float32') 1379 | last_chunk_end_word_hidden1 = tensor.matrix('last_chunk_end_word_hidden1', dtype='float32') 1380 | 1381 | 1382 | 1383 | # given the chunk indicator, compute the word hidden2 1384 | chunk_hidden = chunk_boundary[:, None] * current_position_hypo_chunk_hidden \ 1385 | + (1. - chunk_boundary)[:, None] * current_chunk_hidden 1386 | 1387 | h1_last_chunk_end_word = chunk_boundary[:, None] * word_hidden1 \ 1388 | + (1. - chunk_boundary)[:, None] * last_chunk_end_word_hidden1 1389 | 1390 | 1391 | # 1392 | # decoder for word hidden2 1393 | # 1394 | retval_predict_chunk = get_layer(options['decoder'])[1](tparams, 1395 | None, 1396 | None, 1397 | options, 1398 | prefix='decoder', 1399 | context=ctx, 1400 | one_step=True, 1401 | one_step_word=True, 1402 | one_step_chunk=False, 1403 | current_chunk_hidden=chunk_hidden, 1404 | current_word_hidden1=word_hidden1) 1405 | 1406 | 1407 | word_hidden2 = retval_predict_chunk[0] 1408 | word_ctx = retval_predict_chunk[1] 1409 | word_alpha = retval_predict_chunk[2] 1410 | 1411 | 1412 | # compute word probabilities 1413 | logit_lstm_cw = get_layer('ff')[1](tparams, word_hidden2, options, 1414 | prefix='ff_logit_lstm', activ='linear') 1415 | logit_prev_cw = get_layer('ff')[1](tparams, emb_chunk_word, options, 1416 | prefix='ff_logit_prev', activ='linear') 1417 | logit_ctx_cw = get_layer('ff')[1](tparams, word_ctx, options, 1418 | prefix='ff_logit_ctx', activ='linear') 1419 | 1420 | 1421 | logit_ctx_using_current_chunk_hidden = get_layer('ff')[1](tparams, chunk_hidden, options, 1422 | prefix='ff_logit_using_chunk_hidden', activ='linear') 1423 | 1424 | 1425 | m = tensor.alloc(0., logit_ctx_using_current_chunk_hidden.shape[0], logit_ctx_using_current_chunk_hidden.shape[1]) 1426 | 1427 | logit_ctx_using_current_chunk_hidden = m * logit_ctx_using_current_chunk_hidden 1428 | 1429 | 1430 | 1431 | logit_cw = tensor.tanh(logit_lstm_cw+logit_prev_cw+logit_ctx_cw+logit_ctx_using_current_chunk_hidden) 1432 | 1433 | if options['use_dropout']: 1434 | logit_cw = dropout_layer(logit_cw, use_noise, trng) 1435 | logit_cw = get_layer('ff')[1](tparams, logit_cw, options, 1436 | prefix='ff_logit', activ='linear') 1437 | probs_cw = tensor.nnet.softmax(logit_cw) 1438 | next_sample_cw = trng.multinomial(pvals=probs_cw).argmax(1) 1439 | 1440 | 1441 | 1442 | 1443 | # sample from softmax distribution to get the sample 1444 | # compile a function to do the whole thing above, next word probability, 1445 | # sampled word for the next target, next hidden state to be used 1446 | print 'Building f_next_word..' 1447 | inps = [y_chunk_words, 1448 | ctx, 1449 | chunk_boundary, 1450 | current_chunk_hidden, 1451 | current_position_hypo_chunk_hidden, 1452 | word_hidden1, 1453 | last_chunk_end_word_hidden1] 1454 | outs = [probs_cw, next_sample_cw, word_hidden2, h1_last_chunk_end_word, chunk_hidden] 1455 | f_next_chunk_word = theano.function(inps, outs, name='f_next_chunk_word', profile=profile) 1456 | print 'Done' 1457 | 1458 | return f_init, f_next_chunk, f_next_chunk_word 1459 | 1460 | 1461 | 1462 | 1463 | # generate sample, either with stochastic sampling or beam search. Note that, 1464 | # this function iteratively calls f_init and f_next functions. 1465 | def gen_sample(tparams, f_init, f_next_chunk, f_next_word, x, 1466 | options, trng=None, k_chunk=1, k_word=1, k=5, maxlen=50, 1467 | stochastic=True, argmax=False, jointProb=True): 1468 | 1469 | # k is the beam size we have 1470 | if k > 1: 1471 | assert not stochastic, \ 1472 | 'Beam search does not support stochastic sampling' 1473 | 1474 | sample = [] 1475 | sample_score = [] 1476 | if stochastic: 1477 | sample_score = 0 1478 | 1479 | live_k = 1 1480 | dead_k = 0 1481 | 1482 | hyp_samples = [[]] * live_k 1483 | hyp_scores = numpy.zeros(live_k).astype('float32') 1484 | hyp_states = [] 1485 | hyp_chunk_states = [] 1486 | hyp_last_chunk_last_word_hidden1 = [] 1487 | 1488 | # get initial state of decoder rnn and encoder context 1489 | ret = f_init(x) 1490 | next_state_chunk, next_state_word, ctx0 = ret[0], ret[1], ret[2] 1491 | last_chunk_last_word_hidden1 = numpy.zeros((1, options['dim'])).astype('float32') 1492 | 1493 | 1494 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 1495 | 1496 | # next_chunk = -1 * numpy.ones((1,)).astype('int64') # bos indicator 1497 | # 1498 | # word_hidden1 = None 1499 | 1500 | 1501 | for ii in xrange(maxlen): 1502 | ctx = numpy.tile(ctx0, [live_k, 1]) 1503 | inps = [next_w, 1504 | ctx, 1505 | next_state_chunk, 1506 | next_state_word, 1507 | last_chunk_last_word_hidden1] 1508 | ret = f_next_chunk(*inps) 1509 | next_chunk_p, next_chunk, word_hidden1, hypo_chunk_hidden = ret[0], ret[1], ret[2], ret[3] 1510 | 1511 | 1512 | # get the chunk boundrary indocator 1513 | next_chunk = next_chunk_p.argmax(1) 1514 | chunk_boundary = numpy.zeros((next_chunk.shape[0],)).astype('float32') 1515 | 1516 | for i in xrange(next_chunk.shape[0]): 1517 | if next_chunk[i] != 1: 1518 | chunk_boundary[i] = 1.0 1519 | 1520 | inps = [next_w, 1521 | ctx, 1522 | chunk_boundary, 1523 | next_state_chunk, 1524 | hypo_chunk_hidden, 1525 | word_hidden1, 1526 | last_chunk_last_word_hidden1] 1527 | 1528 | ret = f_next_word(*inps) 1529 | 1530 | next_word_p, \ 1531 | next_w, \ 1532 | next_state_word, \ 1533 | last_chunk_last_word_hidden1, \ 1534 | next_state_chunk \ 1535 | = ret[0], ret[1], ret[2], ret[3], ret[4] 1536 | 1537 | 1538 | if jointProb: 1539 | indicator_score = next_chunk_p.max(1) 1540 | indicator_score = indicator_score.reshape(indicator_score.shape[0], 1) 1541 | next_word_p = indicator_score * next_word_p 1542 | 1543 | 1544 | if stochastic: 1545 | if argmax: 1546 | nw = next_word_p[0].argmax() 1547 | else: 1548 | nw = next_w[0] 1549 | sample.append(nw) 1550 | sample_score -= numpy.log(next_word_p[0, nw]) 1551 | if nw == 0: 1552 | break 1553 | else: 1554 | cand_scores = hyp_scores[:, None] - numpy.log(next_word_p) 1555 | cand_flat = cand_scores.flatten() 1556 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 1557 | 1558 | voc_size = next_word_p.shape[1] 1559 | trans_indices = ranks_flat / voc_size 1560 | word_indices = ranks_flat % voc_size 1561 | costs = cand_flat[ranks_flat] 1562 | 1563 | new_hyp_samples = [] 1564 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 1565 | new_hyp_states = [] 1566 | new_hyp_chunk_states = [] 1567 | new_hyp_last_chunk_last_word_hidden1 = [] 1568 | 1569 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 1570 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 1571 | new_hyp_scores[idx] = copy.copy(costs[idx]) 1572 | new_hyp_states.append(copy.copy(next_state_word[ti])) 1573 | new_hyp_chunk_states.append(copy.copy(next_state_chunk[ti])) 1574 | new_hyp_last_chunk_last_word_hidden1.append(copy.copy(last_chunk_last_word_hidden1[ti])) 1575 | 1576 | # check the finished samples 1577 | new_live_k = 0 1578 | hyp_samples = [] 1579 | hyp_scores = [] 1580 | hyp_states = [] 1581 | hyp_chunk_states = [] 1582 | hyp_last_chunk_last_word_hidden1 = [] 1583 | 1584 | for idx in xrange(len(new_hyp_samples)): 1585 | if new_hyp_samples[idx][-1] == 0: 1586 | sample.append(new_hyp_samples[idx]) 1587 | sample_score.append(new_hyp_scores[idx]) 1588 | dead_k += 1 1589 | else: 1590 | new_live_k += 1 1591 | hyp_samples.append(new_hyp_samples[idx]) 1592 | hyp_scores.append(new_hyp_scores[idx]) 1593 | hyp_states.append(new_hyp_states[idx]) 1594 | hyp_chunk_states.append(new_hyp_chunk_states[idx]) 1595 | hyp_last_chunk_last_word_hidden1.append(new_hyp_last_chunk_last_word_hidden1[idx]) 1596 | 1597 | hyp_scores = numpy.array(hyp_scores) 1598 | live_k = new_live_k 1599 | 1600 | if new_live_k < 1: 1601 | break 1602 | if dead_k >= k: 1603 | break 1604 | 1605 | next_w = numpy.array([w[-1] for w in hyp_samples]) 1606 | next_state_word = numpy.array(hyp_states) 1607 | next_state_chunk = numpy.array(hyp_chunk_states) 1608 | last_chunk_last_word_hidden1 = numpy.array(hyp_last_chunk_last_word_hidden1) 1609 | 1610 | if not stochastic: 1611 | # dump every remaining one 1612 | if live_k > 0: 1613 | for idx in xrange(live_k): 1614 | sample.append(hyp_samples[idx]) 1615 | sample_score.append(hyp_scores[idx]) 1616 | 1617 | return sample, sample_score 1618 | 1619 | 1620 | # calculate the log probablities on a given corpus using translation model 1621 | def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True): 1622 | probs = [] 1623 | 1624 | n_done = 0 1625 | 1626 | for x, y_chunk, y_cw in iterator: 1627 | n_done += len(x) 1628 | 1629 | x, x_mask, y_c, y_cw, chunk_indicator, y_mask = prepare_data(x, y_chunk, y_cw, 1630 | n_words_src=options['n_words_src'], 1631 | n_words=options['n_words']) 1632 | 1633 | pprobs = f_log_probs(x, x_mask, y_c, y_mask, y_cw, chunk_indicator) 1634 | for pp in pprobs: 1635 | probs.append(pp) 1636 | 1637 | if numpy.isnan(numpy.mean(probs)): 1638 | ipdb.set_trace() 1639 | 1640 | if verbose: 1641 | print >>sys.stderr, '%d samples computed' % (n_done) 1642 | 1643 | return numpy.array(probs) 1644 | 1645 | 1646 | # optimizers 1647 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 1648 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8): 1649 | 1650 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 1651 | for k, p in tparams.iteritems()] 1652 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 1653 | 1654 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) 1655 | 1656 | updates = [] 1657 | 1658 | t_prev = theano.shared(numpy.float32(0.)) 1659 | t = t_prev + 1. 1660 | lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) 1661 | 1662 | for p, g in zip(tparams.values(), gshared): 1663 | m = theano.shared(p.get_value() * 0., p.name + '_mean') 1664 | v = theano.shared(p.get_value() * 0., p.name + '_variance') 1665 | m_t = beta1 * m + (1. - beta1) * g 1666 | v_t = beta2 * v + (1. - beta2) * g**2 1667 | step = lr_t * m_t / (tensor.sqrt(v_t) + e) 1668 | p_t = p - step 1669 | updates.append((m, m_t)) 1670 | updates.append((v, v_t)) 1671 | updates.append((p, p_t)) 1672 | updates.append((t_prev, t)) 1673 | 1674 | f_update = theano.function([lr], [], updates=updates, 1675 | on_unused_input='ignore', profile=profile) 1676 | 1677 | return f_grad_shared, f_update 1678 | 1679 | 1680 | def adadelta(lr, tparams, grads, inp, cost): 1681 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1682 | name='%s_grad' % k) 1683 | for k, p in tparams.iteritems()] 1684 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 1685 | name='%s_rup2' % k) 1686 | for k, p in tparams.iteritems()] 1687 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 1688 | name='%s_rgrad2' % k) 1689 | for k, p in tparams.iteritems()] 1690 | 1691 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 1692 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 1693 | for rg2, g in zip(running_grads2, grads)] 1694 | 1695 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 1696 | profile=profile) 1697 | 1698 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 1699 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 1700 | running_grads2)] 1701 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 1702 | for ru2, ud in zip(running_up2, updir)] 1703 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 1704 | 1705 | f_update = theano.function([lr], [], updates=ru2up+param_up, 1706 | on_unused_input='ignore', profile=profile) 1707 | 1708 | return f_grad_shared, f_update 1709 | 1710 | 1711 | def rmsprop(lr, tparams, grads, inp, cost): 1712 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1713 | name='%s_grad' % k) 1714 | for k, p in tparams.iteritems()] 1715 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 1716 | name='%s_rgrad' % k) 1717 | for k, p in tparams.iteritems()] 1718 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 1719 | name='%s_rgrad2' % k) 1720 | for k, p in tparams.iteritems()] 1721 | 1722 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 1723 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 1724 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 1725 | for rg2, g in zip(running_grads2, grads)] 1726 | 1727 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 1728 | profile=profile) 1729 | 1730 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 1731 | name='%s_updir' % k) 1732 | for k, p in tparams.iteritems()] 1733 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 1734 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 1735 | running_grads2)] 1736 | param_up = [(p, p + udn[1]) 1737 | for p, udn in zip(itemlist(tparams), updir_new)] 1738 | f_update = theano.function([lr], [], updates=updir_new+param_up, 1739 | on_unused_input='ignore', profile=profile) 1740 | 1741 | return f_grad_shared, f_update 1742 | 1743 | 1744 | def train(dim_word=100, # word vector dimensionality 1745 | dim_chunk=50, 1746 | dim=1000, # the number of LSTM units 1747 | dim_chunk_hidden=2000, 1748 | encoder='gru', 1749 | decoder='gru_cond', 1750 | patience=10, # early stopping patience 1751 | max_epochs=5000, 1752 | finish_after=10000000, # finish after this many updates 1753 | dispFreq=100, 1754 | decay_c=0., # L2 regularization penalty 1755 | alpha_c=0., # alignment regularization 1756 | clip_c=-1., # gradient clipping threshold 1757 | lrate=0.01, # learning rate 1758 | n_words_src=100000, # source vocabulary size 1759 | n_words=100000, # target vocabulary size 1760 | n_chunks=1000, # target vocabulary size 1761 | maxlen_chunk=10, # maximum length of the description 1762 | maxlen_chunk_words=50, # maximum length of the description 1763 | optimizer='rmsprop', 1764 | batch_size=16, 1765 | valid_batch_size=16, 1766 | saveto='model.npz', 1767 | validFreq=1000, 1768 | saveFreq=1000, # save the parameters after every saveFreq updates 1769 | sampleFreq=100, # generate some samples after every sampleFreq 1770 | datasets=[ 1771 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', 1772 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], 1773 | valid_datasets=['../data/dev/newstest2011.en.tok', 1774 | '../data/dev/newstest2011.fr.tok'], 1775 | dictionaries=[ 1776 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 1777 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], 1778 | dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 1779 | use_dropout=False, 1780 | reload_=False, 1781 | overwrite=False): 1782 | 1783 | # Model options 1784 | model_options = locals().copy() 1785 | 1786 | # load the dictionaries of both source and target 1787 | # load dictionaries and invert them 1788 | worddicts = [None] * len(dictionaries) 1789 | worddicts_r = [None] * len(dictionaries) 1790 | for ii, dd in enumerate(dictionaries): 1791 | with open(dd, 'rb') as f: 1792 | worddicts[ii] = pkl.load(f) 1793 | worddicts_r[ii] = dict() 1794 | for kk, vv in worddicts[ii].iteritems(): 1795 | worddicts_r[ii][vv] = kk 1796 | 1797 | # dict for chunk label 1798 | worddict_chunk = [None] 1799 | worddict_r_chunk = [None] 1800 | with open(dictionary_chunk, 'rb') as f: 1801 | worddict_chunk = pkl.load(f) 1802 | worddict_r_chunk = dict() 1803 | for kk, vv in worddict_chunk.iteritems(): 1804 | worddict_r_chunk[vv] = kk 1805 | model_options['n_chunks'] = len(worddict_chunk) 1806 | print 'chunk_dict size: ', model_options['n_chunks'] 1807 | print worddict_chunk 1808 | 1809 | # reload options 1810 | if reload_ and os.path.exists(saveto) and os.path.exists(saveto + '.pkl'): 1811 | print 'Reloading model options' 1812 | with open('%s.pkl' % saveto, 'rb') as f: 1813 | model_options = pkl.load(f) 1814 | 1815 | print 'Loading data' 1816 | 1817 | # begin to read by iterators 1818 | train = TrainingTextIterator(datasets[0], datasets[1], 1819 | dictionaries[0], dictionaries[1], dictionary_chunk, 1820 | n_words_source=n_words_src, n_words_target=n_words, 1821 | batch_size=batch_size, 1822 | max_chunk_len=maxlen_chunk, max_word_len=maxlen_chunk_words) 1823 | valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1], 1824 | dictionaries[0], dictionaries[1], dictionary_chunk, 1825 | n_words_source=n_words_src, n_words_target=n_words, 1826 | batch_size=valid_batch_size, 1827 | max_chunk_len=maxlen_chunk, max_word_len=maxlen_chunk_words) 1828 | 1829 | print 'Building model' 1830 | 1831 | 1832 | # init all the parameters for model 1833 | params = init_params(model_options) 1834 | 1835 | 1836 | # reload parameters 1837 | if reload_ and os.path.exists(saveto): 1838 | print 'Reloading model parameters' 1839 | params = load_params(saveto, params) 1840 | 1841 | 1842 | tparams = init_tparams(params) 1843 | # modify the module of build model! 1844 | # especially the inputs and outputs 1845 | trng, use_noise, \ 1846 | x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \ 1847 | opt_ret, \ 1848 | cost, cost_cw= \ 1849 | build_model(tparams, model_options) 1850 | 1851 | inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator] 1852 | 1853 | print 'Building sampler' 1854 | f_init, f_next_chunk, f_next_word = build_sampler(tparams, model_options, trng, use_noise) 1855 | 1856 | # before any regularizer 1857 | print 'Building f_log_probs...', 1858 | f_log_probs = theano.function(inps, cost, profile=profile) 1859 | print 'Done' 1860 | 1861 | cost = cost.mean() 1862 | 1863 | # apply L2 regularization on weights 1864 | if decay_c > 0.: 1865 | decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') 1866 | weight_decay = 0. 1867 | for kk, vv in tparams.iteritems(): 1868 | weight_decay += (vv ** 2).sum() 1869 | weight_decay *= decay_c 1870 | cost += weight_decay 1871 | 1872 | # regularize the alpha weights 1873 | if alpha_c > 0. and not model_options['decoder'].endswith('simple'): 1874 | alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') 1875 | alpha_reg = alpha_c * ( 1876 | (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - 1877 | opt_ret['dec_alphas_chunk'].sum(0))**2).sum(1).mean() 1878 | alpha_reg += alpha_c * ( 1879 | (tensor.cast(y_mask.sum(0).sum(0)//x_mask.sum(0), 'float32')[:, None] - 1880 | opt_ret['dec_alphas_cw'].sum(0).sum(0))**2).sum(1).mean() 1881 | cost += alpha_reg 1882 | 1883 | # after all regularizers - compile the computational graph for cost 1884 | print 'Building f_cost...', 1885 | f_cost = theano.function(inps, cost, profile=profile) 1886 | print 'Done' 1887 | 1888 | print 'Computing gradient...', 1889 | grads = tensor.grad(cost, wrt=itemlist(tparams)) 1890 | print 'Done' 1891 | 1892 | # apply gradient clipping here 1893 | if clip_c > 0.: 1894 | g2 = 0. 1895 | for g in grads: 1896 | g2 += (g**2).sum() 1897 | new_grads = [] 1898 | for g in grads: 1899 | new_grads.append(tensor.switch(g2 > (clip_c**2), 1900 | g / tensor.sqrt(g2) * clip_c, 1901 | g)) 1902 | grads = new_grads 1903 | 1904 | # compile the optimizer, the actual computational graph is compiled here 1905 | lr = tensor.scalar(name='lr') 1906 | print 'Building optimizers...', 1907 | f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) 1908 | print 'Done' 1909 | 1910 | print 'Optimization' 1911 | 1912 | best_p = None 1913 | bad_counter = 0 1914 | uidx = 0 1915 | estop = False 1916 | history_errs = [] 1917 | # reload history 1918 | if reload_ and os.path.exists(saveto): 1919 | rmodel = numpy.load(saveto) 1920 | history_errs = list(rmodel['history_errs']) 1921 | if 'uidx' in rmodel: 1922 | uidx = rmodel['uidx'] 1923 | 1924 | if validFreq == -1: 1925 | validFreq = len(train[0])/batch_size 1926 | if saveFreq == -1: 1927 | saveFreq = len(train[0])/batch_size 1928 | if sampleFreq == -1: 1929 | sampleFreq = len(train[0])/batch_size 1930 | 1931 | # print 'train length', len(train) 1932 | 1933 | for eidx in xrange(max_epochs): 1934 | n_samples = 0 1935 | 1936 | for x, y_chunk, y_cw in train: 1937 | n_samples += len(x) 1938 | uidx += 1 1939 | use_noise.set_value(1.) 1940 | 1941 | x, x_mask, y_c, y_cw, chunk_indicator, y_mask = prepare_training_data(x, y_chunk, y_cw, maxlen_chunk=maxlen_chunk, maxlen_cw=maxlen_chunk_words, 1942 | n_words_src=n_words_src, 1943 | n_words=n_words) 1944 | 1945 | if x is None: 1946 | print 'Minibatch with zero sample under chunk length ', maxlen_chunk, 'word length: ', maxlen_chunk_words 1947 | uidx -= 1 1948 | continue 1949 | 1950 | ud_start = time.time() 1951 | 1952 | 1953 | 1954 | # compute cost, grads and copy grads to sh self.target_buffer = _tcbufared variables 1955 | cost = f_grad_shared(x, x_mask, y_c, y_mask, y_cw, chunk_indicator) 1956 | 1957 | # print 'Epoch ', eidx, 'processed one batch' 1958 | 1959 | # do the update on parameters 1960 | f_update(lrate) 1961 | 1962 | ud = time.time() - ud_start 1963 | 1964 | # check for bad numbers, usually we remove non-finite elements 1965 | # and continue training - but not done here 1966 | if numpy.isnan(cost) or numpy.isinf(cost): 1967 | print 'NaN detected' 1968 | return 1., 1., 1. 1969 | 1970 | # verbose 1971 | if numpy.mod(uidx, dispFreq) == 0: 1972 | print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud 1973 | 1974 | # save the best model so far, in addition, save the latest model 1975 | # into a separate file with the iteration number for external eval 1976 | if numpy.mod(uidx, saveFreq) == 0: 1977 | print 'Saving the best model...', 1978 | if best_p is not None: 1979 | params = best_p 1980 | else: 1981 | params = unzip(tparams) 1982 | numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) 1983 | pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) 1984 | print 'Done' 1985 | 1986 | # save with uidx 1987 | if not overwrite: 1988 | print 'Saving the model at iteration {}...'.format(uidx), 1989 | saveto_uidx = '{}.iter{}.npz'.format( 1990 | os.path.splitext(saveto)[0], uidx) 1991 | numpy.savez(saveto_uidx, history_errs=history_errs, 1992 | uidx=uidx, **unzip(tparams)) 1993 | print 'Done' 1994 | 1995 | 1996 | # generate some samples with the model and display them 1997 | if numpy.mod(uidx, sampleFreq) == 0: 1998 | # FIXME: random selection? 1999 | for jj in xrange(numpy.minimum(5, x.shape[1])): 2000 | stochastic = True 2001 | sample, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word, 2002 | x[:, jj][:, None], 2003 | model_options, trng=trng, k=1, 2004 | stochastic=stochastic, 2005 | argmax=False) 2006 | print 'Source ', jj, ': ', 2007 | for vv in x[:, jj]: 2008 | if vv == 0: 2009 | break 2010 | if vv in worddicts_r[0]: 2011 | print worddicts_r[0][vv], 2012 | else: 2013 | print 'UNK', 2014 | print 2015 | print 'Truth ', jj, ' : ', 2016 | ci = 0 2017 | # print y_chunk[: , jj] 2018 | for chunk_index, word_index in zip(y_c[:, jj], y_cw[:, jj]): 2019 | 2020 | if word_index == 0: 2021 | break 2022 | if chunk_index in worddict_r_chunk and chunk_index != 1: # not NULL 2023 | print '|', worddict_r_chunk[chunk_index], 2024 | if word_index in worddicts_r[1]: 2025 | print worddicts_r[1][word_index], 2026 | else: 2027 | print 'UNK', 2028 | ci += 1 2029 | print 2030 | print 'Sample ', jj, ': ', 2031 | if stochastic: 2032 | ss = sample 2033 | else: 2034 | score = score / numpy.array([len(s) for s in sample]) 2035 | ss = sample[score.argmin()] 2036 | for vv in ss: 2037 | if vv == 0: 2038 | continue 2039 | if vv < 0: 2040 | vv = vv * -1 2041 | # print vv, 2042 | print '|', worddict_r_chunk[vv], 2043 | continue 2044 | if vv in worddicts_r[1]: 2045 | print worddicts_r[1][vv], 2046 | else: 2047 | print 'UNK', 2048 | print 2049 | 2050 | # validate model on validation set and early stop if necessary 2051 | if numpy.mod(uidx, validFreq) == 0: 2052 | use_noise.set_value(0.) 2053 | valid_errs = pred_probs(f_log_probs, prepare_training_data, 2054 | model_options, valid) 2055 | valid_err = valid_errs.mean() 2056 | history_errs.append(valid_err) 2057 | 2058 | if uidx == 0 or valid_err <= numpy.array(history_errs).min(): 2059 | best_p = unzip(tparams) 2060 | bad_counter = 0 2061 | if len(history_errs) > patience and valid_err >= \ 2062 | numpy.array(history_errs)[:-patience].min(): 2063 | bad_counter += 1 2064 | if bad_counter > patience: 2065 | print 'Early Stop!' 2066 | estop = True 2067 | break 2068 | 2069 | if numpy.isnan(valid_err): 2070 | ipdb.set_trace() 2071 | 2072 | print 'Valid ', valid_err 2073 | 2074 | # finish after this many updates 2075 | if uidx >= finish_after: 2076 | print 'Finishing after %d iterations!' % uidx 2077 | estop = True 2078 | break 2079 | 2080 | print 'Seen %d samples' % n_samples 2081 | 2082 | if estop: 2083 | break 2084 | 2085 | if best_p is not None: 2086 | zipp(best_p, tparams) 2087 | 2088 | use_noise.set_value(0.) 2089 | valid_err = pred_probs(f_log_probs, prepare_training_data, 2090 | model_options, valid).mean() 2091 | 2092 | print 'Valid ', valid_err 2093 | 2094 | params = copy.copy(best_p) 2095 | numpy.savez(saveto, zipped_params=best_p, 2096 | history_errs=history_errs, 2097 | uidx=uidx, 2098 | **params) 2099 | 2100 | return valid_err 2101 | 2102 | 2103 | def sgd(lr, tparams, grads, x, mask, y, cost): 2104 | gshared = [theano.shared(p.get_value() * 0., 2105 | name='%s_grad' % k) 2106 | for k, p in tparams.iteritems()] 2107 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 2108 | 2109 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 2110 | profile=profile) 2111 | 2112 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 2113 | f_update = theano.function([lr], [], updates=pup, profile=profile) 2114 | 2115 | return f_grad_shared, f_update 2116 | 2117 | 2118 | if __name__ == '__main__': 2119 | pass 2120 | -------------------------------------------------------------------------------- /beam_decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2/models/memory-set_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | #cd $PBS_O_WORKDIR 11 | python ./translate_gpu.py -n \ 12 | ./model_hal.npz \ 13 | ./model_hal.npz.pkl \ 14 | ././../../nmtdata/small.ch.pkl \ 15 | ././../../nmtdata/small.en.chunked.chunktag.pkl \ 16 | ././../../nmtdata/small.test \ 17 | ./small.result 18 | -------------------------------------------------------------------------------- /chunk_nmt_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | #export THEANO_FLAGS=device=gpu0,optimizer=None,floatX=float32,exception_verbosity=high 9 | export THEANO_FLAGS=device=gpu2,floatX=float32 10 | 11 | python ./train_nmt_zh2en.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo rm model* 4 | sudo rm nohup.out 5 | -------------------------------------------------------------------------------- /codetest.py: -------------------------------------------------------------------------------- 1 | __author__ = 'zhouh' 2 | 3 | 4 | 5 | from training_data_iterator import TrainingTextIterator 6 | from nmt import prepare_training_data 7 | 8 | 9 | # train = TrainingTextIterator('/home/zhouh/workspace/python/nmtdata/small.ch', 10 | # '/home/zhouh/workspace/python/nmtdata/small.en.chunked', 11 | # '/home/zhouh/workspace/python/nmtdata/small.ch.pkl', 12 | # '/home/zhouh/workspace/python/nmtdata/small.en.chunked.pkl', 13 | # '/home/zhouh/workspace/python/nmtdata/small.en.chunked.chunktag.pkl', 14 | # n_words_source=10000, n_words_target=10000, 15 | # batch_size=2, max_chunk_len=50, max_word_len=50) 16 | 17 | train = TrainingTextIterator('/home/zhouh/workspace/python/nmtdata/hms.ch.filter', 18 | '/home/zhouh/workspace/python/nmtdata/hms.en.filter.chunked', 19 | '/home/zhouh/workspace/python/nmtdata/hms.ch.filter.pkl', 20 | '/home/zhouh/workspace/python/nmtdata/hms.en.filter.chunked.pkl', 21 | '/home/zhouh/workspace/python/nmtdata/hms.en.filter.chunked.chunktag.pkl', 22 | n_words_source=10000, n_words_target=10000, 23 | batch_size=1, max_chunk_len=30, max_word_len=50) 24 | 25 | 26 | 27 | n = 0 28 | batch = 0 29 | for i in train: 30 | print batch 31 | batch += 1 32 | 33 | s = i[0] 34 | tc = i[1] 35 | tcw = i[2] 36 | 37 | print 's', s 38 | print 'tc', tc 39 | print 'tcw', tcw 40 | 41 | 42 | x, x_mask, y_c, y_cw, chunk_indicator, y_mask = prepare_training_data(s, tc, tcw, maxlen_chunk=10, maxlen_cw=50, 43 | n_words_src=1000, 44 | n_words=1000) 45 | print 'x', x 46 | print 'x_mask', x_mask 47 | print 'y_c', y_c 48 | print 'chunk_indicator', chunk_indicator 49 | print 'y_cw', y_cw 50 | print 'y_mask', y_mask 51 | 52 | 53 | 54 | 55 | print batch 56 | 57 | -------------------------------------------------------------------------------- /computeCost.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import theano 8 | import cPickle as pkl 9 | 10 | from nmt import (build_model, pred_probs, load_params, 11 | init_params, init_tparams, prepare_training_data) 12 | 13 | from training_data_iterator import TrainingTextIterator 14 | 15 | 16 | 17 | def main(model, 18 | pklmodel, 19 | valid_datasets=['../data/dev/newstest2011.en.tok', 20 | '../data/dev/newstest2011.fr.tok'], 21 | dictionaries=[ 22 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 23 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], 24 | dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 25 | result_file='./cost.result'): 26 | 27 | 28 | 29 | 30 | 31 | # load the dictionaries of both source and target 32 | # load dictionaries and invert them 33 | worddicts = [None] * len(dictionaries) 34 | worddicts_r = [None] * len(dictionaries) 35 | for ii, dd in enumerate(dictionaries): 36 | with open(dd, 'rb') as f: 37 | worddicts[ii] = pkl.load(f) 38 | worddicts_r[ii] = dict() 39 | for kk, vv in worddicts[ii].iteritems(): 40 | worddicts_r[ii][vv] = kk 41 | 42 | # dict for chunk label 43 | worddict_chunk = [None] 44 | worddict_r_chunk = [None] 45 | with open(dictionary_chunk, 'rb') as f: 46 | worddict_chunk = pkl.load(f) 47 | worddict_r_chunk = dict() 48 | for kk, vv in worddict_chunk.iteritems(): 49 | worddict_r_chunk[vv] = kk 50 | print worddict_chunk 51 | 52 | print 'load model model_options' 53 | with open('%s' % pklmodel, 'rb') as f: 54 | options = pkl.load(f) 55 | 56 | 57 | # build valid set 58 | valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1], 59 | dictionaries[0], dictionaries[1], dictionary_chunk, 60 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 61 | batch_size=options['batch_size'], 62 | max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words']) 63 | 64 | 65 | # allocate model parameters 66 | params = init_params(options) 67 | 68 | # load model parameters and set theano shared variables 69 | params = load_params(model, params) 70 | tparams = init_tparams(params) 71 | 72 | trng, use_noise, \ 73 | x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \ 74 | opt_ret, \ 75 | cost, cost_cw= \ 76 | build_model(tparams, options) 77 | 78 | 79 | inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator] 80 | 81 | 82 | 83 | # before any regularizer 84 | print 'Building f_log_probs...', 85 | f_log_probs = theano.function(inps, cost, profile=False) 86 | f_log_probs_cw = theano.function(inps, cost_cw, profile=False) 87 | print 'Done' 88 | 89 | valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data, 90 | options, valid) 91 | 92 | valid_err = valid_errs.mean() 93 | valid_err_cw = valid_errs_cw.mean() 94 | 95 | with open(result_file, 'w') as result_file: 96 | print >> result_file, valid_err, valid_err_cw 97 | 98 | 99 | 100 | if __name__ == "__main__": 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('model', type=str) 103 | parser.add_argument('pklmodel', type=str) 104 | parser.add_argument('dictionary', type=str) 105 | parser.add_argument('dictionary_target', type=str) 106 | parser.add_argument('dictionary_chunk', type=str) 107 | parser.add_argument('valid_source', type=str) 108 | parser.add_argument('valid_target', type=str) 109 | parser.add_argument('result_file', type=str) 110 | 111 | args = parser.parse_args() 112 | 113 | main(args.model, 114 | args.pklmodel, 115 | valid_datasets=[args.valid_source, args.valid_target], 116 | dictionaries=[args.dictionary, args.dictionary_target], 117 | dictionary_chunk=args.dictionary_chunk, 118 | result_file=args.result_file) 119 | -------------------------------------------------------------------------------- /cpu_train_chunk_nmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=cpu,optimizer=None,floatX=float32,exception_verbosity=high 9 | 10 | python ./train_nmt_zh2en_pc.py 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import cPickle as pkl 4 | import gzip 5 | 6 | 7 | def fopen(filename, mode='r'): 8 | if filename.endswith('.gz'): 9 | return gzip.open(filename, mode) 10 | return open(filename, mode) 11 | 12 | 13 | class TextIterator: 14 | """Simple Bitext iterator.""" 15 | def __init__(self, source, target, 16 | source_dict, target_dict, 17 | batch_size=128, 18 | maxlen=100, 19 | n_words_source=-1, 20 | n_words_target=-1): 21 | self.source = fopen(source, 'r') 22 | self.target = fopen(target, 'r') 23 | with open(source_dict, 'rb') as f: 24 | self.source_dict = pkl.load(f) 25 | with open(target_dict, 'rb') as f: 26 | self.target_dict = pkl.load(f) 27 | 28 | self.batch_size = batch_size 29 | self.maxlen = maxlen 30 | 31 | self.n_words_source = n_words_source 32 | self.n_words_target = n_words_target 33 | 34 | self.source_buffer = [] 35 | self.target_buffer = [] 36 | self.k = batch_size * 20 37 | 38 | self.end_of_data = False 39 | 40 | def __iter__(self): 41 | return self 42 | 43 | def reset(self): 44 | self.source.seek(0) 45 | self.target.seek(0) 46 | 47 | def next(self): 48 | if self.end_of_data: 49 | self.end_of_data = False 50 | self.reset() 51 | raise StopIteration 52 | 53 | source = [] 54 | target = [] 55 | 56 | # fill buffer, if it's empty 57 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 58 | 59 | if len(self.source_buffer) == 0: 60 | for k_ in xrange(self.k): 61 | ss = self.source.readline() 62 | if ss == "": 63 | break 64 | tt = self.target.readline() 65 | if tt == "": 66 | break 67 | 68 | self.source_buffer.append(ss.strip().split()) 69 | self.target_buffer.append(tt.strip().split()) 70 | 71 | # sort by target buffer 72 | tlen = numpy.array([len(t) for t in self.target_buffer]) 73 | tidx = tlen.argsort() 74 | 75 | _sbuf = [self.source_buffer[i] for i in tidx] 76 | _tbuf = [self.target_buffer[i] for i in tidx] 77 | 78 | self.source_buffer = _sbuf 79 | self.target_buffer = _tbuf 80 | 81 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 82 | self.end_of_data = False 83 | self.reset() 84 | raise StopIteration 85 | 86 | try: 87 | 88 | # actual work here 89 | while True: 90 | 91 | # read from source file and map to word index 92 | try: 93 | ss = self.source_buffer.pop() 94 | except IndexError: 95 | break 96 | ss = [self.source_dict[w] if w in self.source_dict else 1 97 | for w in ss] 98 | if self.n_words_source > 0: 99 | ss = [w if w < self.n_words_source else 1 for w in ss] 100 | 101 | # read from source file and map to word index 102 | tt = self.target_buffer.pop() 103 | tt = [self.target_dict[w] if w in self.target_dict else 1 104 | for w in tt] 105 | if self.n_words_target > 0: 106 | tt = [w if w < self.n_words_target else 1 for w in tt] 107 | 108 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 109 | continue 110 | 111 | source.append(ss) 112 | target.append(tt) 113 | 114 | if len(source) >= self.batch_size or \ 115 | len(target) >= self.batch_size: 116 | break 117 | except IOError: 118 | self.end_of_data = True 119 | 120 | if len(source) <= 0 or len(target) <= 0: 121 | self.end_of_data = False 122 | self.reset() 123 | raise StopIteration 124 | 125 | return source, target 126 | -------------------------------------------------------------------------------- /haos.bib: -------------------------------------------------------------------------------- 1 | @InProceedings{chen2022mtg, 2 | author = {Chen, Yiran and Song, Zhenqiao and Wu, Xianze and Wang, Danqing and Xu, Jingjing and Chen, Jiaze and Zhou, Hao and Li, Lei}, 3 | booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT Findings)}, 4 | title = {{MTG}: A Benchmark Suite for Multilingual Text Generation}, 5 | year = {2022}, 6 | month = jul, 7 | abstract = {We introduce MTG, a new benchmark suite for training and evaluating multilingual text generation. It is the first and largest multilingual multiway text generation benchmark with 400k human-annotated data for four generation tasks (story generation, question generation, title generation and text summarization) across five languages (English, German, French, Spanish and Chinese). Its multiway characteristic makes it possible to achieve direct cross-lingual generation between any two languages, thus facilitating knowledge transfer. Based on MTG, we set various evaluation scenarios and conduct deep analyses of several popular multilingual generation models from different aspects. Our benchmark suite can foster model performance enhancement with more human-annotated parallel data and encourage model evaluation with more diverse generation scenarios.}, 8 | eprint = {https://arxiv.org/abs/2108.07140}, 9 | author+an = {7=highlight} 10 | } 11 | @InProceedings{bao2022latent, 12 | author = {Yu Bao and Hao Zhou and Shujian Huang and Dongqi Wang and Lihua Qian and Xinyu Dai and Jiajun Chen and Lei Li}, 13 | booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)}, 14 | title = {latent-{GLAT}: Glancing at Latent Variables for Parallel Text Generation}, 15 | year = {2022}, 16 | month = may, 17 | abstract = {Recently, parallel text generation has received widespread attention due to its success in generation efficiency. Although many advanced techniques are proposed to improve its generation quality, they still need the help of an autoregressive model for training to overcome the one-to-many multi-modal phenomenon in the dataset, limiting their applications. In this paper, we propose latent-GLAT, which employs the discrete latent variables to capture word categorical information and invoke an advanced curriculum learning technique, alleviating the multi-modality problem. Experiment results show that our method outperforms strong baselines without the help of an autoregressive model, which further broadens the application scenarios of the parallel decoding paradigm.}, 18 | code = {https://github.com/baoy-nlp/Latent-GLAT}, 19 | eprint = {https://openreview.net/forum?id=y4xCe0MSoWx}, 20 | author+an = {1=student; 2=highlight} 21 | } 22 | @InProceedings{fu2022contextual, 23 | author = {Zhiyi Fu and Wangchunshu Zhou and Jingjing Xu and Hao Zhou and Lei Li}, 24 | booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)}, 25 | title = {Contextual Representation Learning beyond Masked Language Modeling}, 26 | year = {2022}, 27 | month = may, 28 | abstract = {How do masked language models (MLMs) such as BERT learn contextual representations? In this work, we analyze the learning dynamics of MLMs. We find that MLMs adopt sampled embeddings as anchors to estimate and inject contextual semantics to representations, which limits the efficiency and effectiveness of MLMs. To address these issues, we propose TACO, a simple yet effective representation learning approach to directly model global semantics. TACO extracts and aligns contextual semantics hidden in contextualized representations to encourage models to attend global semantics when generating contextualized representations. Experiments on the GLUE benchmark show that TACO achieves up to 5x speedup and up to 1.2 points average improvement over existing MLMs.}, 29 | code = {https:// github.com/FUZHIYI/TACO}, 30 | eprint = {https://openreview.net/forum?id=KWL_ElhUejN}, 31 | author+an = {4=highlight} 32 | } 33 | @InProceedings{chen2022e, 34 | author = {Jiangjie Chen and Rui Xu and Ziquan Fu and Wei Shi and Zhongqiao Li and Xinbo Zhang and Changzhi Sun and Lei Li and Yanghua Xiao and Hao Zhou}, 35 | booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings}, 36 | title = {{E-KAR}: A Benchmark for Rationalizing Natural Language Analogical Reasoning}, 37 | year = {2022}, 38 | month = may, 39 | abstract = {The ability to recognize analogies is fundamental to human cognition. Existing benchmarks to test word analogy do not reveal the underneath process of analogical reasoning of neural models. Holding the belief that models capable of reasoning should be right for the right reasons, we propose a first-of-its- kind Explainable Knowledge-intensive Analogical Reasoning benchmark (E-KAR). Our benchmark consists of 1,655 (in Chinese) and 1,251 (in English) problems sourced from the Civil Service Exams, which require intensive background knowledge to solve. More importantly, we design a free-text explanation scheme to explain whether an analogy should be drawn, and manually annotate them for each and every question and candidate answer. Empirical results suggest that this benchmark is very challenging for some state-of-the-art models for both explanation generation and analogical question answering tasks, which invites further research in this area. Project page of E-KAR can be found at https:// ekar-leaderboard.github.io.}, 40 | eprint = {https://openreview.net/forum?id=9kXOFRtrEj}, 41 | url = {https://ekar-leaderboard.github.io}, 42 | author+an = {1=student; 10=highlight;} 43 | } 44 | @InProceedings{sun2022rethinking, 45 | author = {Zewei Sun and Mingxuan Wang and Hao Zhou and Chengqi Zhao and Shujian Huang and Jiajun Chen and Lei Li}, 46 | booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings}, 47 | title = {Rethinking Document-level Neural Machine Translation}, 48 | year = {2022}, 49 | month = may, 50 | abstract = {This paper does not aim at introducing a novel model for document-level neural machine translation. Instead, we head back to the original Transformer model and hope to answer the following question: Is the capacity of current models strong enough for document-level translation? Interestingly, we observe that the original Transformer with appropriate training techniques can achieve strong results for document translation, even with a length of 2000 words. We evaluate this model and several recent approaches on nine document-level datasets and two sentence-level datasets across six languages. Experiments show that document-level Transformer models outperforms sentence-level ones and many previous methods in a comprehensive set of metrics, including BLEU, four lexical indices, three newly proposed assistant linguistic indicators, and human evaluation. Our new datasets and evaluation scripts are in https://github. com/sunzewei2715/Doc2Doc_NMT.}, 51 | code = {https://github. com/sunzewei2715/Doc2Doc_NMT}, 52 | eprint = {https://openreview.net/forum?id=sU9fYzNZ3xX}, 53 | author+an = {3=highlight} 54 | } 55 | @InProceedings{song2022switch, 56 | author = {Zhenqiao Song and Hao Zhou and Lihua Qian and Jingjing Xu and Shanbo Cheng and Mingxuan Wang and Lei Li}, 57 | booktitle = {International Conference on Learning Representations (ICLR)}, 58 | title = {{switch-GLAT}: Multilingual Parallel Machine Translation via Code-switch Decoder}, 59 | year = {2022}, 60 | month = apr, 61 | eprint = {https://openreview.net/forum?id=5HvpvYd68b}, 62 | author+an = {1=student; 2=highlight; 3=student} 63 | } 64 | @InProceedings{yang2022enhancing, 65 | author = {Huiyun Yang and Huadong Chen and Hao Zhou and Lei Li}, 66 | booktitle = {International Conference on Learning Representations (ICLR)}, 67 | title = {Enhancing Cross-lingual Transfer by Manifold Mixup}, 68 | year = {2022}, 69 | month = apr, 70 | eprint = {https://openreview.net/forum?id=OjPmfr9GkVv}, 71 | author+an = {1=student; 3=highlight} 72 | } 73 | @InProceedings{huang2022non, 74 | author = {Chenyang Huang and Hao Zhou and Osmar Zaiane and Lili Mou and Lei Li}, 75 | booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 76 | title = {Non-Autoregressive Translation with Layer-Wise Prediction and Deep Supervision}, 77 | year = {2022}, 78 | month = feb, 79 | abstract = {How do we perform efficient inference while retaining high translation quality? Existing neural machine translation models, such as Transformer, achieve high performance, but they decode words one by one, which is inefficient. Recent non-autoregressive translation models speed up the inference, but their quality is still inferior. In this work, we propose DSLP, a highly efficient and high-performance model for machine translation. The key insight is to train a non-autoregressive Transformer with Deep Supervision and feed additional Layer-wise Predictions. We conducted extensive experiments on four translation tasks (both directions of WMT'14 EN-DE and WMT'16 EN-RO). Results show that our approach consistently improves the BLEU scores compared with respective base models. Specifically, our best variant outperforms the autoregressive model on three translation tasks, while being 14.8 times more efficient in inference.}, 80 | eprint = {https://arxiv.org/abs/2110.07515}, 81 | author+an = {1=student; 2=highlight} 82 | } 83 | @InProceedings{chen-gan2022aaai, 84 | title={Unsupervised Editing for Counterfactual Stories}, 85 | author={Chen, Jiangjie and Gan, Chun and Chen, Sijie and Zhou, Hao and Xiao, Yanghua and Li, Lei}, 86 | year={2022}, 87 | booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 88 | author+an = {1=student; 2=student; 4=highlight} 89 | } 90 | 91 | @InProceedings{chen2022loren, 92 | title={LOREN: Logic-Regularized Reasoning for Interpretable Fact Verification}, 93 | author={Chen, Jiangjie and Bao, Qiaoben and Sun, Changzhi and Zhang, Xinbo and Chen, Jiaze and Zhou, Hao and Xiao, Yanghua and Li, Lei}, 94 | year={2022}, 95 | booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 96 | author+an = {1=student; 6=highlight} 97 | } 98 | @InProceedings{zheng2021duplex, 99 | author = {Zaixiang Zheng and Hao Zhou and Shujian Huang and Jiajun Chen and Jingjing Xu and Lei Li}, 100 | booktitle = {the 35th Conference on Neural Information Processing Systems (NeurIPS)}, 101 | title = {Duplex Sequence-to-Sequence Learning for Reversible Machine Translation}, 102 | year = {2021}, 103 | month = dec, 104 | abstract = {In this work, we design a simple, direct, and fast framework for instance segmentation with strong performance. To this end, we propose a novel and effective approach, termed SOLOv2, following the principle of the SOLO method. First, our new framework is empowered by an efficient and holistic instance mask representation scheme, which dynamically segments each instance in the image, without resorting to bounding box detection. Specifically, the object mask generation is decoupled into a mask kernel prediction and mask feature learning, which are responsible for generating convolution kernels and the feature maps to be convolved with, respectively. Second, SOLOv2 significantly reduces inference overhead with our novel matrix non-maximum suppression (NMS) technique. Our Matrix NMS performs NMS with parallel matrix operations in one shot, and yields better results. We demonstrate that the proposed SOLOv2 achieves the state-of-the- art performance with high efficiency, making it suitable for both mobile and cloud applications. A light-weight version of SOLOv2 executes at 31.3 FPS and yields 37.1\% AP on COCO test-dev. Moreover, our state-of-the-art results in object detection (from our mask byproduct) and panoptic segmentation show the potential of SOLOv2 to serve as a new strong baseline for many instance-level recognition tasks.}, 105 | eprint = {https://arxiv.org/abs/2105.03458}, 106 | author+an = {1=student; 2=highlight} 107 | } 108 | @InProceedings{qian2021volctrans, 109 | author = {Lihua Qian and Yi Zhou and Zaixiang Zheng and Yaoming Zhu and Zehui Lin and Jiangtao Feng and Shanbo Cheng and Lei Li and Mingxuan Wang and Hao Zhou}, 110 | booktitle = {Sixth Conference on Machine Translation (WMT21)}, 111 | title = {The {Volctrans} {GLAT} System: Non-autoregressive Translation Meets {WMT21}}, 112 | year = {2021}, 113 | month = nov, 114 | abstract = {This paper describes the Volctrans' submission to the WMT21 news translation shared task for German->English translation. We build a parallel (i.e., non-autoregressive) translation system using the Glancing Transformer, which enables fast and accurate parallel decoding in contrast to the currently prevailing autoregressive models. To the best of our knowledge, this is the first parallel translation system that can be scaled to such a practical scenario like WMT competition. More importantly, our parallel translation system achieves the best BLEU score (35.0) on German->English translation task, outperforming all strong autoregressive counterparts.}, 115 | entrysubtype = {workshop}, 116 | eprint = {https://arxiv.org/abs/2109.11247}, 117 | author+an = {1=student; 2=student; 3=student; 10=highlight} 118 | } 119 | @InProceedings{ru2021learning, 120 | author = {Dongyu Ru and Changzhi Sun and Jiangtao Feng and Lin Qiu and Hao Zhou and Weinan Zhang and Yong Yu and Lei Li}, 121 | booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 122 | title = {Learning Logic Rules for Document-level Relation Extraction}, 123 | year = {2021}, 124 | month = nov, 125 | abstract = {Document-level relation extraction aims to identify relations between entities in a whole document. Prior efforts to capture long-range dependencies have relied heavily on implicitly powerful representations learned through (graph) neural networks, which makes the model less transparent. To tackle this challenge, in this paper, we propose LogiRE, a novel probabilistic model for document-level relation extraction by learning logic rules. LogiRE treats logic rules as latent variables and consists of two modules: a rule generator and a relation extractor. The rule generator is to generate logic rules potentially contributing to final predictions, and the relation extractor outputs final predictions based on the generated logic rules. Those two modules can be efficiently optimized with the expectation--maximization (EM) algorithm. By introducing logic rules into neural networks, LogiRE can explicitly capture long-range dependencies as well as enjoy better interpretation. Empirical results show that LogiRE significantly outperforms several strong baselines in terms of relation performance (∼1.8 F1 score) and logical consistency (over 3.3 logic score). Our code is available at https://github. com/rudongyu/LogiRE.}, 126 | code = {https://github.com/rudongyu/LogiRE}, 127 | video = {https://underline.io/lecture/38055-learning-logic-rules-for-document-level-relation-extraction}, 128 | author+an = {5=highlight} 129 | } 130 | @InProceedings{wang2021cnewsum, 131 | author = {Danqing Wang and Jiaze Chen and Xianze Wu and Hao Zhou and Lei Li}, 132 | booktitle = {The 10th CCF International Conference on Natural Language Processing and Chinese Computing (NLPCC)}, 133 | title = {{CNewSum}: A Large-scale Chinese News Summarization Dataset with Human-annotated Adequacy and Deducibility Level}, 134 | year = {2021}, 135 | address = {Qingdao, China}, 136 | month = oct, 137 | abstract = {Automatic text summarization aims to produce a brief but crucial summary for the input documents. Both extractive and abstractive methods have witnessed great success in English datasets in recent years. However, there has been a minimal exploration of text summarization in Chinese, limited by the lack of large-scale datasets. In this paper, we present a large-scale Chinese news summarization dataset CNewSum, which consists of 304,307 documents and human-written summaries for the news feed. It has long documents with high-abstractive summaries, which can encourage document-level understanding and generation for current summarization models. An additional distinguishing feature of CNewSum is that its test set contains adequacy and deducibility annotations for the summaries. The adequacy level measures the degree of summary information covered by the document, and the deducibility indicates the reasoning ability the model needs to generate the summary. These annotations can help researchers analyze and target their model performance bottleneck. We examine recent methods on CNewSum and release our dataset to provide a solid testbed for automatic Chinese summarization research.}, 138 | eprint = {https://arxiv.org/abs/2110.10874}, 139 | url = {https://dqwang122.github.io/projects/CNewSum/}, 140 | author+an = {4=highlight} 141 | } 142 | @InProceedings{shi-song2021ecml, 143 | author = {Wenxian Shi and Yuxuan Song and Bohan Li and Hao Zhou and Lei Li}, 144 | booktitle = {the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD)}, 145 | title = {Follow Your Path: a Progressive Method for Knowledge Distillation}, 146 | year = {2021}, 147 | month = jul, 148 | author+an = {1=student; 2=student; 3=student; 4=highlight} 149 | } 150 | @InProceedings{qian2021acl, 151 | author = {Lihua Qian and Hao Zhou and Yu Bao and Mingxuan Wang and Lin Qiu and Weinan Zhang and Yong Yu and Lei Li}, 152 | booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)}, 153 | title = {Glancing Transformer for Non-Autoregressive Neural Machine Translation}, 154 | year = {2021}, 155 | month = jul, 156 | author+an = {1=student; 2=highlight; 3=student} 157 | } 158 | @InProceedings{xu2021acl, 159 | author = {Jingjing Xu and Hao Zhou and Chun Gan and Zaixiang Zheng and Lei Li}, 160 | booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Best Paper Award}, 161 | title = {Vocabularization via Optimal Transport for Neural Machine Translation}, 162 | year = {2021}, 163 | month = jul, 164 | author+an = {1=student; 2=highlight; 3=student; 4=student} 165 | } 166 | @InProceedings{wang2021acl, 167 | author = {Yijun Wang and Changzhi Sun and Yuanbin Wu and Hao Zhou and Lei Li and Junchi Yan}, 168 | booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)}, 169 | title = {A Unified Label Space for Entity Relation Extraction}, 170 | year = {2021}, 171 | month = jul, 172 | author+an = {4=highlight} 173 | } 174 | @InProceedings{sunzhang2021acl, 175 | author = {Changzhi Sun and Xinbo Zhang and Jiangjie Chen and Chun Gan and Yuanbin Wu and Jiaze Chen and Hao Zhou and Lei Li}, 176 | booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Finding}, 177 | title = {Probabilistic Graph Reasoning for Natural Proof Generation}, 178 | year = {2021}, 179 | month = jul, 180 | author+an = {7=highlight} 181 | } 182 | @InProceedings{wangdq2021acl, 183 | author = {Danqing Wang and Jiaze Chen and Hao Zhou and Xipeng Qiu and Lei Li}, 184 | booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Finding}, 185 | title = {Contrastive Aligned Joint Learning for Multilingual Summarization}, 186 | year = {2021}, 187 | month = jul, 188 | author+an = {3=highlight} 189 | } 190 | @InProceedings{wang2021enpar, 191 | author = {Yijun Wang and Changzhi Sun and Yuanbin Wu and Hao Zhou and Lei Li and Junchi Yan}, 192 | booktitle = {Proceedings of European Chapter of the Association for Computational Linguistics (EACL)}, 193 | title = {{ENPAR}: Enhancing Entity and Entity Pair Representations for Joint Entity Relation Extraction}, 194 | year = {2021}, 195 | month = apr, 196 | author+an = {4=highlight} 197 | } 198 | @InProceedings{yutongICLR, 199 | author = {Yutong Xie and Chence Shi and Hao Zhou and Yuwei Yang and Weinan Zhang and Yong Yu and Lei Li}, 200 | booktitle = {International Conference on Learning Representations (ICLR) - Spotlight}, 201 | title = {MARS: Markov Molecular Sampling for Multi-objective Drug Discovery}, 202 | year = {2021}, 203 | month = mar, 204 | author+an = {1=student; 2=student; 3=highlight} 205 | } 206 | @InProceedings{huang2021acmo, 207 | author = {Xunpeng Huang and Runxin Xu and Hao Zhou and Zhe Wang and Zhengyang Liu and Lei Li}, 208 | booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 209 | title = {ACMo: Angle-Calibrated Moment Methods for Stochastic Optimization}, 210 | year = {2021}, 211 | month = feb, 212 | author+an = {1=student; 2=student; 3=highlight} 213 | } 214 | @InProceedings{dong2021listen, 215 | author = {Qianqian Dong and Rong Ye and Mingxuan Wang and Hao Zhou and Shuang Xu and Bo Xu and Lei Li}, 216 | booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 217 | title = {Listen, Understand and Translate: Triple Supervision Decouples End-to-end Speech-to-text Translation}, 218 | year = {2021}, 219 | month = feb, 220 | author+an = {4=highlight} 221 | } 222 | @InProceedings{dong2021consecutive, 223 | author = {Qianqian Dong and Mingxuan Wang and Hao Zhou and Shuang Xu and Bo Xu and Lei Li}, 224 | booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 225 | title = {Consecutive Decoding for Speech-to-text Translation}, 226 | year = {2021}, 227 | month = feb, 228 | author+an = {3=highlight} 229 | } 230 | @InProceedings{song2021triangular, 231 | author = {Zhenqiao Song and Jiaze Chen and Hao Zhou and Lei Li}, 232 | booktitle = {Proceedings of the 14th International Conference on Web Search and Data Mining (WSDM)}, 233 | title = {Triangular Bidword Generation for Sponsored Search Auction}, 234 | year = {2021}, 235 | author+an = {1=student; 3=highlight} 236 | } 237 | @InProceedings{li2020sentence, 238 | author = {Bohan Li and Hao Zhou and Junxian He and Mingxuan Wang and Yiming Yang and Lei Li}, 239 | booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 240 | title = {On the Sentence Embeddings from Pre-trained Language Models}, 241 | year = {2020}, 242 | month = nov, 243 | author+an = {1=student; 2=highlight} 244 | } 245 | @InProceedings{lin2020pre, 246 | author = {Zehui Lin and Xiao Pan and Mingxuan Wang and Xipeng Qiu and Jiangtao Feng and Hao Zhou and Lei Li}, 247 | booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 248 | title = {Pre-training Multilingual Neural Machine Translation by Leveraging Alignment Information}, 249 | year = {2020}, 250 | month = nov, 251 | author+an = {6=highlight} 252 | } 253 | @InProceedings{ru2020active, 254 | author = {Dongyu Ru and Jiangtao Feng and Lin Qiu and Hao Zhou and Mngxuan Wang and Weinan Zhang and Yong Yu and Lei Li}, 255 | booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings}, 256 | title = {Active Sentence Learning by Adversarial Uncertainty Sampling in Discrete Space}, 257 | year = {2020}, 258 | month = nov, 259 | author+an = {1=student; 1=student; 3=student; 4=highlight} 260 | } 261 | @InProceedings{shi2020dispersed, 262 | author = {Wenxian Shi and Hao Zhou and Ning Miao and Lei Li}, 263 | booktitle = {Proceedings of the 37th International Conference on Machine learning (ICML)}, 264 | title = {Dispersing Exponential Family Mixture {VAE}s for Interpretable Text Generation}, 265 | year = {2020}, 266 | month = jul, 267 | author+an = {1=student; 2=highlight}, 268 | } 269 | @InProceedings{ru2020quachie, 270 | author = {Dongyu Ru and Zhenghui Wang and Lin Qiu and Hao Zhou and Lei Li and Weinan Zhang and Yong Yu}, 271 | booktitle = {the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR) - System Demonstrations}, 272 | title = {{QuAChIE}: Question Answering based {Chinese} Information Extraction System}, 273 | year = {2020}, 274 | month = jul, 275 | entrysubtype = {demo}, 276 | author+an = {1=student; 2=student; 3=student; 4=highlight} 277 | } 278 | @InProceedings{miao2020do, 279 | author = {Ning Miao and Yuxuan Song and Hao Zhou and Lei Li}, 280 | booktitle = {the 58th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers}, 281 | title = {Do you have the right scissors? Tailoring Pre-trained Language Models via {Monte}-{Carlo} Methods}, 282 | year = {2020}, 283 | month = jul, 284 | author+an = {1=student; 2=student; 3=highlight} 285 | } 286 | @inproceedings{liu-etal-2020-unsupervised, 287 | title = "Unsupervised Paraphrasing by Simulated Annealing", 288 | author = "Liu, Xianggen and 289 | Mou, Lili and 290 | Meng, Fandong and 291 | Zhou, Hao and 292 | Zhou, Jie and 293 | Song, Sen", 294 | booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL)", 295 | month = jul, 296 | year = "2020", 297 | author+an = {1=student; 4=highlight} 298 | } 299 | @InProceedings{xu2020xiaomingbot, 300 | author = {Runxin Xu and Jun Cao and Mingxuan Wang and Jiaze Chen and Hao Zhou and Ying Zeng and Yuping Wang and Li Chen and Xiang Yin and Xijin Zhang and Songcheng Jiang and Yuxuan Wang and Lei Li}, 301 | booktitle = {the 58th Annual Meeting of the Association for Computational Linguistics (ACL) - System Demonstrations}, 302 | title = {Xiaomingbot: A Multilingual Robot News Reporter}, 303 | year = {2020}, 304 | month = jul, 305 | author+an = {1=student; 5=highlight} 306 | } 307 | @InProceedings{song2020improving, 308 | author = {Yuxuan Song and Ning Miao and Hao Zhou and Lantao Yu and Mingxuan Wang and Lei Li}, 309 | booktitle = {The 23rd International Conference on Artificial Intelligence and Statistics (AISTATS)}, 310 | title = {Improving Maximum Likelihood Training for Text Generation with Density Ratio Estimation}, 311 | year = {2020}, 312 | month = aug, 313 | author+an = {1=student; 2=student; 3=highlight} 314 | } 315 | @InProceedings{ye2020variational, 316 | author = {Rong Ye and Wenxian Shi and Hao Zhou and Zhongyu Wei and Lei Li}, 317 | booktitle = {International Conference on Learning Representations (ICLR)}, 318 | title = {Variational Template Machine for Data-to-Text Generation}, 319 | year = {2020}, 320 | month = apr, 321 | author+an = {1=student; 2=student; 3=highlight} 322 | } 323 | @InProceedings{zheng2020mirror, 324 | author = {Zaixiang Zheng and Hao Zhou and Shujian Huang and Lei Li and Xinyu Dai and Jiajun Chen}, 325 | booktitle = {International Conference on Learning Representations (ICLR) - Oral}, 326 | title = {Mirror Generative Models for Neural Machine Translation}, 327 | year = {2020}, 328 | month = apr, 329 | author+an = {1=student; 2=highlight} 330 | } 331 | @InProceedings{song2020infomax, 332 | title={Infomax Neural Joint Source-Channel Coding via Adversarial Bit Flip}, 333 | author={Song, Yuxuan and Xu, Minkai and Yu, Lantao and Zhou, Hao and Shao, Shuo and Yu, Yong}, 334 | booktitle = {the 34th {AAAI} Conference on Artificial Intelligence ({AAAI})}, 335 | year={2020}, 336 | month = feb, 337 | author+an = {1=student; 2=student; 4=highlight} 338 | } 339 | @InProceedings{yang2020towards, 340 | author = {Jiacheng Yang and Mingxuan Wang and Hao Zhou and Chengqi Zhao and Weinan Zhang and Yong Yu and Lei Li}, 341 | booktitle = {the 34th {AAAI} Conference on Artificial Intelligence ({AAAI})}, 342 | title = {Towards Making the Most of {BERT} in Neural Machine Translation}, 343 | year = {2020}, 344 | month = feb, 345 | author+an = {3=highlight} 346 | 347 | } 348 | @InProceedings{wu2020importance, 349 | author = {Qingyang Wu and Lei Li and Hao Zhou and Ying Zeng and Zhou Yu}, 350 | booktitle = {the 34th {AAAI} Conference on Artificial Intelligence (AAAI)}, 351 | title = {Importance-Aware Learning for Neural Headline Editing}, 352 | year = {2020}, 353 | month = feb, 354 | author+an = {1=student; 3=highlight} 355 | } 356 | @InProceedings{fu2019rethinking, 357 | author = {Fu, Yao and Zhou, Hao and Chen, Jiaze and Li, Lei}, 358 | booktitle = {the 12th International Conference on Natural Language Generation (INLG)}, 359 | title = {Rethinking Text Attribute Transfer: A Lexical Analysis}, 360 | year = {2019}, 361 | month = oct, 362 | author+an = {1=student; 2=highlight} 363 | } 364 | @InProceedings{miao2019kernelized, 365 | author = {Miao, Ning and Zhou, Hao and Zhao, Chengqi and Shi, Wenxian and Li, Lei}, 366 | booktitle = {the 33rd Conference on Neural Information Processing Systems (NeurIPS)}, 367 | title = {Kernelized {Bayesian} Softmax for Text Generation}, 368 | year = {2019}, 369 | month = dec, 370 | author+an = {1=student; 2=highlight} 371 | } 372 | @inproceedings{qiu-etal-2019-dynamically, 373 | title = "Dynamically Fused Graph Network for Multi-hop Reasoning", 374 | author = "Qiu, Lin and 375 | Xiao, Yunxuan and 376 | Qu, Yanru and 377 | Zhou, Hao and 378 | Li, Lei and 379 | Zhang, Weinan and 380 | Yu, Yong", 381 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)", 382 | month = jul, 383 | year = "2019", 384 | author+an = {1=student; 2=student; 3=student; 4=highlight} 385 | } 386 | @inproceedings{zhang-etal-2019-generating-fluent, 387 | title = "Generating Fluent Adversarial Examples for Natural Languages", 388 | author = "Zhang, Huangzhao and 389 | Zhou, Hao and 390 | Miao, Ning and 391 | Li, Lei", 392 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers", 393 | month = jul, 394 | year = "2019", 395 | author+an = {1=student; 3=student; 2=highlight} 396 | } 397 | @inproceedings{bao-etal-2019-generating, 398 | title = "Generating Sentences from Disentangled Syntactic and Semantic Spaces", 399 | author = "Bao, Yu and 400 | Zhou, Hao and 401 | Huang, Shujian and 402 | Li, Lei and 403 | Mou, Lili and 404 | Vechtomova, Olga and 405 | Dai, Xin-yu and 406 | Chen, Jiajun", 407 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)", 408 | month = jul, 409 | year = "2019", 410 | author+an = {1=student; 2=highlight} 411 | } 412 | @inproceedings{wei-etal-2019-imitation, 413 | title = "Imitation Learning for Non-Autoregressive Neural Machine Translation", 414 | author = "Wei, Bingzhen and 415 | Wang, Mingxuan and 416 | Zhou, Hao and 417 | Lin, Junyang and 418 | Sun, Xu", 419 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL)", 420 | month = jul, 421 | year = "2019", 422 | author+an = {3=highlight} 423 | } 424 | @inproceedings{ijcai2019-0730, 425 | title = {Correct-and-Memorize: Learning to Translate from Interactive Revisions}, 426 | author = {Weng, Rongxiang and Zhou, Hao and Huang, Shujian and Li, Lei and Xia, Yifan and Chen, Jiajun}, 427 | booktitle = {Proceedings of the Twenty-Eighth International Joint Conference on 428 | Artificial Intelligence (IJCAI)}, 429 | year = {2019}, 430 | month = {jul}, 431 | author+an = {1=student; 2=highlight} 432 | } 433 | 434 | @InProceedings{sun2019graspsnooker, 435 | author = {Sun, Zhaoyue and Chen, Jiaze and Zhou, Hao and Zhou, Deyu and Li, Lei and Jiang, Mingmin}, 436 | booktitle = {the 28th International Joint Conference on Artificial Intelligence (IJCAI) - System Demonstrations}, 437 | title = {{GraspSnooker}: Automatic {Chinese} Commentary Generation for Snooker Videos}, 438 | year = {2019}, 439 | month = aug, 440 | author+an = {1=student; 3=highlight} 441 | } 442 | @inproceedings{bahuleyan-etal-2019-stochastic, 443 | title = "Stochastic {W}asserstein Autoencoder for Probabilistic Sentence Generation", 444 | author = "Bahuleyan, Hareesh and 445 | Mou, Lili and 446 | Zhou, Hao and 447 | Vechtomova, Olga", 448 | booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics (NAACL)", 449 | month = jun, 450 | year = "2019", 451 | author+an = {3=highlight} 452 | } 453 | @inproceedings{wei2019neural, 454 | title={Why do neural dialog systems generate short and meaningless replies? a comparison between dialog and translation}, 455 | author={Wei, Bolin and Lu, Shuai and Mou, Lili and Zhou, Hao and Poupart, Pascal and Li, Ge and Jin, Zhi}, 456 | booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 457 | year={2019}, 458 | author+an = {4=highlight} 459 | } 460 | @inproceedings{miao2019cgmh, 461 | title={Cgmh: Constrained sentence generation by metropolis-hastings sampling}, 462 | author={Miao, Ning and Zhou, Hao and Mou, Lili and Yan, Rui and Li, Lei}, 463 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)}, 464 | year={2019}, 465 | author+an = {1=student; 2=highlight} 466 | } 467 | @inproceedings{NEURIPS2018_734e6bfc, 468 | author = {Cao, Wei and Wang, Dong and Li, Jian and Zhou, Hao and Li, Lei and Li, Yitan}, 469 | booktitle = {Advances in Neural Information Processing Systems (NIPS)}, 470 | title = {BRITS: Bidirectional Recurrent Imputation for Time Series}, 471 | year = {2018}, 472 | author+an = {1=student; 4=highlight} 473 | } 474 | @inproceedings{shi2018tree, 475 | title={On Tree-Based Neural Sentence Modeling}, 476 | author={Shi, Haoyue and Zhou, Hao and Chen, Jiaze and Li, Lei}, 477 | booktitle={Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 478 | year={2018}, 479 | author+an = {1=student; 2=highlight} 480 | } 481 | @article{zheng-etal-2018-modeling, 482 | title = "Modeling Past and Future for Neural Machine Translation", 483 | author = "Zheng, Zaixiang and 484 | Zhou, Hao and 485 | Huang, Shujian and 486 | Mou, Lili and 487 | Dai, Xinyu and 488 | Chen, Jiajun and 489 | Tu, Zhaopeng", 490 | journal = "Transactions of the Association for Computational Linguistics (TACL)", 491 | year = "2018", 492 | author+an = {1=student; 2=highlight} 493 | } 494 | @inproceedings{zhou-etal-2017-word, 495 | title = "Word-Context Character Embeddings for {C}hinese Word Segmentation", 496 | author = "Zhou, Hao and 497 | Yu, Zhenting and 498 | Zhang, Yue and 499 | Huang, Shujian and 500 | Dai, Xinyu and 501 | Chen, Jiajun", 502 | booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (EMNLP)", 503 | month = sep, 504 | year = "2017", 505 | author+an = {1=highlight} 506 | } 507 | @inproceedings{zhou-etal-2017-chunk, 508 | title = "Chunk-Based Bi-Scale Decoder for Neural Machine Translation", 509 | author = "Zhou, Hao and 510 | Tu, Zhaopeng and 511 | Huang, Shujian and 512 | Liu, Xiaohua and 513 | Li, Hang and 514 | Chen, Jiajun", 515 | booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers", 516 | month = jul, 517 | year = "2017", 518 | author+an = {1=highlight} 519 | } 520 | @inproceedings{zhou-etal-2016-search, 521 | title = "A Search-Based Dynamic Reranking Model for Dependency Parsing", 522 | author = "Zhou, Hao and 523 | Zhang, Yue and 524 | Huang, Shujian and 525 | Zhou, Junsheng and 526 | Dai, Xin-Yu and 527 | Chen, Jiajun", 528 | booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL)", 529 | month = aug, 530 | year = "2016", 531 | author+an = {1=highlight} 532 | } 533 | @InProceedings{ZHOU16.150, 534 | author = {Hao Zhou and Yue Zhang and Shujian Huang and Xin-Yu Dai and Jiajun Chen}, 535 | title = {Evaluating a Deterministic Shift-Reduce Neural Parser for Constituent Parsing}, 536 | booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC)}, 537 | year = {2016}, 538 | month = {may}, 539 | author+an = {1=highlight} 540 | } 541 | @article{zhou15-jair, 542 | title = "A Neural Probabilistic Structured-Prediction Method for Transition-Based Natural Language Processing", 543 | author = "Zhou, Hao and 544 | Zhang, Yue and 545 | Chuan, Chen and 546 | Huang, Shujian and 547 | Xinyu, Dai and 548 | Chen, Jiajun", 549 | journal = "Journal of Artificial Intelligence Research (JAIR)", 550 | year = "2016", 551 | author+an = {1=highlight} 552 | } 553 | 554 | @article{zhou15-talip, 555 | title = "Enhancing Shift-Reduce Constituent Parsing with Action N-Gram Model", 556 | author = "Zhou, Hao and 557 | Huang, Shujian and 558 | Zhou, Junsheng and 559 | Zhang, Yue and 560 | Chen, Huadong and 561 | Dai, Xinyu and 562 | Chen, Chuan and 563 | Chen, Jiajun", 564 | journal = "ACM Transactions on Asian and Low-Resource Language Information Processing (TALLIP)", 565 | year = "2015", 566 | author+an = {1=highlight} 567 | } 568 | @inproceedings{zhou-etal-2015-neural, 569 | title = "A Neural Probabilistic Structured-Prediction Model for Transition-Based Dependency Parsing", 570 | author = "Zhou, Hao and 571 | Zhang, Yue and 572 | Huang, Shujian and 573 | Chen, Jiajun", 574 | booktitle = "Proceedings of the 53th Annual Meeting of the Association for Computational Linguistics (ACL)", 575 | year = "2015", 576 | author+an = {1=highlight} 577 | } 578 | -------------------------------------------------------------------------------- /lookup.py: -------------------------------------------------------------------------------- 1 | __author__ = 'zhouh' 2 | -------------------------------------------------------------------------------- /multi-len-bleu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | datadir=/home/zhouh/Data/nmt/ 4 | 5 | for i in $(seq 2 1 6) 6 | do 7 | echo '====== 0'$i'=======' 8 | python ./BLEUbyLength.py ../BLEU/multi-bleu.perl $datadir/devntest/MT0${i}/MT0${i}.src ./test.result.chunk.$i $datadir/devntest/MT0${i}/reference 9 | done 10 | -------------------------------------------------------------------------------- /output_align.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import theano 8 | import cPickle as pkl 9 | from training_data_iterator import TrainingTextIterator 10 | 11 | from nmt import (build_sampler, gen_sample, load_params, 12 | init_params, init_tparams, build_alignment, prepare_training_data) 13 | 14 | 15 | from multiprocessing import Process, Queue 16 | 17 | 18 | def main(model, pklmodel, dictionary, dictionary_target,dictionary_chunk, source_file,target_file, saveto, ck=5, wk=5, k=20, 19 | normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False): 20 | print 'load model model_options' 21 | with open('%s' % pklmodel, 'rb') as f: 22 | options = pkl.load(f) 23 | 24 | print 'load source dictionary and invert' 25 | with open(dictionary, 'rb') as f: 26 | word_dict = pkl.load(f) 27 | word_idict = dict() 28 | for kk, vv in word_dict.iteritems(): 29 | word_idict[vv] = kk 30 | word_idict[0] = '' 31 | word_idict[1] = 'UNK' 32 | 33 | print 'load target dictionary and invert' 34 | with open(dictionary_target, 'rb') as f: 35 | word_dict_trg = pkl.load(f) 36 | word_idict_trg = dict() 37 | for kk, vv in word_dict_trg.iteritems(): 38 | word_idict_trg[vv] = kk 39 | word_idict_trg[0] = '' 40 | word_idict_trg[1] = 'UNK' 41 | 42 | 43 | 44 | # dict for chunk label 45 | worddict_chunk = [None] 46 | worddict_r_chunk = [None] 47 | with open(dictionary_chunk, 'rb') as f: 48 | worddict_chunk = pkl.load(f) 49 | worddict_r_chunk = dict() 50 | for kk, vv in worddict_chunk.iteritems(): 51 | worddict_r_chunk[vv] = kk 52 | 53 | 54 | def _seqs2wordsByChunk(caps, boundary, chunk, dictionary): 55 | capsw = [] 56 | for cc, bb, ch in zip(caps, boundary, chunk): 57 | if cc == 0: 58 | continue 59 | # if w == -10000: 60 | # ww.append('| NOTEND') 61 | # continue 62 | if cc < 0: 63 | # ww.append('|' + str(w)) 64 | continue 65 | 66 | 67 | if bb == 0: 68 | 69 | capsw[-1] = capsw[-1] + "_" + (dictionary[cc]) 70 | 71 | else: 72 | capsw.append(dictionary[cc]) 73 | 74 | 75 | return capsw 76 | 77 | 78 | # output in the chunk format: 79 | # w1, POS, chunk_boundary-chunk_tag 80 | def _seqs2wordsByChunkFormat(caps, boundary, chunk, dictionary, chunk_dic): 81 | capsw = [] 82 | current_tag = '' 83 | 84 | for cc, bb, ch in zip(caps, boundary, chunk): 85 | if cc == 0: 86 | continue 87 | # if w == -10000: 88 | # ww.append('| NOTEND') 89 | # continue 90 | if cc < 0: 91 | # ww.append('|' + str(w)) 92 | continue 93 | 94 | 95 | if bb == 0: 96 | 97 | capsw.append(dictionary[cc] + ' ' + 'I-'+chunk_dic[ch]) 98 | 99 | else: 100 | capsw.append(dictionary[cc] + ' ' + 'B-'+chunk_dic[ch]) 101 | 102 | 103 | return capsw 104 | 105 | 106 | # utility function 107 | def _seqs2words(caps, dictionary): 108 | capsw = [] 109 | ww = [] 110 | for w in caps: 111 | if w == 0: 112 | continue 113 | ww.append(dictionary[w]) 114 | return ww 115 | 116 | 117 | 118 | 119 | # allocate model parameters 120 | params = init_params(options) 121 | 122 | # load model parameters and set theano shared variables 123 | params = load_params(model, params) 124 | tparams = init_tparams(params) 125 | 126 | 127 | f_align = build_alignment(tparams, options) 128 | 129 | 130 | # begin to read by iterators 131 | train = TrainingTextIterator(source_file, target_file, 132 | dictionary, dictionary_target, dictionary_chunk, 133 | n_words_source=30000, n_words_target=30000, 134 | batch_size=1, 135 | max_chunk_len=50, max_word_len=10000) 136 | 137 | 138 | boundary_right = 0.0 139 | tag_right = 0.0 140 | 141 | boundary_total = 0.0 142 | tag_total = 0.0 143 | 144 | for x, y_chunk, y_cw in train: 145 | 146 | x, x_mask, y_c, y_cw, chunk_indicator, y_mask = \ 147 | prepare_training_data(x, 148 | y_chunk, 149 | y_cw, 150 | maxlen_chunk=100000, 151 | maxlen_cw=100000, 152 | n_words_src=30000, 153 | n_words=30000) 154 | 155 | 156 | 157 | 158 | align, chunk_tag, chunk_boundary = f_align(x, x_mask, y_c, y_cw, y_mask, chunk_indicator) 159 | 160 | 161 | x = x.reshape((x.shape[0],) ) 162 | y_cw = y_cw.reshape((y_cw.shape[0],) ) 163 | y_c = y_c.reshape((y_c.shape[0],) ) 164 | chunk_indicator = chunk_indicator.reshape((chunk_indicator.shape[0],)) 165 | 166 | 167 | print '\n'.join(_seqs2wordsByChunkFormat(numpy.ndarray.tolist(y_cw), 168 | numpy.ndarray.tolist(chunk_boundary), 169 | numpy.ndarray.tolist(chunk_tag), 170 | word_idict_trg, worddict_r_chunk)) 171 | 172 | for gold_boundary, gold_chunk_tag, predict_boundary, predict_chunk_tag in zip(numpy.ndarray.tolist(chunk_indicator), 173 | numpy.ndarray.tolist(y_c), 174 | numpy.ndarray.tolist(chunk_boundary), 175 | numpy.ndarray.tolist(chunk_tag)): 176 | boundary_total += 1 177 | tag_total += 1 178 | 179 | if gold_boundary == predict_boundary: 180 | boundary_right += 1 181 | 182 | if gold_chunk_tag == predict_chunk_tag: 183 | tag_right += 1 184 | 185 | 186 | # for tag, boundary in zip(numpy.ndarray.tolist(chunk_tag), numpy.ndarray.tolist(chunk_boundary)): 187 | # print 188 | # 189 | # # filter alignment 190 | # filter_align = [] 191 | # for b, align in zip(numpy.ndarray.tolist(chunk_indicator), numpy.ndarray.tolist(align[0])): 192 | # if b == 1.0: 193 | # filter_align.append(align) 194 | # 195 | # 196 | # print 'align =', 197 | # # a = numpy.ndarray.tolist(filter_align) 198 | # a = numpy.array(filter_align) 199 | # a = numpy.transpose(a) 200 | # a = numpy.ndarray.tolist(a) 201 | # 202 | # print a 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | print 'boundary prec: ', boundary_right / boundary_total 211 | print 'tag prec: ', tag_right / tag_total 212 | print 'Done' 213 | 214 | 215 | if __name__ == "__main__": 216 | parser = argparse.ArgumentParser() 217 | parser.add_argument('-ck', type=int, default=3) 218 | parser.add_argument('-wk', type=int, default=5) 219 | parser.add_argument('-k', type=int, default=8) 220 | parser.add_argument('-p', type=int, default=5) 221 | parser.add_argument('-n', action="store_true", default=False) 222 | parser.add_argument('-jointProb', action="store_true", default=False) 223 | parser.add_argument('-c', action="store_true", default=False) 224 | parser.add_argument('-show_boundary', action="store_true", default=False) 225 | parser.add_argument('model', type=str) 226 | parser.add_argument('pklmodel', type=str) 227 | parser.add_argument('dictionary', type=str) 228 | parser.add_argument('dictionary_target', type=str) 229 | parser.add_argument('dictionary_chunk', type=str) 230 | parser.add_argument('source', type=str) 231 | parser.add_argument('target', type=str) 232 | parser.add_argument('saveto', type=str) 233 | 234 | args = parser.parse_args() 235 | 236 | main(args.model, args.pklmodel, args.dictionary, args.dictionary_target,args.dictionary_chunk, args.source,args.target, 237 | args.saveto, ck=args.ck, wk=args.wk, normalize=args.n, n_process=args.p, 238 | chr_level=args.c, jointProb=args.jointProb, show_boundary=args.show_boundary) 239 | -------------------------------------------------------------------------------- /rmmodel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=gpu0,floatX=float32 9 | 10 | modeldir=./ 11 | 12 | for i in $(seq 1000 1000 200000) 13 | do 14 | modelfile=$modeldir/model_hal.iter${i}.npz 15 | rm $modelfile 16 | done 17 | 18 | -------------------------------------------------------------------------------- /scp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | scp ./* zhouhao@192.168.88.125:/home/zhouhao/workspace/python/exp-nmt/HMS-chunk-nmt/$1 4 | -------------------------------------------------------------------------------- /scp240.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | scp ./* zhouh@114.212.189.240:/home/zhouh/workspace/python/chunk-nmt/$1 4 | -------------------------------------------------------------------------------- /scp5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | scp ./* zptu@192.168.88.129:/home/zptu/zhouhao/workspace/python/exp-nmt/HMS-chunk-nmt/$1 4 | -------------------------------------------------------------------------------- /test.398000.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | 9 | export THEANO_FLAGS=device=gpu2,floatX=float32 10 | datadir=/home/zhouh/Data/nmt 11 | modeldir=../16 12 | iter=398000 13 | 14 | echo 'joint prob beam = 20' >> 3to6.log 15 | #cd $PBS_O_WORKDIR 16 | for i in $(seq 2 1 6) 17 | do 18 | { 19 | python ./translate_gpu.py -n -jointProb \ 20 | $modeldir/model_hal.iter${iter}.npz \ 21 | $modeldir/model_hal.npz.pkl \ 22 | $datadir/hms.ch.filter.pkl \ 23 | $datadir/hms.en.filter.chunked.pkl \ 24 | $datadir/devntest/MT0${i}/MT0${i}.src \ 25 | ./test.result.chunk.${i} 26 | echo $i >> 3to6.log 27 | perl ../BLEU/multi-bleu.perl /home/zhouh/Data/nmt/devntest/MT0${i}/reference < test.result.chunk.${i} >> 3to6.log 28 | }& 29 | done 30 | 31 | 32 | -------------------------------------------------------------------------------- /test.align.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=gpu3,floatX=float32 9 | #export THEANO_FLAGS=device=gpu0,optimizer=None,floatX=float32,exception_verbosity=high 10 | datadir=/home/zhouh/Data/nmt 11 | 12 | i=3 13 | for i in $(seq 2 1 5) 14 | do 15 | { python ./output_align.py \ 16 | ./model_hal.iter398000.npz \ 17 | ./model_hal.npz.pkl \ 18 | $datadir/hms.ch.filter.pkl \ 19 | $datadir/hms.en.filter.chunked.pkl \ 20 | $datadir/hms.en.filter.chunked.chunktag.pkl \ 21 | $datadir/devntest/MT0${i}/MT0${i}.src\ 22 | $datadir/devntest/MT0${i}/reference0.tag.chunked.chunked\ 23 | ./align.output >> boundary.log.$i 24 | 25 | }& 26 | done 27 | -------------------------------------------------------------------------------- /test.batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | 9 | export THEANO_FLAGS=device=gpu1,floatX=float32 10 | 11 | #cd $PBS_O_WORKDIR 12 | 13 | modeldir=./ 14 | gap=`expr $1 \* 1000` 15 | echo $gap 16 | 17 | mkdir outputs 18 | rm test.log 19 | 20 | for i in $(seq 250000 $gap 500000) 21 | do 22 | for j in $(seq 1000 1000 $gap) 23 | do 24 | { iter=`expr $i + $j` 25 | 26 | modelfile=$modeldir/model_hal.iter${iter}.npz 27 | while [ ! -f $modelfile ];do 28 | sleep 1m; 29 | done; 30 | 31 | outputfile=./outputs/MT02.trans${iter}.en 32 | 33 | python ./translate_gpu.py -n $modelfile $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/hms.ch.filter.pkl /home/zhouh/Data/nmt/hms.en.filter.chunked.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src $outputfile 34 | echo ${iter} >> test.log 35 | perl ../BLEU/multi-bleu.perl ~/Data/nmt/devntest/MT02/reference < $outputfile >>test.log 36 | }& 37 | 38 | 39 | done 40 | wait 41 | done 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /test.scratch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=gpu0,floatX=float32 9 | 10 | modeldir=./ 11 | output=outputs 12 | 13 | #cd $PBS_O_WORKDIR 14 | #python ./translate_gpu.py -n -p 4 -ck 8 -wk 3 $modeldir/model_hal.npz $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/corpus.ch.pkl /home/zhouh/Data/nmt/corpus.en.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src ./outputs/$output 15 | if [ ! -f $output ]; then 16 | mkdir $outputs 17 | fi 18 | 19 | 20 | for i in $(seq 20000 10000 200000) 21 | do 22 | modelfile=$modeldir/model_hal.iter${i}.npz 23 | while [ ! -f $modelfile ];do 24 | sleep 1m; 25 | done; 26 | sleep 1m 27 | python ./translate_gpu.py -n -p 4 -ck $1 -wk $2 $modeldir/model_hal.iter${i}.npz $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/corpus.ch.filter.pkl /home/zhouh/Data/nmt/corpus.en.filter.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src ./outputs/MT02.trans${i}.scratch.en.$1.$2 28 | done 29 | 30 | for i in $(seq 201000 1000 500000) 31 | do 32 | modelfile=$modeldir/model_hal.iter${i}.npz 33 | while [ ! -f $modelfile ];do 34 | sleep 1m; 35 | done; 36 | sleep 1m 37 | python ./translate_gpu.py -n -p 4 -ck $1 -wk $2 $modeldir/model_hal.iter${i}.npz $modeldir/model_hal.npz.pkl /home/zhouh/Data/nmt/corpus.ch.filter.pkl /home/zhouh/Data/nmt/corpus.en.filter.pkl /home/zhouh/Data/nmt/devntest/MT02/MT02.src ./outputs/MT02.trans${i}.scratch.en.$1.$2 38 | done 39 | 40 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | 9 | export THEANO_FLAGS=device=gpu2,floatX=float32 10 | datadir=/home/zhouh/Data/nmt 11 | 12 | iter=398000 13 | 14 | #cd $PBS_O_WORKDIR 15 | for i in $(seq 2 1 7) 16 | do 17 | python ./translate_gpu.py -n \ 18 | ./model_hal.npz \ 19 | ./model_hal.npz.pkl \ 20 | $datadir/hms.ch.filter.pkl \ 21 | $datadir/hms.en.filter.chunked.pkl \ 22 | $datadir/devntest/MT0${i}/MT0${i}.src \ 23 | ./test.result.chunk.${i} 24 | echo $i >> 3to6.log 25 | perl ../BLEU/multi-bleu.perl /home/zhouh/Data/nmt/devntest/MT0${i}/reference < test.result.chunk.${i} >> 3to6.log 26 | done 27 | 28 | 29 | -------------------------------------------------------------------------------- /test_zh2en.pc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2/models/memory-set_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | #cd $PBS_O_WORKDIR 11 | python ./translate_gpu.py -n -p 3 \ 12 | ./model_hal.npz \ 13 | ./model_hal.npz.pkl \ 14 | ././../../nmtdata/small.ch.pkl \ 15 | ././../../nmtdata/small.en.chunked.chunktag.pkl \ 16 | ././../../nmtdata/small.test \ 17 | ./small.result 18 | -------------------------------------------------------------------------------- /test_zh2en.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2/models/memory-set_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=cpu,floatX=float32 9 | 10 | #cd $PBS_O_WORKDIR 11 | python ./translate.py -n -p 8 \ 12 | ./models/model_hal.npz \ 13 | $HOME/Data/nmt/corpus.ch.pkl \ 14 | $HOME/Data/nmt/corpus.en.pkl \ 15 | $HOME/Data/nmt/devntest/MT02/MT02.src\ 16 | ./result/MT02.trans.en 17 | 18 | python ./translate.py -n -p 8 \ 19 | ./models/model_hal.npz \ 20 | $HOME/Data/nmt/corpus.ch.pkl \ 21 | $HOME/Data/nmt/corpus.en.pkl \ 22 | $HOME/Data/nmt/devntest/MT03/MT03.src\ 23 | ./result/MT03.trans.en 24 | 25 | python ./translate.py -n -p 8 \ 26 | ./models/model_hal.npz \ 27 | $HOME/Data/nmt/corpus.ch.pkl \ 28 | $HOME/Data/nmt/corpus.en.pkl \ 29 | $HOME/Data/nmt/devntest/MT04/MT04.src\ 30 | ./result/MT04.trans.en 31 | 32 | python ./translate.py -n -p 8 \ 33 | ./models/model_hal.npz \ 34 | $HOME/Data/nmt/corpus.ch.pkl \ 35 | $HOME/Data/nmt/corpus.en.pkl \ 36 | $HOME/Data/nmt/devntest/MT05/MT05.src\ 37 | ./result/MT05.trans.en 38 | 39 | python ./translate.py -n -p 8 \ 40 | ./models/model_hal.npz \ 41 | $HOME/Data/nmt/corpus.ch.pkl \ 42 | $HOME/Data/nmt/corpus.en.pkl \ 43 | $HOME/Data/nmt/devntest/MT06/MT06.src\ 44 | ./result/MT06.trans.en 45 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /train_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=20 3 | #PBS -l walltime=168:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q GpuQ 7 | 8 | export THEANO_FLAGS=device=gpu,floatX=float32 9 | 10 | cd $PBS_O_WORKDIR 11 | python ./train_nmt_all.py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /train_nmt.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | import numpy 5 | import os 6 | 7 | from nmt import train 8 | 9 | def main(job_id, params): 10 | print params 11 | validerr = train(saveto=params['model'][0], 12 | reload_=params['reload'][0], 13 | dim_word=params['dim_word'][0], 14 | dim=params['dim'][0], 15 | n_words=params['n-words'][0], 16 | n_words_src=params['n-words'][0], 17 | decay_c=params['decay-c'][0], 18 | clip_c=params['clip-c'][0], 19 | lrate=params['learning-rate'][0], 20 | optimizer=params['optimizer'][0], 21 | patience=1000, 22 | maxlen=50, 23 | batch_size=32, 24 | valid_batch_size=32, 25 | validFreq=100, 26 | dispFreq=10, 27 | saveFreq=100, 28 | sampleFreq=100, 29 | datasets=['../data/hal/train/tok/en', 30 | '../data/hal/train/tok/fr'], 31 | valid_datasets=['../data/hal/dev/tok/en', 32 | '../data/hal/dev/tok/fr'], 33 | dictionaries=['../data/hal/train/tok/en.pkl', 34 | '../data/hal/train/tok/fr.pkl'], 35 | use_dropout=params['use-dropout'][0], 36 | overwrite=False) 37 | return validerr 38 | 39 | if __name__ == '__main__': 40 | main(0, { 41 | 'model': ['model_hal.npz'], 42 | 'dim_word': [512], 43 | 'dim': [1024], 44 | 'n-words': [30000], 45 | 'optimizer': ['adadelta'], 46 | 'decay-c': [0.], 47 | 'clip-c': [1.], 48 | 'use-dropout': [False], 49 | 'learning-rate': [0.0001], 50 | 'reload': [True]}) 51 | -------------------------------------------------------------------------------- /train_nmt_all.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | from nmt import train 5 | 6 | def main(job_id, params): 7 | print params 8 | validerr = train(saveto=params['model'][0], 9 | reload_=params['reload'][0], 10 | dim_word=params['dim_word'][0], 11 | dim=params['dim'][0], 12 | n_words=params['n-words'][0], 13 | n_words_src=params['n-words'][0], 14 | decay_c=params['decay-c'][0], 15 | clip_c=params['clip-c'][0], 16 | lrate=params['learning-rate'][0], 17 | optimizer=params['optimizer'][0], 18 | maxlen=50, 19 | batch_size=32, 20 | valid_batch_size=32, 21 | datasets=['/ichec/home/users/%s/data/all.en.concat.shuf.gz'%os.environ['USER'], 22 | '/ichec/home/users/%s/data/all.fr.concat.shuf.gz'%os.environ['USER']], 23 | valid_datasets=['/ichec/home/users/%s/data/newstest2011.en.tok'%os.environ['USER'], 24 | '/ichec/home/users/%s/data/newstest2011.fr.tok'%os.environ['USER']], 25 | dictionaries=['/ichec/home/users/%s/data/all.en.concat.gz.pkl'%os.environ['USER'], 26 | '/ichec/home/users/%s/data/all.fr.concat.gz.pkl'%os.environ['USER']], 27 | validFreq=5000, 28 | dispFreq=10, 29 | saveFreq=5000, 30 | sampleFreq=1000, 31 | use_dropout=params['use-dropout'][0], 32 | overwrite=False) 33 | return validerr 34 | 35 | if __name__ == '__main__': 36 | main(0, { 37 | 'model': ['/ichec/home/users/%s/models/model_session2_all.npz'%os.environ['USER']], 38 | 'dim_word': [500], 39 | 'dim': [1024], 40 | 'n-words': [30000], 41 | 'optimizer': ['adadelta'], 42 | 'decay-c': [0.], 43 | 'clip-c': [1.], 44 | 'use-dropout': [False], 45 | 'learning-rate': [0.0001], 46 | 'reload': [False]}) 47 | 48 | 49 | -------------------------------------------------------------------------------- /train_nmt_zh2en.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | import numpy 5 | import os 6 | 7 | from nmt import train 8 | 9 | def main(job_id, params): 10 | print params 11 | validerr = train(saveto=params['model'][0], 12 | reload_=params['reload'][0], 13 | dim_word=params['dim_word'][0], 14 | dim_chunk=params['dim_chunk'][0], 15 | dim_chunk_hidden=params['dim_chunk_hidden'][0], 16 | dim=params['dim'][0], 17 | n_words=params['n-words'][0], 18 | n_words_src=params['n-words'][0], 19 | decay_c=params['decay-c'][0], 20 | clip_c=params['clip-c'][0], 21 | lrate=params['learning-rate'][0], 22 | optimizer=params['optimizer'][0], 23 | patience=10000, 24 | batch_size=32, 25 | valid_batch_size=32, 26 | validFreq=100, 27 | dispFreq=10, 28 | saveFreq=1000, 29 | sampleFreq=100, 30 | maxlen_chunk_words=50, # maximum length of the description 31 | datasets=['/home/zhouh/Data/nmt/hms.ch.filter', 32 | '/home/zhouh/Data/nmt/hms.en.filter.chunked'], 33 | valid_datasets=['/home/zhouh/Data/nmt/devntest/MT02/MT02.src', 34 | '/home/zhouh/Data/nmt/devntest/MT02/reference0.tag.chunked.chunked'], 35 | dictionaries=['/home/zhouh/Data/nmt/hms.ch.filter.pkl', 36 | '/home/zhouh/Data/nmt/hms.en.filter.chunked.pkl'], 37 | dictionary_chunk='/home/zhouh/Data/nmt/hms.en.filter.chunked.chunktag.pkl', 38 | use_dropout=params['use-dropout'][0], 39 | overwrite=False) 40 | return validerr 41 | 42 | if __name__ == '__main__': 43 | main(0, { 44 | 'model': ['model_hal.npz'], 45 | 'dim_word': [600], 46 | 'dim_chunk': [1000], 47 | 'dim': [1000], 48 | 'dim_chunk_hidden': [1000], 49 | 'n-words': [30000], 50 | 'optimizer': ['adadelta'], 51 | 'decay-c': [0.], 52 | 'clip-c': [1.], 53 | 'use-dropout': [False], 54 | 'learning-rate': [0.0001], 55 | 'reload': [True]}) 56 | -------------------------------------------------------------------------------- /train_nmt_zh2en_pc.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | 4 | import numpy 5 | import os 6 | 7 | from nmt import train 8 | 9 | def main(job_id, params): 10 | print params 11 | validerr = train(saveto=params['model'][0], 12 | reload_=params['reload'][0], 13 | dim_word=params['dim_word'][0], 14 | dim_chunk=params['dim_chunk'][0], 15 | dim_chunk_hidden=params['dim_chunk_hidden'][0], 16 | dim=params['dim'][0], 17 | n_words=params['n-words'][0], 18 | n_words_src=params['n-words'][0], 19 | decay_c=params['decay-c'][0], 20 | clip_c=params['clip-c'][0], 21 | lrate=params['learning-rate'][0], 22 | optimizer=params['optimizer'][0], 23 | patience=1000, 24 | batch_size=2, 25 | valid_batch_size=2, 26 | validFreq=3, 27 | dispFreq=10, 28 | saveFreq=10, 29 | sampleFreq=10, 30 | maxlen_chunk=30, # maximum length of the description 31 | maxlen_chunk_words=50, # maximum length of the description 32 | datasets=['/home/zhouh/workspace/python/nmtdata/small.ch', 33 | '/home/zhouh/workspace/python/nmtdata/small.en.chunked'], 34 | valid_datasets=['/home/zhouh/workspace/python/nmtdata/small.ch', 35 | '/home/zhouh/workspace/python/nmtdata/small.en.chunked'], 36 | dictionaries=['/home/zhouh/workspace/python/nmtdata/small.ch.pkl', 37 | '/home/zhouh/workspace/python/nmtdata/small.en.chunked.pkl'], 38 | dictionary_chunk='/home/zhouh/workspace/python/nmtdata/small.en.chunked.chunktag.pkl', 39 | use_dropout=params['use-dropout'][0], 40 | overwrite=False) 41 | return validerr 42 | 43 | if __name__ == '__main__': 44 | main(0, { 45 | 'model': ['model_hal.npz'], 46 | 'dim_word': [30], 47 | 'dim_chunk': [50], 48 | 'dim_chunk_hidden' : [60], 49 | 'dim': [40], 50 | 'n-words': [100], 51 | 'optimizer': ['adadelta'], 52 | 'decay-c': [0.], 53 | 'clip-c': [1.], 54 | 'use-dropout': [False], 55 | 'learning-rate': [0.0001], 56 | 'reload': [True]}) 57 | -------------------------------------------------------------------------------- /training_data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import cPickle as pkl 4 | import gzip 5 | import sys 6 | import codecs 7 | 8 | 9 | def fopen(filename, mode='r'): 10 | if filename.endswith('.gz'): 11 | return gzip.open(filename, mode) 12 | return open(filename, mode) 13 | 14 | 15 | unk_idx = 1 16 | sentence_end_idx = 0 17 | 18 | 19 | class TrainingTextIterator: 20 | """Simple Bitext iterator.""" 21 | # 22 | # max_chunk_len, # max size of chunks in a sentence 23 | # max_word_len, # max size of words in a chunk 24 | # 25 | def __init__(self, source, target, 26 | source_dict, target_dict, target_chunk_dict, 27 | batch_size=128, 28 | max_chunk_len=20, # max size of chunks in a sentence 29 | max_word_len=5, # max size of words in a chunk 30 | n_words_source=-1, 31 | n_words_target=-1): 32 | self.source = fopen(source, 'r') 33 | self.target = fopen(target, 'r') 34 | with open(source_dict, 'rb') as f: 35 | self.source_dict = pkl.load(f) 36 | with open(target_dict, 'rb') as f: 37 | self.target_dict = pkl.load(f) 38 | 39 | with open(target_chunk_dict, 'rb') as f: 40 | self.target_chunk_dict = pkl.load(f) 41 | 42 | self.batch_size = batch_size 43 | self.max_chunk_len = max_chunk_len 44 | self.max_word_len = max_word_len 45 | 46 | self.n_words_source = n_words_source 47 | self.n_words_target = n_words_target 48 | 49 | self.source_buffer = [] 50 | self.target_chunk_buffer = [] 51 | self.target_chunk_words_buffer = [] 52 | self.k = batch_size * 50 53 | 54 | self.end_of_data = False 55 | 56 | 57 | 58 | def __iter__(self): 59 | return self 60 | 61 | def reset(self): 62 | self.source.seek(0) 63 | self.target.seek(0) 64 | 65 | def readNextChunkSent(self): 66 | chunk_words = [] 67 | chunk_tag = [] 68 | 69 | while(True): 70 | chunk_line = self.target.readline() 71 | 72 | if(chunk_line == '' and len(chunk_tag) == 0): 73 | return None, None 74 | 75 | # read until meeting empty line 76 | if(len(chunk_line.strip()) == 0): 77 | break 78 | 79 | # the chunk and the is seperated by \t, and words are sperated by space 80 | tokens = chunk_line.strip().split('\t') 81 | 82 | words = tokens[1].strip().split() 83 | ctags = ['NULL'] * len(words) # index of 'NULL' in chunk dictionary is 1 84 | ctags[0] = tokens[0] 85 | chunk_tag.extend( ctags ) 86 | chunk_words.extend( words ) 87 | 88 | assert len(chunk_tag) == len(chunk_words) 89 | 90 | return chunk_tag, chunk_words 91 | 92 | def readBuffer(self): 93 | 94 | # print 'read the buffer' 95 | 96 | # read k items into the buffer 97 | for k_ in xrange(self.k): 98 | ss = self.source.readline() 99 | 100 | if ss == "": 101 | break 102 | 103 | # print ss 104 | chunk_tags, chunk_words = self.readNextChunkSent() 105 | if chunk_tags is None and chunk_words is None: 106 | break 107 | 108 | # print chunk_words 109 | 110 | self.source_buffer.append(ss.strip().split()) 111 | self.target_chunk_buffer.append(chunk_tags) 112 | self.target_chunk_words_buffer.append(chunk_words) 113 | 114 | # sort by target buffer 115 | tlen = numpy.array([len(t) for t in self.target_chunk_buffer]) 116 | tidx = tlen.argsort() 117 | 118 | _sbuf = [self.source_buffer[i] for i in tidx] 119 | _tcbuf = [self.target_chunk_buffer[i] for i in tidx] 120 | _tcwbuf = [self.target_chunk_words_buffer[i] for i in tidx] 121 | 122 | self.source_buffer = _sbuf 123 | self.target_chunk_buffer = _tcbuf 124 | self.target_chunk_words_buffer = _tcwbuf 125 | 126 | if len(self.source_buffer) == 0 or len(self.target_chunk_buffer) == 0: 127 | 128 | # print len(self.source_buffer), len(self.target_chunk_buffer) 129 | self.end_of_data = False 130 | self.reset() 131 | raise StopIteration 132 | 133 | 134 | def next(self): 135 | if self.end_of_data: 136 | self.end_of_data = False 137 | self.reset() 138 | raise StopIteration 139 | 140 | source = [] 141 | target_chunk = [] 142 | target_chunk_words = [] 143 | 144 | get_none_items = False 145 | 146 | # fill buffer, if it's empty 147 | assert len(self.source_buffer) == len(self.target_chunk_buffer), 'Buffer size mismatch!' 148 | 149 | if len(self.source_buffer) == 0: 150 | 151 | self.readBuffer() 152 | 153 | 154 | # retrieval index for each string token 155 | try: 156 | 157 | # print 'get next' 158 | 159 | # actual work here 160 | while True: 161 | 162 | if len(self.source_buffer) == 0 and len(source) == 0: 163 | self.readBuffer() 164 | 165 | # read from source file and map to word index 166 | try: 167 | ss = self.source_buffer.pop() 168 | except IndexError: 169 | break 170 | 171 | # print 'source before', ' '.join(ss) 172 | ss = [self.source_dict[w] if w in self.source_dict else 1 173 | for w in ss] 174 | if self.n_words_source > 0: 175 | ss = [w if w < self.n_words_source else 1 for w in ss] 176 | 177 | # read from target file and map to word index 178 | tt = self.target_chunk_buffer.pop() 179 | 180 | # print 'target chunk before', tt 181 | 182 | 183 | tt = [self.target_chunk_dict[w] for w in tt] 184 | 185 | # 186 | # mark all the chunk tag in the dictionary as 0 and 1, 187 | # we only want to predict the boundary 188 | # 189 | # tt = [1 if w == 1 else 0 for w in tt] 190 | 191 | # print 'target chunk after', tt 192 | # tt = [w if w < self.n_words_target else 1 for w in tt] 193 | 194 | # read from target file and map to word index 195 | tcw = self.target_chunk_words_buffer.pop() 196 | 197 | # print 'target before', tcw 198 | tcw = [self.target_dict[w] if w in self.target_dict else 1 for w in tcw] 199 | if self.n_words_target > 0: 200 | tcw = [w if w < self.n_words_target else 1 for w in tcw] 201 | 202 | # print 'target after', tcw 203 | 204 | 205 | # if the source or target chunk or words in target chunk exceed max len, just skip 206 | # if len(ss) > self.max_word_len and len(tt) > self.max_chunk_len: 207 | # continue 208 | if len(ss) > self.max_word_len or len(tt) > self.max_word_len: 209 | 210 | # print 'skip', len(ss), len(tt) 211 | continue 212 | # else: 213 | # print 'not skip', len(ss), len(tt) 214 | 215 | source.append(ss) 216 | target_chunk.append(tt) 217 | target_chunk_words.append(tcw) 218 | 219 | if len(source) >= self.batch_size or \ 220 | len(target_chunk) >= self.batch_size: 221 | break 222 | 223 | 224 | except IOError: 225 | print 'IOError' 226 | self.end_of_data = True 227 | 228 | if len(source) <= 0 or len(target_chunk) <= 0 or len(target_chunk_words) <= 0: 229 | 230 | # print len(source) ,len(target_chunk) , len(target_chunk_words) 231 | print 'StopIteration' 232 | self.end_of_data = False 233 | self.reset() 234 | raise StopIteration 235 | 236 | return source, target_chunk, target_chunk_words 237 | -------------------------------------------------------------------------------- /translate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import theano 8 | import cPickle as pkl 9 | 10 | from nmt import (build_sampler, gen_sample, load_params, 11 | init_params, init_tparams) 12 | 13 | from multiprocessing import Process, Queue 14 | 15 | 16 | def translate_model(queue, rqueue, pid, model, options, ck, wk, normalize): 17 | 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 19 | trng = RandomStreams(1234) 20 | use_noise = theano.shared(numpy.float32(0.)) 21 | 22 | # allocate model parameters 23 | params = init_params(options) 24 | 25 | # load model parameters and set theano shared variables 26 | params = load_params(model, params) 27 | tparams = init_tparams(params) 28 | 29 | # word index 30 | f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng, use_noise) 31 | 32 | def _translate(seq): 33 | 34 | be_stochastic = False 35 | # sample given an input sequence and obtain scores 36 | sample, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word, 37 | numpy.array(seq).reshape([len(seq), 1]), 38 | options, trng=trng, maxlen_words=5, k_chunk=ck, k_word=wk, 39 | maxlen_chunks=50, 40 | stochastic=be_stochastic, argmax=True) 41 | 42 | if be_stochastic: 43 | return sample 44 | 45 | # normalize scores according to sequence lengths 46 | if normalize: 47 | lengths = numpy.array([len(s) for s in sample]) 48 | score = score / lengths 49 | 50 | # print 'score', score 51 | # print 'candidates', sample 52 | 53 | sidx = numpy.argmin(score) 54 | return sample[sidx] 55 | 56 | while True: 57 | req = queue.get() 58 | if req is None: 59 | break 60 | 61 | idx, x = req[0], req[1] 62 | print pid, '-', idx 63 | seq = _translate(x) 64 | 65 | rqueue.put((idx, seq)) 66 | 67 | return 68 | 69 | 70 | def main(model, pklmodel, dictionary, dictionary_target, source_file, saveto, ck=5, wk=5, 71 | normalize=False, n_process=5, chr_level=False): 72 | 73 | # load model model_options 74 | with open('%s' % pklmodel, 'rb') as f: 75 | options = pkl.load(f) 76 | 77 | # load source dictionary and invert 78 | with open(dictionary, 'rb') as f: 79 | word_dict = pkl.load(f) 80 | word_idict = dict() 81 | for kk, vv in word_dict.iteritems(): 82 | word_idict[vv] = kk 83 | word_idict[0] = '' 84 | word_idict[1] = 'UNK' 85 | 86 | # load target dictionary and invert 87 | with open(dictionary_target, 'rb') as f: 88 | word_dict_trg = pkl.load(f) 89 | word_idict_trg = dict() 90 | for kk, vv in word_dict_trg.iteritems(): 91 | word_idict_trg[vv] = kk 92 | word_idict_trg[0] = '' 93 | word_idict_trg[1] = 'UNK' 94 | 95 | # create input and output queues for processes 96 | queue = Queue() 97 | rqueue = Queue() 98 | processes = [None] * n_process 99 | for midx in xrange(n_process): 100 | processes[midx] = Process( 101 | target=translate_model, 102 | args=(queue, rqueue, midx, model, options, ck, wk, normalize)) 103 | processes[midx].start() 104 | 105 | # utility function 106 | def _seqs2words(caps): 107 | capsw = [] 108 | for cc in caps: 109 | ww = [] 110 | for w in cc: 111 | if w == 0: 112 | continue 113 | # if w == -10000: 114 | # ww.append('| NOTEND') 115 | # continue 116 | elif w < 0: 117 | # ww.append('|' + str(w)) 118 | continue 119 | ww.append(word_idict_trg[w]) 120 | capsw.append(' '.join(ww)) 121 | return capsw 122 | 123 | def _send_jobs(fname): 124 | with open(fname, 'r') as f: 125 | for idx, line in enumerate(f): 126 | if chr_level: 127 | words = list(line.decode('utf-8').strip()) 128 | else: 129 | words = line.strip().split() 130 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 131 | x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) 132 | x += [0] 133 | queue.put((idx, x)) 134 | return idx+1 135 | 136 | def _finish_processes(): 137 | for midx in xrange(n_process): 138 | queue.put(None) 139 | 140 | def _retrieve_jobs(n_samples): 141 | trans = [None] * n_samples 142 | for idx in xrange(n_samples): 143 | resp = rqueue.get() 144 | trans[resp[0]] = resp[1] 145 | if numpy.mod(idx, 10) == 0: 146 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 147 | return trans 148 | 149 | print 'Translating ', source_file, '...' 150 | n_samples = _send_jobs(source_file) 151 | trans = _seqs2words(_retrieve_jobs(n_samples)) 152 | _finish_processes() 153 | with open(saveto, 'w') as f: 154 | print >>f, '\n'.join(trans) 155 | print 'Done' 156 | 157 | 158 | if __name__ == "__main__": 159 | parser = argparse.ArgumentParser() 160 | parser.add_argument('-ck', type=int, default=3) 161 | parser.add_argument('-wk', type=int, default=5) 162 | parser.add_argument('-p', type=int, default=5) 163 | parser.add_argument('-n', action="store_true", default=False) 164 | parser.add_argument('-c', action="store_true", default=False) 165 | parser.add_argument('model', type=str) 166 | parser.add_argument('pklmodel', type=str) 167 | parser.add_argument('dictionary', type=str) 168 | parser.add_argument('dictionary_target', type=str) 169 | parser.add_argument('source', type=str) 170 | parser.add_argument('saveto', type=str) 171 | 172 | args = parser.parse_args() 173 | 174 | main(args.model, args.pklmodel, args.dictionary, args.dictionary_target, args.source, 175 | args.saveto, ck=args.ck, wk=args.wk, normalize=args.n, n_process=args.p, 176 | chr_level=args.c) 177 | -------------------------------------------------------------------------------- /translate_gpu.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import theano 8 | import cPickle as pkl 9 | 10 | from nmt import (build_sampler, gen_sample, load_params, 11 | init_params, init_tparams) 12 | 13 | from multiprocessing import Process, Queue 14 | 15 | 16 | def main(model, pklmodel, dictionary, dictionary_target, source_file, saveto, ck=5, wk=5, k=20, 17 | normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False): 18 | print 'load model model_options' 19 | with open('%s' % pklmodel, 'rb') as f: 20 | options = pkl.load(f) 21 | 22 | print 'load source dictionary and invert' 23 | with open(dictionary, 'rb') as f: 24 | word_dict = pkl.load(f) 25 | word_idict = dict() 26 | for kk, vv in word_dict.iteritems(): 27 | word_idict[vv] = kk 28 | word_idict[0] = '' 29 | word_idict[1] = 'UNK' 30 | 31 | print 'load target dictionary and invert' 32 | with open(dictionary_target, 'rb') as f: 33 | word_dict_trg = pkl.load(f) 34 | word_idict_trg = dict() 35 | for kk, vv in word_dict_trg.iteritems(): 36 | word_idict_trg[vv] = kk 37 | word_idict_trg[0] = '' 38 | word_idict_trg[1] = 'UNK' 39 | 40 | # utility function 41 | def _seqs2words(caps, boundary, chunk): 42 | capsw = [] 43 | for cc, bb, ch in zip(caps, boundary, chunk): 44 | ww = [] 45 | for w, b, c in zip(cc, bb, ch): 46 | if w == 0: 47 | continue 48 | # if w == -10000: 49 | # ww.append('| NOTEND') 50 | # continue 51 | elif w < 0: 52 | # ww.append('|' + str(w)) 53 | continue 54 | 55 | if show_boundary: 56 | if b == 1.0: 57 | ww.append('|') 58 | ww.append(word_idict_trg[w]) 59 | capsw.append(' '.join(ww)) 60 | return capsw 61 | 62 | def _seqs2wordsByChunk(caps, boundary, chunk): 63 | capsw = [] 64 | for cc, bb, ch in zip(caps, boundary, chunk): 65 | ww = [] 66 | for w, b, c in zip(cc, bb, ch): 67 | if w == 0: 68 | continue 69 | # if w == -10000: 70 | # ww.append('| NOTEND') 71 | # continue 72 | elif w < 0: 73 | # ww.append('|' + str(w)) 74 | continue 75 | 76 | if b == 1.0: 77 | ww.append('| ' + str(c)) 78 | ww.append(word_idict_trg[w]) 79 | capsw.append(' '.join(ww)) 80 | return capsw 81 | 82 | def _send_jobs(fname): 83 | retval = [] 84 | with open(fname, 'r') as f: 85 | for idx, line in enumerate(f): 86 | if chr_level: 87 | words = list(line.decode('utf-8').strip()) 88 | else: 89 | words = line.strip().split() 90 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 91 | x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) 92 | x += [0] 93 | retval.append(x) 94 | return retval 95 | 96 | print 'Translating ', source_file, '...' 97 | 98 | print 'look up table' 99 | n_samples = _send_jobs(source_file) 100 | 101 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 102 | trng = RandomStreams(1234) 103 | use_noise = theano.shared(numpy.float32(0.)) 104 | 105 | # allocate model parameters 106 | params = init_params(options) 107 | 108 | # load model parameters and set theano shared variables 109 | params = load_params(model, params) 110 | tparams = init_tparams(params) 111 | 112 | # word index 113 | f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng, use_noise) 114 | 115 | def _translate(seq): 116 | 117 | be_stochastic = False 118 | # sample given an input sequence and obtain scores 119 | sample, boundary, chunk, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word, 120 | numpy.array(seq).reshape([len(seq), 1]), 121 | options, trng=trng, maxlen=200, k_chunk=ck, k_word=wk, k=k, 122 | stochastic=be_stochastic, argmax=True, jointProb=False) 123 | 124 | if be_stochastic: 125 | return sample 126 | 127 | # normalize scores according to sequence lengths 128 | if normalize: 129 | lengths = numpy.array([len(s) for s in sample]) 130 | score = score / lengths 131 | 132 | # print 'score', score 133 | # print 'candidates', sample 134 | 135 | sidx = numpy.argmin(score) 136 | return sample[sidx], boundary[sidx], chunk[sidx] 137 | 138 | 139 | 140 | ys = [] 141 | yb = [] 142 | yc = [] 143 | idx = 0 144 | for x in n_samples: 145 | y, y_boundary, y_chunk = _translate(x) 146 | ys.append(y) 147 | yb.append(y_boundary) 148 | yc.append(y_chunk) 149 | print idx 150 | idx += 1 151 | 152 | 153 | # print ys 154 | # print yb 155 | trans = _seqs2words(ys, yb, yc) 156 | trans_chunk = _seqs2wordsByChunk(ys, yb, yc) 157 | 158 | with open(saveto, 'w') as f: 159 | print >> f, '\n'.join(trans) 160 | with open(saveto+'chunk', 'w') as f: 161 | print >> f, '\n'.join(trans_chunk) 162 | print 'Done' 163 | 164 | 165 | if __name__ == "__main__": 166 | parser = argparse.ArgumentParser() 167 | parser.add_argument('-ck', type=int, default=3) 168 | parser.add_argument('-wk', type=int, default=5) 169 | parser.add_argument('-k', type=int, default=8) 170 | parser.add_argument('-p', type=int, default=5) 171 | parser.add_argument('-n', action="store_true", default=False) 172 | parser.add_argument('-jointProb', action="store_true", default=False) 173 | parser.add_argument('-c', action="store_true", default=False) 174 | parser.add_argument('-show_boundary', action="store_true", default=False) 175 | parser.add_argument('model', type=str) 176 | parser.add_argument('pklmodel', type=str) 177 | parser.add_argument('dictionary', type=str) 178 | parser.add_argument('dictionary_target', type=str) 179 | parser.add_argument('source', type=str) 180 | parser.add_argument('saveto', type=str) 181 | 182 | args = parser.parse_args() 183 | 184 | main(args.model, args.pklmodel, args.dictionary, args.dictionary_target, args.source, 185 | args.saveto, ck=args.ck, wk=args.wk, normalize=args.n, n_process=args.p, 186 | chr_level=args.c, jointProb=args.jointProb, show_boundary=args.show_boundary) 187 | -------------------------------------------------------------------------------- /valid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l nodes=1:ppn=24 3 | #PBS -l walltime=24:00:00 4 | #PBS -N session2_default 5 | #PBS -A course 6 | #PBS -q ShortQ 7 | 8 | export THEANO_FLAGS=device=gpu1,floatX=float32 9 | 10 | 11 | 12 | modeldir=. 13 | 14 | datadir=/home/zhouh/Data/nmt/ 15 | 16 | modelfile=$modeldir/model_hal 17 | python ./validate.py $modelfile ./model_hal.npz.pkl ./bleu.log ./outputs/test.result ../BLEU/multi-bleu.perl $datadir/hms.ch.filter.pkl $datadir/hms.en.filter.chunked.pkl $datadir/hms.en.filter.chunked.chunktag.pkl $datadir/devntest/MT02/MT02.src $datadir/devntest/MT02/reference0.tag.chunked.chunked $datadir/devntest/MT02/reference 18 | 19 | -------------------------------------------------------------------------------- /validate.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import argparse 5 | 6 | import numpy 7 | import theano 8 | import cPickle as pkl 9 | import heapq 10 | import logging 11 | import time 12 | import subprocess 13 | import re 14 | import os 15 | 16 | from nmt import (build_model, pred_probs, load_params, 17 | init_params, init_tparams, prepare_training_data) 18 | 19 | from training_data_iterator import TrainingTextIterator 20 | 21 | 22 | def getBLEU(): 23 | 24 | return 0 25 | 26 | 27 | 28 | def main(model, 29 | pklmodel, 30 | logfile, 31 | outputfile, 32 | bleu_scrip, 33 | valid_datasets=['../data/dev/newstest2011.en.tok', 34 | '../data/dev/newstest2011.fr.tok', 35 | '../data/dev/newstest2011.fr.tok'], 36 | dictionaries=[ 37 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 38 | '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl',], 39 | dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', 40 | beginModelIter=300000, 41 | k_best_keep=5): 42 | 43 | 44 | 45 | 46 | logfile = open(logfile, 'w') 47 | 48 | best_bleu = -1 49 | best_bleu_iter = beginModelIter 50 | 51 | cost_cache = [] 52 | cost_iter = [] 53 | 54 | for iter in range(beginModelIter, 500000, 1000): 55 | 56 | model_file_name = model + '.iter' + str(iter) + '.npz' 57 | 58 | 59 | # wait until current model is written 60 | while not os.path.isfile(model_file_name): 61 | time.sleep(300) 62 | 63 | print iter 64 | 65 | cmd_get_cost = ['python', 66 | 'computeCost.py', 67 | model_file_name, 68 | pklmodel, 69 | dictionaries[0], 70 | dictionaries[1], 71 | dictionary_chunk, 72 | valid_datasets[0], 73 | valid_datasets[1], 74 | './cost.result'] 75 | 76 | subprocess.check_call(" ".join(cmd_get_cost), shell=True) 77 | 78 | fin = open('./cost.result', 'rU') 79 | out = fin.readline() 80 | 81 | tokens = out.strip().split() 82 | 83 | totalCost = float(tokens[0]) 84 | wordCost = float(tokens[1]) 85 | 86 | fin.close() 87 | 88 | 89 | print >> logfile, '===========================' 90 | 91 | print >> logfile, 'Iter ' + str(iter) + ', Word Cost ' + str(wordCost), 'Total Cost', totalCost 92 | 93 | cost_cache.append(totalCost) 94 | cost_iter.append(iter) 95 | 96 | 97 | if iter % 50000 == 0 and iter != beginModelIter: 98 | 99 | 100 | 101 | list = numpy.array(cost_cache) 102 | toDecode = list.argsort()[:5] 103 | 104 | for d in toDecode: 105 | 106 | d_iter = cost_iter[d] 107 | 108 | decode_model_name = model + '.iter' + str(d_iter) + '.npz' 109 | 110 | print >> logfile, 'To Decode for Iter '+str(d_iter) 111 | 112 | 113 | output_iter = outputfile + str(d_iter) 114 | 115 | val_start_time = time.time() 116 | 117 | cmd_translate = ['python', 118 | 'translate_gpu.py', 119 | '-n', 120 | decode_model_name, 121 | pklmodel, 122 | dictionaries[0], 123 | dictionaries[1], 124 | valid_datasets[0], 125 | output_iter] 126 | 127 | subprocess.check_call(" ".join(cmd_translate), shell=True) 128 | 129 | 130 | print >>logfile, "Decoding took {} minutes".format(float(time.time() - val_start_time) / 60.) 131 | 132 | 133 | 134 | cmd_bleu_cmd = ['perl', bleu_scrip, \ 135 | valid_datasets[2], \ 136 | '<', \ 137 | output_iter, \ 138 | '>' 139 | './output.eva'] 140 | 141 | subprocess.check_call(" ".join(cmd_bleu_cmd), shell=True) 142 | 143 | fin = open('./output.eva', 'rU') 144 | out = re.search('BLEU = [-.0-9]+', fin.readlines()[0]) 145 | fin.close() 146 | 147 | bleu_score = float(out.group()[7:]) 148 | 149 | print >>logfile, 'Iter '+str(d_iter) + 'BLEU: ' + str(bleu_score) 150 | 151 | if bleu_score > best_bleu: 152 | best_bleu = bleu_score 153 | best_bleu_iter = d_iter 154 | 155 | print >>logfile, '## Best BLEU: ' + str(best_bleu) + 'at Iter' + str(best_bleu_iter) 156 | 157 | logfile.flush() 158 | 159 | 160 | 161 | 162 | 163 | 164 | cost_cache = [] 165 | cost_iter = [] 166 | 167 | logfile.close() 168 | 169 | 170 | 171 | if __name__ == "__main__": 172 | parser = argparse.ArgumentParser() 173 | parser.add_argument('model', type=str) 174 | parser.add_argument('pklmodel', type=str) 175 | parser.add_argument('logfile', type=str) 176 | parser.add_argument('outputfile', type=str) 177 | parser.add_argument('bleu_scrip', type=str) 178 | parser.add_argument('dictionary', type=str) 179 | parser.add_argument('dictionary_target', type=str) 180 | parser.add_argument('dictionary_chunk', type=str) 181 | parser.add_argument('valid_source', type=str) 182 | parser.add_argument('valid_target', type=str) 183 | parser.add_argument('valid_reference', type=str) 184 | 185 | args = parser.parse_args() 186 | 187 | main(args.model, 188 | args.pklmodel, 189 | args.logfile, 190 | args.outputfile, 191 | args.bleu_scrip, 192 | valid_datasets=[args.valid_source, args.valid_target, args.valid_reference], 193 | dictionaries=[args.dictionary, args.dictionary_target], 194 | dictionary_chunk=args.dictionary_chunk ) 195 | 196 | --------------------------------------------------------------------------------