├── LICENSE ├── README.md ├── bpe2char ├── char_base.py ├── char_base_multi_b2c.py ├── data_iterator.py ├── many_data_iterator.py ├── mixer.py ├── nmt.py ├── nmt_many.py ├── print_batch.py ├── train_bi_bpe2char.py ├── train_multi_bpe2char.py ├── wmt15_manyen_bpe2char_adam.txt └── wmt_path.py ├── char2char ├── char_base.py ├── conv_tools.py ├── data_iterator.py ├── many_data_iterator.py ├── mixer.py ├── nmt.py ├── nmt_many.py ├── prepare_data.py ├── print_batch.py ├── train_bi_char2char.py ├── train_multi_char2char.py ├── wmt_path.py └── wmt_path_iso9.py ├── preprocess ├── build_dictionary_char.py ├── build_dictionary_word.py ├── clean_tags.py ├── fix_appo.sh ├── iso.py ├── iso9 ├── merge.sh ├── multi-bleu.perl ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.el │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.pt │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.sv │ └── nonbreaking_prefix.ta ├── normalize-punctuation.perl ├── preprocess.sh ├── tokenizer.perl └── tokenizer_apos.perl └── translate ├── translate_bpe2char.py └── translate_char2char.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Jason Lee and New York University (Kyunghyun Cho) 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of dl4mt-c2c nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Fully Character-Level Neural Machine Translation 2 | ================================== 3 | 4 | Theano implementation of the models described in the paper [Fully Character-Level Neural Machine Translation without Explicit Segmentation](https://arxiv.org/abs/1610.03017 "Fully Character-Level Neural Machine Translation without Explicit Segmentation"). 5 | 6 | We present code for training and decoding four different models: 7 | 8 | 1. bilingual bpe2char (from [Chung et al., 2016](https://arxiv.org/abs/1603.06147)). 9 | 2. bilingual char2char 10 | 3. multilingual bpe2char 11 | 4. multilingual char2char 12 | 13 | Dependencies 14 | ------------------ 15 | ### Python 16 | * Theano 17 | * Numpy 18 | * NLTK 19 | 20 | ### GPU 21 | * CUDA (we recommend using the latest version. The version 8.0 was used in all our experiments.) 22 | 23 | ### Related code 24 | * For preprocessing and evaluation, we used scripts from [MOSES](https://github.com/moses-smt/mosesdecoder "MOSES"). 25 | * This code is based on [Subword-NMT](http://arxiv.org/abs/1508.07909 "Subword-NMT") and [dl4mt-cdec](https://github.com/nyu-dl/dl4mt-cdec "dl4mt-cdec"). 26 | 27 | Downloading Datasets & Pre-trained Models 28 | ------------------ 29 | The original WMT'15 corpora can be downloaded from [here](http://www.statmt.org/wmt15/translation-task.html). For the preprocessed corpora used in our experiments, see below. 30 | * WMT'15 preprocessed corpora 31 | * [Standard version (for bilingual models, 3.5GB)](https://drive.google.com/open?id=0BxmEQ91VZAPQam5pc2ltQ1BBTTQ) 32 | * [Cyrillic converted to Latin (for multilingual models, 2.6GB)](https://drive.google.com/open?id=0BxmEQ91VZAPQS0oxTDJINng5b1k) 33 | 34 | To obtain the pre-trained top-performing models, see below. 35 | * [Pre-trained models (6.0GB)](https://drive.google.com/open?id=0BxmEQ91VZAPQcGx4VGI2N3dMNEE): **Tarball updated** on Nov 21st 2016. The CS-EN bi-char2char model in the previous tarball was not the best-performing model. 36 | 37 | Training Details 38 | ------------------ 39 | ### Using GPUs 40 | Do the following before executing `train*.py`. 41 | ```bash 42 | $ export THEANO_FLAGS=device=gpu,floatX=float32 43 | ``` 44 | With space permitting on your GPU, it may speed up training to use `cnmem`: 45 | ```bash 46 | $ export THEANO_FLAGS=device=gpu,floatX=float32,lib.cnmem=0.95,allow_gc=False 47 | ``` 48 | 49 | On a pre-2016 Titan X GPU with 12GB RAM, our bpe2char models were trained with `cnmem`. Our char2char models (both bilingual and multilingual) were trained without `cnmem` (due to lack of RAM). 50 | 51 | ### Training models 52 | Before executing the following, modify `train*.py` such that the correct directory containing WMT15 corpora is referenced. 53 | 54 | #### Bilingual bpe2char 55 | ```bash 56 | $ python bpe2char/train_bi_bpe2char.py -translate 57 | ``` 58 | #### Bilingual char2char 59 | ```bash 60 | $ python char2char/train_bi_char2char.py -translate 61 | ``` 62 | #### Multilingual bpe2char 63 | ```bash 64 | $ python bpe2char/train_multi_bpe2char.py 65 | ``` 66 | #### Multilingual char2char 67 | ```bash 68 | $ python char2char/train_multi_char2char.py 69 | ``` 70 | #### Checkpoint 71 | To resume training a model from a checkpoint, simply append `-re_load` and `-re_load_old_setting` above. Make sure the checkpoint resides in the correct directory (`.../dl4mt-c2c/models`). 72 | 73 | ### Using Custom Datasets 74 | To train your models using your own dataset (and not the WMT'15 corpus), you first need to learn your vocabulary using `build_dictionary_char.py` or `build_dictionary_word.py` for char2char or bpe2char model, respectively. For the bpe2char model, you additionally need to learn your BPE segmentation rules on the source corpus using the Subword-NMT repository (see below). 75 | 76 | Decoding 77 | ------------------ 78 | 79 | ### Decoding WMT'15 validation / test files 80 | Before executing the following, modify `translate*.py` such that the correct directory containing WMT15 corpora is referenced. 81 | 82 | ```bash 83 | $ export THEANO_FLAGS=device=gpu,floatX=float32,lib.cnmem=0.95,allow_gc=False 84 | $ python translate/translate_bpe2char.py -model -translate -saveto -which # for bpe2char models 85 | $ python translate/translate_char2char.py -model -translate -saveto -which # for char2char models 86 | ``` 87 | 88 | When choosing which pre-trained model to give to `-model`, make sure to choose e.g. `.grads.123000.npz`. The models with `.grads` in their names are the optimal models and you should be decoding from those. 89 | 90 | ### Decoding an arbitrary file 91 | Remove `-which ` and append `-source `. 92 | 93 | If you choose to decode your own source file, make sure it is: 94 | 95 | 1. properly tokenized (using `preprocess/preprocess.sh`). 96 | 2. bpe-tokenized for bpe2char models. 97 | 3. Cyrillic characters should be converted to Latin for multilingual models. 98 | 99 | ### Decoding multilingual models 100 | Append `-many` (of course, provide a path to a multilingual model for `-model`). 101 | 102 | Evaluation 103 | ------------------ 104 | We use the script from MOSES to compute the bleu score. The reference translations can be found in `.../wmt15`. 105 | ``` 106 | perl preprocess/multi-bleu.perl reference.txt < model_output.txt 107 | ``` 108 | 109 | Extra 110 | ----------------- 111 | ### Extracting & applying BPE rules 112 | 113 | Clone the Subword-NMT repository. 114 | ```bash 115 | git clone https://github.com/rsennrich/subword-nmt 116 | ``` 117 | 118 | Use following commands (find more information in [Subword-NMT](https://github.com/rsennrich/subword-nmt)) 119 | ```bash 120 | ./learn_bpe.py -s {num_operations} < {train_file} > {codes_file} 121 | ./apply_bpe.py -c {codes_file} < {test_file} 122 | ``` 123 | 124 | ### Converting Cyrillic to Latin 125 | 126 | ```bash 127 | $ python preprocess/iso.py russian_source.txt 128 | ``` 129 | will produce an output at `russian_source.txt.iso9`. 130 | 131 | Citation 132 | ------------------ 133 | 134 | ``` 135 | @article{Lee:16, 136 | author = {Jason Lee and Kyunghyun Cho and Thomas Hofmann}, 137 | title = {Fully Character-Level Neural Machine Translation without Explicit Segmentation}, 138 | year = {2016}, 139 | journal = {arXiv preprint arXiv:1610.03017}, 140 | } 141 | ``` 142 | -------------------------------------------------------------------------------- /bpe2char/char_base.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a simple neural language model using GRU units 3 | ''' 4 | import theano 5 | from theano import tensor 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | 8 | import cPickle 9 | import numpy 10 | import copy 11 | 12 | import os 13 | import warnings 14 | import sys 15 | import time 16 | 17 | from collections import OrderedDict 18 | from mixer import * 19 | 20 | 21 | def init_params(options): 22 | params = OrderedDict() 23 | 24 | print "source dictionary size: %d" % options['n_words_src'] 25 | # embedding 26 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word_src']) 27 | params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) 28 | 29 | # encoder 30 | params = get_layer('gru')[0](options, params, 31 | prefix='encoder', 32 | nin=options['dim_word_src'], 33 | dim=options['enc_dim']) 34 | params = get_layer('gru')[0](options, params, 35 | prefix='encoderr', 36 | nin=options['dim_word_src'], 37 | dim=options['enc_dim']) 38 | ctxdim = 2 * options['enc_dim'] 39 | 40 | # init_state of decoder 41 | params = get_layer('ff')[0](options, params, 42 | prefix='ff_init_state_char', 43 | nin=ctxdim, 44 | nout=options['dec_dim']) 45 | params = get_layer('ff')[0](options, params, 46 | prefix='ff_init_state_word', 47 | nin=ctxdim, 48 | nout=options['dec_dim']) 49 | 50 | print "target dictionary size: %d" % options['n_words'] 51 | # decoder 52 | params = get_layer('two_layer_gru_decoder')[0](options, params, 53 | prefix='decoder', 54 | nin=options['dim_word'], 55 | dim_char=options['dec_dim'], 56 | dim_word=options['dec_dim'], 57 | dimctx=ctxdim) 58 | 59 | # readout 60 | params = get_layer('fff')[0](options, params, prefix='ff_logit_rnn', 61 | nin1=options['dec_dim'], nin2=options['dec_dim'], 62 | nout=options['dim_word'], ortho=False) 63 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 64 | nin=options['dim_word'], 65 | nout=options['dim_word'], 66 | ortho=False) 67 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 68 | nin=ctxdim, 69 | nout=options['dim_word'], 70 | ortho=False) 71 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 72 | nin=options['dim_word'], 73 | nout=options['n_words']) 74 | 75 | return params 76 | 77 | 78 | def build_model(tparams, options): 79 | opt_ret = OrderedDict() 80 | 81 | trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max)) 82 | use_noise = theano.shared(numpy.float32(0.)) 83 | 84 | # description string: #words x #samples 85 | x = tensor.matrix('x', dtype='int64') 86 | x_mask = tensor.matrix('x_mask', dtype='float32') 87 | y = tensor.matrix('y', dtype='int64') 88 | y_mask = tensor.matrix('y_mask', dtype='float32') 89 | x.tag.test_value = numpy.zeros((5, 63), dtype='int64') 90 | x_mask.tag.test_value = numpy.ones((5, 63), dtype='float32') 91 | y.tag.test_value = numpy.zeros((7, 63), dtype='int64') 92 | y_mask.tag.test_value = numpy.ones((7, 63), dtype='float32') 93 | 94 | xr = x[::-1] 95 | xr_mask = x_mask[::-1] 96 | 97 | n_samples = x.shape[1] 98 | n_timesteps = x.shape[0] 99 | n_timesteps_trg = y.shape[0] 100 | 101 | # word embedding for forward RNN (source) 102 | emb = tparams['Wemb'][x.flatten()] 103 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) 104 | 105 | # word embedding for backward RNN (source) 106 | embr = tparams['Wemb'][xr.flatten()] 107 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']]) 108 | 109 | # pass through gru layer, recurrence here 110 | proj = get_layer('gru')[1](tparams, emb, options, 111 | prefix='encoder', mask=x_mask) 112 | projr = get_layer('gru')[1](tparams, embr, options, 113 | prefix='encoderr', mask=xr_mask) 114 | 115 | # context 116 | ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) 117 | 118 | # context mean 119 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 120 | 121 | # initial decoder state 122 | init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, 123 | prefix='ff_init_state_char', activ='tanh') 124 | init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, 125 | prefix='ff_init_state_word', activ='tanh') 126 | 127 | # word embedding and shifting for targets 128 | yemb = tparams['Wemb_dec'][y.flatten()] 129 | yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) 130 | yemb_shited = tensor.zeros_like(yemb) 131 | yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1]) 132 | yemb = yemb_shited 133 | 134 | char_h, word_h, ctxs, alphas = \ 135 | get_layer('two_layer_gru_decoder')[1](tparams, yemb, options, 136 | prefix='decoder', 137 | mask=y_mask, 138 | context=ctx, 139 | context_mask=x_mask, 140 | one_step=False, 141 | init_state_char=init_state_char, 142 | init_state_word=init_state_word) 143 | 144 | opt_ret['dec_alphas'] = alphas 145 | 146 | # compute word probabilities 147 | logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options, 148 | prefix='ff_logit_rnn', activ='linear') 149 | logit_prev = get_layer('ff')[1](tparams, yemb, options, 150 | prefix='ff_logit_prev', activ='linear') 151 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 152 | prefix='ff_logit_ctx', activ='linear') 153 | logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) 154 | 155 | if options['use_dropout']: 156 | print 'Using dropout' 157 | logit = dropout_layer(logit, use_noise, trng) 158 | 159 | logit = get_layer('ff')[1](tparams, logit, options, 160 | prefix='ff_logit', activ='linear') 161 | logit_shp = logit.shape 162 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 163 | 164 | # cost 165 | y_flat = y.flatten() 166 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 167 | cost = -tensor.log(probs.flatten()[y_flat_idx]) 168 | cost = cost.reshape([y.shape[0], y.shape[1]]) 169 | cost = (cost * y_mask).sum(0) 170 | 171 | return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost 172 | 173 | 174 | def build_sampler(tparams, options, trng, use_noise): 175 | x = tensor.matrix('x', dtype='int64') 176 | xr = x[::-1] 177 | 178 | n_timesteps = x.shape[0] 179 | n_samples = x.shape[1] 180 | 181 | emb = tparams['Wemb'][x.flatten()] 182 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) 183 | embr = tparams['Wemb'][xr.flatten()] 184 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']]) 185 | 186 | proj = get_layer('gru')[1](tparams, emb, options, prefix='encoder') 187 | projr = get_layer('gru')[1](tparams, embr, options, prefix='encoderr') 188 | 189 | ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) 190 | ctx_mean = ctx.mean(0) 191 | 192 | init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, 193 | prefix='ff_init_state_char', activ='tanh') 194 | init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, 195 | prefix='ff_init_state_word', activ='tanh') 196 | 197 | print 'Building f_init...', 198 | outs = [init_state_char, init_state_word, ctx] 199 | f_init = theano.function([x], outs, name='f_init', profile=profile) 200 | print 'Done' 201 | 202 | y = tensor.vector('y_sampler', dtype='int64') 203 | init_state_char = tensor.matrix('init_state_char', dtype='float32') 204 | init_state_word = tensor.matrix('init_state_word', dtype='float32') 205 | 206 | # if it's the first word, emb should be all zero and it is indicated by -1 207 | yemb = tensor.switch(y[:, None] < 0, 208 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 209 | tparams['Wemb_dec'][y]) 210 | 211 | next_state_char, next_state_word, next_ctx, next_alpha = \ 212 | get_layer('two_layer_gru_decoder')[1](tparams, yemb, options, 213 | prefix='decoder', 214 | context=ctx, 215 | mask=None, 216 | one_step=True, 217 | init_state_char=init_state_char, 218 | init_state_word=init_state_word) 219 | 220 | logit_rnn = get_layer('fff')[1](tparams, 221 | next_state_char, 222 | next_state_word, 223 | options, 224 | prefix='ff_logit_rnn', 225 | activ='linear') 226 | logit_prev = get_layer('ff')[1](tparams, 227 | yemb, 228 | options, 229 | prefix='ff_logit_prev', 230 | activ='linear') 231 | logit_ctx = get_layer('ff')[1](tparams, 232 | next_ctx, 233 | options, 234 | prefix='ff_logit_ctx', 235 | activ='linear') 236 | logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) 237 | 238 | if options['use_dropout']: 239 | print 'Sampling for dropoutted model' 240 | logit = dropout_layer(logit, use_noise, trng) 241 | 242 | logit = get_layer('ff')[1](tparams, logit, options, 243 | prefix='ff_logit', 244 | activ='linear') 245 | next_probs = tensor.nnet.softmax(logit) 246 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 247 | 248 | # next word probability 249 | print 'Building f_next...', 250 | inps = [y, ctx, init_state_char, init_state_word] 251 | outs = [next_probs, next_sample, next_state_char, next_state_word] 252 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 253 | print 'Done' 254 | 255 | return f_init, f_next 256 | 257 | 258 | def gen_sample(tparams, f_init, f_next, x, options, trng=None, 259 | k=1, maxlen=500, stochastic=True, argmax=False): 260 | 261 | # k is the beam size we have 262 | if k > 1: 263 | assert not stochastic, \ 264 | 'Beam search does not support stochastic sampling' 265 | 266 | sample = [] 267 | sample_score = [] 268 | if stochastic: 269 | sample_score = 0 270 | 271 | live_k = 1 272 | dead_k = 0 273 | 274 | hyp_samples = [[]] * live_k 275 | hyp_scores = numpy.zeros(live_k).astype('float32') 276 | hyp_states = [] 277 | 278 | # get initial state of decoder rnn and encoder context 279 | ret = f_init(x) 280 | next_state_char, next_state_word, ctx0 = ret[0], ret[1], ret[2] 281 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 282 | 283 | for ii in xrange(maxlen): 284 | ctx = numpy.tile(ctx0, [live_k, 1]) 285 | inps = [next_w, ctx, next_state_char, next_state_word] 286 | ret = f_next(*inps) 287 | next_p, next_w, next_state_char, next_state_word = ret[0], ret[1], ret[2], ret[3] 288 | if stochastic: 289 | if argmax: 290 | nw = next_p[0].argmax() 291 | else: 292 | nw = next_w[0] 293 | sample.append(nw) 294 | sample_score += next_p[0, nw] 295 | if nw == 0: 296 | break 297 | else: 298 | cand_scores = hyp_scores[:, None] - numpy.log(next_p) 299 | cand_flat = cand_scores.flatten() 300 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 301 | 302 | voc_size = next_p.shape[1] 303 | trans_indices = ranks_flat / voc_size 304 | word_indices = ranks_flat % voc_size 305 | costs = cand_flat[ranks_flat] 306 | 307 | new_hyp_samples = [] 308 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 309 | new_hyp_states_char = [] 310 | new_hyp_states_word = [] 311 | 312 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 313 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 314 | new_hyp_scores[idx] = copy.copy(costs[idx]) 315 | new_hyp_states_char.append(copy.copy(next_state_char[ti])) 316 | new_hyp_states_word.append(copy.copy(next_state_word[ti])) 317 | 318 | # check the finished samples 319 | new_live_k = 0 320 | hyp_samples = [] 321 | hyp_scores = [] 322 | hyp_states_char = [] 323 | hyp_states_word = [] 324 | 325 | for idx in xrange(len(new_hyp_samples)): 326 | if new_hyp_samples[idx][-1] == 0: 327 | sample.append(new_hyp_samples[idx]) 328 | sample_score.append(new_hyp_scores[idx]) 329 | dead_k += 1 330 | else: 331 | new_live_k += 1 332 | hyp_samples.append(new_hyp_samples[idx]) 333 | hyp_scores.append(new_hyp_scores[idx]) 334 | hyp_states_char.append(new_hyp_states_char[idx]) 335 | hyp_states_word.append(new_hyp_states_word[idx]) 336 | hyp_scores = numpy.array(hyp_scores) 337 | live_k = new_live_k 338 | 339 | if new_live_k < 1: 340 | break 341 | if dead_k >= k: 342 | break 343 | 344 | next_w = numpy.array([w[-1] for w in hyp_samples]) 345 | next_state_char = numpy.array(hyp_states_char) 346 | next_state_word = numpy.array(hyp_states_word) 347 | 348 | if not stochastic: 349 | # dump every remaining one 350 | if live_k > 0: 351 | for idx in xrange(live_k): 352 | sample.append(hyp_samples[idx]) 353 | sample_score.append(hyp_scores[idx]) 354 | 355 | return sample, sample_score 356 | -------------------------------------------------------------------------------- /bpe2char/char_base_multi_b2c.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Build a simple neural language model using GRU units 3 | ''' 4 | import theano 5 | from theano import tensor 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | 8 | import cPickle 9 | import numpy 10 | import copy 11 | 12 | import os 13 | import warnings 14 | import sys 15 | import time 16 | 17 | from collections import OrderedDict 18 | from mixer import * 19 | 20 | def init_params(options): 21 | params = OrderedDict() 22 | 23 | print "source dictionary size: %d" % options['n_words_src'] 24 | # embedding 25 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word_src']) 26 | params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) 27 | 28 | # encoder 29 | params = get_layer('gru')[0](options, params, 30 | prefix='encoder', 31 | nin=options['dim_word_src'], 32 | dim=options['enc_dim']) 33 | params = get_layer('gru')[0](options, params, 34 | prefix='encoderr', 35 | nin=options['dim_word_src'], 36 | dim=options['enc_dim']) 37 | ctxdim = 2 * options['enc_dim'] 38 | 39 | # init_state of decoder 40 | params = get_layer('ff')[0](options, params, 41 | prefix='ff_init_state_char', 42 | nin=ctxdim, 43 | nout=options['dec_dim']) 44 | params = get_layer('ff')[0](options, params, 45 | prefix='ff_init_state_word', 46 | nin=ctxdim, 47 | nout=options['dec_dim']) 48 | 49 | print "target dictionary size: %d" % options['n_words'] 50 | # decoder 51 | params = get_layer('two_layer_gru_decoder')[0](options, params, 52 | prefix='decoder', 53 | nin=options['dim_word'], 54 | dim_char=options['dec_dim'], 55 | dim_word=options['dec_dim'], 56 | dimctx=ctxdim) 57 | 58 | # readout 59 | params = get_layer('fff')[0](options, params, prefix='ff_logit_rnn', 60 | nin1=options['dec_dim'], nin2=options['dec_dim'], 61 | nout=options['dim_word'], ortho=False) 62 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 63 | nin=options['dim_word'], 64 | nout=options['dim_word'], 65 | ortho=False) 66 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 67 | nin=ctxdim, 68 | nout=options['dim_word'], 69 | ortho=False) 70 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 71 | nin=options['dim_word'], 72 | nout=options['n_words']) 73 | 74 | return params 75 | 76 | 77 | def build_model(tparams, options): 78 | opt_ret = OrderedDict() 79 | 80 | trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max)) 81 | use_noise = theano.shared(numpy.float32(0.)) 82 | 83 | # description string: #words x #samples 84 | x = tensor.matrix('x', dtype='int64') 85 | x_mask = tensor.matrix('x_mask', dtype='float32') 86 | y = tensor.matrix('y', dtype='int64') 87 | y_mask = tensor.matrix('y_mask', dtype='float32') 88 | x.tag.test_value = numpy.zeros((5, 63), dtype='int64') 89 | x_mask.tag.test_value = numpy.ones((5, 63), dtype='float32') 90 | y.tag.test_value = numpy.zeros((7, 63), dtype='int64') 91 | y_mask.tag.test_value = numpy.ones((7, 63), dtype='float32') 92 | 93 | xr = x[::-1] 94 | xr_mask = x_mask[::-1] 95 | 96 | n_samples = x.shape[1] 97 | n_timesteps = x.shape[0] 98 | n_timesteps_trg = y.shape[0] 99 | 100 | # word embedding for forward RNN (source) 101 | emb = tparams['Wemb'][x.flatten()] 102 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) 103 | 104 | # word embedding for backward RNN (source) 105 | embr = tparams['Wemb'][xr.flatten()] 106 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']]) 107 | 108 | # pass through gru layer, recurrence here 109 | proj = get_layer('gru')[1](tparams, emb, options, 110 | prefix='encoder', mask=x_mask) 111 | projr = get_layer('gru')[1](tparams, embr, options, 112 | prefix='encoderr', mask=xr_mask) 113 | 114 | # context 115 | ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) 116 | 117 | # context mean 118 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 119 | 120 | # initial decoder state 121 | init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, 122 | prefix='ff_init_state_char', activ='tanh') 123 | init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, 124 | prefix='ff_init_state_word', activ='tanh') 125 | 126 | # word embedding and shifting for targets 127 | yemb = tparams['Wemb_dec'][y.flatten()] 128 | yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) 129 | yemb_shited = tensor.zeros_like(yemb) 130 | yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1]) 131 | yemb = yemb_shited 132 | 133 | char_h, word_h, ctxs, alphas = \ 134 | get_layer('two_layer_gru_decoder')[1](tparams, yemb, options, 135 | prefix='decoder', 136 | mask=y_mask, 137 | context=ctx, 138 | context_mask=x_mask, 139 | one_step=False, 140 | init_state_char=init_state_char, 141 | init_state_word=init_state_word) 142 | 143 | opt_ret['dec_alphas'] = alphas 144 | 145 | # compute word probabilities 146 | logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options, 147 | prefix='ff_logit_rnn', activ='linear') 148 | logit_prev = get_layer('ff')[1](tparams, yemb, options, 149 | prefix='ff_logit_prev', activ='linear') 150 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 151 | prefix='ff_logit_ctx', activ='linear') 152 | logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) 153 | 154 | if options['use_dropout']: 155 | print 'Using dropout' 156 | logit = dropout_layer(logit, use_noise, trng) 157 | 158 | logit = get_layer('ff')[1](tparams, logit, options, 159 | prefix='ff_logit', activ='linear') 160 | logit_shp = logit.shape 161 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 162 | 163 | # cost 164 | y_flat = y.flatten() 165 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 166 | cost = -tensor.log(probs.flatten()[y_flat_idx]) 167 | cost = cost.reshape([y.shape[0], y.shape[1]]) 168 | cost = (cost * y_mask).sum(0) 169 | 170 | return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost 171 | 172 | def build_sampler(tparams, options, trng, use_noise): 173 | x = tensor.matrix('x', dtype='int64') 174 | xr = x[::-1] 175 | 176 | n_timesteps = x.shape[0] 177 | n_samples = x.shape[1] 178 | 179 | emb = tparams['Wemb'][x.flatten()] 180 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) 181 | embr = tparams['Wemb'][xr.flatten()] 182 | embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']]) 183 | 184 | proj = get_layer('gru')[1](tparams, emb, options, prefix='encoder') 185 | projr = get_layer('gru')[1](tparams, embr, options, prefix='encoderr') 186 | 187 | ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) 188 | ctx_mean = ctx.mean(0) 189 | 190 | init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, 191 | prefix='ff_init_state_char', activ='tanh') 192 | init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, 193 | prefix='ff_init_state_word', activ='tanh') 194 | 195 | print 'Building f_init...', 196 | outs = [init_state_char, init_state_word, ctx] 197 | f_init = theano.function([x], outs, name='f_init', profile=profile) 198 | print 'Done' 199 | 200 | y = tensor.vector('y_sampler', dtype='int64') 201 | init_state_char = tensor.matrix('init_state_char', dtype='float32') 202 | init_state_word = tensor.matrix('init_state_word', dtype='float32') 203 | 204 | # if it's the first word, emb should be all zero and it is indicated by -1 205 | yemb = tensor.switch(y[:, None] < 0, 206 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 207 | tparams['Wemb_dec'][y]) 208 | 209 | next_state_char, next_state_word, next_ctx, next_alpha = \ 210 | get_layer('two_layer_gru_decoder')[1](tparams, yemb, options, 211 | prefix='decoder', 212 | context=ctx, 213 | mask=None, 214 | one_step=True, 215 | init_state_char=init_state_char, 216 | init_state_word=init_state_word) 217 | 218 | logit_rnn = get_layer('fff')[1](tparams, 219 | next_state_char, 220 | next_state_word, 221 | options, 222 | prefix='ff_logit_rnn', 223 | activ='linear') 224 | logit_prev = get_layer('ff')[1](tparams, 225 | yemb, 226 | options, 227 | prefix='ff_logit_prev', 228 | activ='linear') 229 | logit_ctx = get_layer('ff')[1](tparams, 230 | next_ctx, 231 | options, 232 | prefix='ff_logit_ctx', 233 | activ='linear') 234 | logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) 235 | 236 | if options['use_dropout']: 237 | print 'Sampling for dropoutted model' 238 | logit = dropout_layer(logit, use_noise, trng) 239 | 240 | logit = get_layer('ff')[1](tparams, logit, options, 241 | prefix='ff_logit', 242 | activ='linear') 243 | next_probs = tensor.nnet.softmax(logit) 244 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 245 | 246 | # next word probability 247 | print 'Building f_next...', 248 | inps = [y, ctx, init_state_char, init_state_word] 249 | outs = [next_probs, next_sample, next_state_char, next_state_word] 250 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 251 | print 'Done' 252 | 253 | return f_init, f_next 254 | 255 | 256 | def gen_sample(tparams, f_init, f_next, x, options, trng=None, 257 | k=1, maxlen=500, stochastic=True, argmax=False): 258 | 259 | # k is the beam size we have 260 | if k > 1: 261 | assert not stochastic, \ 262 | 'Beam search does not support stochastic sampling' 263 | 264 | sample = [] 265 | sample_score = [] 266 | if stochastic: 267 | sample_score = 0 268 | 269 | live_k = 1 270 | dead_k = 0 271 | 272 | hyp_samples = [[]] * live_k 273 | hyp_scores = numpy.zeros(live_k).astype('float32') 274 | hyp_states = [] 275 | 276 | # get initial state of decoder rnn and encoder context 277 | ret = f_init(x) 278 | next_state_char, next_state_word, ctx0 = ret[0], ret[1], ret[2] 279 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 280 | 281 | for ii in xrange(maxlen): 282 | ctx = numpy.tile(ctx0, [live_k, 1]) 283 | inps = [next_w, ctx, next_state_char, next_state_word] 284 | ret = f_next(*inps) 285 | next_p, next_w, next_state_char, next_state_word = ret[0], ret[1], ret[2], ret[3] 286 | if stochastic: 287 | if argmax: 288 | nw = next_p[0].argmax() 289 | else: 290 | nw = next_w[0] 291 | sample.append(nw) 292 | sample_score += next_p[0, nw] 293 | if nw == 0: 294 | break 295 | else: 296 | cand_scores = hyp_scores[:, None] - numpy.log(next_p) 297 | cand_flat = cand_scores.flatten() 298 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 299 | 300 | voc_size = next_p.shape[1] 301 | trans_indices = ranks_flat / voc_size 302 | word_indices = ranks_flat % voc_size 303 | costs = cand_flat[ranks_flat] 304 | 305 | new_hyp_samples = [] 306 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 307 | new_hyp_states_char = [] 308 | new_hyp_states_word = [] 309 | 310 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 311 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 312 | new_hyp_scores[idx] = copy.copy(costs[idx]) 313 | new_hyp_states_char.append(copy.copy(next_state_char[ti])) 314 | new_hyp_states_word.append(copy.copy(next_state_word[ti])) 315 | 316 | # check the finished samples 317 | new_live_k = 0 318 | hyp_samples = [] 319 | hyp_scores = [] 320 | hyp_states_char = [] 321 | hyp_states_word = [] 322 | 323 | for idx in xrange(len(new_hyp_samples)): 324 | if new_hyp_samples[idx][-1] == 0: 325 | sample.append(new_hyp_samples[idx]) 326 | sample_score.append(new_hyp_scores[idx]) 327 | dead_k += 1 328 | else: 329 | new_live_k += 1 330 | hyp_samples.append(new_hyp_samples[idx]) 331 | hyp_scores.append(new_hyp_scores[idx]) 332 | hyp_states_char.append(new_hyp_states_char[idx]) 333 | hyp_states_word.append(new_hyp_states_word[idx]) 334 | hyp_scores = numpy.array(hyp_scores) 335 | live_k = new_live_k 336 | 337 | if new_live_k < 1: 338 | break 339 | if dead_k >= k: 340 | break 341 | 342 | next_w = numpy.array([w[-1] for w in hyp_samples]) 343 | next_state_char = numpy.array(hyp_states_char) 344 | next_state_word = numpy.array(hyp_states_word) 345 | 346 | if not stochastic: 347 | # dump every remaining one 348 | if live_k > 0: 349 | for idx in xrange(live_k): 350 | sample.append(hyp_samples[idx]) 351 | sample_score.append(hyp_scores[idx]) 352 | 353 | return sample, sample_score 354 | -------------------------------------------------------------------------------- /bpe2char/data_iterator.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy 3 | import os 4 | import random 5 | 6 | import cPickle 7 | import gzip 8 | import codecs 9 | 10 | from tempfile import mkstemp 11 | 12 | 13 | def fopen(filename, mode='r'): 14 | if filename.endswith('.gz'): 15 | return gzip.open(filename, mode) 16 | return open(filename, mode) 17 | 18 | 19 | class TextIterator: 20 | """Simple Bitext iterator.""" 21 | def __init__(self, 22 | source, source_dict, 23 | target=None, target_dict=None, 24 | source_word_level=0, 25 | target_word_level=0, 26 | batch_size=128, 27 | job_id=0, 28 | sort_size=20, 29 | n_words_source=-1, 30 | n_words_target=-1, 31 | shuffle_per_epoch=False): 32 | self.source_file = source 33 | self.target_file = target 34 | self.source = fopen(source, 'r') 35 | with open(source_dict, 'rb') as f: 36 | self.source_dict = cPickle.load(f) 37 | if target is not None: 38 | self.target = fopen(target, 'r') 39 | if target_dict is not None: 40 | with open(target_dict, 'rb') as f: 41 | self.target_dict = cPickle.load(f) 42 | else: 43 | self.target = None 44 | 45 | self.source_word_level = source_word_level 46 | self.target_word_level = target_word_level 47 | self.batch_size = batch_size 48 | 49 | self.n_words_source = n_words_source 50 | self.n_words_target = n_words_target 51 | self.shuffle_per_epoch = shuffle_per_epoch 52 | 53 | self.source_buffer = [] 54 | self.target_buffer = [] 55 | self.k = batch_size * sort_size 56 | 57 | self.end_of_data = False 58 | self.job_id = job_id 59 | 60 | def __iter__(self): 61 | return self 62 | 63 | def reset(self): 64 | if self.shuffle_per_epoch: 65 | # close current files 66 | self.source.close() 67 | if self.target is None: 68 | self.shuffle([self.source_file]) 69 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 70 | else: 71 | self.target.close() 72 | # shuffle *original* source files, 73 | self.shuffle([self.source_file, self.target_file]) 74 | # open newly 're-shuffled' file as input 75 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 76 | self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r') 77 | else: 78 | self.source.seek(0) 79 | if self.target is not None: 80 | self.target.seek(0) 81 | 82 | @staticmethod 83 | def shuffle(files): 84 | tf_os, tpath = mkstemp() 85 | tf = open(tpath, 'w') 86 | fds = [open(ff) for ff in files] 87 | for l in fds[0]: 88 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 89 | print >>tf, "|||".join(lines) 90 | [ff.close() for ff in fds] 91 | tf.close() 92 | tf = open(tpath, 'r') 93 | lines = tf.readlines() 94 | random.shuffle(lines) 95 | fds = [open(ff+'.reshuf','w') for ff in files] 96 | for l in lines: 97 | s = l.strip().split('|||') 98 | for ii, fd in enumerate(fds): 99 | print >>fd, s[ii] 100 | [ff.close() for ff in fds] 101 | os.remove(tpath) 102 | return 103 | 104 | def next(self): 105 | if self.end_of_data: 106 | self.end_of_data = False 107 | self.reset() 108 | raise StopIteration 109 | 110 | source = [] 111 | target = [] 112 | 113 | # fill buffer, if it's empty 114 | if self.target is not None: 115 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 116 | 117 | if len(self.source_buffer) == 0: 118 | for k_ in xrange(self.k): 119 | ss = self.source.readline() 120 | 121 | if ss == "": 122 | break 123 | 124 | if self.source_word_level: 125 | ss = ss.strip().split() 126 | else: 127 | ss = ss.strip() 128 | ss = list(ss.decode('utf8')) 129 | 130 | self.source_buffer.append(ss) 131 | 132 | if self.target is not None: 133 | tt = self.target.readline() 134 | 135 | if tt == "": 136 | break 137 | 138 | if self.target_word_level: 139 | tt = tt.strip().split() 140 | else: 141 | tt = tt.strip() 142 | tt = list(tt.decode('utf8')) 143 | 144 | self.target_buffer.append(tt) 145 | 146 | if self.target is not None: 147 | # sort by target buffer 148 | tlen = numpy.array([len(t) for t in self.target_buffer]) 149 | tidx = tlen.argsort() 150 | _sbuf = [self.source_buffer[i] for i in tidx] 151 | _tbuf = [self.target_buffer[i] for i in tidx] 152 | self.target_buffer = _tbuf 153 | else: 154 | slen = numpy.array([len(s) for s in self.source_buffer]) 155 | sidx = slen.argsort() 156 | _sbuf = [self.source_buffer[i] for i in sidx] 157 | 158 | self.source_buffer = _sbuf 159 | 160 | if self.target is not None: 161 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 162 | self.end_of_data = False 163 | self.reset() 164 | raise StopIteration 165 | elif len(self.source_buffer) == 0: 166 | self.end_of_data = False 167 | self.reset() 168 | raise StopIteration 169 | 170 | try: 171 | # actual work here 172 | while True: 173 | # read from source file and map to word index 174 | try: 175 | ss_ = self.source_buffer.pop() 176 | except IndexError: 177 | break 178 | ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_] 179 | if self.n_words_source > 0: 180 | ss = [w if w < self.n_words_source else 1 for w in ss] 181 | source.append(ss) 182 | if self.target is not None: 183 | # read from target file and map to word index 184 | tt_ = self.target_buffer.pop() 185 | tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_] 186 | if self.n_words_target > 0: 187 | tt = [w if w < self.n_words_target else 1 for w in tt] 188 | target.append(tt) 189 | 190 | if len(source) >= self.batch_size: 191 | break 192 | except IOError: 193 | self.end_of_data = True 194 | 195 | if self.target is not None: 196 | if len(source) <= 0 or len(target) <= 0: 197 | self.end_of_data = False 198 | self.reset() 199 | raise StopIteration 200 | return source, target 201 | else: 202 | if len(source) <= 0: 203 | self.end_of_data = False 204 | self.reset() 205 | raise StopIteration 206 | return source 207 | -------------------------------------------------------------------------------- /bpe2char/many_data_iterator.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy 3 | import os 4 | import random 5 | 6 | import cPickle 7 | import gzip 8 | import codecs 9 | 10 | from tempfile import mkstemp 11 | 12 | random.seed(1029381209) 13 | 14 | def fopen(filename, mode='r'): 15 | if filename.endswith('.gz'): 16 | return gzip.open(filename, mode) 17 | return open(filename, mode) 18 | 19 | class MultiTextIterator: 20 | """Simple Bitext iterator.""" 21 | def __init__(self, 22 | source, source_dict, 23 | target=None, target_dict=None, 24 | source_word_level=0, 25 | target_word_level=0, 26 | batch_size=[128,1,2,3], 27 | job_id=0, 28 | sort_size=20, 29 | n_words_source=302, 30 | n_words_target=302, 31 | shuffle_per_epoch=False): 32 | 33 | self.source_files = source 34 | self.target_files = target 35 | 36 | self.sources = [fopen(s, 'r') for s in source] 37 | with open(source_dict, 'rb') as f: 38 | self.source_dict = cPickle.load(f) 39 | # one source dictionary 40 | 41 | self.targets = [fopen(t, 'r') for t in target] 42 | with open(target_dict, 'rb') as f: 43 | self.target_dict = cPickle.load(f) 44 | # one target dictionary 45 | 46 | self.source_word_level = source_word_level 47 | self.target_word_level = target_word_level 48 | self.batch_sizes = batch_size 49 | # list 50 | 51 | self.n_words_source = n_words_source 52 | self.n_words_target = n_words_target 53 | self.shuffle_per_epoch = shuffle_per_epoch 54 | 55 | self.source_buffers = [[],[],[],[]] 56 | self.target_buffers = [[],[],[],[]] 57 | self.k = [bs * sort_size for bs in batch_size] 58 | # at once, fetch 20 items 59 | # we're good for 20 updates 60 | 61 | self.end_of_data = False 62 | self.job_id = job_id 63 | 64 | def __iter__(self): 65 | return self 66 | 67 | def reset(self): 68 | if self.shuffle_per_epoch: 69 | raise Exception("hi") 70 | # close current files 71 | for s in self.sources: 72 | s.close() 73 | 74 | if self.targets is None: 75 | self.shuffle([self.source_file]) 76 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 77 | else: 78 | for t in self.targets: 79 | t.close() 80 | 81 | # shuffle *original* source files, 82 | self.shuffle([self.source_file, self.target_file]) 83 | # open newly 're-shuffled' file as input 84 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 85 | self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r') 86 | else: 87 | for idx in xrange(4): 88 | self.sources[idx].seek(0) 89 | self.targets[idx].seek(0) 90 | 91 | @staticmethod 92 | def shuffle(files): 93 | tf_os, tpath = mkstemp() 94 | tf = open(tpath, 'w') 95 | fds = [open(ff) for ff in files] 96 | for l in fds[0]: 97 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 98 | print >>tf, "|||".join(lines) 99 | [ff.close() for ff in fds] 100 | tf.close() 101 | tf = open(tpath, 'r') 102 | lines = tf.readlines() 103 | random.shuffle(lines) 104 | fds = [open(ff+'.reshuf','w') for ff in files] 105 | for l in lines: 106 | s = l.strip().split('|||') 107 | for ii, fd in enumerate(fds): 108 | print >>fd, s[ii] 109 | [ff.close() for ff in fds] 110 | os.remove(tpath) 111 | return 112 | 113 | def next(self): 114 | # if end_of_data reaches, stop for loop 115 | if self.end_of_data: 116 | self.end_of_data = False 117 | self.reset() 118 | raise StopIteration 119 | 120 | sources = [[],[],[],[]] 121 | targets = [[],[],[],[]] 122 | # NOTE : this is the data to be used for "this" round of updates 123 | 124 | # fill buffer, if it's empty 125 | for idx in xrange(4): 126 | assert len(self.source_buffers[idx]) == len(self.target_buffers[idx]), 'Buffer size mismatch!' 127 | 128 | for idx in xrange(4): 129 | # NOTE : in buffer: don't put the whole dataset in... only for 'k' many updates 130 | # after 'k' updates, self.source_buffers[idx] will be empty, in which case we will put new things in 131 | 132 | #if len(self.source_buffers[idx]) == 0: 133 | if len(self.source_buffers[idx]) < self.batch_sizes[idx]: 134 | # NOTE : change this to : if less than one out... 135 | for k_ in xrange(self.k[idx]): 136 | 137 | ss = self.sources[idx].readline() 138 | # NOTE: self.sources is where we keep the RAW data 139 | if ss == "": 140 | break 141 | if self.source_word_level: 142 | ss = ss.strip().split() 143 | else: 144 | ss = ss.strip() 145 | ss = list(ss.decode('utf8')) 146 | self.source_buffers[idx].append(ss) 147 | 148 | tt = self.targets[idx].readline() 149 | if tt == "": 150 | break 151 | if self.target_word_level: 152 | tt = tt.strip().split() 153 | else: 154 | tt = tt.strip() 155 | tt = list(tt.decode('utf8')) 156 | self.target_buffers[idx].append(tt) 157 | 158 | tlen = numpy.array([len(t) for t in self.target_buffers[idx]]) 159 | tidx = tlen.argsort() 160 | _sbuf = [self.source_buffers[idx][i] for i in tidx] 161 | _tbuf = [self.target_buffers[idx][i] for i in tidx] 162 | self.target_buffers[idx] = _tbuf 163 | self.source_buffers[idx] = _sbuf 164 | 165 | stop = False 166 | for idx in xrange(4): 167 | if len(self.source_buffers[idx]) < self.batch_sizes[idx]: 168 | stop = True 169 | 170 | if stop: 171 | self.end_of_data = False 172 | self.reset() 173 | raise StopIteration 174 | 175 | try: 176 | # actual work here 177 | for idx in xrange(4): 178 | while True: 179 | # read from source file and map to word index 180 | try: 181 | ss_ = self.source_buffers[idx].pop() 182 | except IndexError: 183 | # NOTE : just because source_buffers is empty, doesn't mean file scanned 184 | # we do add partial batches. We proceed until len(source_buffers) = 0 185 | break 186 | 187 | ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_] 188 | if self.n_words_source > 0: 189 | ss = [w if w < self.n_words_source else 1 for w in ss] 190 | sources[idx].append(ss) 191 | 192 | tt_ = self.target_buffers[idx].pop() 193 | tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_] 194 | if self.n_words_target > 0: 195 | tt = [w if w < self.n_words_target else 1 for w in tt] 196 | targets[idx].append(tt) 197 | 198 | if len(sources[idx]) >= self.batch_sizes[idx]: 199 | break 200 | 201 | except IOError: 202 | self.end_of_data = True 203 | 204 | source = sources[0] + sources[1] + sources[2] + sources[3] 205 | target = targets[0] + targets[1] + targets[2] + targets[3] 206 | 207 | # NOTE : just add anything, if still nothing, reset 208 | min_batch_size = numpy.sum(self.batch_sizes) 209 | # NOTE : this CANT BE ZERO!!!! bc buffer not multiple of things 210 | if len(source) < min_batch_size or len(target) < min_batch_size: 211 | self.end_of_data = False 212 | self.reset() 213 | raise StopIteration 214 | 215 | return source, target 216 | -------------------------------------------------------------------------------- /bpe2char/print_batch.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import numpy as np 3 | 4 | def pbatch(source, dic): 5 | ss = np.transpose(source) 6 | for line in ss[:10]: 7 | for word in line: 8 | a = dic[word] 9 | b = a 10 | if a == "eos": 11 | b = "_" 12 | elif a == "UNK": 13 | b = "|" 14 | print b, 15 | print " " 16 | print "" 17 | -------------------------------------------------------------------------------- /bpe2char/train_bi_bpe2char.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import sys 4 | from collections import OrderedDict 5 | from nmt import train 6 | from wmt_path import wmts 7 | from char_base import * 8 | 9 | layers = {'ff': ('param_init_fflayer', 'fflayer'), 10 | 'fff': ('param_init_ffflayer', 'ffflayer'), 11 | 'gru': ('param_init_gru', 'gru_layer'), 12 | 'two_layer_gru_decoder': ('param_init_two_layer_gru_decoder', 13 | 'two_layer_gru_decoder'), 14 | } 15 | 16 | def main(job_id, params): 17 | save_file_name = args.model_name 18 | source_dataset = args.data_path + wmts[args.translate]['train'][1][0] 19 | target_dataset = args.data_path + wmts[args.translate]['train'][0][1] 20 | valid_source_dataset = args.data_path + wmts[args.translate]['dev'][1][0] 21 | valid_target_dataset = args.data_path + wmts[args.translate]['dev'][0][1] 22 | source_dictionary = args.data_path + wmts[args.translate]['dic'][1][0] 23 | target_dictionary = args.data_path + wmts[args.translate]['dic'][0][1] 24 | 25 | print args.save_path, save_file_name 26 | print source_dataset 27 | print target_dataset 28 | print valid_source_dataset 29 | print valid_target_dataset 30 | print source_dictionary 31 | print target_dictionary 32 | print params, params.save_path, save_file_name 33 | 34 | validerr = train( 35 | max_epochs=args.max_epochs, 36 | patience=args.patience, 37 | 38 | dim_word_src=args.dim_word_src, 39 | dim_word=args.dim_word, 40 | 41 | save_path=args.save_path, 42 | save_file_name=save_file_name, 43 | re_load=args.re_load, 44 | re_load_old_setting=args.re_load_old_setting, 45 | 46 | enc_dim=args.enc_dim, 47 | dec_dim=args.dec_dim, 48 | 49 | n_words_src=args.n_words_src, 50 | n_words=args.n_words, 51 | decay_c=args.decay_c, 52 | lrate=args.learning_rate, 53 | optimizer=args.optimizer, 54 | maxlen=args.maxlen, 55 | maxlen_trg=args.maxlen_trg, 56 | maxlen_sample=args.maxlen_sample, 57 | batch_size=args.batch_size, 58 | valid_batch_size=args.valid_batch_size, 59 | sort_size=args.sort_size, 60 | validFreq=args.validFreq, 61 | dispFreq=args.dispFreq, 62 | saveFreq=args.saveFreq, 63 | sampleFreq=args.sampleFreq, 64 | pbatchFreq=args.pbatchFreq, 65 | clip_c=args.clip_c, 66 | 67 | datasets=[source_dataset, target_dataset], 68 | valid_datasets=[valid_source_dataset, valid_target_dataset], 69 | dictionaries=[source_dictionary, target_dictionary], 70 | 71 | use_dropout=args.use_dropout, 72 | source_word_level=args.source_word_level, 73 | target_word_level=args.target_word_level, 74 | save_every_saveFreq=1, 75 | use_bpe=1, 76 | gru=args.gru, 77 | 78 | quit_immediately=args.quit_immediately, 79 | init_params=init_params, 80 | build_model=build_model, 81 | build_sampler=build_sampler, 82 | gen_sample=gen_sample, 83 | ) 84 | return validerr 85 | 86 | if __name__ == '__main__': 87 | 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('-model_name', type=str, help="", default="bi-bpe2char") 90 | parser.add_argument('-translate', type=str, default="de_en", help="de_en / cs_en / fi_en / ru_en") 91 | 92 | parser.add_argument('-enc_dim', type=int, default=512, help="") 93 | parser.add_argument('-dec_dim', type=int, default=1024, help="") 94 | 95 | parser.add_argument('-dim_word', type=int, default=512, help="") 96 | parser.add_argument('-dim_word_src', type=int, default=512, help="") 97 | 98 | parser.add_argument('-batch_size', type=int, default=128, help="") 99 | parser.add_argument('-valid_batch_size', type=int, default=128, help="") 100 | 101 | parser.add_argument('-maxlen', type=int, default=50, help="") 102 | parser.add_argument('-maxlen_trg', type=int, default=500, help="") 103 | parser.add_argument('-maxlen_sample', type=int, default=500, help="") 104 | 105 | parser.add_argument('-re_load', action="store_true", default=False) 106 | parser.add_argument('-re_load_old_setting', action="store_true", default=False) 107 | parser.add_argument('-quit_immediately', action="store_true", default=False) 108 | 109 | parser.add_argument('-use_dropout', action="store_true", default=False) 110 | 111 | parser.add_argument('-max_epochs', type=int, default=1000000000000, help="") 112 | parser.add_argument('-patience', type=int, default=-1, help="") 113 | parser.add_argument('-learning_rate', type=float, default=0.0001, help="") 114 | 115 | parser.add_argument('-n_words_src', type=int, default=302, help="298 for FI") 116 | parser.add_argument('-n_words', type=int, default=302, help="292 for FI") 117 | 118 | parser.add_argument('-optimizer', type=str, default="adam", help="") 119 | parser.add_argument('-decay_c', type=int, default=0, help="") 120 | parser.add_argument('-clip_c', type=int, default=1, help="") 121 | 122 | parser.add_argument('-gru', type=str, default="gru", help="gru/lngru") 123 | 124 | parser.add_argument('-saveFreq', type=int, default=5000, help="") 125 | parser.add_argument('-sampleFreq', type=int, default=5000, help="") 126 | parser.add_argument('-dispFreq', type=int, default=1000, help="") 127 | parser.add_argument('-validFreq', type=int, default=5000, help="") 128 | parser.add_argument('-pbatchFreq', type=int, default=5000, help="") 129 | parser.add_argument('-sort_size', type=int, default=20, help="") 130 | 131 | parser.add_argument('-source_word_level', type=int, default=1, help="") 132 | parser.add_argument('-target_word_level', type=int, default=0, help="") 133 | 134 | args = parser.parse_args() 135 | 136 | n_words_dic = {'de_en': [24254, 302], 'cs_en': [21816, 302], 'fi_en':[20783, 292], 'ru_en':[22106, 302]} 137 | 138 | args.n_words_src = n_words_dic[args.translate][0] 139 | args.n_words= n_words_dic[args.translate][1] 140 | 141 | args.save_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly 142 | args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/wmt15/" # change accordingly 143 | args.save_path = args.save_path + args.translate + "/" 144 | 145 | main(0, args) 146 | -------------------------------------------------------------------------------- /bpe2char/train_multi_bpe2char.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import string 5 | import math 6 | import numpy 7 | from char_base_multi_b2c import * 8 | from nmt_many import train 9 | 10 | from collections import OrderedDict 11 | 12 | def main(job_id, params, args): 13 | print args 14 | save_file_name = args.model_name 15 | source_dataset = [args.data_path + path + tr for path, tr in zip(params['train_data_path'], params['source_dataset'])] 16 | target_dataset = [args.data_path + path + tr for path, tr in zip(params['train_data_path'], params['target_dataset'])] 17 | 18 | valid_source_dataset = [args.data_path + path + tr for path, tr in zip(params['dev_data_path'], params['valid_source_dataset'])] 19 | valid_target_dataset = [args.data_path + path + tr for path, tr in zip(params['dev_data_path'], params['valid_target_dataset'])] 20 | 21 | source_dictionary = args.dic_path + args.source_dictionary 22 | target_dictionary = args.dic_path + args.target_dictionary 23 | 24 | args.save_path = args.save_path + args.translate + "/" 25 | 26 | print params, args.save_path, save_file_name 27 | validerr = train( 28 | max_epochs=int(params['max_epochs']), 29 | patience=int(params['patience']), 30 | 31 | dim_word=args.dim_word, 32 | dim_word_src=args.dim_word_src, 33 | 34 | save_path=args.save_path, 35 | save_file_name=save_file_name, 36 | re_load=args.re_load, 37 | re_load_old_setting=args.re_load_old_setting, 38 | 39 | enc_dim=args.enc_dim, 40 | dec_dim=args.dec_dim, 41 | 42 | n_words=args.n_words, 43 | n_words_src=args.n_words_src, 44 | decay_c=float(params['decay_c']), 45 | lrate=float(params['learning_rate']), 46 | optimizer=params['optimizer'], 47 | maxlen=args.maxlen, 48 | maxlen_trg=args.maxlen_trg, 49 | maxlen_sample=args.maxlen_sample, 50 | batch_size=args.train_batch_size, 51 | valid_batch_size=args.valid_batch_size, 52 | sort_size=args.sort_size, 53 | validFreq=args.validFreq, 54 | dispFreq=args.dispFreq, 55 | saveFreq=args.saveFreq, 56 | sampleFreq=args.sampleFreq, 57 | pbatchFreq=args.pbatchFreq, 58 | clip_c=int(params['clip_c']), 59 | 60 | datasets=[source_dataset, target_dataset], 61 | valid_datasets=[[s,t] for s,t in zip(valid_source_dataset, valid_target_dataset)], 62 | dictionaries=[source_dictionary, target_dictionary], 63 | 64 | use_dropout=int(params['use_dropout']), 65 | source_word_level=int(params['source_word_level']), 66 | target_word_level=int(params['target_word_level']), 67 | save_every_saveFreq=1, 68 | use_bpe=0, 69 | init_params=init_params, 70 | build_model=build_model, 71 | build_sampler=build_sampler, 72 | gen_sample=gen_sample, 73 | ) 74 | return validerr 75 | 76 | if __name__ == '__main__': 77 | 78 | import sys, time 79 | 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument('-model_name', type=str, help="", default="multi-bpe2char") 82 | parser.add_argument('-translate', type=str, default="many_en") 83 | 84 | parser.add_argument('-enc_dim', type=int, default=512, help="") 85 | parser.add_argument('-dec_dim', type=int, default=1024, help="") 86 | 87 | parser.add_argument('-dim_word', type=int, default=512, help="") 88 | parser.add_argument('-dim_word_src', type=int, default=512, help="") 89 | 90 | parser.add_argument('-n_words', type=int, default=402, help="") 91 | parser.add_argument('-n_words_src', type=int, default=54541, help="") 92 | 93 | parser.add_argument('-source_dictionary', type=str, default="bpe-source-for-dic.word.pkl", help="") 94 | parser.add_argument('-target_dictionary', type=str, default="target.402.pkl", help="") 95 | 96 | parser.add_argument('-saveFreq', type=int, default=5000, help="") 97 | parser.add_argument('-sampleFreq', type=int, default=5000, help="") 98 | parser.add_argument('-dispFreq', type=int, default=1000, help="") 99 | parser.add_argument('-validFreq', type=int, default=5000, help="") 100 | parser.add_argument('-pbatchFreq', type=int, default=-1, help="") 101 | parser.add_argument('-sort_size', type=int, default=20, help="") 102 | 103 | parser.add_argument('-maxlen', type=int, default=50, help="") 104 | parser.add_argument('-maxlen_trg', type=int, default=500, help="") 105 | parser.add_argument('-maxlen_sample', type=int, default=500, help="") 106 | 107 | parser.add_argument('-train_batch_size', type=str, default="4535523/12122376/1926115/2326893", help="") 108 | parser.add_argument('-valid_batch_size', type=int, default=60, help="") 109 | parser.add_argument('-batch_size', type=int, default=60, help="") 110 | 111 | parser.add_argument('-re_load', action="store_true", default=False) 112 | parser.add_argument('-re_load_old_setting', action="store_true", default=False) 113 | 114 | args = parser.parse_args() 115 | 116 | args.train_batch_size = [ int(x) for x in args.train_batch_size.split("/") ] 117 | 118 | train_batch_sum = numpy.sum(args.train_batch_size) 119 | 120 | args.train_batch_size = [ int(numpy.ceil(args.batch_size * x / float(train_batch_sum))) for x in args.train_batch_size ] 121 | args.train_batch_size = [ 14, 37, 6, 7 ] 122 | 123 | args.save_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly 124 | args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/multi-wmt15/" # change accordingly 125 | args.dic_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/multi-wmt15/dic/" # change accordingly 126 | 127 | config_file_name = '/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/bpe2char/wmt15_manyen_bpe2char_adam.txt' # change accordingly 128 | 129 | f = open(config_file_name, 'r') 130 | lines = f.readlines() 131 | params = OrderedDict() 132 | 133 | for line in lines: 134 | line = line.split('\n')[0] 135 | param_list = line.split(' ') 136 | 137 | if len(param_list) < 2: 138 | continue 139 | elif len(param_list) == 2: 140 | param_name = param_list[0] 141 | param_value = param_list[1] 142 | params[param_name] = param_value 143 | else: 144 | param_name = param_list[0] 145 | param_value = param_list[1:] 146 | params[param_name] = param_value 147 | 148 | main(0, params, args) 149 | -------------------------------------------------------------------------------- /bpe2char/wmt15_manyen_bpe2char_adam.txt: -------------------------------------------------------------------------------- 1 | train_data_path deen/train/ csen/train/ fien/train/ ruen/train/ 2 | dev_data_path deen/dev/ csen/dev/ fien/dev/ ruen/dev/ 3 | 4 | max_epochs 1000000000000 5 | patience -1 6 | learning_rate 0.0001 7 | 8 | optimizer adam 9 | decay_c 0 10 | use_dropout 0 11 | clip_c 1 12 | 13 | source_word_level 1 14 | target_word_level 0 15 | 16 | source_dataset all_de-en.de.tok.shuf.iso9.bpe.50000 all_cs-en.cs.tok.iso9.bpe.50000 all_fi-en.fi.tok.shuf.iso9.bpe.50000 all_ru-en.ru.tok.iso9.bpe.50000 17 | target_dataset all_de-en.en.tok.shuf.iso9 all_cs-en.en.tok.iso9 all_fi-en.en.tok.shuf.iso9 all_ru-en.en.tok.iso9 18 | 19 | valid_source_dataset newstest2013.de.tok.iso9.bpe.50000 newstest2013-ref.cs.tok.iso9.bpe.50000 newsdev2015-enfi-ref.fi.tok.iso9.bpe.50000 newstest2013-ref.ru.tok.iso9.bpe.50000 20 | valid_target_dataset newstest2013.en.tok.iso9 newstest2013-src.en.tok.iso9 newsdev2015-enfi-src.en.tok.iso9 newstest2013-src.en.tok.iso9 21 | -------------------------------------------------------------------------------- /bpe2char/wmt_path.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | deen={ 4 | "dic": [ 5 | ["deen/train/all_de-en.de.tok.304.pkl", 6 | "deen/train/all_de-en.en.tok.300.pkl",], 7 | 8 | ["deen/train/all_de-en.de.tok.bpe.word.pkl"], 9 | ], 10 | 11 | "train": [ 12 | ["deen/train/all_de-en.de.tok.shuf", 13 | "deen/train/all_de-en.en.tok.shuf",], 14 | 15 | ["deen/train/all_de-en.de.tok.bpe.shuf", 16 | "deen/train/all_de-en.en.tok.bpe.shuf",], 17 | ], 18 | 19 | "dev": [ 20 | ["deen/dev/newstest2013.de.tok", 21 | "deen/dev/newstest2013.en.tok",], 22 | 23 | ["deen/dev/newstest2013.de.tok.bpe", 24 | "deen/dev/newstest2013.en.tok.bpe",], 25 | ], 26 | 27 | "test1" :[ 28 | ["deen/test/newstest2014-deen-ref.de.tok", 29 | "deen/test/newstest2014-deen-src.en.tok",], 30 | 31 | ["deen/test/newstest2014-deen-ref.de.tok.bpe", 32 | "deen/test/newstest2014-deen-src.en.tok.bpe",], 33 | ], 34 | 35 | "test2":[ 36 | ["deen/test/newstest2015-deen-ref.de.tok", 37 | "deen/test/newstest2015-deen-src.en.tok",], 38 | 39 | ["deen/test/newstest2015-deen-ref.de.tok.bpe", 40 | "deen/test/newstest2015-deen-src.en.tok.bpe",], 41 | ], 42 | } 43 | 44 | csen={ 45 | 46 | "dic":[ 47 | ["csen/train/all_cs-en.cs.tok.304.pkl", 48 | "csen/train/all_cs-en.en.tok.300.pkl",], 49 | 50 | ["csen/train/all_cs-en.cs.tok.bpe.word.pkl"], 51 | ], 52 | 53 | "train":[ 54 | ["csen/train/all_cs-en.cs.tok", 55 | "csen/train/all_cs-en.en.tok",], 56 | 57 | ["csen/train/all_cs-en.cs.tok.bpe", 58 | "csen/train/all_cs-en.en.tok.bpe",], 59 | ], 60 | 61 | "dev": [ 62 | ["csen/dev/newstest2013-ref.cs.tok", 63 | "csen/dev/newstest2013-src.en.tok",], 64 | 65 | ["csen/dev/newstest2013-ref.cs.tok.bpe", 66 | "csen/dev/newstest2013-src.en.tok.bpe",], 67 | ], 68 | 69 | "test1":[ 70 | ["csen/test/newstest2014-csen-ref.cs.tok", 71 | "csen/test/newstest2014-csen-src.en.tok",], 72 | 73 | ["csen/test/newstest2014-csen-ref.cs.tok.bpe", 74 | "csen/test/newstest2014-csen-src.en.tok.bpe",], 75 | ], 76 | 77 | "test2":[ 78 | ["csen/test/newstest2015-csen-ref.cs.tok", 79 | "csen/test/newstest2015-csen-src.en.tok",], 80 | 81 | ["csen/test/newstest2015-csen-ref.cs.tok.bpe", 82 | "csen/test/newstest2015-csen-src.en.tok.bpe",], 83 | ] 84 | } 85 | 86 | fien={ 87 | "dic":[ 88 | ["fien/train/all_fi-en.fi.tok.304.pkl", 89 | "fien/train/all_fi-en.en.tok.300.pkl",], 90 | 91 | ["fien/train/all_fi-en.fi.tok.bpe.word.pkl"], 92 | ], 93 | 94 | "train":[ 95 | ["fien/train/all_fi-en.fi.tok", 96 | "fien/train/all_fi-en.en.tok",], 97 | 98 | ["fien/train/all_fi-en.fi.tok.bpe", 99 | "fien/train/all_fi-en.en.tok.bpe",], 100 | ], 101 | 102 | "dev":[ 103 | ["fien/dev/newsdev2015-enfi-ref.fi.tok", 104 | "fien/dev/newsdev2015-enfi-src.en.tok",], 105 | 106 | ["fien/dev/newsdev2015-enfi-ref.fi.tok.bpe", 107 | "fien/dev/newsdev2015-enfi-src.en.tok.bpe",], 108 | ], 109 | 110 | "test1":[ 111 | ["fien/test/newstest2015-fien-ref.fi.tok", 112 | "fien/test/newstest2015-fien-src.en.tok",], 113 | 114 | ["fien/test/newstest2015-fien-ref.fi.tok.bpe", 115 | "fien/test/newstest2015-fien-src.en.tok.bpe",], 116 | ], 117 | } 118 | 119 | ruen={ 120 | 121 | "dic":[ 122 | ["ruen/train/all_ru-en.ru.tok.304.pkl", 123 | "ruen/train/all_ru-en.en.tok.300.pkl",], 124 | 125 | ["ruen/train/all_ru-en.ru.tok.bpe.word.pkl"], 126 | ], 127 | 128 | "train":[ 129 | ["ruen/train/all_ru-en.ru.tok", 130 | "ruen/train/all_ru-en.en.tok",], 131 | 132 | ["ruen/train/all_ru-en.ru.tok.bpe", 133 | "ruen/train/all_ru-en.en.tok.bpe",], 134 | ], 135 | 136 | "dev":[ 137 | ["ruen/dev/newstest2013-ref.ru.tok", 138 | "ruen/dev/newstest2013-src.en.tok",], 139 | 140 | ["ruen/dev/newstest2013-ref.ru.tok.bpe", 141 | "ruen/dev/newstest2013-src.en.tok.bpe",], 142 | ], 143 | 144 | "test1":[ 145 | ["ruen/test/newstest2014-ruen-ref.ru.tok", 146 | "ruen/test/newstest2014-ruen-src.en.tok",], 147 | 148 | ["ruen/test/newstest2014-ruen-ref.ru.tok.bpe", 149 | "ruen/test/newstest2014-ruen-src.en.tok.bpe",], 150 | ], 151 | 152 | "test2":[ 153 | ["ruen/test/newstest2015-ruen-ref.ru.tok", 154 | "ruen/test/newstest2015-ruen-src.en.tok",], 155 | 156 | ["ruen/test/newstest2015-ruen-ref.ru.tok.bpe", 157 | "ruen/test/newstest2015-ruen-src.en.tok.bpe",], 158 | ] 159 | } 160 | 161 | manyen = { 162 | "dic":[ 163 | ["char-source-for-dic.300.pkl", 164 | "char-target-for-dic.300.pkl"], 165 | 166 | ["bpe-source-for-dic.word.pkl"] 167 | ] 168 | } 169 | 170 | wmts = dict() 171 | wmts["de_en"] = deen 172 | wmts["cs_en"] = csen 173 | wmts["fi_en"] = fien 174 | wmts["ru_en"] = ruen 175 | wmts["many_en"] = manyen 176 | -------------------------------------------------------------------------------- /char2char/char_base.py: -------------------------------------------------------------------------------- 1 | import theano 2 | from theano import tensor 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 4 | 5 | import cPickle 6 | import numpy 7 | import copy 8 | 9 | import os 10 | import warnings 11 | import sys 12 | import time 13 | 14 | from collections import OrderedDict 15 | from mixer import * 16 | 17 | def init_params(options): 18 | params = OrderedDict() 19 | 20 | print "new char_base initialise..." 21 | print "source dictionary size: %d" % options['n_words_src'] 22 | # embedding 23 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word_src']) 24 | params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) 25 | 26 | params = get_layer('multi_scale_conv_encoder')[0](options, params, prefix='multi_scale_conv_enc1', dim=options['dim_word_src'], width=options['conv_width'], nkernels=options['conv_nkernels']) 27 | 28 | for ii in xrange(options['highway']): 29 | params = get_layer('hw')[0](options, params, prefix="hw_network{}".format(ii+1), dim=numpy.sum(options['conv_nkernels'])) 30 | 31 | params = get_layer('gru')[0](options, params, 32 | prefix='encoder', 33 | nin=numpy.sum(options['conv_nkernels']), 34 | dim=options['enc_dim']) 35 | params = get_layer('gru')[0](options, params, 36 | prefix='encoderr', 37 | nin=numpy.sum(options['conv_nkernels']), 38 | dim=options['enc_dim']) 39 | ctxdim = 2 * options['enc_dim'] 40 | 41 | params = get_layer('ff')[0](options, params, 42 | prefix='ff_init_state_char', 43 | nin=ctxdim, 44 | nout=options['dec_dim']) 45 | params = get_layer('ff')[0](options, params, 46 | prefix='ff_init_state_word', 47 | nin=ctxdim, 48 | nout=options['dec_dim']) 49 | 50 | print "target dictionary size: %d" % options['n_words'] 51 | # decoder 52 | params = get_layer('two_layer_gru_decoder')[0](options, params, 53 | prefix='decoder', 54 | nin=options['dim_word'], 55 | dim_char=options['dec_dim'], 56 | dim_word=options['dec_dim'], 57 | dimctx=ctxdim) 58 | 59 | # readout 60 | params = get_layer('fff')[0](options, params, prefix='ff_logit_rnn', 61 | nin1=options['dec_dim'], nin2=options['dec_dim'], 62 | nout=options['dim_word'], ortho=False) 63 | params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', 64 | nin=options['dim_word'], 65 | nout=options['dim_word'], 66 | ortho=False) 67 | params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', 68 | nin=ctxdim, 69 | nout=options['dim_word'], 70 | ortho=False) 71 | params = get_layer('ff')[0](options, params, prefix='ff_logit', 72 | nin=options['dim_word'], 73 | nout=options['n_words']) 74 | 75 | return params 76 | 77 | 78 | def build_model(tparams, options): 79 | opt_ret = OrderedDict() 80 | 81 | trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max)) 82 | use_noise = theano.shared(numpy.float32(0.)) 83 | 84 | # description string: #words x #samples 85 | x = tensor.matrix('x', dtype='int64') 86 | x_mask = tensor.matrix('x_mask', dtype='float32') 87 | 88 | y = tensor.matrix('y', dtype='int64') 89 | y_mask = tensor.matrix('y_mask', dtype='float32') 90 | 91 | xr_mask = x_mask[::-1] 92 | 93 | n_samples = x.shape[1] 94 | n_timesteps = x.shape[0] 95 | n_timesteps_trg = y.shape[0] 96 | 97 | # word embedding for forward RNN (source) 98 | emb = tparams['Wemb'][x.flatten()] 99 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) 100 | # emb.shape = (maxlen_x_pad + 2*pool_stride, n_samples, dim_word_src) 101 | 102 | conv_out = get_layer('multi_scale_conv_encoder')[1](tparams, emb, options, prefix='multi_scale_conv_enc1', width=options['conv_width'], nkernels=options['conv_nkernels'], pool_window=options['pool_window'], pool_stride=options['pool_stride']) 103 | # conv_out.shape = (maxlen_x_pad/pool_stride, n_samples, sum(nkernels)) 104 | 105 | hw_in = conv_out.reshape([conv_out.shape[0] * conv_out.shape[1], conv_out.shape[2]]) 106 | for ii in xrange(options['highway']): 107 | hw_in = get_layer('hw')[1](tparams, hw_in, options, prefix="hw_network{}".format(ii+1)) 108 | hw_out = hw_in.reshape([conv_out.shape[0], conv_out.shape[1], conv_out.shape[2]]) 109 | # hw_out.shape = (maxlen_x_pad/pool_stride, n_samples, sum(nkernels)) 110 | 111 | if options['dropout_gru']: 112 | print "Dropout before GRUs." 113 | hw_out = hw_out * trng.binomial(hw_out.shape, p=0.5, n=1, dtype=hw_out.dtype) * 2.0 114 | 115 | # pass through gru layer, recurrence here 116 | proj = get_layer('gru')[1](tparams, hw_out, options, prefix='encoder', mask=x_mask) 117 | projr = get_layer('gru')[1](tparams, hw_out[::-1], options, prefix='encoderr', mask=xr_mask) 118 | 119 | # context 120 | ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) 121 | 122 | # context mean 123 | ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] 124 | 125 | # initial decoder state 126 | init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, 127 | prefix='ff_init_state_char', activ='tanh') 128 | init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, 129 | prefix='ff_init_state_word', activ='tanh') 130 | 131 | # word embedding and shifting for targets 132 | yemb = tparams['Wemb_dec'][y.flatten()] 133 | yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) 134 | yemb_shited = tensor.zeros_like(yemb) 135 | yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1]) 136 | yemb = yemb_shited 137 | 138 | char_h, word_h, ctxs, alphas = \ 139 | get_layer('two_layer_gru_decoder')[1](tparams, yemb, options, 140 | prefix='decoder', 141 | mask=y_mask, 142 | context=ctx, 143 | context_mask=x_mask, 144 | one_step=False, 145 | init_state_char=init_state_char, 146 | init_state_word=init_state_word) 147 | 148 | opt_ret['dec_alphas'] = alphas 149 | 150 | # compute word probabilities 151 | logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options, 152 | prefix='ff_logit_rnn', activ='linear') 153 | logit_prev = get_layer('ff')[1](tparams, yemb, options, 154 | prefix='ff_logit_prev', activ='linear') 155 | logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 156 | prefix='ff_logit_ctx', activ='linear') 157 | logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) 158 | 159 | if options['dropout_softmax']: 160 | print "Dropout before Softmax" 161 | logit = logit * trng.binomial(logit.shape, p=0.5, n=1, dtype=logit.dtype) * 2.0 162 | 163 | logit = get_layer('ff')[1](tparams, logit, options, 164 | prefix='ff_logit', activ='linear') 165 | logit_shp = logit.shape 166 | probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 167 | 168 | # cost 169 | y_flat = y.flatten() 170 | y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat 171 | cost = -tensor.log(probs.flatten()[y_flat_idx]) 172 | cost = cost.reshape([y.shape[0], y.shape[1]]) 173 | cost = (cost * y_mask).sum(0) 174 | 175 | return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost 176 | 177 | def build_sampler(tparams, options, trng, use_noise): 178 | 179 | x = tensor.matrix('x', dtype='int64') 180 | 181 | n_timesteps = x.shape[0] 182 | n_samples = x.shape[1] 183 | 184 | emb = tparams['Wemb'][x.flatten()] 185 | emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']]) 186 | 187 | conv_out = get_layer('multi_scale_conv_encoder')[1](tparams, emb, options, prefix='multi_scale_conv_enc1', width=options['conv_width'], nkernels=options['conv_nkernels'], pool_window=options['pool_window'], pool_stride=options['pool_stride']) 188 | 189 | hw_in = conv_out.reshape([conv_out.shape[0] * conv_out.shape[1], conv_out.shape[2]]) 190 | for ii in xrange(options['highway']): 191 | hw_in = get_layer('hw')[1](tparams, hw_in, options, prefix="hw_network{}".format(ii+1)) 192 | hw_out = hw_in.reshape([conv_out.shape[0], conv_out.shape[1], conv_out.shape[2]]) 193 | 194 | # pass through gru layer, recurrence here 195 | proj = get_layer('gru')[1](tparams, hw_out, options, prefix='encoder') 196 | projr = get_layer('gru')[1](tparams, hw_out[::-1], options, prefix='encoderr') 197 | 198 | ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1) 199 | ctx_mean = ctx.mean(0) 200 | 201 | init_state_char = get_layer('ff')[1](tparams, ctx_mean, options, 202 | prefix='ff_init_state_char', activ='tanh') 203 | init_state_word = get_layer('ff')[1](tparams, ctx_mean, options, 204 | prefix='ff_init_state_word', activ='tanh') 205 | 206 | print 'Building f_init...', 207 | outs = [init_state_char, init_state_word, ctx] 208 | f_init = theano.function([x], outs, name='f_init', profile=profile) 209 | print 'Done' 210 | 211 | y = tensor.vector('y_sampler', dtype='int64') 212 | init_state_char = tensor.matrix('init_state_char', dtype='float32') 213 | init_state_word = tensor.matrix('init_state_word', dtype='float32') 214 | 215 | # if it's the first word, emb should be all zero and it is indicated by -1 216 | yemb = tensor.switch(y[:, None] < 0, 217 | tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 218 | tparams['Wemb_dec'][y]) 219 | 220 | next_state_char, next_state_word, next_ctx, next_alpha = \ 221 | get_layer('two_layer_gru_decoder')[1](tparams, yemb, options, 222 | prefix='decoder', 223 | context=ctx, 224 | mask=None, 225 | one_step=True, 226 | init_state_char=init_state_char, 227 | init_state_word=init_state_word) 228 | 229 | logit_rnn = get_layer('fff')[1](tparams, 230 | next_state_char, 231 | next_state_word, 232 | options, 233 | prefix='ff_logit_rnn', 234 | activ='linear') 235 | # dec_dim, dec_dim => dim_word 236 | logit_prev = get_layer('ff')[1](tparams, 237 | yemb, 238 | options, 239 | prefix='ff_logit_prev', 240 | activ='linear') 241 | # dim_word => dim_word 242 | logit_ctx = get_layer('ff')[1](tparams, 243 | next_ctx, 244 | options, 245 | prefix='ff_logit_ctx', 246 | activ='linear') 247 | # ctx_dim => dim_word 248 | logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx) 249 | 250 | logit = get_layer('ff')[1](tparams, logit, options, 251 | prefix='ff_logit', 252 | activ='linear') 253 | next_probs = tensor.nnet.softmax(logit) 254 | next_sample = trng.multinomial(pvals=next_probs).argmax(1) 255 | 256 | # next word probability 257 | print 'Building f_next...', 258 | inps = [y, ctx, init_state_char, init_state_word] 259 | outs = [next_probs, next_sample, next_state_char, next_state_word] 260 | f_next = theano.function(inps, outs, name='f_next', profile=profile) 261 | print 'Done' 262 | 263 | return f_init, f_next 264 | 265 | def gen_sample(tparams, f_init, f_next, x, options, trng=None, 266 | k=1, maxlen=500, stochastic=True, argmax=False): 267 | 268 | # k is the beam size we have 269 | if k > 1: 270 | assert not stochastic, \ 271 | 'Beam search does not support stochastic sampling' 272 | 273 | sample = [] 274 | sample_score = [] 275 | if stochastic: 276 | sample_score = 0 277 | 278 | live_k = 1 279 | dead_k = 0 280 | 281 | hyp_samples = [[]] * live_k 282 | hyp_scores = numpy.zeros(live_k).astype('float32') 283 | hyp_states = [] 284 | 285 | # get initial state of decoder rnn and encoder context 286 | ret = f_init(x) 287 | 288 | next_state_char, next_state_word, ctx0 = ret[0], ret[1], ret[2] 289 | next_w = -1 * numpy.ones((1,)).astype('int64') # bos indicator 290 | 291 | for ii in xrange(maxlen): 292 | ctx = numpy.tile(ctx0, [live_k, 1]) 293 | inps = [next_w, ctx, next_state_char, next_state_word] 294 | 295 | ret = f_next(*inps) 296 | 297 | next_p, next_w, next_state_char, next_state_word = ret[0], ret[1], ret[2], ret[3] 298 | 299 | # FALSE while decoding 300 | if stochastic: 301 | if argmax: 302 | nw = next_p[0].argmax() 303 | else: 304 | nw = next_w[0] 305 | sample.append(nw) 306 | sample_score += next_p[0, nw] 307 | if nw == 0: 308 | break 309 | else: 310 | cand_scores = hyp_scores[:, None] - numpy.log(next_p) 311 | cand_flat = cand_scores.flatten() 312 | 313 | ranks_flat = cand_flat.argsort()[:(k-dead_k)] 314 | # k: beam width 315 | # dead_k : initially 0, increments 1 by 1 316 | 317 | voc_size = next_p.shape[1] 318 | trans_indices = ranks_flat / voc_size 319 | word_indices = ranks_flat % voc_size 320 | costs = cand_flat[ranks_flat] 321 | # here, basically sort cand_flat 322 | 323 | new_hyp_samples = [] 324 | # k : beam width 325 | new_hyp_scores = numpy.zeros(k-dead_k).astype('float32') 326 | new_hyp_states_char = [] 327 | new_hyp_states_word = [] 328 | 329 | for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): 330 | new_hyp_samples.append(hyp_samples[ti]+[wi]) 331 | new_hyp_scores[idx] = copy.copy(costs[idx]) 332 | new_hyp_states_char.append(copy.copy(next_state_char[ti])) 333 | new_hyp_states_word.append(copy.copy(next_state_word[ti])) 334 | 335 | # check the finished samples 336 | new_live_k = 0 337 | hyp_samples = [] 338 | hyp_scores = [] 339 | hyp_states_char = [] 340 | hyp_states_word = [] 341 | 342 | for idx in xrange(len(new_hyp_samples)): 343 | if new_hyp_samples[idx][-1] == 0: 344 | sample.append(new_hyp_samples[idx]) 345 | sample_score.append(new_hyp_scores[idx]) 346 | dead_k += 1 347 | else: 348 | new_live_k += 1 349 | hyp_samples.append(new_hyp_samples[idx]) 350 | hyp_scores.append(new_hyp_scores[idx]) 351 | hyp_states_char.append(new_hyp_states_char[idx]) 352 | hyp_states_word.append(new_hyp_states_word[idx]) 353 | hyp_scores = numpy.array(hyp_scores) 354 | live_k = new_live_k 355 | 356 | if new_live_k < 1: 357 | break 358 | if dead_k >= k: 359 | break 360 | 361 | next_w = numpy.array([w[-1] for w in hyp_samples]) 362 | next_state_char = numpy.array(hyp_states_char) 363 | next_state_word = numpy.array(hyp_states_word) 364 | 365 | if not stochastic: 366 | # dump every remaining one 367 | if live_k > 0: 368 | for idx in xrange(live_k): 369 | sample.append(hyp_samples[idx]) 370 | sample_score.append(hyp_scores[idx]) 371 | 372 | return sample, sample_score 373 | -------------------------------------------------------------------------------- /char2char/conv_tools.py: -------------------------------------------------------------------------------- 1 | import theano 2 | from theano import tensor 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 4 | 5 | import cPickle 6 | import numpy 7 | import copy 8 | 9 | import os 10 | import warnings 11 | import sys 12 | import time 13 | 14 | def conv_mask_pool(x_mask, pool_stride): 15 | # x_mask.shape = (maxlen_x_pad, n_samples) 16 | maxlen_x_pad, n_samples = x_mask.shape[0], x_mask.shape[1] 17 | maxlen_pooled = maxlen_x_pad / pool_stride 18 | 19 | x_m = numpy.zeros((maxlen_pooled, n_samples)).astype('float32') 20 | 21 | for idx in range(n_samples): 22 | x_sum = numpy.sum(x_mask[:,idx]) 23 | x_num = numpy.ceil( x_sum / float(pool_stride)) 24 | x_num = int(x_num) 25 | x_m[:x_num, idx] = 1.0 26 | 27 | return x_m 28 | -------------------------------------------------------------------------------- /char2char/data_iterator.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy 3 | import os 4 | import random 5 | 6 | import cPickle 7 | import gzip 8 | import codecs 9 | 10 | from tempfile import mkstemp 11 | 12 | random.seed(1029381209) 13 | 14 | def fopen(filename, mode='r'): 15 | if filename.endswith('.gz'): 16 | return gzip.open(filename, mode) 17 | return open(filename, mode) 18 | 19 | class TextIterator: 20 | """Simple Bitext iterator.""" 21 | def __init__(self, 22 | source, source_dict, 23 | target=None, target_dict=None, 24 | source_word_level=0, 25 | target_word_level=0, 26 | batch_size=128, 27 | job_id=0, 28 | sort_size=20, 29 | n_words_source=-1, 30 | n_words_target=-1, 31 | shuffle_per_epoch=False): 32 | self.source_file = source 33 | self.target_file = target 34 | self.source = fopen(source, 'r') 35 | with open(source_dict, 'rb') as f: 36 | self.source_dict = cPickle.load(f) 37 | if target is not None: 38 | self.target = fopen(target, 'r') 39 | if target_dict is not None: 40 | with open(target_dict, 'rb') as f: 41 | self.target_dict = cPickle.load(f) 42 | else: 43 | self.target = None 44 | 45 | self.source_word_level = source_word_level 46 | self.target_word_level = target_word_level 47 | self.batch_size = batch_size 48 | 49 | self.n_words_source = n_words_source 50 | self.n_words_target = n_words_target 51 | self.shuffle_per_epoch = shuffle_per_epoch 52 | 53 | self.source_buffer = [] 54 | self.target_buffer = [] 55 | self.k = batch_size * sort_size 56 | 57 | self.end_of_data = False 58 | self.job_id = job_id 59 | 60 | def __iter__(self): 61 | return self 62 | 63 | def reset(self): 64 | if self.shuffle_per_epoch: 65 | # close current files 66 | self.source.close() 67 | if self.target is None: 68 | self.shuffle([self.source_file]) 69 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 70 | else: 71 | self.target.close() 72 | # shuffle *original* source files, 73 | self.shuffle([self.source_file, self.target_file]) 74 | # open newly 're-shuffled' file as input 75 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 76 | self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r') 77 | else: 78 | self.source.seek(0) 79 | if self.target is not None: 80 | self.target.seek(0) 81 | 82 | @staticmethod 83 | def shuffle(files): 84 | tf_os, tpath = mkstemp() 85 | tf = open(tpath, 'w') 86 | fds = [open(ff) for ff in files] 87 | for l in fds[0]: 88 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 89 | print >>tf, "|||".join(lines) 90 | [ff.close() for ff in fds] 91 | tf.close() 92 | tf = open(tpath, 'r') 93 | lines = tf.readlines() 94 | random.shuffle(lines) 95 | fds = [open(ff+'.reshuf','w') for ff in files] 96 | for l in lines: 97 | s = l.strip().split('|||') 98 | for ii, fd in enumerate(fds): 99 | print >>fd, s[ii] 100 | [ff.close() for ff in fds] 101 | os.remove(tpath) 102 | return 103 | 104 | def next(self): 105 | if self.end_of_data: 106 | self.end_of_data = False 107 | self.reset() 108 | raise StopIteration 109 | 110 | source = [] 111 | target = [] 112 | 113 | # fill buffer, if it's empty 114 | if self.target is not None: 115 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 116 | 117 | if len(self.source_buffer) == 0: 118 | for k_ in xrange(self.k): 119 | #rand_idx = random.randint(0,len(self.source_buffer)) 120 | 121 | ss = self.source.readline() 122 | 123 | if ss == "": 124 | break 125 | 126 | if self.source_word_level: 127 | ss = ss.strip().split() 128 | else: 129 | ss = ss.strip() 130 | ss = list(ss.decode('utf8')) 131 | 132 | #self.source_buffer.insert(rand_idx, ss) 133 | self.source_buffer.append(ss) 134 | 135 | if self.target is not None: 136 | tt = self.target.readline() 137 | 138 | if tt == "": 139 | break 140 | 141 | if self.target_word_level: 142 | tt = tt.strip().split() 143 | else: 144 | tt = tt.strip() 145 | tt = list(tt.decode('utf8')) 146 | 147 | #self.target_buffer.insert(rand_idx, tt) 148 | self.target_buffer.append(tt) 149 | 150 | if self.target is not None: 151 | # sort by target buffer 152 | tlen = numpy.array([len(t) for t in self.target_buffer]) 153 | tidx = tlen.argsort() 154 | _sbuf = [self.source_buffer[i] for i in tidx] 155 | _tbuf = [self.target_buffer[i] for i in tidx] 156 | self.target_buffer = _tbuf 157 | else: 158 | slen = numpy.array([len(s) for s in self.source_buffer]) 159 | sidx = slen.argsort() 160 | _sbuf = [self.source_buffer[i] for i in sidx] 161 | 162 | self.source_buffer = _sbuf 163 | 164 | if self.target is not None: 165 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 166 | self.end_of_data = False 167 | self.reset() 168 | raise StopIteration 169 | elif len(self.source_buffer) == 0: 170 | self.end_of_data = False 171 | self.reset() 172 | raise StopIteration 173 | 174 | try: 175 | # actual work here 176 | while True: 177 | # read from source file and map to word index 178 | try: 179 | ss_ = self.source_buffer.pop() 180 | except IndexError: 181 | break 182 | ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_] 183 | if self.n_words_source > 0: 184 | ss = [w if w < self.n_words_source else 1 for w in ss] 185 | 186 | # NOTE : prepending and appending with SOS and EOS symbols 187 | # see preprocess/build_dictionary_char.py to see why 2 and 3. 188 | ss = [2] + ss + [3] 189 | source.append(ss) 190 | 191 | if self.target is not None: 192 | # read from target file and map to word index 193 | tt_ = self.target_buffer.pop() 194 | tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_] 195 | if self.n_words_target > 0: 196 | tt = [w if w < self.n_words_target else 1 for w in tt] 197 | target.append(tt) 198 | 199 | if len(source) >= self.batch_size: 200 | break 201 | except IOError: 202 | self.end_of_data = True 203 | 204 | if self.target is not None: 205 | if len(source) <= 0 or len(target) <= 0: 206 | self.end_of_data = False 207 | self.reset() 208 | raise StopIteration 209 | return source, target 210 | else: 211 | if len(source) <= 0: 212 | self.end_of_data = False 213 | self.reset() 214 | raise StopIteration 215 | return source 216 | -------------------------------------------------------------------------------- /char2char/many_data_iterator.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy 3 | import os 4 | import random 5 | 6 | import cPickle 7 | import gzip 8 | import codecs 9 | 10 | from tempfile import mkstemp 11 | 12 | random.seed(1029381209) 13 | 14 | def fopen(filename, mode='r'): 15 | if filename.endswith('.gz'): 16 | return gzip.open(filename, mode) 17 | return open(filename, mode) 18 | 19 | class MultiTextIterator: 20 | """Simple Bitext iterator.""" 21 | def __init__(self, 22 | source, source_dict, 23 | target=None, target_dict=None, 24 | source_word_level=0, 25 | target_word_level=0, 26 | batch_size=[128,1,2,3], 27 | job_id=0, 28 | sort_size=20, 29 | n_words_source=302, 30 | n_words_target=302, 31 | shuffle_per_epoch=False): 32 | 33 | self.source_files = source 34 | self.target_files = target 35 | 36 | self.sources = [fopen(s, 'r') for s in source] 37 | with open(source_dict, 'rb') as f: 38 | self.source_dict = cPickle.load(f) 39 | # one source dictionary 40 | 41 | self.targets = [fopen(t, 'r') for t in target] 42 | with open(target_dict, 'rb') as f: 43 | self.target_dict = cPickle.load(f) 44 | # one target dictionary 45 | 46 | self.source_word_level = source_word_level 47 | self.target_word_level = target_word_level 48 | self.batch_sizes = batch_size 49 | # list 50 | 51 | self.n_words_source = n_words_source 52 | self.n_words_target = n_words_target 53 | self.shuffle_per_epoch = shuffle_per_epoch 54 | 55 | self.source_buffers = [[],[],[],[]] 56 | self.target_buffers = [[],[],[],[]] 57 | self.k = [bs * sort_size for bs in batch_size] 58 | # at once, fetch 20 items 59 | # we're good for 20 updates 60 | 61 | self.end_of_data = False 62 | self.job_id = job_id 63 | 64 | def __iter__(self): 65 | return self 66 | 67 | def reset(self): 68 | if self.shuffle_per_epoch: 69 | raise Exception("hi") 70 | # close current files 71 | for s in self.sources: 72 | s.close() 73 | 74 | if self.targets is None: 75 | self.shuffle([self.source_file]) 76 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 77 | else: 78 | for t in self.targets: 79 | t.close() 80 | 81 | # shuffle *original* source files, 82 | self.shuffle([self.source_file, self.target_file]) 83 | # open newly 're-shuffled' file as input 84 | self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r') 85 | self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r') 86 | else: 87 | for idx in xrange(4): 88 | self.sources[idx].seek(0) 89 | self.targets[idx].seek(0) 90 | 91 | @staticmethod 92 | def shuffle(files): 93 | tf_os, tpath = mkstemp() 94 | tf = open(tpath, 'w') 95 | fds = [open(ff) for ff in files] 96 | for l in fds[0]: 97 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 98 | print >>tf, "|||".join(lines) 99 | [ff.close() for ff in fds] 100 | tf.close() 101 | tf = open(tpath, 'r') 102 | lines = tf.readlines() 103 | random.shuffle(lines) 104 | fds = [open(ff+'.reshuf','w') for ff in files] 105 | for l in lines: 106 | s = l.strip().split('|||') 107 | for ii, fd in enumerate(fds): 108 | print >>fd, s[ii] 109 | [ff.close() for ff in fds] 110 | os.remove(tpath) 111 | return 112 | 113 | def next(self): 114 | # if end_of_data reaches, stop for loop 115 | if self.end_of_data: 116 | self.end_of_data = False 117 | self.reset() 118 | raise StopIteration 119 | 120 | sources = [[],[],[],[]] 121 | targets = [[],[],[],[]] 122 | # NOTE : this is the data to be used for "this" round of updates 123 | 124 | # fill buffer, if it's empty 125 | for idx in xrange(4): 126 | assert len(self.source_buffers[idx]) == len(self.target_buffers[idx]), 'Buffer size mismatch!' 127 | 128 | for idx in xrange(4): 129 | 130 | # NOTE : in buffer: don't put the whole dataset in... only for 'k' many updates 131 | # after 'k' updates, self.source_buffers[idx] will be empty, in which case we will put new things in 132 | 133 | #if len(self.source_buffers[idx]) == 0: 134 | if len(self.source_buffers[idx]) < self.batch_sizes[idx]: 135 | for k_ in xrange(self.k[idx]): 136 | 137 | ss = self.sources[idx].readline() 138 | # NOTE: self.sources is where we keep the RAW data 139 | if ss == "": 140 | break 141 | if self.source_word_level: 142 | ss = ss.strip().split() 143 | else: 144 | ss = ss.strip() 145 | ss = list(ss.decode('utf8')) 146 | self.source_buffers[idx].append(ss) 147 | 148 | tt = self.targets[idx].readline() 149 | if tt == "": 150 | break 151 | if self.target_word_level: 152 | tt = tt.strip().split() 153 | else: 154 | tt = tt.strip() 155 | tt = list(tt.decode('utf8')) 156 | self.target_buffers[idx].append(tt) 157 | 158 | tlen = numpy.array([len(t) for t in self.target_buffers[idx]]) 159 | tidx = tlen.argsort() 160 | _sbuf = [self.source_buffers[idx][i] for i in tidx] 161 | _tbuf = [self.target_buffers[idx][i] for i in tidx] 162 | self.target_buffers[idx] = _tbuf 163 | self.source_buffers[idx] = _sbuf 164 | 165 | stop = False 166 | for idx in xrange(4): 167 | if len(self.source_buffers[idx]) < self.batch_sizes[idx]: 168 | stop = True 169 | 170 | if stop: 171 | self.end_of_data = False 172 | self.reset() 173 | raise StopIteration 174 | 175 | try: 176 | # actual work here 177 | for idx in xrange(4): 178 | while True: 179 | # read from source file and map to word index 180 | try: 181 | ss_ = self.source_buffers[idx].pop() 182 | except IndexError: 183 | # NOTE : just because source_buffers is empty, doesn't mean file scanned 184 | # we do add partial batches. We proceed until len(source_buffers) = 0 185 | break 186 | 187 | ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_] 188 | if self.n_words_source > 0: 189 | ss = [w if w < self.n_words_source else 1 for w in ss] 190 | 191 | # NOTE : prepending and appending with SOS and EOS symbols 192 | ss = [2] + ss + [3] 193 | sources[idx].append(ss) 194 | 195 | tt_ = self.target_buffers[idx].pop() 196 | tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_] 197 | if self.n_words_target > 0: 198 | tt = [w if w < self.n_words_target else 1 for w in tt] 199 | targets[idx].append(tt) 200 | 201 | if len(sources[idx]) >= self.batch_sizes[idx]: 202 | break 203 | 204 | except IOError: 205 | self.end_of_data = True 206 | 207 | source = sources[0] + sources[1] + sources[2] + sources[3] 208 | target = targets[0] + targets[1] + targets[2] + targets[3] 209 | 210 | # NOTE : just add anything, if still nothing, reset 211 | min_batch_size = numpy.sum(self.batch_sizes) / float(1.0) 212 | if len(source) < min_batch_size or len(target) < min_batch_size: 213 | self.end_of_data = False 214 | self.reset() 215 | raise StopIteration 216 | 217 | return source, target 218 | -------------------------------------------------------------------------------- /char2char/prepare_data.py: -------------------------------------------------------------------------------- 1 | import theano 2 | from theano import tensor 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 4 | 5 | import cPickle 6 | import numpy 7 | import copy 8 | 9 | import os 10 | import warnings 11 | import sys 12 | import time 13 | 14 | from conv_tools import * 15 | 16 | from collections import OrderedDict 17 | from mixer import * 18 | 19 | # batch preparation for char2char models 20 | def prepare_data(seqs_x, seqs_y, pool_stride, maxlen=None, maxlen_trg=None): 21 | # x: a list of sentences 22 | lengths_x = [len(s) for s in seqs_x] 23 | lengths_y = [len(s) for s in seqs_y] 24 | 25 | if maxlen is not None: 26 | new_seqs_x = [] 27 | new_seqs_y = [] 28 | new_lengths_x = [] 29 | new_lengths_y = [] 30 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y): 31 | if l_x < maxlen and l_y < maxlen_trg: 32 | new_seqs_x.append(s_x) 33 | new_lengths_x.append(l_x) 34 | new_seqs_y.append(s_y) 35 | new_lengths_y.append(l_y) 36 | lengths_x = new_lengths_x 37 | seqs_x = new_seqs_x 38 | lengths_y = new_lengths_y 39 | seqs_y = new_seqs_y 40 | 41 | if len(lengths_x) < 1 or len(lengths_y) < 1: 42 | return None, None, None, None, None 43 | 44 | # n_samples is not always equal to batch_size, can be smaller! 45 | n_samples = len(seqs_x) 46 | maxlen_x = numpy.max(lengths_x) # SOS, EOS symbols are already added in data_iterator.py, hence no extra trick here. 47 | maxlen_y = numpy.max(lengths_y) + 1 # account for EOS symbol at the end of the target sentence. 48 | 49 | maxlen_x_pad = int( numpy.ceil( maxlen_x / float(pool_stride) ) * pool_stride ) 50 | # 1st round padding, such that the length is a multiple of pool_stride 51 | 52 | x = numpy.zeros((maxlen_x_pad + 2*pool_stride, n_samples)).astype('int64') 53 | # 2nd round padding at the beginning & the end for consistency, because theano's "half convolution" pads with zero-vectors by default. We want to ensure we don't pad with actual zero vectors, but rather with PAD embeddings. This is for consistency. For more information, consult http://deeplearning.net/software/theano/library/tensor/nnet/conv.html#theano.tensor.nnet.conv2d 54 | 55 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 56 | x_mask = numpy.zeros((maxlen_x_pad, n_samples)).astype('float32') 57 | 58 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 59 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)): 60 | x[ pool_stride : pool_stride + lengths_x[idx], idx] = s_x 61 | x_mask[:lengths_x[idx], idx] = 1. 62 | 63 | y[:lengths_y[idx], idx] = s_y 64 | y_mask[:lengths_y[idx]+1, idx] = 1. 65 | 66 | x_m = conv_mask_pool(x_mask, pool_stride) 67 | # x_m.shape = (maxlen_x_pad/pool_stride, n_samples) 68 | # x_m is used as masks at the GRU layer, note its length is reduced by pool_stride. 69 | 70 | return x, x_m, y, y_mask, n_samples 71 | -------------------------------------------------------------------------------- /char2char/print_batch.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import sys 3 | import numpy as np 4 | 5 | def pbatch(source, dic): 6 | ss = np.transpose(source) 7 | for line in ss[:10]: 8 | for word in line: 9 | a = dic[word] 10 | b = a 11 | 12 | if a == "SOS": 13 | b = "{" 14 | elif a == "EOS": 15 | b = "}" 16 | elif a == "ZERO": 17 | b = "_" 18 | elif a == "UNK": 19 | b = "|" 20 | 21 | sys.stdout.write(b) 22 | print " " 23 | print "" 24 | 25 | def pbatch_many(source, dic, n_x): 26 | ss = np.transpose(source) 27 | iis = [0, 20, n_x-8,n_x-1] 28 | 29 | for ii in iis: 30 | line = ss[ii] 31 | for word in line: 32 | a = dic[word] 33 | b = a 34 | 35 | if a == "SOS": 36 | b = "{" 37 | elif a == "EOS": 38 | b = "}" 39 | elif a == "ZERO": 40 | b = "_" 41 | elif a == "UNK": 42 | b = "|" 43 | 44 | sys.stdout.write(b) 45 | print " " 46 | print "" 47 | -------------------------------------------------------------------------------- /char2char/train_bi_char2char.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import string 5 | from collections import OrderedDict 6 | from wmt_path import * 7 | from char_base import * 8 | from nmt import train 9 | from conv_tools import * 10 | from prepare_data import * 11 | 12 | def main(job_id, args): 13 | save_file_name = args.model_name 14 | source_dataset = args.data_path + wmts[args.translate]['train'][0][0] 15 | target_dataset = args.data_path + wmts[args.translate]['train'][0][1] 16 | valid_source_dataset = args.data_path + wmts[args.translate]['dev'][0][0] 17 | valid_target_dataset = args.data_path + wmts[args.translate]['dev'][0][1] 18 | source_dictionary = args.data_path + wmts[args.translate]['dic'][0][0] 19 | target_dictionary = args.data_path + wmts[args.translate]['dic'][0][1] 20 | 21 | print args.model_path, save_file_name 22 | print source_dataset 23 | print target_dataset 24 | print valid_source_dataset 25 | print valid_target_dataset 26 | print source_dictionary 27 | print target_dictionary 28 | validerr = train( 29 | highway=args.highway, 30 | 31 | max_epochs=args.max_epochs, 32 | patience=args.patience, 33 | 34 | dim_word_src=args.dim_word_src, 35 | dim_word=args.dim_word, 36 | 37 | conv_width=args.conv_width, 38 | conv_nkernels=args.conv_nkernels, 39 | 40 | pool_window=args.pool_window, 41 | pool_stride=args.pool_stride, 42 | 43 | model_path=args.model_path, 44 | save_file_name=save_file_name, 45 | re_load=args.re_load, 46 | re_load_old_setting=args.re_load_old_setting, 47 | 48 | enc_dim=args.enc_dim, 49 | dec_dim=args.dec_dim, 50 | 51 | n_words_src=args.n_words_src, 52 | n_words=args.n_words, 53 | decay_c=args.decay_c, 54 | lrate=args.learning_rate, 55 | optimizer=args.optimizer, 56 | maxlen=args.maxlen, 57 | maxlen_trg=args.maxlen_trg, 58 | maxlen_sample=args.maxlen_sample, 59 | batch_size=args.batch_size, 60 | valid_batch_size=args.valid_batch_size, 61 | sort_size=args.sort_size, 62 | validFreq=args.validFreq, 63 | dispFreq=args.dispFreq, 64 | saveFreq=args.saveFreq, 65 | sampleFreq=args.sampleFreq, 66 | pbatchFreq=args.pbatchFreq, 67 | clip_c=args.clip_c, 68 | 69 | datasets=[source_dataset, target_dataset], 70 | valid_datasets=[valid_source_dataset, valid_target_dataset], 71 | dictionaries=[source_dictionary, target_dictionary], 72 | 73 | dropout_gru=args.dropout_gru, 74 | dropout_softmax=args.dropout_softmax, 75 | source_word_level=args.source_word_level, 76 | target_word_level=args.target_word_level, 77 | save_every_saveFreq=1, 78 | use_bpe=0, 79 | quit_immediately=args.quit_immediately, 80 | init_params=init_params, 81 | build_model=build_model, 82 | build_sampler=build_sampler, 83 | gen_sample=gen_sample, 84 | prepare_data=prepare_data, 85 | ) 86 | return validerr 87 | 88 | if __name__ == '__main__': 89 | 90 | import sys, time 91 | 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('-translate', type=str, default="de_en", help="de_en / cs_en / fi_en / ru_en") 94 | parser.add_argument('-highway', type=int, default=4) 95 | 96 | parser.add_argument('-conv_width', type=str, default="1-2-3-4-5-6-7-8") 97 | parser.add_argument('-conv_nkernels', type=str, default="200-200-250-250-300-300-300-300") 98 | 99 | parser.add_argument('-pool_window', type=int, default=5) 100 | parser.add_argument('-pool_stride', type=int, default=5) 101 | 102 | parser.add_argument('-enc_dim', type=int, default=512) 103 | parser.add_argument('-dec_dim', type=int, default=1024) 104 | 105 | parser.add_argument('-dim_word', type=int, default=512) 106 | parser.add_argument('-dim_word_src', type=int, default=128) 107 | 108 | parser.add_argument('-batch_size', type=int, default=64, help="") 109 | parser.add_argument('-valid_batch_size', type=int, default=64, help="") 110 | 111 | parser.add_argument('-dropout_gru', type=int, default=0, help="") 112 | parser.add_argument('-dropout_softmax', type=int, default=0, help="") 113 | 114 | parser.add_argument('-maxlen', type=int, default=450, help="") 115 | parser.add_argument('-maxlen_trg', type=int, default=500, help="") 116 | parser.add_argument('-maxlen_sample', type=int, default=500, help="") 117 | 118 | parser.add_argument('-re_load', action="store_true", default=False) 119 | parser.add_argument('-re_load_old_setting', action="store_true", default=False) 120 | parser.add_argument('-quit_immediately', action="store_true", default=False, help="if true, will not proceed training, only print the size of the model.") 121 | 122 | parser.add_argument('-max_epochs', type=int, default=1000000000000, help="") 123 | parser.add_argument('-patience', type=int, default=-1, help="") 124 | parser.add_argument('-learning_rate', type=float, default=0.0001, help="") 125 | 126 | parser.add_argument('-n_words_src', type=int, default=304, help="298 for FI-EN") 127 | parser.add_argument('-n_words', type=int, default=302, help="292 for FI-EN") 128 | 129 | parser.add_argument('-optimizer', type=str, default="adam", help="") 130 | parser.add_argument('-decay_c', type=int, default=0, help="") 131 | parser.add_argument('-clip_c', type=int, default=1, help="") 132 | 133 | parser.add_argument('-saveFreq', type=int, default=5000, help="") 134 | parser.add_argument('-sampleFreq', type=int, default=5000, help="") 135 | parser.add_argument('-dispFreq', type=int, default=1000, help="") 136 | parser.add_argument('-validFreq', type=int, default=5000, help="") 137 | parser.add_argument('-pbatchFreq', type=int, default=5000, help="") 138 | parser.add_argument('-sort_size', type=int, default=20, help="") 139 | 140 | parser.add_argument('-source_word_level', type=int, default=0, help="") 141 | parser.add_argument('-target_word_level', type=int, default=0, help="") 142 | 143 | args = parser.parse_args() 144 | 145 | if args.translate == "fi_en": 146 | args.n_words_src = 304 147 | args.n_words = 302 148 | 149 | if args.translate not in "de_en cs_en fi_en ru_en".split(): 150 | raise Exception('1') 151 | 152 | args.model_name = "bi-char2char" 153 | 154 | args.conv_width = [ int(x) for x in args.conv_width.split("-") ] 155 | args.conv_nkernels = [ int(x) for x in args.conv_nkernels.split("-") ] 156 | 157 | args.model_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly 158 | args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/wmt15/" # change accordingly 159 | args.model_path = args.model_path + args.translate + "/" 160 | 161 | print "Model path:", args.model_path 162 | 163 | print args 164 | main(0, args) 165 | -------------------------------------------------------------------------------- /char2char/train_multi_char2char.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import string 5 | from collections import OrderedDict 6 | from wmt_path_iso9 import * 7 | from char_base import * 8 | from nmt_many import train 9 | from conv_tools import * 10 | from prepare_data import * 11 | 12 | def main(job_id, args): 13 | save_file_name = args.model_name 14 | langs = "de_en cs_en fi_en ru_en".split() 15 | source_dataset = [] 16 | target_dataset = [] 17 | valid_source_dataset = [] 18 | valid_target_dataset = [] 19 | 20 | for lang in langs: 21 | source_dataset.append(args.data_path + wmts[lang]['train'][0][0]) 22 | target_dataset.append(args.data_path + wmts[lang]['train'][0][1]) 23 | valid_source_dataset.append(args.data_path + wmts[lang]['dev'][0][0]) 24 | valid_target_dataset.append(args.data_path + wmts[lang]['dev'][0][1]) 25 | 26 | source_dictionary = args.data_path + wmts[args.translate]['dic'][0][0] 27 | target_dictionary = args.data_path + wmts[args.translate]['dic'][0][1] 28 | 29 | print args.model_path, save_file_name 30 | print source_dataset 31 | print target_dataset 32 | print valid_source_dataset 33 | print valid_target_dataset 34 | print source_dictionary 35 | print target_dictionary 36 | validerr = train( 37 | highway=args.highway, 38 | 39 | max_epochs=args.max_epochs, 40 | patience=args.patience, 41 | 42 | dim_word_src=args.dim_word_src, 43 | dim_word=args.dim_word, 44 | 45 | conv_width=args.conv_width, 46 | conv_nkernels=args.conv_nkernels, 47 | 48 | pool_window=args.pool_window, 49 | pool_stride=args.pool_stride, 50 | 51 | model_path=args.model_path, 52 | save_file_name=save_file_name, 53 | re_load=args.re_load, 54 | re_load_old_setting=args.re_load_old_setting, 55 | 56 | enc_dim=args.enc_dim, 57 | dec_dim=args.dec_dim, 58 | 59 | n_words_src=args.n_words_src, 60 | n_words=args.n_words, 61 | decay_c=args.decay_c, 62 | lrate=args.learning_rate, 63 | optimizer=args.optimizer, 64 | maxlen=args.maxlen, 65 | maxlen_trg=args.maxlen_trg, 66 | maxlen_sample=args.maxlen_sample, 67 | batch_size=args.train_batch_size, 68 | valid_batch_size=args.valid_batch_size, 69 | sort_size=args.sort_size, 70 | validFreq=args.validFreq, 71 | dispFreq=args.dispFreq, 72 | saveFreq=args.saveFreq, 73 | sampleFreq=args.sampleFreq, 74 | pbatchFreq=args.pbatchFreq, 75 | clip_c=args.clip_c, 76 | 77 | datasets=[source_dataset, target_dataset], 78 | valid_datasets=[[s,t] for s,t in zip(valid_source_dataset, valid_target_dataset)], 79 | dictionaries=[source_dictionary, target_dictionary], 80 | 81 | dropout_gru=args.dropout_gru, 82 | dropout_softmax=args.dropout_softmax, 83 | source_word_level=args.source_word_level, 84 | target_word_level=args.target_word_level, 85 | save_every_saveFreq=1, 86 | use_bpe=0, 87 | quit_immediately=args.quit_immediately, 88 | init_params=init_params, 89 | build_model=build_model, 90 | build_sampler=build_sampler, 91 | gen_sample=gen_sample, 92 | prepare_data=prepare_data, 93 | ) 94 | return validerr 95 | 96 | if __name__ == '__main__': 97 | 98 | import sys, time 99 | 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument('-translate', type=str, default="many_en") 102 | parser.add_argument('-highway', type=int, default=4) 103 | 104 | parser.add_argument('-conv_width', type=str, default="1-2-3-4-5-6-7-8") 105 | parser.add_argument('-conv_nkernels', type=str, default="200-250-300-300-400-400-400-400") 106 | 107 | parser.add_argument('-pool_window', type=int, default=5) 108 | parser.add_argument('-pool_stride', type=int, default=5) 109 | 110 | parser.add_argument('-enc_dim', type=int, default=512) 111 | parser.add_argument('-dec_dim', type=int, default=1024) 112 | 113 | parser.add_argument('-dim_word', type=int, default=512) 114 | parser.add_argument('-dim_word_src', type=int, default=128) 115 | 116 | parser.add_argument('-dropout_gru', type=int, default=0, help="") 117 | parser.add_argument('-dropout_softmax', type=int, default=0, help="") 118 | 119 | parser.add_argument('-maxlen', type=int, default=400, help="") 120 | parser.add_argument('-maxlen_trg', type=int, default=500, help="") 121 | parser.add_argument('-maxlen_sample', type=int, default=500, help="") 122 | 123 | parser.add_argument('-train_batch_size', type=str,) 124 | parser.add_argument('-valid_batch_size', type=int, default=60, help="") 125 | parser.add_argument('-batch_size', type=int, default=60, help="") 126 | 127 | parser.add_argument('-re_load', action="store_true", default=False) 128 | parser.add_argument('-re_load_old_setting', action="store_true", default=False) 129 | parser.add_argument('-quit_immediately', action="store_true", default=False) 130 | 131 | parser.add_argument('-max_epochs', type=int, default=1000000000000, help="") 132 | parser.add_argument('-patience', type=int, default=-1, help="") 133 | parser.add_argument('-learning_rate', type=float, default=0.0001, help="") 134 | 135 | parser.add_argument('-n_words_src', type=int, default=404, help="") 136 | parser.add_argument('-n_words', type=int, default=402, help="") 137 | 138 | parser.add_argument('-optimizer', type=str, default="adam", help="") 139 | parser.add_argument('-decay_c', type=int, default=0, help="") 140 | parser.add_argument('-clip_c', type=int, default=1, help="") 141 | 142 | parser.add_argument('-saveFreq', type=int, default=5000, help="") 143 | parser.add_argument('-sampleFreq', type=int, default=5000, help="") 144 | parser.add_argument('-dispFreq', type=int, default=1000, help="") 145 | parser.add_argument('-validFreq', type=int, default=5000, help="") 146 | parser.add_argument('-pbatchFreq', type=int, default=5000, help="") 147 | parser.add_argument('-sort_size', type=int, default=20, help="") 148 | 149 | parser.add_argument('-source_word_level', type=int, default=0, help="") 150 | parser.add_argument('-target_word_level', type=int, default=0, help="") 151 | 152 | args = parser.parse_args() 153 | 154 | if args.translate != "many_en": 155 | raise Exception('1') 156 | 157 | args.train_batch_size = [ 14, 37, 6, 7 ] 158 | 159 | args.model_name = "multi-char2char" 160 | 161 | args.conv_width = [ int(x) for x in args.conv_width.split("-") ] 162 | args.conv_nkernels = [ int(x) for x in args.conv_nkernels.split("-") ] 163 | 164 | args.model_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly 165 | args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/multi-wmt15/" # change accordingly 166 | args.model_path = args.model_path + args.translate + "/" 167 | 168 | print "Model path:", args.model_path 169 | 170 | print args 171 | main(0, args) 172 | -------------------------------------------------------------------------------- /char2char/wmt_path.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Paths to training / valid / test corpus & dictionaries 3 | # For the bilingual models 4 | 5 | deen={ 6 | "dic": [ 7 | ["deen/train/all_de-en.de.tok.304.pkl", 8 | "deen/train/all_de-en.en.tok.300.pkl",], 9 | 10 | ["deen/train/all_de-en.de.tok.bpe.word.pkl"], 11 | ], 12 | 13 | "train": [ 14 | ["deen/train/all_de-en.de.tok.shuf", 15 | "deen/train/all_de-en.en.tok.shuf",], 16 | 17 | ["deen/train/all_de-en.de.tok.bpe.shuf", 18 | "deen/train/all_de-en.en.tok.bpe.shuf",], 19 | ], 20 | 21 | "dev": [ 22 | ["deen/dev/newstest2013.de.tok", 23 | "deen/dev/newstest2013.en.tok",], 24 | 25 | ["deen/dev/newstest2013.de.tok.bpe", 26 | "deen/dev/newstest2013.en.tok.bpe",], 27 | ], 28 | 29 | "test1" :[ 30 | ["deen/test/newstest2014-deen-ref.de.tok", 31 | "deen/test/newstest2014-deen-src.en.tok",], 32 | 33 | ["deen/test/newstest2014-deen-ref.de.tok.bpe", 34 | "deen/test/newstest2014-deen-src.en.tok.bpe",], 35 | ], 36 | 37 | "test2":[ 38 | ["deen/test/newstest2015-deen-ref.de.tok", 39 | "deen/test/newstest2015-deen-src.en.tok",], 40 | 41 | ["deen/test/newstest2015-deen-ref.de.tok.bpe", 42 | "deen/test/newstest2015-deen-src.en.tok.bpe",], 43 | ], 44 | } 45 | 46 | csen={ 47 | 48 | "dic":[ 49 | ["csen/train/all_cs-en.cs.tok.304.pkl", 50 | "csen/train/all_cs-en.en.tok.300.pkl",], 51 | 52 | ["csen/train/all_cs-en.cs.tok.bpe.word.pkl"], 53 | ], 54 | 55 | "train":[ 56 | ["csen/train/all_cs-en.cs.tok", 57 | "csen/train/all_cs-en.en.tok",], 58 | 59 | ["csen/train/all_cs-en.cs.tok.bpe", 60 | "csen/train/all_cs-en.en.tok.bpe",], 61 | ], 62 | 63 | "dev": [ 64 | ["csen/dev/newstest2013-ref.cs.tok", 65 | "csen/dev/newstest2013-src.en.tok",], 66 | 67 | ["csen/dev/newstest2013-ref.cs.tok.bpe", 68 | "csen/dev/newstest2013-src.en.tok.bpe",], 69 | ], 70 | 71 | "test1":[ 72 | ["csen/test/newstest2014-csen-ref.cs.tok", 73 | "csen/test/newstest2014-csen-src.en.tok",], 74 | 75 | ["csen/test/newstest2014-csen-ref.cs.tok.bpe", 76 | "csen/test/newstest2014-csen-src.en.tok.bpe",], 77 | ], 78 | 79 | "test2":[ 80 | ["csen/test/newstest2015-csen-ref.cs.tok", 81 | "csen/test/newstest2015-csen-src.en.tok",], 82 | 83 | ["csen/test/newstest2015-csen-ref.cs.tok.bpe", 84 | "csen/test/newstest2015-csen-src.en.tok.bpe",], 85 | ] 86 | } 87 | 88 | fien={ 89 | "dic":[ 90 | ["fien/train/all_fi-en.fi.tok.304.pkl", 91 | "fien/train/all_fi-en.en.tok.300.pkl",], 92 | 93 | ["fien/train/all_fi-en.fi.tok.bpe.word.pkl"], 94 | ], 95 | 96 | "train":[ 97 | ["fien/train/all_fi-en.fi.tok", 98 | "fien/train/all_fi-en.en.tok",], 99 | 100 | ["fien/train/all_fi-en.fi.tok.bpe", 101 | "fien/train/all_fi-en.en.tok.bpe",], 102 | ], 103 | 104 | "dev":[ 105 | ["fien/dev/newsdev2015-enfi-ref.fi.tok", 106 | "fien/dev/newsdev2015-enfi-src.en.tok",], 107 | 108 | ["fien/dev/newsdev2015-enfi-ref.fi.tok.bpe", 109 | "fien/dev/newsdev2015-enfi-src.en.tok.bpe",], 110 | ], 111 | 112 | "test1":[ 113 | ["fien/test/newstest2015-fien-ref.fi.tok", 114 | "fien/test/newstest2015-fien-src.en.tok",], 115 | 116 | ["fien/test/newstest2015-fien-ref.fi.tok.bpe", 117 | "fien/test/newstest2015-fien-src.en.tok.bpe",], 118 | ], 119 | } 120 | 121 | ruen={ 122 | 123 | "dic":[ 124 | ["ruen/train/all_ru-en.ru.tok.304.pkl", 125 | "ruen/train/all_ru-en.en.tok.300.pkl",], 126 | 127 | ["ruen/train/all_ru-en.ru.tok.bpe.word.pkl"], 128 | ], 129 | 130 | "train":[ 131 | ["ruen/train/all_ru-en.ru.tok", 132 | "ruen/train/all_ru-en.en.tok",], 133 | 134 | ["ruen/train/all_ru-en.ru.tok.bpe", 135 | "ruen/train/all_ru-en.en.tok.bpe",], 136 | ], 137 | 138 | "dev":[ 139 | ["ruen/dev/newstest2013-ref.ru.tok", 140 | "ruen/dev/newstest2013-src.en.tok",], 141 | 142 | ["ruen/dev/newstest2013-ref.ru.tok.bpe", 143 | "ruen/dev/newstest2013-src.en.tok.bpe",], 144 | ], 145 | 146 | "test1":[ 147 | ["ruen/test/newstest2014-ruen-ref.ru.tok", 148 | "ruen/test/newstest2014-ruen-src.en.tok",], 149 | 150 | ["ruen/test/newstest2014-ruen-ref.ru.tok.bpe", 151 | "ruen/test/newstest2014-ruen-src.en.tok.bpe",], 152 | ], 153 | 154 | "test2":[ 155 | ["ruen/test/newstest2015-ruen-ref.ru.tok", 156 | "ruen/test/newstest2015-ruen-src.en.tok",], 157 | 158 | ["ruen/test/newstest2015-ruen-ref.ru.tok.bpe", 159 | "ruen/test/newstest2015-ruen-src.en.tok.bpe",], 160 | ] 161 | } 162 | 163 | manyen = { 164 | "dic":[ 165 | ["char-source-for-dic.300.pkl", 166 | "char-target-for-dic.300.pkl"], 167 | 168 | ["bpe-source-for-dic.word.pkl"] 169 | ] 170 | } 171 | 172 | wmts = dict() 173 | wmts["de_en"] = deen 174 | wmts["cs_en"] = csen 175 | wmts["fi_en"] = fien 176 | wmts["ru_en"] = ruen 177 | wmts["many_en"] = manyen 178 | -------------------------------------------------------------------------------- /char2char/wmt_path_iso9.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Paths to training / valid / test corpus & dictionaries 3 | # For the multilingual models (russian characters converted to latin using iso9) 4 | 5 | deen={ 6 | "dic": [ 7 | ["deen/train/all_de-en.de.tok.shuf.iso9.304.pkl", 8 | "dic/target.402.pkl"], 9 | 10 | ["deen/train/all_de-en.de.tok.shuf.iso9.bpe.24111.word.pkl"], 11 | ], 12 | 13 | "train": [ 14 | ["deen/train/all_de-en.de.tok.shuf.iso9", 15 | "deen/train/all_de-en.en.tok.shuf.iso9",], 16 | 17 | ["deen/train/all_de-en.de.tok.shuf.iso9.bpe.50000", 18 | "deen/train/all_de-en.de.tok.shuf.iso9.bpe.20000"], 19 | ], 20 | 21 | "dev": [ 22 | ["deen/dev/newstest2013.de.tok.iso9", 23 | "deen/dev/newstest2013.en.tok.iso9",], 24 | 25 | ["deen/dev/newstest2013.de.tok.iso9.bpe.50000", 26 | "deen/dev/newstest2013.de.tok.iso9.bpe.20000"], 27 | ], 28 | 29 | "test1" :[ 30 | ["deen/test/newstest2014-deen-ref.de.tok.iso9", 31 | "deen/test/newstest2014-deen-src.en.tok.iso9",], 32 | 33 | ["deen/test/newstest2014-deen-ref.de.tok.iso9.bpe.50000", 34 | "deen/test/newstest2014-deen-ref.de.tok.iso9.bpe.20000"], 35 | ], 36 | 37 | "test2":[ 38 | ["deen/test/newstest2015-deen-src.de.tok.iso9", 39 | "deen/test/newstest2015-deen-src.en.tok.iso9",], 40 | 41 | ["deen/test/newstest2015-deen-src.de.tok.iso9.bpe.50000", 42 | "deen/test/newstest2015-deen-src.de.tok.iso9.bpe.20000"], 43 | ], 44 | } 45 | 46 | csen={ 47 | "dic": [ 48 | ["csen/train/all_cs-en.cs.tok.iso9.304.pkl", 49 | "dic/target.402.pkl"], 50 | 51 | ["csen/train/all_cs-en.cs.tok.iso9.bpe.21697.word.pkl"], 52 | ], 53 | 54 | "train":[ 55 | ["csen/train/all_cs-en.cs.tok.iso9", 56 | "csen/train/all_cs-en.en.tok.iso9",], 57 | 58 | ["csen/train/all_cs-en.cs.tok.iso9.bpe.50000", 59 | "csen/train/all_cs-en.cs.tok.iso9.bpe.20000"], 60 | ], 61 | 62 | "dev": [ 63 | ["csen/dev/newstest2013-ref.cs.tok.iso9", 64 | "csen/dev/newstest2013-src.en.tok.iso9",], 65 | 66 | ["csen/dev/newstest2013-ref.cs.tok.iso9.bpe.50000", 67 | "csen/dev/newstest2013-ref.cs.tok.iso9.bpe.20000"], 68 | ], 69 | 70 | "test1":[ 71 | ["csen/test/newstest2014-csen-ref.cs.tok.iso9", 72 | "csen/test/newstest2014-csen-src.en.tok.iso9",], 73 | 74 | ["csen/test/newstest2014-csen-ref.cs.tok.iso9.bpe.50000", 75 | "csen/test/newstest2014-csen-ref.cs.tok.iso9.bpe.20000"], 76 | ], 77 | 78 | "test2":[ 79 | ["csen/test/newstest2015-csen-ref.cs.tok.iso9", 80 | "csen/test/newstest2015-csen-src.en.tok.iso9",], 81 | 82 | ["csen/test/newstest2015-csen-ref.cs.tok.iso9.bpe.50000", 83 | "csen/test/newstest2015-csen-ref.cs.tok.iso9.bpe.20000"], 84 | ] 85 | } 86 | 87 | fien={ 88 | "dic": [ 89 | ["fien/train/all_fi-en.fi.tok.shuf.iso9.269.pkl", 90 | "dic/target.402.pkl"], 91 | 92 | ["fien/train/all_fi-en.fi.tok.shuf.iso9.bpe.20747.word.pkl"], 93 | ], 94 | 95 | "train":[ 96 | ["fien/train/all_fi-en.fi.tok.shuf.iso9", 97 | "fien/train/all_fi-en.en.tok.shuf.iso9",], 98 | 99 | ["fien/train/all_fi-en.fi.tok.shuf.iso9.bpe.50000", 100 | "fien/train/all_fi-en.fi.tok.shuf.iso9.bpe.20000"], 101 | ], 102 | 103 | "dev":[ 104 | ["fien/dev/newsdev2015-enfi-ref.fi.tok.iso9", 105 | "fien/dev/newsdev2015-enfi-src.en.tok.iso9",], 106 | 107 | ["fien/dev/newsdev2015-enfi-ref.fi.tok.iso9.bpe.50000", 108 | "fien/dev/newsdev2015-enfi-ref.fi.tok.iso9.bpe.20000"], 109 | ], 110 | 111 | "test1":[ 112 | ["fien/test/newstest2015-fien-ref.fi.tok.iso9", 113 | "fien/test/newstest2015-fien-src.en.tok.iso9",], 114 | 115 | ["fien/test/newstest2015-fien-ref.fi.tok.iso9.bpe.50000", 116 | "fien/test/newstest2015-fien-ref.fi.tok.iso9.bpe.20000"], 117 | ], 118 | } 119 | 120 | ruen={ 121 | "dic": [ 122 | ["ruen/train/all_ru-en.ru.tok.iso9.304.pkl", 123 | "dic/target.402.pkl"], 124 | 125 | ["ruen/train/all_ru-en.ru.tok.iso9.bpe.21995.word.pkl"], 126 | ], 127 | 128 | "train":[ 129 | ["ruen/train/all_ru-en.ru.tok.iso9", 130 | "ruen/train/all_ru-en.en.tok.iso9",], 131 | 132 | ["ruen/train/all_ru-en.ru.tok.iso9.bpe.50000", 133 | "ruen/train/all_ru-en.ru.tok.iso9.bpe.20000"], 134 | ], 135 | 136 | "dev":[ 137 | ["ruen/dev/newstest2013-ref.ru.tok.iso9", 138 | "ruen/dev/newstest2013-src.en.tok.iso9",], 139 | 140 | ["ruen/dev/newstest2013-ref.ru.tok.iso9.bpe.50000", 141 | "ruen/dev/newstest2013-ref.ru.tok.iso9.bpe.20000"], 142 | ], 143 | 144 | "test1":[ 145 | ["ruen/test/newstest2014-ruen-ref.ru.tok.iso9", 146 | "ruen/test/newstest2014-ruen-src.en.tok.iso9",], 147 | 148 | ["ruen/test/newstest2014-ruen-ref.ru.tok.iso9.bpe.50000", 149 | "ruen/test/newstest2014-ruen-ref.ru.tok.iso9.bpe.20000"], 150 | ], 151 | 152 | "test2":[ 153 | ["ruen/test/newstest2015-ruen-ref.ru.tok.iso9", 154 | "ruen/test/newstest2015-ruen-src.en.tok.iso9",], 155 | 156 | ["ruen/test/newstest2015-ruen-ref.ru.tok.iso9.bpe.50000", 157 | "ruen/test/newstest2015-ruen-ref.ru.tok.iso9.bpe.20000"], 158 | ] 159 | } 160 | 161 | manyen = { 162 | "dic":[ 163 | ["dic/source.404.pkl", 164 | "dic/target.402.pkl"], 165 | 166 | ["dic/bpe-source-for-dic.word.pkl"] 167 | ] 168 | } 169 | 170 | wmts = dict() 171 | wmts["de_en"] = deen 172 | wmts["cs_en"] = csen 173 | wmts["fi_en"] = fien 174 | wmts["ru_en"] = ruen 175 | wmts["many_en"] = manyen 176 | -------------------------------------------------------------------------------- /preprocess/build_dictionary_char.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import fileinput 3 | import numpy 4 | import sys 5 | import codecs 6 | 7 | from collections import OrderedDict 8 | 9 | def main(filename, short_list, src): 10 | # Build character dictionaries 11 | print 'Processing', filename 12 | word_freqs = OrderedDict() 13 | 14 | with open(filename, 'r') as f: 15 | 16 | for number, line in enumerate(f): 17 | 18 | if number % 20000 == 0: 19 | print 'line', number 20 | 21 | words_in = line.strip() 22 | words_in = list(words_in.decode('utf8')) 23 | 24 | for w in words_in: 25 | if w not in word_freqs: 26 | word_freqs[w] = 0 27 | word_freqs[w] += 1 28 | 29 | print 'count finished' 30 | 31 | words = word_freqs.keys() 32 | freqs = word_freqs.values() 33 | 34 | sorted_idx = numpy.argsort(freqs) 35 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 36 | 37 | worddict = OrderedDict() 38 | if src: 39 | # 0 -> ZERO 40 | # 1 -> UNK 41 | # 2 -> SOS 42 | # 3 -> EOS 43 | tokens = "ZERO UNK SOS EOS".split() 44 | else: 45 | tokens = "EOS UNK".split() 46 | print tokens 47 | 48 | for ii, aa in enumerate(tokens): 49 | worddict[aa] = ii 50 | print worddict 51 | 52 | if short_list is not None: 53 | for ii in xrange(min(short_list, len(sorted_words))): 54 | worddict[sorted_words[ii]] = ii + len(tokens) 55 | # NOTE : sorted_words 56 | print 'dict finished' 57 | 58 | else: 59 | for ii, ww in enumerate(sorted_words): 60 | worddict[ww] = ii + len(tokens) 61 | 62 | print 'start dump' 63 | with open('%s.%d.pkl' % (filename, short_list+len(tokens)), 'wb') as f: 64 | pkl.dump(worddict, f) 65 | 66 | f.close() 67 | print 'Done' 68 | print len(worddict) 69 | -------------------------------------------------------------------------------- /preprocess/build_dictionary_word.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import fileinput 3 | import numpy 4 | import sys 5 | import codecs 6 | 7 | from collections import OrderedDict 8 | 9 | def main(): 10 | for filename in sys.argv[1:]: 11 | print 'Processing', filename 12 | word_freqs = OrderedDict() 13 | 14 | with open(filename, 'r') as f: 15 | 16 | for number, line in enumerate(f): 17 | 18 | if number % 20000 == 0: 19 | print 'line', number 20 | 21 | words_in = line.strip().split(' ') 22 | for w in words_in: 23 | if w not in word_freqs: 24 | word_freqs[w] = 0 25 | word_freqs[w] += 1 26 | 27 | words = word_freqs.keys() 28 | freqs = word_freqs.values() 29 | 30 | sorted_idx = numpy.argsort(freqs) 31 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 32 | 33 | worddict = OrderedDict() 34 | worddict['eos'] = 0 35 | worddict['UNK'] = 1 36 | 37 | for ii, ww in enumerate(sorted_words): 38 | worddict[ww] = ii + 2 39 | 40 | with open('%s.word.pkl' % filename, 'wb') as f: 41 | pkl.dump(worddict, f) 42 | 43 | f.close() 44 | print 'Done' 45 | print len(worddict) 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /preprocess/clean_tags.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | from_file = sys.argv[1] 5 | to_file = sys.argv[2] 6 | to_file_out = open(to_file, "w") 7 | 8 | regex = "<.*>" 9 | 10 | tag_match = re.compile(regex) 11 | matched_lines = [] 12 | 13 | with open(from_file) as from_file: 14 | content = from_file.readlines() 15 | for line in content: 16 | if (tag_match.match(line)): 17 | pass 18 | else: 19 | matched_lines.append(line) 20 | 21 | matched_lines = "".join(matched_lines) 22 | to_file_out.write(matched_lines) 23 | to_file_out.close() 24 | 25 | -------------------------------------------------------------------------------- /preprocess/fix_appo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # {1} is the directory name 3 | 4 | 5 | for f in ${1}/*.xml 6 | do 7 | cat $f | grep "" | sed "s/’/'/g" | sed "s/“/\"/g" | sed "s/”/\"/g" > ${f}.fixed 8 | done 9 | 10 | -------------------------------------------------------------------------------- /preprocess/iso.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | from collections import OrderedDict 4 | import re 5 | import sys 6 | reload(sys) 7 | sys.setdefaultencoding("utf-8") 8 | 9 | f = open("preprocess/iso9", 'rb') 10 | lines = [line for line in f] 11 | bigru = lines[::4] 12 | smallru = lines[1::4] 13 | bigen = lines[2::4] 14 | smallen = lines[3::4] 15 | iso = OrderedDict() 16 | 17 | for br, sr, be, se in zip(bigru, smallru, bigen, smallen): 18 | iso[br.replace("\n", "")] = be.replace("\n", "") 19 | iso[sr.replace("\n", "")] = se.replace("\n", "") 20 | 21 | def rep(a): 22 | #aa = a.decode('utf-8') 23 | aa = a 24 | for k,v in iso.iteritems(): 25 | aa = aa.replace(k,v) 26 | #aa = aa.replace(k.decode('utf-8'),v.decode('utf-8')) 27 | #return aa.encode('utf-8') 28 | return aa 29 | 30 | if __name__ == '__main__': 31 | filename = sys.argv[1] 32 | rr = open(filename, 'rb') 33 | txt = rr.read() 34 | txt = rep(txt) 35 | ww = open(filename+".iso9", "w") 36 | ww.write(txt) 37 | rr.close() 38 | ww.close() 39 | -------------------------------------------------------------------------------- /preprocess/iso9: -------------------------------------------------------------------------------- 1 | А 2 | а 3 | A 4 | a 5 | Б 6 | б 7 | B 8 | b 9 | В 10 | в 11 | V 12 | v 13 | Г 14 | г 15 | G 16 | g 17 | Д 18 | д 19 | D 20 | d 21 | Е 22 | е 23 | E 24 | e 25 | Ё 26 | ё 27 | Ë 28 | ë 29 | Ж 30 | ж 31 | Ž 32 | ž 33 | З 34 | з 35 | Z 36 | z 37 | И 38 | и 39 | I 40 | i 41 | Й 42 | й 43 | J 44 | j 45 | К 46 | к 47 | K 48 | k 49 | Л 50 | л 51 | L 52 | l 53 | М 54 | м 55 | M 56 | m 57 | Н 58 | н 59 | N 60 | n 61 | О 62 | о 63 | O 64 | o 65 | П 66 | п 67 | P 68 | p 69 | Р 70 | р 71 | R 72 | r 73 | С 74 | с 75 | S 76 | s 77 | Т 78 | т 79 | T 80 | t 81 | У 82 | у 83 | U 84 | u 85 | Ф 86 | ф 87 | F 88 | f 89 | Х 90 | х 91 | H 92 | h 93 | Ц 94 | ц 95 | C 96 | c 97 | Ч 98 | ч 99 | Č 100 | č 101 | Ш 102 | ш 103 | Š 104 | š 105 | Щ 106 | щ 107 | Ŝ 108 | ŝ 109 | Ъ 110 | ъ 111 | ʺ 112 | ʺ 113 | Ы 114 | ы 115 | Y 116 | y 117 | Ь 118 | ь 119 | ʹ 120 | ʹ 121 | Э 122 | э 123 | È 124 | è 125 | Ю 126 | ю 127 | Û 128 | û 129 | Я 130 | я 131 | Â 132 | â 133 | -------------------------------------------------------------------------------- /preprocess/merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | SRC=$1 5 | TRG=$2 6 | 7 | FSRC=all_${1}-${2}.${1} 8 | FTRG=all_${1}-${2}.${2} 9 | 10 | echo "" > $FSRC 11 | for F in *${1}-${2}.${1} 12 | do 13 | if [ "$F" = "$FSRC" ]; then 14 | echo "pass" 15 | else 16 | cat $F >> $FSRC 17 | fi 18 | done 19 | 20 | 21 | echo "" > $FTRG 22 | for F in *${1}-${2}.${2} 23 | do 24 | if [ "$F" = "$FTRG" ]; then 25 | echo "pass" 26 | else 27 | cat $F >> $FTRG 28 | fi 29 | done 30 | -------------------------------------------------------------------------------- /preprocess/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | sub add_to_ref { 35 | my ($file,$REF) = @_; 36 | my $s=0; 37 | open(REF,$file) or die "Can't read $file"; 38 | while() { 39 | chop; 40 | push @{$$REF[$s++]}, $_; 41 | } 42 | close(REF); 43 | } 44 | 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 46 | my $s=0; 47 | while() { 48 | chop; 49 | $_ = lc if $lowercase; 50 | my @WORD = split; 51 | my %REF_NGRAM = (); 52 | my $length_translation_this_sentence = scalar(@WORD); 53 | my ($closest_diff,$closest_length) = (9999,9999); 54 | foreach my $reference (@{$REF[$s]}) { 55 | # print "$s $_ <=> $reference\n"; 56 | $reference = lc($reference) if $lowercase; 57 | my @WORD = split(' ',$reference); 58 | my $length = scalar(@WORD); 59 | my $diff = abs($length_translation_this_sentence-$length); 60 | if ($diff < $closest_diff) { 61 | $closest_diff = $diff; 62 | $closest_length = $length; 63 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 64 | } elsif ($diff == $closest_diff) { 65 | $closest_length = $length if $length < $closest_length; 66 | # from two references with the same closeness to me 67 | # take the *shorter* into account, not the "first" one. 68 | } 69 | for(my $n=1;$n<=4;$n++) { 70 | my %REF_NGRAM_N = (); 71 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 72 | my $ngram = "$n"; 73 | for(my $w=0;$w<$n;$w++) { 74 | $ngram .= " ".$WORD[$start+$w]; 75 | } 76 | $REF_NGRAM_N{$ngram}++; 77 | } 78 | foreach my $ngram (keys %REF_NGRAM_N) { 79 | if (!defined($REF_NGRAM{$ngram}) || 80 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 81 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 82 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 83 | } 84 | } 85 | } 86 | } 87 | $length_translation += $length_translation_this_sentence; 88 | $length_reference += $closest_length; 89 | for(my $n=1;$n<=4;$n++) { 90 | my %T_NGRAM = (); 91 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 92 | my $ngram = "$n"; 93 | for(my $w=0;$w<$n;$w++) { 94 | $ngram .= " ".$WORD[$start+$w]; 95 | } 96 | $T_NGRAM{$ngram}++; 97 | } 98 | foreach my $ngram (keys %T_NGRAM) { 99 | $ngram =~ /^(\d+) /; 100 | my $n = $1; 101 | # my $corr = 0; 102 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 103 | $TOTAL[$n] += $T_NGRAM{$ngram}; 104 | if (defined($REF_NGRAM{$ngram})) { 105 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 106 | $CORRECT[$n] += $T_NGRAM{$ngram}; 107 | # $corr = $T_NGRAM{$ngram}; 108 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 109 | } 110 | else { 111 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 112 | # $corr = $REF_NGRAM{$ngram}; 113 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 114 | } 115 | } 116 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 117 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 118 | } 119 | } 120 | $s++; 121 | } 122 | my $brevity_penalty = 1; 123 | my $bleu = 0; 124 | 125 | my @bleu=(); 126 | 127 | for(my $n=1;$n<=4;$n++) { 128 | if (defined ($TOTAL[$n]) && defined ($CORRECT[$n]) && $TOTAL[$n] > 0){ 129 | $bleu[$n]=($TOTAL[$n]>0)?$CORRECT[$n]/$TOTAL[$n]:0; 130 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 131 | }else{ 132 | $bleu[$n]=0; 133 | } 134 | } 135 | 136 | if ($length_reference==0){ 137 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 138 | exit(1); 139 | } 140 | 141 | if ($length_translation<$length_reference) { 142 | $brevity_penalty = exp(1-$length_reference/$length_translation); 143 | } 144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 145 | my_log( $bleu[2] ) + 146 | my_log( $bleu[3] ) + 147 | my_log( $bleu[4] ) ) / 4) ; 148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 149 | 100*$bleu, 150 | 100*$bleu[1], 151 | 100*$bleu[2], 152 | 100*$bleu[3], 153 | 100*$bleu[4], 154 | $brevity_penalty, 155 | $length_translation / $length_reference, 156 | $length_translation, 157 | $length_reference; 158 | 159 | sub my_log { 160 | return -9999999999 unless $_[0]; 161 | return log($_[0]); 162 | } 163 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /preprocess/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /preprocess/normalize-punctuation.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | 5 | my ($language) = @ARGV; 6 | 7 | while() { 8 | s/\r//g; 9 | # remove extra spaces 10 | s/\(/ \(/g; 11 | s/\)/\) /g; s/ +/ /g; 12 | s/\) ([\.\!\:\?\;\,])/\)$1/g; 13 | s/\( /\(/g; 14 | s/ \)/\)/g; 15 | s/(\d) \%/$1\%/g; 16 | s/ :/:/g; 17 | s/ ;/;/g; 18 | # normalize unicode punctuation 19 | s/„/\"/g; 20 | s/“/\"/g; 21 | s/”/\"/g; 22 | s/–/-/g; 23 | s/—/ - /g; s/ +/ /g; 24 | s/´/\'/g; 25 | s/([a-z])‘([a-z])/$1\'$2/gi; 26 | s/([a-z])’([a-z])/$1\'$2/gi; 27 | s/‘/\"/g; 28 | s/‚/\"/g; 29 | s/’/\"/g; 30 | s/''/\"/g; 31 | s/´´/\"/g; 32 | s/…/.../g; 33 | # French quotes 34 | s/ « / \"/g; 35 | s/« /\"/g; 36 | s/«/\"/g; 37 | s/ » /\" /g; 38 | s/ »/\"/g; 39 | s/»/\"/g; 40 | # handle pseudo-spaces 41 | s/ \%/\%/g; 42 | s/nº /nº /g; 43 | s/ :/:/g; 44 | s/ ºC/ ºC/g; 45 | s/ cm/ cm/g; 46 | s/ \?/\?/g; 47 | s/ \!/\!/g; 48 | s/ ;/;/g; 49 | s/, /, /g; s/ +/ /g; 50 | 51 | # English "quotation," followed by comma, style 52 | if ($language eq "en") { 53 | s/\"([,\.]+)/$1\"/g; 54 | } 55 | # Czech is confused 56 | elsif ($language eq "cs" || $language eq "cz") { 57 | } 58 | # German/Spanish/French "quotation", followed by comma, style 59 | else { 60 | s/,\"/\",/g; 61 | s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence 62 | } 63 | 64 | print STDERR $_ if //; 65 | 66 | if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { 67 | s/(\d) (\d)/$1,$2/g; 68 | } 69 | else { 70 | s/(\d) (\d)/$1.$2/g; 71 | } 72 | print $_; 73 | } 74 | -------------------------------------------------------------------------------- /preprocess/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # source language (example: fr) 4 | S=$1 5 | # target language (example: en) 6 | T=$2 7 | 8 | # path to dl4mt/data 9 | P1=$3 10 | 11 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt) 12 | P2=$4 13 | 14 | ## merge all parallel corpora 15 | #./merge.sh $1 $2 16 | 17 | perl $P1/normalize-punctuation.perl -l ${S} < all_${S}-${T}.${S} > all_${S}-${T}.${S}.norm # do this for validation and test 18 | perl $P1/normalize-punctuation.perl -l ${T} < all_${S}-${T}.${T} > all_${S}-${T}.${T}.norm # do this for validation and test 19 | 20 | # tokenize 21 | perl $P1/tokenizer_apos.perl -threads 5 -l $S < all_${S}-${T}.${S}.norm > all_${S}-${T}.${S}.tok # do this for validation and test 22 | perl $P1/tokenizer_apos.perl -threads 5 -l $T < all_${S}-${T}.${T}.norm > all_${S}-${T}.${T}.tok # do this for validation and test 23 | 24 | # BPE 25 | if [ ! -f "../${S}.bpe" ]; then 26 | python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${S}.tok > ../${S}.bpe 27 | fi 28 | if [ ! -f "../${T}.bpe" ]; then 29 | python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${T}.tok > ../${T}.bpe 30 | fi 31 | 32 | python $P2/apply_bpe.py -c ../${S}.bpe < all_${S}-${T}.${S}.tok > all_${S}-${T}.${S}.tok.bpe # do this for validation and test 33 | python $P2/apply_bpe.py -c ../${T}.bpe < all_${S}-${T}.${T}.tok > all_${S}-${T}.${T}.tok.bpe # do this for validation and test 34 | 35 | # shuffle 36 | python $P1/shuffle.py all_${S}-${T}.${S}.tok.bpe all_${S}-${T}.${T}.tok.bpe all_${S}-${T}.${S}.tok all_${S}-${T}.${T}.tok 37 | 38 | # build dictionary 39 | #python $P1/build_dictionary.py all_${S}-${T}.${S}.tok & 40 | #python $P1/build_dictionary.py all_${S}-${T}.${T}.tok & 41 | #python $P1/build_dictionary_word.py all_${S}-${T}.${S}.tok.bpe & 42 | #python $P1/build_dictionary_word.py all_${S}-${T}.${T}.tok.bpe & 43 | -------------------------------------------------------------------------------- /translate/translate_bpe2char.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | import time 5 | 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | sys.path.insert(0, "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/bpe2char") # change appropriately 10 | 11 | import numpy 12 | import cPickle as pkl 13 | from mixer import * 14 | 15 | def translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent): 16 | 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | trng = RandomStreams(1234) 19 | 20 | # allocate model parameters 21 | params = init_params(options) 22 | 23 | # load model parameters and set theano shared variables 24 | params = load_params(model, params) 25 | tparams = init_tparams(params) 26 | 27 | # word index 28 | use_noise = theano.shared(numpy.float32(0.)) 29 | f_init, f_next = build_sampler(tparams, options, trng, use_noise) 30 | 31 | def _translate(seq): 32 | use_noise.set_value(0.) 33 | # sample given an input sequence and obtain scores 34 | # NOTE : if seq length too small, do something about it 35 | sample, score = gen_sample(tparams, f_init, f_next, 36 | numpy.array(seq).reshape([len(seq), 1]), 37 | options, trng=trng, k=k, maxlen=500, 38 | stochastic=False, argmax=False) 39 | 40 | # normalize scores according to sequence lengths 41 | if normalize: 42 | lengths = numpy.array([len(s) for s in sample]) 43 | score = score / lengths 44 | sidx = numpy.argmin(score) 45 | return sample[sidx] 46 | 47 | while jobqueue: 48 | req = jobqueue.pop(0) 49 | 50 | idx, x = req[0], req[1] 51 | if not silent: 52 | print "sentence", idx, model_id 53 | seq = _translate(x) 54 | 55 | resultqueue.append((idx, seq)) 56 | return 57 | 58 | def main(model, dictionary, dictionary_target, source_file, saveto, k=6, 59 | normalize=False, encoder_chr_level=False, 60 | decoder_chr_level=False, utf8=False, 61 | model_id=None, silent=False,): 62 | 63 | from char_base import (build_sampler, gen_sample, init_params) 64 | 65 | # load model model_options 66 | pkl_file = model.split('.')[0] + '.pkl' 67 | with open(pkl_file, 'rb') as f: 68 | options = pkl.load(f) 69 | 70 | # load source dictionary and invert 71 | with open(dictionary, 'rb') as f: 72 | word_dict = pkl.load(f) 73 | word_idict = dict() 74 | for kk, vv in word_dict.iteritems(): 75 | word_idict[vv] = kk 76 | #word_idict[0] = '' 77 | #word_idict[1] = 'UNK' 78 | 79 | # load target dictionary and invert 80 | with open(dictionary_target, 'rb') as f: 81 | word_dict_trg = pkl.load(f) 82 | word_idict_trg = dict() 83 | for kk, vv in word_dict_trg.iteritems(): 84 | word_idict_trg[vv] = kk 85 | #word_idict_trg[0] = '' 86 | #word_idict_trg[1] = 'UNK' 87 | 88 | # create input and output queues for processes 89 | jobqueue = [] 90 | resultqueue = [] 91 | 92 | # utility function 93 | def _seqs2words(caps): 94 | capsw = [] 95 | for cc in caps: 96 | ww = [] 97 | for w in cc: 98 | if w == 0: 99 | break 100 | if utf8: 101 | ww.append(word_idict_trg[w].encode('utf-8')) 102 | else: 103 | ww.append(word_idict_trg[w]) 104 | if decoder_chr_level: 105 | capsw.append(''.join(ww)) 106 | else: 107 | capsw.append(' '.join(ww)) 108 | return capsw 109 | 110 | def _send_jobs(fname): 111 | with open(fname, 'r') as f: 112 | for idx, line in enumerate(f): 113 | # idx : 0 ... len-1 114 | 115 | if encoder_chr_level: 116 | words = list(line.decode('utf-8').strip()) 117 | else: 118 | words = line.strip().split() 119 | 120 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 121 | x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) 122 | x += [0] 123 | jobqueue.append((idx, x)) 124 | return idx+1 125 | 126 | def _retrieve_jobs(n_samples, silent): 127 | trans = [None] * n_samples 128 | 129 | for idx in xrange(n_samples): 130 | resp = resultqueue.pop(0) 131 | trans[resp[0]] = resp[1] 132 | if numpy.mod(idx, 10) == 0: 133 | if not silent: 134 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 135 | return trans 136 | 137 | print 'Translating ', source_file, '...' 138 | print 'source dic ', dictionary, '...' 139 | print 'target dic ', dictionary_target, '...' 140 | n_samples = _send_jobs(source_file) 141 | print "jobs sent" 142 | 143 | translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent) 144 | trans = _seqs2words(_retrieve_jobs(n_samples, silent)) 145 | print "translations retrieved" 146 | 147 | with open(saveto, 'w') as f: 148 | print >>f, u'\n'.join(trans).encode('utf-8') 149 | 150 | print "Done", saveto 151 | 152 | if __name__ == "__main__": 153 | parser = argparse.ArgumentParser() 154 | parser.add_argument('-k', type=int, default=20) # beam width 155 | parser.add_argument('-n', action="store_true", default=True) # normalize scores for different hypothesis based on their length (to penalize shorter hypotheses, longer hypotheses are already penalized by the BLEU measure, which is precision of sorts). 156 | parser.add_argument('-enc_c', action="store_true", default=False) # is encoder character-level? 157 | parser.add_argument('-dec_c', action="store_true", default=True) # is decoder character-level? 158 | parser.add_argument('-utf8', action="store_true", default=True) 159 | parser.add_argument('-many', action="store_true", default=False) # multilingual model? 160 | parser.add_argument('-model', type=str) # absolute path to a model (.npz file) 161 | parser.add_argument('-translate', type=str, help="de_en / cs_en / fi_en / ru_en") # which language? 162 | parser.add_argument('-saveto', type=str) # absolute path where the translation should be saved 163 | parser.add_argument('-which', type=str, help="dev / test1 / test2", default="dev") # if you wish to translate any of development / test1 / test2 file from WMT15, simply specify which one here 164 | parser.add_argument('-source', type=str, default="") # if you wish to provide your own file to be translated, provide an absolute path to the file to be translated 165 | parser.add_argument('-silent', action="store_true", default=False) # suppress progress messages 166 | 167 | args = parser.parse_args() 168 | 169 | which_wmt = None 170 | if args.many: 171 | which_wmt = "multi-wmt15" 172 | else: 173 | which_wmt = "wmt15" 174 | data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/%s/" % which_wmt # change appropriately 175 | 176 | if args.which not in "dev test1 test2".split(): 177 | raise Exception('1') 178 | 179 | if args.translate not in ["de_en", "cs_en", "fi_en", "ru_en"]: 180 | raise Exception('1') 181 | 182 | if args.translate == "fi_en" and args.which == "test2": 183 | raise Exception('1') 184 | 185 | if args.many: 186 | from wmt_path_iso9 import * 187 | 188 | dictionary = wmts["many_en"]["dic"][1][0] 189 | dictionary_target = wmts["many_en"]["dic"][0][1] 190 | source = wmts[args.translate][args.which][1][0] 191 | 192 | else: 193 | from wmt_path import * 194 | 195 | aa = args.translate.split("_") 196 | lang = aa[0] 197 | en = aa[1] 198 | 199 | dictionary = "%s%s/train/all_%s-%s.%s.tok.bpe.word.pkl" % (lang, en, lang, en, lang) 200 | dictionary_target = "%s%s/train/all_%s-%s.%s.tok.300.pkl" % (lang, en, lang, en, en) 201 | source = wmts[args.translate][args.which][1][0] 202 | 203 | # /work/yl1363/bpe2char/de_en/deen_bpe2char_two_layer_gru_decoder_adam.grads.355000.npz 204 | model_id = args.model.split('/')[-1] 205 | 206 | dictionary = data_path + dictionary 207 | dictionary_target = data_path + dictionary_target 208 | source = data_path + source 209 | 210 | if args.source != "": 211 | source = args.source 212 | 213 | print "src dict:", dictionary 214 | print "trg dict:", dictionary_target 215 | print "source:", source 216 | 217 | print "dest :", args.saveto 218 | 219 | print args 220 | 221 | time1 = time.time() 222 | main(args.model, dictionary, dictionary_target, source, 223 | args.saveto, k=args.k, normalize=args.n, encoder_chr_level=args.enc_c, 224 | decoder_chr_level=args.dec_c, 225 | utf8=args.utf8, 226 | model_id = model_id, 227 | silent=args.silent, 228 | ) 229 | time2 = time.time() 230 | duration = (time2-time1)/float(60) 231 | print("Translation took %.2f minutes" % duration) 232 | -------------------------------------------------------------------------------- /translate/translate_char2char.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | import time 5 | 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | sys.path.insert(0, "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/char2char") # change appropriately 10 | 11 | import numpy 12 | import cPickle as pkl 13 | from mixer import * 14 | 15 | def translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent): 16 | 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 18 | trng = RandomStreams(1234) 19 | 20 | # allocate model parameters 21 | params = init_params(options) 22 | 23 | # load model parameters and set theano shared variables 24 | params = load_params(model, params) 25 | tparams = init_tparams(params) 26 | 27 | # word index 28 | use_noise = theano.shared(numpy.float32(0.)) 29 | f_init, f_next = build_sampler(tparams, options, trng, use_noise) 30 | 31 | def _translate(seq): 32 | use_noise.set_value(0.) 33 | # sample given an input sequence and obtain scores 34 | # NOTE : if seq length too small, do something about it 35 | sample, score = gen_sample(tparams, f_init, f_next, 36 | numpy.array(seq).reshape([len(seq), 1]), 37 | options, trng=trng, k=k, maxlen=500, 38 | stochastic=False, argmax=False) 39 | 40 | # normalize scores according to sequence lengths 41 | if normalize: 42 | lengths = numpy.array([len(s) for s in sample]) 43 | score = score / lengths 44 | sidx = numpy.argmin(score) 45 | return sample[sidx] 46 | 47 | while jobqueue: 48 | req = jobqueue.pop(0) 49 | 50 | idx, x = req[0], req[1] 51 | if not silent: 52 | print "sentence", idx, model_id 53 | seq = _translate(x) 54 | 55 | resultqueue.append((idx, seq)) 56 | return 57 | 58 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5, 59 | normalize=False, encoder_chr_level=False, 60 | decoder_chr_level=False, utf8=False, 61 | model_id=None, silent=False): 62 | 63 | from char_base import (build_sampler, gen_sample, init_params) 64 | 65 | # load model model_options 66 | # /misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-cdec/models/one-multiscale-conv-two-hw-lngru-1234567-100-150-200-200-200-200-200-66-one.pkl 67 | pkl_file = model.split('.')[0] + '.pkl' 68 | with open(pkl_file, 'rb') as f: 69 | options = pkl.load(f) 70 | 71 | # load source dictionary and invert 72 | with open(dictionary, 'rb') as f: 73 | word_dict = pkl.load(f) 74 | word_idict = dict() 75 | for kk, vv in word_dict.iteritems(): 76 | word_idict[vv] = kk 77 | #word_idict[0] = 'ZERO' 78 | #word_idict[1] = 'UNK' 79 | 80 | # load target dictionary and invert 81 | with open(dictionary_target, 'rb') as f: 82 | word_dict_trg = pkl.load(f) 83 | word_idict_trg = dict() 84 | for kk, vv in word_dict_trg.iteritems(): 85 | word_idict_trg[vv] = kk 86 | #word_idict_trg[0] = 'ZERO' 87 | #word_idict_trg[1] = 'UNK' 88 | 89 | # create input and output queues for processes 90 | jobqueue = [] 91 | resultqueue = [] 92 | 93 | # utility function 94 | def _seqs2words(caps): 95 | capsw = [] 96 | for cc in caps: 97 | ww = [] 98 | for w in cc: 99 | if w == 0: 100 | break 101 | if utf8: 102 | ww.append(word_idict_trg[w].encode('utf-8')) 103 | else: 104 | ww.append(word_idict_trg[w]) 105 | if decoder_chr_level: 106 | capsw.append(''.join(ww)) 107 | else: 108 | capsw.append(' '.join(ww)) 109 | return capsw 110 | 111 | def _send_jobs(fname): 112 | with open(fname, 'r') as f: 113 | for idx, line in enumerate(f): 114 | # idx : 0 ... len-1 115 | pool_window = options['pool_stride'] 116 | 117 | if encoder_chr_level: 118 | words = list(line.decode('utf-8').strip()) 119 | else: 120 | words = line.strip().split() 121 | 122 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 123 | x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) 124 | x = [2] + x + [3] 125 | 126 | # len : 77, pool_window 10 -> 3 127 | # len : 80, pool_window 10 -> 0 128 | #rem = pool_window - ( len(x) % pool_window ) 129 | #if rem < pool_window: 130 | # x += [0]*rem 131 | 132 | while len(x) % pool_window != 0: 133 | x += [0] 134 | 135 | x = [0]*pool_window + x + [0]*pool_window 136 | 137 | jobqueue.append((idx, x)) 138 | 139 | return idx+1 140 | 141 | def _retrieve_jobs(n_samples, silent): 142 | trans = [None] * n_samples 143 | 144 | for idx in xrange(n_samples): 145 | resp = resultqueue.pop(0) 146 | trans[resp[0]] = resp[1] 147 | if numpy.mod(idx, 10) == 0: 148 | if not silent: 149 | print 'Sample ', (idx+1), '/', n_samples, ' Done', model_id 150 | return trans 151 | 152 | print 'Translating ', source_file, '...' 153 | n_samples = _send_jobs(source_file) 154 | print "jobs sent" 155 | 156 | translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent) 157 | trans = _seqs2words(_retrieve_jobs(n_samples, silent)) 158 | print "translations retrieved" 159 | 160 | with open(saveto, 'w') as f: 161 | print >>f, u'\n'.join(trans).encode('utf-8') 162 | 163 | print "Done", saveto 164 | 165 | if __name__ == "__main__": 166 | parser = argparse.ArgumentParser() 167 | parser.add_argument('-k', type=int, default=20) # beam width 168 | parser.add_argument('-n', action="store_true", default=True) # normalize scores for different hypothesis based on their length (to penalize shorter hypotheses, longer hypotheses are already penalized by the BLEU measure, which is precision of sorts). 169 | parser.add_argument('-enc_c', action="store_true", default=True) # is encoder character-level? 170 | parser.add_argument('-dec_c', action="store_true", default=True) # is decoder character-level? 171 | parser.add_argument('-utf8', action="store_true", default=True) 172 | parser.add_argument('-many', action="store_true", default=False) # multilingual model? 173 | parser.add_argument('-model', type=str) # absolute path to a model (.npz file) 174 | parser.add_argument('-translate', type=str, help="de_en / cs_en / fi_en / ru_en") # which language? 175 | parser.add_argument('-saveto', type=str, ) # absolute path where the translation should be saved 176 | parser.add_argument('-which', type=str, help="dev / test1 / test2", default="dev") # if you wish to translate any of development / test1 / test2 file from WMT15, simply specify which one here 177 | parser.add_argument('-source', type=str, default="") # if you wish to provide your own file to be translated, provide an absolute path to the file to be translated 178 | parser.add_argument('-silent', action="store_true", default=False) # suppress progress messages 179 | 180 | args = parser.parse_args() 181 | 182 | which_wmt = None 183 | if args.many: 184 | which_wmt = "multi-wmt15" 185 | else: 186 | which_wmt = "wmt15" 187 | 188 | data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/%s/" % which_wmt # change appropriately 189 | 190 | if args.which not in "dev test1 test2".split(): 191 | raise Exception('1') 192 | 193 | if args.translate not in ["de_en", "cs_en", "fi_en", "ru_en"]: 194 | raise Exception('1') 195 | 196 | if args.translate == "fi_en" and args.which == "test2": 197 | raise Exception('1') 198 | 199 | if args.many: 200 | from wmt_path_iso9 import * 201 | 202 | dictionary = wmts['many_en']['dic'][0][0] 203 | dictionary_target = wmts['many_en']['dic'][0][1] 204 | source = wmts[args.translate][args.which][0][0] 205 | 206 | else: 207 | from wmt_path import * 208 | 209 | aa = args.translate.split("_") 210 | lang = aa[0] 211 | en = aa[1] 212 | 213 | dictionary = "%s%s/train/all_%s-%s.%s.tok.304.pkl" % (lang, en, lang, en, lang) 214 | dictionary_target = "%s%s/train/all_%s-%s.%s.tok.300.pkl" % (lang, en, lang, en, en) 215 | source = wmts[args.translate][args.which][0][0] 216 | 217 | char_base = args.model.split("/")[-1] 218 | 219 | dictionary = data_path + dictionary 220 | dictionary_target = data_path + dictionary_target 221 | source = data_path + source 222 | 223 | if args.source != "": 224 | source = args.source 225 | 226 | print "src dict:", dictionary 227 | print "trg dict:", dictionary_target 228 | print "source:", source 229 | print "dest :", args.saveto 230 | 231 | print args 232 | 233 | time1 = time.time() 234 | main(args.model, dictionary, dictionary_target, source, 235 | args.saveto, k=args.k, normalize=args.n, encoder_chr_level=args.enc_c, 236 | decoder_chr_level=args.dec_c, 237 | utf8=args.utf8, 238 | model_id=char_base, 239 | silent=args.silent, 240 | ) 241 | time2 = time.time() 242 | duration = (time2-time1)/float(60) 243 | print("Translation took %.2f minutes" % duration) 244 | --------------------------------------------------------------------------------