├── LICENSE
├── README.md
├── bpe2char
    ├── char_base.py
    ├── char_base_multi_b2c.py
    ├── data_iterator.py
    ├── many_data_iterator.py
    ├── mixer.py
    ├── nmt.py
    ├── nmt_many.py
    ├── print_batch.py
    ├── train_bi_bpe2char.py
    ├── train_multi_bpe2char.py
    ├── wmt15_manyen_bpe2char_adam.txt
    └── wmt_path.py
├── char2char
    ├── char_base.py
    ├── conv_tools.py
    ├── data_iterator.py
    ├── many_data_iterator.py
    ├── mixer.py
    ├── nmt.py
    ├── nmt_many.py
    ├── prepare_data.py
    ├── print_batch.py
    ├── train_bi_char2char.py
    ├── train_multi_char2char.py
    ├── wmt_path.py
    └── wmt_path_iso9.py
├── preprocess
    ├── build_dictionary_char.py
    ├── build_dictionary_word.py
    ├── clean_tags.py
    ├── fix_appo.sh
    ├── iso.py
    ├── iso9
    ├── merge.sh
    ├── multi-bleu.perl
    ├── nonbreaking_prefixes
    │   ├── README.txt
    │   ├── nonbreaking_prefix.ca
    │   ├── nonbreaking_prefix.cs
    │   ├── nonbreaking_prefix.de
    │   ├── nonbreaking_prefix.el
    │   ├── nonbreaking_prefix.en
    │   ├── nonbreaking_prefix.es
    │   ├── nonbreaking_prefix.fi
    │   ├── nonbreaking_prefix.fr
    │   ├── nonbreaking_prefix.hu
    │   ├── nonbreaking_prefix.is
    │   ├── nonbreaking_prefix.it
    │   ├── nonbreaking_prefix.lv
    │   ├── nonbreaking_prefix.nl
    │   ├── nonbreaking_prefix.pl
    │   ├── nonbreaking_prefix.pt
    │   ├── nonbreaking_prefix.ro
    │   ├── nonbreaking_prefix.ru
    │   ├── nonbreaking_prefix.sk
    │   ├── nonbreaking_prefix.sl
    │   ├── nonbreaking_prefix.sv
    │   └── nonbreaking_prefix.ta
    ├── normalize-punctuation.perl
    ├── preprocess.sh
    ├── tokenizer.perl
    └── tokenizer_apos.perl
└── translate
    ├── translate_bpe2char.py
    └── translate_char2char.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Jason Lee and New York University (Kyunghyun Cho)
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of dl4mt-c2c nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Fully Character-Level Neural Machine Translation
  2 | ==================================
  3 | 
  4 | Theano implementation of the models described in the paper [Fully Character-Level Neural Machine Translation without Explicit Segmentation](https://arxiv.org/abs/1610.03017 "Fully Character-Level Neural Machine Translation without Explicit Segmentation").
  5 | 
  6 | We present code for training and decoding four different models:
  7 | 
  8 | 1. bilingual bpe2char (from [Chung et al., 2016](https://arxiv.org/abs/1603.06147)).
  9 | 2. bilingual char2char
 10 | 3. multilingual bpe2char
 11 | 4. multilingual char2char
 12 | 
 13 | Dependencies
 14 | ------------------
 15 | ### Python
 16 | * Theano
 17 | * Numpy
 18 | * NLTK
 19 | 
 20 | ### GPU
 21 | * CUDA (we recommend using the latest version. The version 8.0 was used in all our experiments.)
 22 | 
 23 | ### Related code
 24 | * For preprocessing and evaluation, we used scripts from [MOSES](https://github.com/moses-smt/mosesdecoder "MOSES").
 25 | * This code is based on [Subword-NMT](http://arxiv.org/abs/1508.07909 "Subword-NMT") and [dl4mt-cdec](https://github.com/nyu-dl/dl4mt-cdec "dl4mt-cdec").
 26 | 
 27 | Downloading Datasets & Pre-trained Models
 28 | ------------------
 29 | The original WMT'15 corpora can be downloaded from [here](http://www.statmt.org/wmt15/translation-task.html). For the preprocessed corpora used in our experiments, see below.
 30 | * WMT'15 preprocessed corpora
 31 |   * [Standard version (for bilingual models, 3.5GB)](https://drive.google.com/open?id=0BxmEQ91VZAPQam5pc2ltQ1BBTTQ)
 32 |   * [Cyrillic converted to Latin (for multilingual models, 2.6GB)](https://drive.google.com/open?id=0BxmEQ91VZAPQS0oxTDJINng5b1k)
 33 | 
 34 | To obtain the pre-trained top-performing models, see below.
 35 | * [Pre-trained models (6.0GB)](https://drive.google.com/open?id=0BxmEQ91VZAPQcGx4VGI2N3dMNEE): **Tarball updated** on Nov 21st 2016. The CS-EN bi-char2char model in the previous tarball was not the best-performing model. 
 36 | 
 37 | Training Details
 38 | ------------------
 39 | ### Using GPUs
 40 | Do the following before executing `train*.py`.
 41 | ```bash
 42 | $ export THEANO_FLAGS=device=gpu,floatX=float32
 43 | ```
 44 | With space permitting on your GPU, it may speed up training to use `cnmem`:
 45 | ```bash
 46 | $ export THEANO_FLAGS=device=gpu,floatX=float32,lib.cnmem=0.95,allow_gc=False
 47 | ```
 48 | 
 49 | On a pre-2016 Titan X GPU with 12GB RAM, our bpe2char models were trained with `cnmem`. Our char2char models (both bilingual and multilingual) were trained without `cnmem` (due to lack of RAM).
 50 | 
 51 | ### Training models
 52 | Before executing the following, modify `train*.py` such that the correct directory containing WMT15 corpora is referenced.
 53 | 
 54 | #### Bilingual bpe2char
 55 | ```bash
 56 | $ python bpe2char/train_bi_bpe2char.py -translate <LANGUAGE_PAIR>
 57 | ```
 58 | #### Bilingual char2char
 59 | ```bash
 60 | $ python char2char/train_bi_char2char.py -translate <LANGUAGE_PAIR>
 61 | ```
 62 | #### Multilingual bpe2char
 63 | ```bash
 64 | $ python bpe2char/train_multi_bpe2char.py 
 65 | ```
 66 | #### Multilingual char2char
 67 | ```bash
 68 | $ python char2char/train_multi_char2char.py 
 69 | ```
 70 | #### Checkpoint
 71 | To resume training a model from a checkpoint, simply append `-re_load` and `-re_load_old_setting` above. Make sure the checkpoint resides in the correct directory (`.../dl4mt-c2c/models`).
 72 | 
 73 | ### Using Custom Datasets
 74 | To train your models using your own dataset (and not the WMT'15 corpus), you first need to learn your vocabulary using `build_dictionary_char.py` or `build_dictionary_word.py` for char2char or bpe2char model, respectively. For the bpe2char model, you additionally need to learn your BPE segmentation rules on the source corpus using the Subword-NMT repository (see below).
 75 | 
 76 | Decoding
 77 | ------------------
 78 | 
 79 | ### Decoding WMT'15 validation / test files
 80 | Before executing the following, modify `translate*.py` such that the correct directory containing WMT15 corpora is referenced.
 81 | 
 82 | ```bash
 83 | $ export THEANO_FLAGS=device=gpu,floatX=float32,lib.cnmem=0.95,allow_gc=False
 84 | $ python translate/translate_bpe2char.py -model <PATH_TO_MODEL.npz> -translate <LANGUAGE_PAIR> -saveto <DESTINATION> -which <VALID/TEST_SET> # for bpe2char models
 85 | $ python translate/translate_char2char.py -model <PATH_TO_MODEL.npz> -translate <LANGUAGE_PAIR> -saveto <DESTINATION> -which <VALID/TEST_SET> # for char2char models
 86 | ```
 87 | 
 88 | When choosing which pre-trained model to give to `-model`, make sure to choose e.g. `.grads.123000.npz`. The models with `.grads` in their names are the optimal models and you should be decoding from those.
 89 | 
 90 | ### Decoding an arbitrary file
 91 | Remove `-which <VALID/TEST_SET>` and append `-source <PATH_TO_SOURCE>`.
 92 | 
 93 | If you choose to decode your own source file, make sure it is:
 94 | 
 95 | 1. properly tokenized (using `preprocess/preprocess.sh`).
 96 | 2. bpe-tokenized for bpe2char models.
 97 | 3. Cyrillic characters should be converted to Latin for multilingual models.
 98 | 
 99 | ### Decoding multilingual models
100 | Append `-many` (of course, provide a path to a multilingual model for `-model`).
101 | 
102 | Evaluation
103 | ------------------
104 | We use the script from MOSES to compute the bleu score. The reference translations can be found in `.../wmt15`.
105 | ```
106 | perl preprocess/multi-bleu.perl reference.txt < model_output.txt
107 | ```
108 | 
109 | Extra
110 | -----------------
111 | ### Extracting & applying BPE rules
112 | 
113 | Clone the Subword-NMT repository.
114 | ```bash
115 | git clone https://github.com/rsennrich/subword-nmt
116 | ```
117 | 
118 | Use following commands (find more information in [Subword-NMT](https://github.com/rsennrich/subword-nmt))
119 | ```bash
120 | ./learn_bpe.py -s {num_operations} < {train_file} > {codes_file}
121 | ./apply_bpe.py -c {codes_file} < {test_file}
122 | ```
123 | 
124 | ### Converting Cyrillic to Latin
125 | 
126 | ```bash
127 | $ python preprocess/iso.py russian_source.txt
128 | ```
129 | will produce an output at `russian_source.txt.iso9`.
130 | 
131 | Citation
132 | ------------------
133 | 
134 | ```
135 | @article{Lee:16,
136 |   author    = {Jason Lee and Kyunghyun Cho and Thomas Hofmann},
137 |   title     = {Fully Character-Level Neural Machine Translation without Explicit Segmentation},
138 |   year      = {2016},
139 |   journal   = {arXiv preprint arXiv:1610.03017},
140 | }
141 | ```
142 | 


--------------------------------------------------------------------------------
/bpe2char/char_base.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Build a simple neural language model using GRU units
  3 | '''
  4 | import theano
  5 | from theano import tensor
  6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  7 | 
  8 | import cPickle
  9 | import numpy
 10 | import copy
 11 | 
 12 | import os
 13 | import warnings
 14 | import sys
 15 | import time
 16 | 
 17 | from collections import OrderedDict
 18 | from mixer import *
 19 | 
 20 | 
 21 | def init_params(options):
 22 |     params = OrderedDict()
 23 | 
 24 |     print "source dictionary size: %d" % options['n_words_src']
 25 |     # embedding
 26 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word_src'])
 27 |     params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
 28 | 
 29 |     # encoder
 30 |     params = get_layer('gru')[0](options, params,
 31 |                                  prefix='encoder',
 32 |                                  nin=options['dim_word_src'],
 33 |                                  dim=options['enc_dim'])
 34 |     params = get_layer('gru')[0](options, params,
 35 |                                  prefix='encoderr',
 36 |                                  nin=options['dim_word_src'],
 37 |                                  dim=options['enc_dim'])
 38 |     ctxdim = 2 * options['enc_dim']
 39 | 
 40 |     # init_state of decoder
 41 |     params = get_layer('ff')[0](options, params,
 42 |                                 prefix='ff_init_state_char',
 43 |                                 nin=ctxdim,
 44 |                                 nout=options['dec_dim'])
 45 |     params = get_layer('ff')[0](options, params,
 46 |                                 prefix='ff_init_state_word',
 47 |                                 nin=ctxdim,
 48 |                                 nout=options['dec_dim'])
 49 | 
 50 |     print "target dictionary size: %d" % options['n_words']
 51 |     # decoder
 52 |     params = get_layer('two_layer_gru_decoder')[0](options, params,
 53 |                                                    prefix='decoder',
 54 |                                                    nin=options['dim_word'],
 55 |                                                    dim_char=options['dec_dim'],
 56 |                                                    dim_word=options['dec_dim'],
 57 |                                                    dimctx=ctxdim)
 58 | 
 59 |     # readout
 60 |     params = get_layer('fff')[0](options, params, prefix='ff_logit_rnn',
 61 |                                  nin1=options['dec_dim'], nin2=options['dec_dim'],
 62 |                                  nout=options['dim_word'], ortho=False)
 63 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 64 |                                 nin=options['dim_word'],
 65 |                                 nout=options['dim_word'],
 66 |                                 ortho=False)
 67 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 68 |                                 nin=ctxdim,
 69 |                                 nout=options['dim_word'],
 70 |                                 ortho=False)
 71 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
 72 |                                 nin=options['dim_word'],
 73 |                                 nout=options['n_words'])
 74 | 
 75 |     return params
 76 | 
 77 | 
 78 | def build_model(tparams, options):
 79 |     opt_ret = OrderedDict()
 80 | 
 81 |     trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max))
 82 |     use_noise = theano.shared(numpy.float32(0.))
 83 | 
 84 |     # description string: #words x #samples
 85 |     x = tensor.matrix('x', dtype='int64')
 86 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 87 |     y = tensor.matrix('y', dtype='int64')
 88 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 89 |     x.tag.test_value = numpy.zeros((5, 63), dtype='int64')
 90 |     x_mask.tag.test_value = numpy.ones((5, 63), dtype='float32')
 91 |     y.tag.test_value = numpy.zeros((7, 63), dtype='int64')
 92 |     y_mask.tag.test_value = numpy.ones((7, 63), dtype='float32')
 93 | 
 94 |     xr = x[::-1]
 95 |     xr_mask = x_mask[::-1]
 96 | 
 97 |     n_samples = x.shape[1]
 98 |     n_timesteps = x.shape[0]
 99 |     n_timesteps_trg = y.shape[0]
100 | 
101 |     # word embedding for forward RNN (source)
102 |     emb = tparams['Wemb'][x.flatten()]
103 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']])
104 | 
105 |     # word embedding for backward RNN (source)
106 |     embr = tparams['Wemb'][xr.flatten()]
107 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']])
108 | 
109 |     # pass through gru layer, recurrence here
110 |     proj = get_layer('gru')[1](tparams, emb, options,
111 |                                prefix='encoder', mask=x_mask)
112 |     projr = get_layer('gru')[1](tparams, embr, options,
113 |                                 prefix='encoderr', mask=xr_mask)
114 | 
115 |     # context
116 |     ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
117 | 
118 |     # context mean
119 |     ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
120 | 
121 |     # initial decoder state
122 |     init_state_char = get_layer('ff')[1](tparams, ctx_mean, options,
123 |                                          prefix='ff_init_state_char', activ='tanh')
124 |     init_state_word = get_layer('ff')[1](tparams, ctx_mean, options,
125 |                                          prefix='ff_init_state_word', activ='tanh')
126 | 
127 |     # word embedding and shifting for targets
128 |     yemb = tparams['Wemb_dec'][y.flatten()]
129 |     yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
130 |     yemb_shited = tensor.zeros_like(yemb)
131 |     yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1])
132 |     yemb = yemb_shited
133 | 
134 |     char_h, word_h, ctxs, alphas = \
135 |             get_layer('two_layer_gru_decoder')[1](tparams, yemb, options,
136 |                                                   prefix='decoder',
137 |                                                   mask=y_mask,
138 |                                                   context=ctx,
139 |                                                   context_mask=x_mask,
140 |                                                   one_step=False,
141 |                                                   init_state_char=init_state_char,
142 |                                                   init_state_word=init_state_word)
143 | 
144 |     opt_ret['dec_alphas'] = alphas
145 | 
146 |     # compute word probabilities
147 |     logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options,
148 |                                     prefix='ff_logit_rnn', activ='linear')
149 |     logit_prev = get_layer('ff')[1](tparams, yemb, options,
150 |                                     prefix='ff_logit_prev', activ='linear')
151 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
152 |                                    prefix='ff_logit_ctx', activ='linear')
153 |     logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx)
154 | 
155 |     if options['use_dropout']:
156 |         print 'Using dropout'
157 |         logit = dropout_layer(logit, use_noise, trng)
158 | 
159 |     logit = get_layer('ff')[1](tparams, logit, options,
160 |                                prefix='ff_logit', activ='linear')
161 |     logit_shp = logit.shape
162 |     probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
163 | 
164 |     # cost
165 |     y_flat = y.flatten()
166 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
167 |     cost = -tensor.log(probs.flatten()[y_flat_idx])
168 |     cost = cost.reshape([y.shape[0], y.shape[1]])
169 |     cost = (cost * y_mask).sum(0)
170 | 
171 |     return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
172 | 
173 | 
174 | def build_sampler(tparams, options, trng, use_noise):
175 |     x = tensor.matrix('x', dtype='int64')
176 |     xr = x[::-1]
177 | 
178 |     n_timesteps = x.shape[0]
179 |     n_samples = x.shape[1]
180 | 
181 |     emb = tparams['Wemb'][x.flatten()]
182 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']])
183 |     embr = tparams['Wemb'][xr.flatten()]
184 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']])
185 | 
186 |     proj = get_layer('gru')[1](tparams, emb, options, prefix='encoder')
187 |     projr = get_layer('gru')[1](tparams, embr, options, prefix='encoderr')
188 | 
189 |     ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
190 |     ctx_mean = ctx.mean(0)
191 | 
192 |     init_state_char = get_layer('ff')[1](tparams, ctx_mean, options,
193 |                                          prefix='ff_init_state_char', activ='tanh')
194 |     init_state_word = get_layer('ff')[1](tparams, ctx_mean, options,
195 |                                          prefix='ff_init_state_word', activ='tanh')
196 | 
197 |     print 'Building f_init...',
198 |     outs = [init_state_char, init_state_word, ctx]
199 |     f_init = theano.function([x], outs, name='f_init', profile=profile)
200 |     print 'Done'
201 | 
202 |     y = tensor.vector('y_sampler', dtype='int64')
203 |     init_state_char = tensor.matrix('init_state_char', dtype='float32')
204 |     init_state_word = tensor.matrix('init_state_word', dtype='float32')
205 | 
206 |     # if it's the first word, emb should be all zero and it is indicated by -1
207 |     yemb = tensor.switch(y[:, None] < 0,
208 |                          tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
209 |                          tparams['Wemb_dec'][y])
210 | 
211 |     next_state_char, next_state_word, next_ctx, next_alpha = \
212 |             get_layer('two_layer_gru_decoder')[1](tparams, yemb, options,
213 |                                                   prefix='decoder',
214 |                                                   context=ctx,
215 |                                                   mask=None,
216 |                                                   one_step=True,
217 |                                                   init_state_char=init_state_char,
218 |                                                   init_state_word=init_state_word)
219 | 
220 |     logit_rnn = get_layer('fff')[1](tparams,
221 |                                     next_state_char,
222 |                                     next_state_word,
223 |                                     options,
224 |                                     prefix='ff_logit_rnn',
225 |                                     activ='linear')
226 |     logit_prev = get_layer('ff')[1](tparams,
227 |                                     yemb,
228 |                                     options,
229 |                                     prefix='ff_logit_prev',
230 |                                     activ='linear')
231 |     logit_ctx = get_layer('ff')[1](tparams,
232 |                                    next_ctx,
233 |                                    options,
234 |                                    prefix='ff_logit_ctx',
235 |                                    activ='linear')
236 |     logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx)
237 | 
238 |     if options['use_dropout']:
239 |         print 'Sampling for dropoutted model'
240 |         logit = dropout_layer(logit, use_noise, trng)
241 | 
242 |     logit = get_layer('ff')[1](tparams, logit, options,
243 |                                prefix='ff_logit',
244 |                                activ='linear')
245 |     next_probs = tensor.nnet.softmax(logit)
246 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
247 | 
248 |     # next word probability
249 |     print 'Building f_next...',
250 |     inps = [y, ctx, init_state_char, init_state_word]
251 |     outs = [next_probs, next_sample, next_state_char, next_state_word]
252 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
253 |     print 'Done'
254 | 
255 |     return f_init, f_next
256 | 
257 | 
258 | def gen_sample(tparams, f_init, f_next, x, options, trng=None,
259 |                k=1, maxlen=500, stochastic=True, argmax=False):
260 | 
261 |     # k is the beam size we have
262 |     if k > 1:
263 |         assert not stochastic, \
264 |             'Beam search does not support stochastic sampling'
265 | 
266 |     sample = []
267 |     sample_score = []
268 |     if stochastic:
269 |         sample_score = 0
270 | 
271 |     live_k = 1
272 |     dead_k = 0
273 | 
274 |     hyp_samples = [[]] * live_k
275 |     hyp_scores = numpy.zeros(live_k).astype('float32')
276 |     hyp_states = []
277 | 
278 |     # get initial state of decoder rnn and encoder context
279 |     ret = f_init(x)
280 |     next_state_char, next_state_word, ctx0 = ret[0], ret[1], ret[2]
281 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
282 | 
283 |     for ii in xrange(maxlen):
284 |         ctx = numpy.tile(ctx0, [live_k, 1])
285 |         inps = [next_w, ctx, next_state_char, next_state_word]
286 |         ret = f_next(*inps)
287 |         next_p, next_w, next_state_char, next_state_word = ret[0], ret[1], ret[2], ret[3]
288 |         if stochastic:
289 |             if argmax:
290 |                 nw = next_p[0].argmax()
291 |             else:
292 |                 nw = next_w[0]
293 |             sample.append(nw)
294 |             sample_score += next_p[0, nw]
295 |             if nw == 0:
296 |                 break
297 |         else:
298 |             cand_scores = hyp_scores[:, None] - numpy.log(next_p)
299 |             cand_flat = cand_scores.flatten()
300 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
301 | 
302 |             voc_size = next_p.shape[1]
303 |             trans_indices = ranks_flat / voc_size
304 |             word_indices = ranks_flat % voc_size
305 |             costs = cand_flat[ranks_flat]
306 | 
307 |             new_hyp_samples = []
308 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
309 |             new_hyp_states_char = []
310 |             new_hyp_states_word = []
311 | 
312 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
313 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
314 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
315 |                 new_hyp_states_char.append(copy.copy(next_state_char[ti]))
316 |                 new_hyp_states_word.append(copy.copy(next_state_word[ti]))
317 | 
318 |             # check the finished samples
319 |             new_live_k = 0
320 |             hyp_samples = []
321 |             hyp_scores = []
322 |             hyp_states_char = []
323 |             hyp_states_word = []
324 | 
325 |             for idx in xrange(len(new_hyp_samples)):
326 |                 if new_hyp_samples[idx][-1] == 0:
327 |                     sample.append(new_hyp_samples[idx])
328 |                     sample_score.append(new_hyp_scores[idx])
329 |                     dead_k += 1
330 |                 else:
331 |                     new_live_k += 1
332 |                     hyp_samples.append(new_hyp_samples[idx])
333 |                     hyp_scores.append(new_hyp_scores[idx])
334 |                     hyp_states_char.append(new_hyp_states_char[idx])
335 |                     hyp_states_word.append(new_hyp_states_word[idx])
336 |             hyp_scores = numpy.array(hyp_scores)
337 |             live_k = new_live_k
338 | 
339 |             if new_live_k < 1:
340 |                 break
341 |             if dead_k >= k:
342 |                 break
343 | 
344 |             next_w = numpy.array([w[-1] for w in hyp_samples])
345 |             next_state_char = numpy.array(hyp_states_char)
346 |             next_state_word = numpy.array(hyp_states_word)
347 | 
348 |     if not stochastic:
349 |         # dump every remaining one
350 |         if live_k > 0:
351 |             for idx in xrange(live_k):
352 |                 sample.append(hyp_samples[idx])
353 |                 sample_score.append(hyp_scores[idx])
354 | 
355 |     return sample, sample_score
356 | 


--------------------------------------------------------------------------------
/bpe2char/char_base_multi_b2c.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Build a simple neural language model using GRU units
  3 | '''
  4 | import theano
  5 | from theano import tensor
  6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  7 | 
  8 | import cPickle
  9 | import numpy
 10 | import copy
 11 | 
 12 | import os
 13 | import warnings
 14 | import sys
 15 | import time
 16 | 
 17 | from collections import OrderedDict
 18 | from mixer import *
 19 | 
 20 | def init_params(options):
 21 |     params = OrderedDict()
 22 | 
 23 |     print "source dictionary size: %d" % options['n_words_src']
 24 |     # embedding
 25 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word_src'])
 26 |     params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
 27 | 
 28 |     # encoder
 29 |     params = get_layer('gru')[0](options, params,
 30 |                                  prefix='encoder',
 31 |                                  nin=options['dim_word_src'],
 32 |                                  dim=options['enc_dim'])
 33 |     params = get_layer('gru')[0](options, params,
 34 |                                  prefix='encoderr',
 35 |                                  nin=options['dim_word_src'],
 36 |                                  dim=options['enc_dim'])
 37 |     ctxdim = 2 * options['enc_dim']
 38 | 
 39 |     # init_state of decoder
 40 |     params = get_layer('ff')[0](options, params,
 41 |                                 prefix='ff_init_state_char',
 42 |                                 nin=ctxdim,
 43 |                                 nout=options['dec_dim'])
 44 |     params = get_layer('ff')[0](options, params,
 45 |                                 prefix='ff_init_state_word',
 46 |                                 nin=ctxdim,
 47 |                                 nout=options['dec_dim'])
 48 | 
 49 |     print "target dictionary size: %d" % options['n_words']
 50 |     # decoder
 51 |     params = get_layer('two_layer_gru_decoder')[0](options, params,
 52 |                                                    prefix='decoder',
 53 |                                                    nin=options['dim_word'],
 54 |                                                    dim_char=options['dec_dim'],
 55 |                                                    dim_word=options['dec_dim'],
 56 |                                                    dimctx=ctxdim)
 57 | 
 58 |     # readout
 59 |     params = get_layer('fff')[0](options, params, prefix='ff_logit_rnn',
 60 |                                  nin1=options['dec_dim'], nin2=options['dec_dim'],
 61 |                                  nout=options['dim_word'], ortho=False)
 62 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 63 |                                 nin=options['dim_word'],
 64 |                                 nout=options['dim_word'],
 65 |                                 ortho=False)
 66 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 67 |                                 nin=ctxdim,
 68 |                                 nout=options['dim_word'],
 69 |                                 ortho=False)
 70 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
 71 |                                 nin=options['dim_word'],
 72 |                                 nout=options['n_words'])
 73 | 
 74 |     return params
 75 | 
 76 | 
 77 | def build_model(tparams, options):
 78 |     opt_ret = OrderedDict()
 79 | 
 80 |     trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max))
 81 |     use_noise = theano.shared(numpy.float32(0.))
 82 | 
 83 |     # description string: #words x #samples
 84 |     x = tensor.matrix('x', dtype='int64')
 85 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 86 |     y = tensor.matrix('y', dtype='int64')
 87 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 88 |     x.tag.test_value = numpy.zeros((5, 63), dtype='int64')
 89 |     x_mask.tag.test_value = numpy.ones((5, 63), dtype='float32')
 90 |     y.tag.test_value = numpy.zeros((7, 63), dtype='int64')
 91 |     y_mask.tag.test_value = numpy.ones((7, 63), dtype='float32')
 92 | 
 93 |     xr = x[::-1]
 94 |     xr_mask = x_mask[::-1]
 95 | 
 96 |     n_samples = x.shape[1]
 97 |     n_timesteps = x.shape[0]
 98 |     n_timesteps_trg = y.shape[0]
 99 | 
100 |     # word embedding for forward RNN (source)
101 |     emb = tparams['Wemb'][x.flatten()]
102 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']])
103 | 
104 |     # word embedding for backward RNN (source)
105 |     embr = tparams['Wemb'][xr.flatten()]
106 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']])
107 | 
108 |     # pass through gru layer, recurrence here
109 |     proj = get_layer('gru')[1](tparams, emb, options,
110 |                                prefix='encoder', mask=x_mask)
111 |     projr = get_layer('gru')[1](tparams, embr, options,
112 |                                 prefix='encoderr', mask=xr_mask)
113 | 
114 |     # context
115 |     ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
116 | 
117 |     # context mean
118 |     ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
119 | 
120 |     # initial decoder state
121 |     init_state_char = get_layer('ff')[1](tparams, ctx_mean, options,
122 |                                          prefix='ff_init_state_char', activ='tanh')
123 |     init_state_word = get_layer('ff')[1](tparams, ctx_mean, options,
124 |                                          prefix='ff_init_state_word', activ='tanh')
125 | 
126 |     # word embedding and shifting for targets
127 |     yemb = tparams['Wemb_dec'][y.flatten()]
128 |     yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
129 |     yemb_shited = tensor.zeros_like(yemb)
130 |     yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1])
131 |     yemb = yemb_shited
132 | 
133 |     char_h, word_h, ctxs, alphas = \
134 |             get_layer('two_layer_gru_decoder')[1](tparams, yemb, options,
135 |                                                   prefix='decoder',
136 |                                                   mask=y_mask,
137 |                                                   context=ctx,
138 |                                                   context_mask=x_mask,
139 |                                                   one_step=False,
140 |                                                   init_state_char=init_state_char,
141 |                                                   init_state_word=init_state_word)
142 | 
143 |     opt_ret['dec_alphas'] = alphas
144 | 
145 |     # compute word probabilities
146 |     logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options,
147 |                                     prefix='ff_logit_rnn', activ='linear')
148 |     logit_prev = get_layer('ff')[1](tparams, yemb, options,
149 |                                     prefix='ff_logit_prev', activ='linear')
150 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
151 |                                    prefix='ff_logit_ctx', activ='linear')
152 |     logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx)
153 | 
154 |     if options['use_dropout']:
155 |         print 'Using dropout'
156 |         logit = dropout_layer(logit, use_noise, trng)
157 | 
158 |     logit = get_layer('ff')[1](tparams, logit, options,
159 |                                prefix='ff_logit', activ='linear')
160 |     logit_shp = logit.shape
161 |     probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
162 | 
163 |     # cost
164 |     y_flat = y.flatten()
165 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
166 |     cost = -tensor.log(probs.flatten()[y_flat_idx])
167 |     cost = cost.reshape([y.shape[0], y.shape[1]])
168 |     cost = (cost * y_mask).sum(0)
169 | 
170 |     return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
171 | 
172 | def build_sampler(tparams, options, trng, use_noise):
173 |     x = tensor.matrix('x', dtype='int64')
174 |     xr = x[::-1]
175 | 
176 |     n_timesteps = x.shape[0]
177 |     n_samples = x.shape[1]
178 | 
179 |     emb = tparams['Wemb'][x.flatten()]
180 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']])
181 |     embr = tparams['Wemb'][xr.flatten()]
182 |     embr = embr.reshape([n_timesteps, n_samples, options['dim_word_src']])
183 | 
184 |     proj = get_layer('gru')[1](tparams, emb, options, prefix='encoder')
185 |     projr = get_layer('gru')[1](tparams, embr, options, prefix='encoderr')
186 | 
187 |     ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
188 |     ctx_mean = ctx.mean(0)
189 | 
190 |     init_state_char = get_layer('ff')[1](tparams, ctx_mean, options,
191 |                                          prefix='ff_init_state_char', activ='tanh')
192 |     init_state_word = get_layer('ff')[1](tparams, ctx_mean, options,
193 |                                          prefix='ff_init_state_word', activ='tanh')
194 | 
195 |     print 'Building f_init...',
196 |     outs = [init_state_char, init_state_word, ctx]
197 |     f_init = theano.function([x], outs, name='f_init', profile=profile)
198 |     print 'Done'
199 | 
200 |     y = tensor.vector('y_sampler', dtype='int64')
201 |     init_state_char = tensor.matrix('init_state_char', dtype='float32')
202 |     init_state_word = tensor.matrix('init_state_word', dtype='float32')
203 | 
204 |     # if it's the first word, emb should be all zero and it is indicated by -1
205 |     yemb = tensor.switch(y[:, None] < 0,
206 |                          tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
207 |                          tparams['Wemb_dec'][y])
208 | 
209 |     next_state_char, next_state_word, next_ctx, next_alpha = \
210 |             get_layer('two_layer_gru_decoder')[1](tparams, yemb, options,
211 |                                                   prefix='decoder',
212 |                                                   context=ctx,
213 |                                                   mask=None,
214 |                                                   one_step=True,
215 |                                                   init_state_char=init_state_char,
216 |                                                   init_state_word=init_state_word)
217 | 
218 |     logit_rnn = get_layer('fff')[1](tparams,
219 |                                     next_state_char,
220 |                                     next_state_word,
221 |                                     options,
222 |                                     prefix='ff_logit_rnn',
223 |                                     activ='linear')
224 |     logit_prev = get_layer('ff')[1](tparams,
225 |                                     yemb,
226 |                                     options,
227 |                                     prefix='ff_logit_prev',
228 |                                     activ='linear')
229 |     logit_ctx = get_layer('ff')[1](tparams,
230 |                                    next_ctx,
231 |                                    options,
232 |                                    prefix='ff_logit_ctx',
233 |                                    activ='linear')
234 |     logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx)
235 | 
236 |     if options['use_dropout']:
237 |         print 'Sampling for dropoutted model'
238 |         logit = dropout_layer(logit, use_noise, trng)
239 | 
240 |     logit = get_layer('ff')[1](tparams, logit, options,
241 |                                prefix='ff_logit',
242 |                                activ='linear')
243 |     next_probs = tensor.nnet.softmax(logit)
244 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
245 | 
246 |     # next word probability
247 |     print 'Building f_next...',
248 |     inps = [y, ctx, init_state_char, init_state_word]
249 |     outs = [next_probs, next_sample, next_state_char, next_state_word]
250 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
251 |     print 'Done'
252 | 
253 |     return f_init, f_next
254 | 
255 | 
256 | def gen_sample(tparams, f_init, f_next, x, options, trng=None,
257 |                k=1, maxlen=500, stochastic=True, argmax=False):
258 | 
259 |     # k is the beam size we have
260 |     if k > 1:
261 |         assert not stochastic, \
262 |             'Beam search does not support stochastic sampling'
263 | 
264 |     sample = []
265 |     sample_score = []
266 |     if stochastic:
267 |         sample_score = 0
268 | 
269 |     live_k = 1
270 |     dead_k = 0
271 | 
272 |     hyp_samples = [[]] * live_k
273 |     hyp_scores = numpy.zeros(live_k).astype('float32')
274 |     hyp_states = []
275 | 
276 |     # get initial state of decoder rnn and encoder context
277 |     ret = f_init(x)
278 |     next_state_char, next_state_word, ctx0 = ret[0], ret[1], ret[2]
279 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
280 | 
281 |     for ii in xrange(maxlen):
282 |         ctx = numpy.tile(ctx0, [live_k, 1])
283 |         inps = [next_w, ctx, next_state_char, next_state_word]
284 |         ret = f_next(*inps)
285 |         next_p, next_w, next_state_char, next_state_word = ret[0], ret[1], ret[2], ret[3]
286 |         if stochastic:
287 |             if argmax:
288 |                 nw = next_p[0].argmax()
289 |             else:
290 |                 nw = next_w[0]
291 |             sample.append(nw)
292 |             sample_score += next_p[0, nw]
293 |             if nw == 0:
294 |                 break
295 |         else:
296 |             cand_scores = hyp_scores[:, None] - numpy.log(next_p)
297 |             cand_flat = cand_scores.flatten()
298 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
299 | 
300 |             voc_size = next_p.shape[1]
301 |             trans_indices = ranks_flat / voc_size
302 |             word_indices = ranks_flat % voc_size
303 |             costs = cand_flat[ranks_flat]
304 | 
305 |             new_hyp_samples = []
306 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
307 |             new_hyp_states_char = []
308 |             new_hyp_states_word = []
309 | 
310 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
311 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
312 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
313 |                 new_hyp_states_char.append(copy.copy(next_state_char[ti]))
314 |                 new_hyp_states_word.append(copy.copy(next_state_word[ti]))
315 | 
316 |             # check the finished samples
317 |             new_live_k = 0
318 |             hyp_samples = []
319 |             hyp_scores = []
320 |             hyp_states_char = []
321 |             hyp_states_word = []
322 | 
323 |             for idx in xrange(len(new_hyp_samples)):
324 |                 if new_hyp_samples[idx][-1] == 0:
325 |                     sample.append(new_hyp_samples[idx])
326 |                     sample_score.append(new_hyp_scores[idx])
327 |                     dead_k += 1
328 |                 else:
329 |                     new_live_k += 1
330 |                     hyp_samples.append(new_hyp_samples[idx])
331 |                     hyp_scores.append(new_hyp_scores[idx])
332 |                     hyp_states_char.append(new_hyp_states_char[idx])
333 |                     hyp_states_word.append(new_hyp_states_word[idx])
334 |             hyp_scores = numpy.array(hyp_scores)
335 |             live_k = new_live_k
336 | 
337 |             if new_live_k < 1:
338 |                 break
339 |             if dead_k >= k:
340 |                 break
341 | 
342 |             next_w = numpy.array([w[-1] for w in hyp_samples])
343 |             next_state_char = numpy.array(hyp_states_char)
344 |             next_state_word = numpy.array(hyp_states_word)
345 | 
346 |     if not stochastic:
347 |         # dump every remaining one
348 |         if live_k > 0:
349 |             for idx in xrange(live_k):
350 |                 sample.append(hyp_samples[idx])
351 |                 sample_score.append(hyp_scores[idx])
352 | 
353 |     return sample, sample_score
354 | 


--------------------------------------------------------------------------------
/bpe2char/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import numpy
  3 | import os
  4 | import random
  5 | 
  6 | import cPickle
  7 | import gzip
  8 | import codecs
  9 | 
 10 | from tempfile import mkstemp
 11 | 
 12 | 
 13 | def fopen(filename, mode='r'):
 14 |     if filename.endswith('.gz'):
 15 |         return gzip.open(filename, mode)
 16 |     return open(filename, mode)
 17 | 
 18 | 
 19 | class TextIterator:
 20 |     """Simple Bitext iterator."""
 21 |     def __init__(self,
 22 |                  source, source_dict,
 23 |                  target=None, target_dict=None,
 24 |                  source_word_level=0,
 25 |                  target_word_level=0,
 26 |                  batch_size=128,
 27 |                  job_id=0,
 28 |                  sort_size=20,
 29 |                  n_words_source=-1,
 30 |                  n_words_target=-1,
 31 |                  shuffle_per_epoch=False):
 32 |         self.source_file = source
 33 |         self.target_file = target
 34 |         self.source = fopen(source, 'r')
 35 |         with open(source_dict, 'rb') as f:
 36 |             self.source_dict = cPickle.load(f)
 37 |         if target is not None:
 38 |             self.target = fopen(target, 'r')
 39 |             if target_dict is not None:
 40 |                 with open(target_dict, 'rb') as f:
 41 |                     self.target_dict = cPickle.load(f)
 42 |         else:
 43 |             self.target = None
 44 | 
 45 |         self.source_word_level = source_word_level
 46 |         self.target_word_level = target_word_level
 47 |         self.batch_size = batch_size
 48 | 
 49 |         self.n_words_source = n_words_source
 50 |         self.n_words_target = n_words_target
 51 |         self.shuffle_per_epoch = shuffle_per_epoch
 52 | 
 53 |         self.source_buffer = []
 54 |         self.target_buffer = []
 55 |         self.k = batch_size * sort_size
 56 | 
 57 |         self.end_of_data = False
 58 |         self.job_id = job_id
 59 | 
 60 |     def __iter__(self):
 61 |         return self
 62 | 
 63 |     def reset(self):
 64 |         if self.shuffle_per_epoch:
 65 |             # close current files
 66 |             self.source.close()
 67 |             if self.target is None:
 68 |                 self.shuffle([self.source_file])
 69 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 70 |             else:
 71 |                 self.target.close()
 72 |                 # shuffle *original* source files,
 73 |                 self.shuffle([self.source_file, self.target_file])
 74 |                 # open newly 're-shuffled' file as input
 75 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 76 |                 self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r')
 77 |         else:
 78 |             self.source.seek(0)
 79 |             if self.target is not None:
 80 |                 self.target.seek(0)
 81 | 
 82 |     @staticmethod
 83 |     def shuffle(files):
 84 |         tf_os, tpath = mkstemp()
 85 |         tf = open(tpath, 'w')
 86 |         fds = [open(ff) for ff in files]
 87 |         for l in fds[0]:
 88 |             lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
 89 |             print >>tf, "|||".join(lines)
 90 |         [ff.close() for ff in fds]
 91 |         tf.close()
 92 |         tf = open(tpath, 'r')
 93 |         lines = tf.readlines()
 94 |         random.shuffle(lines)
 95 |         fds = [open(ff+'.reshuf','w') for ff in files]
 96 |         for l in lines:
 97 |             s = l.strip().split('|||')
 98 |             for ii, fd in enumerate(fds):
 99 |                 print >>fd, s[ii]
100 |         [ff.close() for ff in fds]
101 |         os.remove(tpath)
102 |         return
103 | 
104 |     def next(self):
105 |         if self.end_of_data:
106 |             self.end_of_data = False
107 |             self.reset()
108 |             raise StopIteration
109 | 
110 |         source = []
111 |         target = []
112 | 
113 |         # fill buffer, if it's empty
114 |         if self.target is not None:
115 |             assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
116 | 
117 |         if len(self.source_buffer) == 0:
118 |             for k_ in xrange(self.k):
119 |                 ss = self.source.readline()
120 | 
121 |                 if ss == "":
122 |                     break
123 | 
124 |                 if self.source_word_level:
125 |                     ss = ss.strip().split()
126 |                 else:
127 |                     ss = ss.strip()
128 |                     ss = list(ss.decode('utf8'))
129 | 
130 |                 self.source_buffer.append(ss)
131 | 
132 |                 if self.target is not None:
133 |                     tt = self.target.readline()
134 | 
135 |                     if tt == "":
136 |                         break
137 | 
138 |                     if self.target_word_level:
139 |                         tt = tt.strip().split()
140 |                     else:
141 |                         tt = tt.strip()
142 |                         tt = list(tt.decode('utf8'))
143 | 
144 |                     self.target_buffer.append(tt)
145 | 
146 |             if self.target is not None:
147 |                 # sort by target buffer
148 |                 tlen = numpy.array([len(t) for t in self.target_buffer])
149 |                 tidx = tlen.argsort()
150 |                 _sbuf = [self.source_buffer[i] for i in tidx]
151 |                 _tbuf = [self.target_buffer[i] for i in tidx]
152 |                 self.target_buffer = _tbuf
153 |             else:
154 |                 slen = numpy.array([len(s) for s in self.source_buffer])
155 |                 sidx = slen.argsort()
156 |                 _sbuf = [self.source_buffer[i] for i in sidx]
157 | 
158 |             self.source_buffer = _sbuf
159 | 
160 |         if self.target is not None:
161 |             if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
162 |                 self.end_of_data = False
163 |                 self.reset()
164 |                 raise StopIteration
165 |         elif len(self.source_buffer) == 0:
166 |             self.end_of_data = False
167 |             self.reset()
168 |             raise StopIteration
169 | 
170 |         try:
171 |             # actual work here
172 |             while True:
173 |                 # read from source file and map to word index
174 |                 try:
175 |                     ss_ = self.source_buffer.pop()
176 |                 except IndexError:
177 |                     break
178 |                 ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_]
179 |                 if self.n_words_source > 0:
180 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
181 |                 source.append(ss)
182 |                 if self.target is not None:
183 |                     # read from target file and map to word index
184 |                     tt_ = self.target_buffer.pop()
185 |                     tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_]
186 |                     if self.n_words_target > 0:
187 |                         tt = [w if w < self.n_words_target else 1 for w in tt]
188 |                     target.append(tt)
189 | 
190 |                 if len(source) >= self.batch_size:
191 |                     break
192 |         except IOError:
193 |             self.end_of_data = True
194 | 
195 |         if self.target is not None:
196 |             if len(source) <= 0 or len(target) <= 0:
197 |                 self.end_of_data = False
198 |                 self.reset()
199 |                 raise StopIteration
200 |             return source, target
201 |         else:
202 |             if len(source) <= 0:
203 |                 self.end_of_data = False
204 |                 self.reset()
205 |                 raise StopIteration
206 |             return source
207 | 


--------------------------------------------------------------------------------
/bpe2char/many_data_iterator.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import numpy
  3 | import os
  4 | import random
  5 | 
  6 | import cPickle
  7 | import gzip
  8 | import codecs
  9 | 
 10 | from tempfile import mkstemp
 11 | 
 12 | random.seed(1029381209)
 13 | 
 14 | def fopen(filename, mode='r'):
 15 |     if filename.endswith('.gz'):
 16 |         return gzip.open(filename, mode)
 17 |     return open(filename, mode)
 18 | 
 19 | class MultiTextIterator:
 20 |     """Simple Bitext iterator."""
 21 |     def __init__(self,
 22 |                  source, source_dict,
 23 |                  target=None, target_dict=None,
 24 |                  source_word_level=0,
 25 |                  target_word_level=0,
 26 |                  batch_size=[128,1,2,3],
 27 |                  job_id=0,
 28 |                  sort_size=20,
 29 |                  n_words_source=302,
 30 |                  n_words_target=302,
 31 |                  shuffle_per_epoch=False):
 32 | 
 33 |         self.source_files = source
 34 |         self.target_files = target
 35 | 
 36 |         self.sources = [fopen(s, 'r') for s in source]
 37 |         with open(source_dict, 'rb') as f:
 38 |             self.source_dict = cPickle.load(f)
 39 |             # one source dictionary
 40 | 
 41 |         self.targets = [fopen(t, 'r') for t in target]
 42 |         with open(target_dict, 'rb') as f:
 43 |             self.target_dict = cPickle.load(f)
 44 |             # one target dictionary
 45 | 
 46 |         self.source_word_level = source_word_level
 47 |         self.target_word_level = target_word_level
 48 |         self.batch_sizes = batch_size
 49 |         # list
 50 | 
 51 |         self.n_words_source = n_words_source
 52 |         self.n_words_target = n_words_target
 53 |         self.shuffle_per_epoch = shuffle_per_epoch
 54 | 
 55 |         self.source_buffers = [[],[],[],[]]
 56 |         self.target_buffers = [[],[],[],[]]
 57 |         self.k = [bs * sort_size for bs in batch_size]
 58 |         # at once, fetch 20 items
 59 |         # we're good for 20 updates
 60 | 
 61 |         self.end_of_data = False
 62 |         self.job_id = job_id
 63 | 
 64 |     def __iter__(self):
 65 |         return self
 66 | 
 67 |     def reset(self):
 68 |         if self.shuffle_per_epoch:
 69 |             raise Exception("hi")
 70 |             # close current files
 71 |             for s in self.sources:
 72 |                 s.close()
 73 | 
 74 |             if self.targets is None:
 75 |                 self.shuffle([self.source_file])
 76 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 77 |             else:
 78 |                 for t in self.targets:
 79 |                     t.close()
 80 | 
 81 |                 # shuffle *original* source files,
 82 |                 self.shuffle([self.source_file, self.target_file])
 83 |                 # open newly 're-shuffled' file as input
 84 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 85 |                 self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r')
 86 |         else:
 87 |             for idx in xrange(4):
 88 |                 self.sources[idx].seek(0)
 89 |                 self.targets[idx].seek(0)
 90 | 
 91 |     @staticmethod
 92 |     def shuffle(files):
 93 |         tf_os, tpath = mkstemp()
 94 |         tf = open(tpath, 'w')
 95 |         fds = [open(ff) for ff in files]
 96 |         for l in fds[0]:
 97 |             lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
 98 |             print >>tf, "|||".join(lines)
 99 |         [ff.close() for ff in fds]
100 |         tf.close()
101 |         tf = open(tpath, 'r')
102 |         lines = tf.readlines()
103 |         random.shuffle(lines)
104 |         fds = [open(ff+'.reshuf','w') for ff in files]
105 |         for l in lines:
106 |             s = l.strip().split('|||')
107 |             for ii, fd in enumerate(fds):
108 |                 print >>fd, s[ii]
109 |         [ff.close() for ff in fds]
110 |         os.remove(tpath)
111 |         return
112 | 
113 |     def next(self):
114 |         # if end_of_data reaches, stop for loop
115 |         if self.end_of_data:
116 |             self.end_of_data = False
117 |             self.reset()
118 |             raise StopIteration
119 | 
120 |         sources = [[],[],[],[]]
121 |         targets = [[],[],[],[]]
122 |         # NOTE : this is the data to be used for "this" round of updates
123 | 
124 |         # fill buffer, if it's empty
125 |         for idx in xrange(4):
126 |             assert len(self.source_buffers[idx]) == len(self.target_buffers[idx]), 'Buffer size mismatch!'
127 | 
128 |         for idx in xrange(4):
129 |             # NOTE : in buffer: don't put the whole dataset in... only for 'k' many updates
130 |             # after 'k' updates, self.source_buffers[idx] will be empty, in which case we will put new things in
131 | 
132 |             #if len(self.source_buffers[idx]) == 0:
133 |             if len(self.source_buffers[idx]) < self.batch_sizes[idx]:
134 |             # NOTE : change this to : if less than one out...
135 |                 for k_ in xrange(self.k[idx]):
136 | 
137 |                     ss = self.sources[idx].readline()
138 |                     # NOTE: self.sources is where we keep the RAW data
139 |                     if ss == "":
140 |                         break
141 |                     if self.source_word_level:
142 |                         ss = ss.strip().split()
143 |                     else:
144 |                         ss = ss.strip()
145 |                         ss = list(ss.decode('utf8'))
146 |                     self.source_buffers[idx].append(ss)
147 | 
148 |                     tt = self.targets[idx].readline()
149 |                     if tt == "":
150 |                         break
151 |                     if self.target_word_level:
152 |                         tt = tt.strip().split()
153 |                     else:
154 |                         tt = tt.strip()
155 |                         tt = list(tt.decode('utf8'))
156 |                     self.target_buffers[idx].append(tt)
157 | 
158 |                 tlen = numpy.array([len(t) for t in self.target_buffers[idx]])
159 |                 tidx = tlen.argsort()
160 |                 _sbuf = [self.source_buffers[idx][i] for i in tidx]
161 |                 _tbuf = [self.target_buffers[idx][i] for i in tidx]
162 |                 self.target_buffers[idx] = _tbuf
163 |                 self.source_buffers[idx] = _sbuf
164 | 
165 |         stop = False
166 |         for idx in xrange(4):
167 |             if len(self.source_buffers[idx]) < self.batch_sizes[idx]:
168 |                 stop = True
169 | 
170 |         if stop:
171 |             self.end_of_data = False
172 |             self.reset()
173 |             raise StopIteration
174 | 
175 |         try:
176 |             # actual work here
177 |             for idx in xrange(4):
178 |                 while True:
179 |                 # read from source file and map to word index
180 |                     try:
181 |                         ss_ = self.source_buffers[idx].pop()
182 |                     except IndexError:
183 |                         # NOTE : just because source_buffers is empty, doesn't mean file scanned
184 |                         # we do add partial batches. We proceed until len(source_buffers) = 0
185 |                         break
186 | 
187 |                     ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_]
188 |                     if self.n_words_source > 0:
189 |                         ss = [w if w < self.n_words_source else 1 for w in ss]
190 |                     sources[idx].append(ss)
191 | 
192 |                     tt_ = self.target_buffers[idx].pop()
193 |                     tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_]
194 |                     if self.n_words_target > 0:
195 |                         tt = [w if w < self.n_words_target else 1 for w in tt]
196 |                     targets[idx].append(tt)
197 | 
198 |                     if len(sources[idx]) >= self.batch_sizes[idx]:
199 |                         break
200 | 
201 |         except IOError:
202 |             self.end_of_data = True
203 | 
204 |         source = sources[0] + sources[1] + sources[2] + sources[3]
205 |         target = targets[0] + targets[1] + targets[2] + targets[3]
206 | 
207 |         # NOTE : just add anything, if still nothing, reset
208 |         min_batch_size = numpy.sum(self.batch_sizes)
209 |         # NOTE : this CANT BE ZERO!!!! bc buffer not multiple of things
210 |         if len(source) < min_batch_size or len(target) < min_batch_size:
211 |             self.end_of_data = False
212 |             self.reset()
213 |             raise StopIteration
214 | 
215 |         return source, target
216 | 


--------------------------------------------------------------------------------
/bpe2char/print_batch.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | import numpy as np
 3 | 
 4 | def pbatch(source, dic):
 5 |     ss = np.transpose(source)
 6 |     for line in ss[:10]:
 7 |         for word in line:
 8 |             a = dic[word]
 9 |             b = a
10 |             if a == "eos":
11 |                 b = "_"
12 |             elif a == "UNK":
13 |                 b = "|"
14 |             print b,
15 |         print " "
16 |     print ""
17 | 


--------------------------------------------------------------------------------
/bpe2char/train_bi_bpe2char.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import sys
  4 | from collections import OrderedDict
  5 | from nmt import train
  6 | from wmt_path import wmts
  7 | from char_base import *
  8 | 
  9 | layers = {'ff': ('param_init_fflayer', 'fflayer'),
 10 |           'fff': ('param_init_ffflayer', 'ffflayer'),
 11 |           'gru': ('param_init_gru', 'gru_layer'),
 12 |           'two_layer_gru_decoder': ('param_init_two_layer_gru_decoder',
 13 |                                     'two_layer_gru_decoder'),
 14 |           }
 15 | 
 16 | def main(job_id, params):
 17 |     save_file_name = args.model_name
 18 |     source_dataset = args.data_path + wmts[args.translate]['train'][1][0]
 19 |     target_dataset = args.data_path + wmts[args.translate]['train'][0][1]
 20 |     valid_source_dataset = args.data_path + wmts[args.translate]['dev'][1][0]
 21 |     valid_target_dataset = args.data_path + wmts[args.translate]['dev'][0][1]
 22 |     source_dictionary = args.data_path + wmts[args.translate]['dic'][1][0]
 23 |     target_dictionary = args.data_path + wmts[args.translate]['dic'][0][1]
 24 | 
 25 |     print args.save_path, save_file_name
 26 |     print source_dataset
 27 |     print target_dataset
 28 |     print valid_source_dataset
 29 |     print valid_target_dataset
 30 |     print source_dictionary
 31 |     print target_dictionary
 32 |     print params, params.save_path, save_file_name
 33 | 
 34 |     validerr = train(
 35 |         max_epochs=args.max_epochs,
 36 |         patience=args.patience,
 37 | 
 38 |         dim_word_src=args.dim_word_src,
 39 |         dim_word=args.dim_word,
 40 | 
 41 |         save_path=args.save_path,
 42 |         save_file_name=save_file_name,
 43 |         re_load=args.re_load,
 44 |         re_load_old_setting=args.re_load_old_setting,
 45 | 
 46 |         enc_dim=args.enc_dim,
 47 |         dec_dim=args.dec_dim,
 48 | 
 49 |         n_words_src=args.n_words_src,
 50 |         n_words=args.n_words,
 51 |         decay_c=args.decay_c,
 52 |         lrate=args.learning_rate,
 53 |         optimizer=args.optimizer,
 54 |         maxlen=args.maxlen,
 55 |         maxlen_trg=args.maxlen_trg,
 56 |         maxlen_sample=args.maxlen_sample,
 57 |         batch_size=args.batch_size,
 58 |         valid_batch_size=args.valid_batch_size,
 59 |         sort_size=args.sort_size,
 60 |         validFreq=args.validFreq,
 61 |         dispFreq=args.dispFreq,
 62 |         saveFreq=args.saveFreq,
 63 |         sampleFreq=args.sampleFreq,
 64 |         pbatchFreq=args.pbatchFreq,
 65 |         clip_c=args.clip_c,
 66 | 
 67 |         datasets=[source_dataset, target_dataset],
 68 |         valid_datasets=[valid_source_dataset, valid_target_dataset],
 69 |         dictionaries=[source_dictionary, target_dictionary],
 70 | 
 71 | 	use_dropout=args.use_dropout,
 72 |         source_word_level=args.source_word_level,
 73 |         target_word_level=args.target_word_level,
 74 |         save_every_saveFreq=1,
 75 |         use_bpe=1,
 76 |         gru=args.gru,
 77 | 
 78 |         quit_immediately=args.quit_immediately,
 79 |         init_params=init_params,
 80 |         build_model=build_model,
 81 |         build_sampler=build_sampler,
 82 |         gen_sample=gen_sample,
 83 |     )
 84 |     return validerr
 85 | 
 86 | if __name__ == '__main__':
 87 | 
 88 |     parser = argparse.ArgumentParser()
 89 |     parser.add_argument('-model_name', type=str, help="", default="bi-bpe2char")
 90 |     parser.add_argument('-translate', type=str, default="de_en", help="de_en / cs_en / fi_en / ru_en")
 91 | 
 92 |     parser.add_argument('-enc_dim', type=int, default=512, help="")
 93 |     parser.add_argument('-dec_dim', type=int, default=1024, help="")
 94 | 
 95 |     parser.add_argument('-dim_word', type=int, default=512, help="")
 96 |     parser.add_argument('-dim_word_src', type=int, default=512, help="")
 97 | 
 98 |     parser.add_argument('-batch_size', type=int, default=128, help="")
 99 |     parser.add_argument('-valid_batch_size', type=int, default=128, help="")
100 | 
101 |     parser.add_argument('-maxlen', type=int, default=50, help="")
102 |     parser.add_argument('-maxlen_trg', type=int, default=500, help="")
103 |     parser.add_argument('-maxlen_sample', type=int, default=500, help="")
104 | 
105 |     parser.add_argument('-re_load', action="store_true", default=False)
106 |     parser.add_argument('-re_load_old_setting', action="store_true", default=False)
107 |     parser.add_argument('-quit_immediately', action="store_true", default=False)
108 | 
109 |     parser.add_argument('-use_dropout', action="store_true", default=False)
110 | 
111 |     parser.add_argument('-max_epochs', type=int, default=1000000000000, help="")
112 |     parser.add_argument('-patience', type=int, default=-1, help="")
113 |     parser.add_argument('-learning_rate', type=float, default=0.0001, help="")
114 | 
115 |     parser.add_argument('-n_words_src', type=int, default=302, help="298 for FI")
116 |     parser.add_argument('-n_words', type=int, default=302, help="292 for FI")
117 | 
118 |     parser.add_argument('-optimizer', type=str, default="adam", help="")
119 |     parser.add_argument('-decay_c', type=int, default=0, help="")
120 |     parser.add_argument('-clip_c', type=int, default=1, help="")
121 | 
122 |     parser.add_argument('-gru', type=str, default="gru", help="gru/lngru")
123 | 
124 |     parser.add_argument('-saveFreq', type=int, default=5000, help="")
125 |     parser.add_argument('-sampleFreq', type=int, default=5000, help="")
126 |     parser.add_argument('-dispFreq', type=int, default=1000, help="")
127 |     parser.add_argument('-validFreq', type=int, default=5000, help="")
128 |     parser.add_argument('-pbatchFreq', type=int, default=5000, help="")
129 |     parser.add_argument('-sort_size', type=int, default=20, help="")
130 | 
131 |     parser.add_argument('-source_word_level', type=int, default=1, help="")
132 |     parser.add_argument('-target_word_level', type=int, default=0, help="")
133 | 
134 |     args = parser.parse_args()
135 | 
136 |     n_words_dic = {'de_en': [24254, 302], 'cs_en': [21816, 302], 'fi_en':[20783, 292], 'ru_en':[22106, 302]}
137 | 
138 |     args.n_words_src = n_words_dic[args.translate][0]
139 |     args.n_words= n_words_dic[args.translate][1]
140 | 
141 |     args.save_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly
142 |     args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/wmt15/" # change accordingly
143 |     args.save_path = args.save_path + args.translate + "/"
144 | 
145 |     main(0, args)
146 | 


--------------------------------------------------------------------------------
/bpe2char/train_multi_bpe2char.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import string
  5 | import math
  6 | import numpy
  7 | from char_base_multi_b2c import *
  8 | from nmt_many import train
  9 | 
 10 | from collections import OrderedDict
 11 | 
 12 | def main(job_id, params, args):
 13 |     print args
 14 |     save_file_name = args.model_name
 15 |     source_dataset = [args.data_path + path + tr for path, tr in zip(params['train_data_path'], params['source_dataset'])]
 16 |     target_dataset = [args.data_path + path + tr for path, tr in zip(params['train_data_path'], params['target_dataset'])]
 17 | 
 18 |     valid_source_dataset = [args.data_path + path + tr for path, tr in zip(params['dev_data_path'], params['valid_source_dataset'])]
 19 |     valid_target_dataset = [args.data_path + path + tr for path, tr in zip(params['dev_data_path'], params['valid_target_dataset'])]
 20 | 
 21 |     source_dictionary = args.dic_path + args.source_dictionary
 22 |     target_dictionary = args.dic_path + args.target_dictionary
 23 | 
 24 |     args.save_path = args.save_path + args.translate + "/"
 25 | 
 26 |     print params, args.save_path, save_file_name
 27 |     validerr = train(
 28 |         max_epochs=int(params['max_epochs']),
 29 |         patience=int(params['patience']),
 30 | 
 31 |         dim_word=args.dim_word,
 32 |         dim_word_src=args.dim_word_src,
 33 | 
 34 |         save_path=args.save_path,
 35 |         save_file_name=save_file_name,
 36 |         re_load=args.re_load,
 37 |         re_load_old_setting=args.re_load_old_setting,
 38 | 
 39 |         enc_dim=args.enc_dim,
 40 |         dec_dim=args.dec_dim,
 41 | 
 42 |         n_words=args.n_words,
 43 |         n_words_src=args.n_words_src,
 44 |         decay_c=float(params['decay_c']),
 45 |         lrate=float(params['learning_rate']),
 46 |         optimizer=params['optimizer'],
 47 |         maxlen=args.maxlen,
 48 |         maxlen_trg=args.maxlen_trg,
 49 |         maxlen_sample=args.maxlen_sample,
 50 |         batch_size=args.train_batch_size,
 51 |         valid_batch_size=args.valid_batch_size,
 52 |         sort_size=args.sort_size,
 53 |         validFreq=args.validFreq,
 54 |         dispFreq=args.dispFreq,
 55 |         saveFreq=args.saveFreq,
 56 |         sampleFreq=args.sampleFreq,
 57 |         pbatchFreq=args.pbatchFreq,
 58 |         clip_c=int(params['clip_c']),
 59 | 
 60 |         datasets=[source_dataset, target_dataset],
 61 |         valid_datasets=[[s,t] for s,t in zip(valid_source_dataset, valid_target_dataset)],
 62 |         dictionaries=[source_dictionary, target_dictionary],
 63 | 
 64 |         use_dropout=int(params['use_dropout']),
 65 |         source_word_level=int(params['source_word_level']),
 66 |         target_word_level=int(params['target_word_level']),
 67 |         save_every_saveFreq=1,
 68 |         use_bpe=0,
 69 |         init_params=init_params,
 70 |         build_model=build_model,
 71 |         build_sampler=build_sampler,
 72 |         gen_sample=gen_sample,
 73 |     )
 74 |     return validerr
 75 | 
 76 | if __name__ == '__main__':
 77 | 
 78 |     import sys, time
 79 | 
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument('-model_name', type=str, help="", default="multi-bpe2char")
 82 |     parser.add_argument('-translate', type=str, default="many_en")
 83 | 
 84 |     parser.add_argument('-enc_dim', type=int, default=512, help="")
 85 |     parser.add_argument('-dec_dim', type=int, default=1024, help="")
 86 | 
 87 |     parser.add_argument('-dim_word', type=int, default=512, help="")
 88 |     parser.add_argument('-dim_word_src', type=int, default=512, help="")
 89 | 
 90 |     parser.add_argument('-n_words', type=int, default=402, help="")
 91 |     parser.add_argument('-n_words_src', type=int, default=54541, help="")
 92 | 
 93 |     parser.add_argument('-source_dictionary', type=str, default="bpe-source-for-dic.word.pkl", help="")
 94 |     parser.add_argument('-target_dictionary', type=str, default="target.402.pkl", help="")
 95 | 
 96 |     parser.add_argument('-saveFreq', type=int, default=5000, help="")
 97 |     parser.add_argument('-sampleFreq', type=int, default=5000, help="")
 98 |     parser.add_argument('-dispFreq', type=int, default=1000, help="")
 99 |     parser.add_argument('-validFreq', type=int, default=5000, help="")
100 |     parser.add_argument('-pbatchFreq', type=int, default=-1, help="")
101 |     parser.add_argument('-sort_size', type=int, default=20, help="")
102 | 
103 |     parser.add_argument('-maxlen', type=int, default=50, help="")
104 |     parser.add_argument('-maxlen_trg', type=int, default=500, help="")
105 |     parser.add_argument('-maxlen_sample', type=int, default=500, help="")
106 | 
107 |     parser.add_argument('-train_batch_size', type=str, default="4535523/12122376/1926115/2326893", help="")
108 |     parser.add_argument('-valid_batch_size', type=int, default=60, help="")
109 |     parser.add_argument('-batch_size', type=int, default=60, help="")
110 | 
111 |     parser.add_argument('-re_load', action="store_true", default=False)
112 |     parser.add_argument('-re_load_old_setting', action="store_true", default=False)
113 | 
114 |     args = parser.parse_args()
115 | 
116 |     args.train_batch_size = [ int(x) for x in args.train_batch_size.split("/") ]
117 | 
118 |     train_batch_sum = numpy.sum(args.train_batch_size)
119 | 
120 |     args.train_batch_size = [ int(numpy.ceil(args.batch_size * x / float(train_batch_sum))) for x in args.train_batch_size ]
121 |     args.train_batch_size = [ 14, 37, 6, 7 ]
122 | 
123 |     args.save_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly
124 |     args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/multi-wmt15/" # change accordingly
125 |     args.dic_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/multi-wmt15/dic/" # change accordingly
126 | 
127 |     config_file_name = '/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/bpe2char/wmt15_manyen_bpe2char_adam.txt' # change accordingly
128 | 
129 |     f = open(config_file_name, 'r')
130 |     lines = f.readlines()
131 |     params = OrderedDict()
132 | 
133 |     for line in lines:
134 |         line = line.split('\n')[0]
135 |         param_list = line.split(' ')
136 | 
137 |         if len(param_list) < 2:
138 |             continue
139 |         elif len(param_list) == 2:
140 |             param_name = param_list[0]
141 |             param_value = param_list[1]
142 |             params[param_name] = param_value
143 |         else:
144 |             param_name = param_list[0]
145 |             param_value = param_list[1:]
146 |             params[param_name] = param_value
147 | 
148 |     main(0, params, args)
149 | 


--------------------------------------------------------------------------------
/bpe2char/wmt15_manyen_bpe2char_adam.txt:
--------------------------------------------------------------------------------
 1 | train_data_path deen/train/ csen/train/ fien/train/ ruen/train/
 2 | dev_data_path deen/dev/ csen/dev/ fien/dev/ ruen/dev/
 3 | 
 4 | max_epochs 1000000000000
 5 | patience -1
 6 | learning_rate 0.0001
 7 | 
 8 | optimizer adam
 9 | decay_c 0
10 | use_dropout 0
11 | clip_c 1
12 | 
13 | source_word_level 1
14 | target_word_level 0
15 | 
16 | source_dataset all_de-en.de.tok.shuf.iso9.bpe.50000 all_cs-en.cs.tok.iso9.bpe.50000 all_fi-en.fi.tok.shuf.iso9.bpe.50000 all_ru-en.ru.tok.iso9.bpe.50000
17 | target_dataset all_de-en.en.tok.shuf.iso9 all_cs-en.en.tok.iso9 all_fi-en.en.tok.shuf.iso9 all_ru-en.en.tok.iso9
18 | 
19 | valid_source_dataset newstest2013.de.tok.iso9.bpe.50000 newstest2013-ref.cs.tok.iso9.bpe.50000 newsdev2015-enfi-ref.fi.tok.iso9.bpe.50000 newstest2013-ref.ru.tok.iso9.bpe.50000
20 | valid_target_dataset newstest2013.en.tok.iso9 newstest2013-src.en.tok.iso9 newsdev2015-enfi-src.en.tok.iso9 newstest2013-src.en.tok.iso9
21 | 


--------------------------------------------------------------------------------
/bpe2char/wmt_path.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | deen={
  4 |     "dic": [
  5 |                 ["deen/train/all_de-en.de.tok.304.pkl",
  6 |                  "deen/train/all_de-en.en.tok.300.pkl",],
  7 | 
  8 |                 ["deen/train/all_de-en.de.tok.bpe.word.pkl"],
  9 |             ],
 10 | 
 11 |     "train": [
 12 |                 ["deen/train/all_de-en.de.tok.shuf",
 13 |                  "deen/train/all_de-en.en.tok.shuf",],
 14 | 
 15 |                 ["deen/train/all_de-en.de.tok.bpe.shuf",
 16 |                  "deen/train/all_de-en.en.tok.bpe.shuf",],
 17 |             ],
 18 | 
 19 |     "dev": [
 20 |                 ["deen/dev/newstest2013.de.tok",
 21 |                  "deen/dev/newstest2013.en.tok",],
 22 | 
 23 |                 ["deen/dev/newstest2013.de.tok.bpe",
 24 |                  "deen/dev/newstest2013.en.tok.bpe",],
 25 |             ],
 26 | 
 27 |     "test1" :[
 28 |                 ["deen/test/newstest2014-deen-ref.de.tok",
 29 |                  "deen/test/newstest2014-deen-src.en.tok",],
 30 | 
 31 |                 ["deen/test/newstest2014-deen-ref.de.tok.bpe",
 32 |                  "deen/test/newstest2014-deen-src.en.tok.bpe",],
 33 |             ],
 34 | 
 35 |     "test2":[
 36 |                 ["deen/test/newstest2015-deen-ref.de.tok",
 37 |                  "deen/test/newstest2015-deen-src.en.tok",],
 38 | 
 39 |                 ["deen/test/newstest2015-deen-ref.de.tok.bpe",
 40 |                  "deen/test/newstest2015-deen-src.en.tok.bpe",],
 41 |             ],
 42 | }
 43 | 
 44 | csen={
 45 | 
 46 |     "dic":[
 47 |                 ["csen/train/all_cs-en.cs.tok.304.pkl",
 48 |                  "csen/train/all_cs-en.en.tok.300.pkl",],
 49 | 
 50 |                 ["csen/train/all_cs-en.cs.tok.bpe.word.pkl"],
 51 |             ],
 52 | 
 53 |     "train":[
 54 |                 ["csen/train/all_cs-en.cs.tok",
 55 |                  "csen/train/all_cs-en.en.tok",],
 56 | 
 57 |                 ["csen/train/all_cs-en.cs.tok.bpe",
 58 |                  "csen/train/all_cs-en.en.tok.bpe",],
 59 |             ],
 60 | 
 61 |     "dev": [
 62 |                 ["csen/dev/newstest2013-ref.cs.tok",
 63 |                  "csen/dev/newstest2013-src.en.tok",],
 64 | 
 65 |                 ["csen/dev/newstest2013-ref.cs.tok.bpe",
 66 |                  "csen/dev/newstest2013-src.en.tok.bpe",],
 67 |             ],
 68 | 
 69 |     "test1":[
 70 |                 ["csen/test/newstest2014-csen-ref.cs.tok",
 71 |                  "csen/test/newstest2014-csen-src.en.tok",],
 72 | 
 73 |                 ["csen/test/newstest2014-csen-ref.cs.tok.bpe",
 74 |                  "csen/test/newstest2014-csen-src.en.tok.bpe",],
 75 |         ],
 76 | 
 77 |     "test2":[
 78 |                 ["csen/test/newstest2015-csen-ref.cs.tok",
 79 |                  "csen/test/newstest2015-csen-src.en.tok",],
 80 | 
 81 |                 ["csen/test/newstest2015-csen-ref.cs.tok.bpe",
 82 |                  "csen/test/newstest2015-csen-src.en.tok.bpe",],
 83 |         ]
 84 | }
 85 | 
 86 | fien={
 87 |     "dic":[
 88 |                 ["fien/train/all_fi-en.fi.tok.304.pkl",
 89 |                  "fien/train/all_fi-en.en.tok.300.pkl",],
 90 | 
 91 |                 ["fien/train/all_fi-en.fi.tok.bpe.word.pkl"],
 92 |         ],
 93 | 
 94 |     "train":[
 95 |                 ["fien/train/all_fi-en.fi.tok",
 96 |                  "fien/train/all_fi-en.en.tok",],
 97 | 
 98 |                 ["fien/train/all_fi-en.fi.tok.bpe",
 99 |                  "fien/train/all_fi-en.en.tok.bpe",],
100 |         ],
101 | 
102 |     "dev":[
103 |                 ["fien/dev/newsdev2015-enfi-ref.fi.tok",
104 |                  "fien/dev/newsdev2015-enfi-src.en.tok",],
105 | 
106 |                 ["fien/dev/newsdev2015-enfi-ref.fi.tok.bpe",
107 |                  "fien/dev/newsdev2015-enfi-src.en.tok.bpe",],
108 |         ],
109 | 
110 |     "test1":[
111 |                 ["fien/test/newstest2015-fien-ref.fi.tok",
112 |                  "fien/test/newstest2015-fien-src.en.tok",],
113 | 
114 |                 ["fien/test/newstest2015-fien-ref.fi.tok.bpe",
115 |                  "fien/test/newstest2015-fien-src.en.tok.bpe",],
116 |         ],
117 | }
118 | 
119 | ruen={
120 | 
121 |     "dic":[
122 |                 ["ruen/train/all_ru-en.ru.tok.304.pkl",
123 |                  "ruen/train/all_ru-en.en.tok.300.pkl",],
124 | 
125 |                 ["ruen/train/all_ru-en.ru.tok.bpe.word.pkl"],
126 |         ],
127 | 
128 |     "train":[
129 |                 ["ruen/train/all_ru-en.ru.tok",
130 |                  "ruen/train/all_ru-en.en.tok",],
131 | 
132 |                 ["ruen/train/all_ru-en.ru.tok.bpe",
133 |                  "ruen/train/all_ru-en.en.tok.bpe",],
134 |         ],
135 | 
136 |     "dev":[
137 |                 ["ruen/dev/newstest2013-ref.ru.tok",
138 |                  "ruen/dev/newstest2013-src.en.tok",],
139 | 
140 |                 ["ruen/dev/newstest2013-ref.ru.tok.bpe",
141 |                  "ruen/dev/newstest2013-src.en.tok.bpe",],
142 |         ],
143 | 
144 |     "test1":[
145 |                 ["ruen/test/newstest2014-ruen-ref.ru.tok",
146 |                  "ruen/test/newstest2014-ruen-src.en.tok",],
147 | 
148 |                 ["ruen/test/newstest2014-ruen-ref.ru.tok.bpe",
149 |                  "ruen/test/newstest2014-ruen-src.en.tok.bpe",],
150 |         ],
151 | 
152 |     "test2":[
153 |                 ["ruen/test/newstest2015-ruen-ref.ru.tok",
154 |                  "ruen/test/newstest2015-ruen-src.en.tok",],
155 | 
156 |                 ["ruen/test/newstest2015-ruen-ref.ru.tok.bpe",
157 |                  "ruen/test/newstest2015-ruen-src.en.tok.bpe",],
158 |         ]
159 | }
160 | 
161 | manyen = {
162 |     "dic":[
163 |             ["char-source-for-dic.300.pkl",
164 |              "char-target-for-dic.300.pkl"],
165 | 
166 |             ["bpe-source-for-dic.word.pkl"]
167 |         ]
168 | }
169 | 
170 | wmts = dict()
171 | wmts["de_en"] = deen
172 | wmts["cs_en"] = csen
173 | wmts["fi_en"] = fien
174 | wmts["ru_en"] = ruen
175 | wmts["many_en"] = manyen
176 | 


--------------------------------------------------------------------------------
/char2char/char_base.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | from theano import tensor
  3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  4 | 
  5 | import cPickle
  6 | import numpy
  7 | import copy
  8 | 
  9 | import os
 10 | import warnings
 11 | import sys
 12 | import time
 13 | 
 14 | from collections import OrderedDict
 15 | from mixer import *
 16 | 
 17 | def init_params(options):
 18 |     params = OrderedDict()
 19 | 
 20 |     print "new char_base initialise..."
 21 |     print "source dictionary size: %d" % options['n_words_src']
 22 |     # embedding
 23 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word_src'])
 24 |     params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])
 25 | 
 26 |     params = get_layer('multi_scale_conv_encoder')[0](options, params, prefix='multi_scale_conv_enc1', dim=options['dim_word_src'], width=options['conv_width'], nkernels=options['conv_nkernels'])
 27 | 
 28 |     for ii in xrange(options['highway']):
 29 |         params = get_layer('hw')[0](options, params, prefix="hw_network{}".format(ii+1), dim=numpy.sum(options['conv_nkernels']))
 30 | 
 31 |     params = get_layer('gru')[0](options, params,
 32 |                                  prefix='encoder',
 33 |                                  nin=numpy.sum(options['conv_nkernels']),
 34 |                                  dim=options['enc_dim'])
 35 |     params = get_layer('gru')[0](options, params,
 36 |                                  prefix='encoderr',
 37 |                                  nin=numpy.sum(options['conv_nkernels']),
 38 |                                  dim=options['enc_dim'])
 39 |     ctxdim = 2 * options['enc_dim']
 40 | 
 41 |     params = get_layer('ff')[0](options, params,
 42 |                                 prefix='ff_init_state_char',
 43 |                                 nin=ctxdim,
 44 |                                 nout=options['dec_dim'])
 45 |     params = get_layer('ff')[0](options, params,
 46 |                                 prefix='ff_init_state_word',
 47 |                                 nin=ctxdim,
 48 |                                 nout=options['dec_dim'])
 49 | 
 50 |     print "target dictionary size: %d" % options['n_words']
 51 |     # decoder
 52 |     params = get_layer('two_layer_gru_decoder')[0](options, params,
 53 |                                                    prefix='decoder',
 54 |                                                    nin=options['dim_word'],
 55 |                                                    dim_char=options['dec_dim'],
 56 |                                                    dim_word=options['dec_dim'],
 57 |                                                    dimctx=ctxdim)
 58 | 
 59 |     # readout
 60 |     params = get_layer('fff')[0](options, params, prefix='ff_logit_rnn',
 61 |                                  nin1=options['dec_dim'], nin2=options['dec_dim'],
 62 |                                  nout=options['dim_word'], ortho=False)
 63 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_prev',
 64 |                                 nin=options['dim_word'],
 65 |                                 nout=options['dim_word'],
 66 |                                 ortho=False)
 67 |     params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx',
 68 |                                 nin=ctxdim,
 69 |                                 nout=options['dim_word'],
 70 |                                 ortho=False)
 71 |     params = get_layer('ff')[0](options, params, prefix='ff_logit',
 72 |                                 nin=options['dim_word'],
 73 |                                 nout=options['n_words'])
 74 | 
 75 |     return params
 76 | 
 77 | 
 78 | def build_model(tparams, options):
 79 |     opt_ret = OrderedDict()
 80 | 
 81 |     trng = RandomStreams(numpy.random.RandomState(numpy.random.randint(1024)).randint(numpy.iinfo(numpy.int32).max))
 82 |     use_noise = theano.shared(numpy.float32(0.))
 83 | 
 84 |     # description string: #words x #samples
 85 |     x = tensor.matrix('x', dtype='int64')
 86 |     x_mask = tensor.matrix('x_mask', dtype='float32')
 87 | 
 88 |     y = tensor.matrix('y', dtype='int64')
 89 |     y_mask = tensor.matrix('y_mask', dtype='float32')
 90 | 
 91 |     xr_mask = x_mask[::-1]
 92 | 
 93 |     n_samples = x.shape[1]
 94 |     n_timesteps = x.shape[0]
 95 |     n_timesteps_trg = y.shape[0]
 96 | 
 97 |     # word embedding for forward RNN (source)
 98 |     emb = tparams['Wemb'][x.flatten()]
 99 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']])
100 |     # emb.shape = (maxlen_x_pad + 2*pool_stride, n_samples, dim_word_src)
101 | 
102 |     conv_out = get_layer('multi_scale_conv_encoder')[1](tparams, emb, options, prefix='multi_scale_conv_enc1', width=options['conv_width'], nkernels=options['conv_nkernels'], pool_window=options['pool_window'], pool_stride=options['pool_stride'])
103 |     # conv_out.shape = (maxlen_x_pad/pool_stride, n_samples, sum(nkernels))
104 | 
105 |     hw_in = conv_out.reshape([conv_out.shape[0] * conv_out.shape[1], conv_out.shape[2]])
106 |     for ii in xrange(options['highway']):
107 |         hw_in = get_layer('hw')[1](tparams, hw_in, options, prefix="hw_network{}".format(ii+1))
108 |     hw_out = hw_in.reshape([conv_out.shape[0], conv_out.shape[1], conv_out.shape[2]])
109 |     # hw_out.shape = (maxlen_x_pad/pool_stride, n_samples, sum(nkernels))
110 | 
111 |     if options['dropout_gru']:
112 |         print "Dropout before GRUs."
113 |         hw_out = hw_out * trng.binomial(hw_out.shape, p=0.5, n=1, dtype=hw_out.dtype) * 2.0
114 | 
115 |     # pass through gru layer, recurrence here
116 |     proj = get_layer('gru')[1](tparams, hw_out, options, prefix='encoder', mask=x_mask)
117 |     projr = get_layer('gru')[1](tparams, hw_out[::-1], options, prefix='encoderr', mask=xr_mask)
118 | 
119 |     # context
120 |     ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
121 | 
122 |     # context mean
123 |     ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
124 | 
125 |     # initial decoder state
126 |     init_state_char = get_layer('ff')[1](tparams, ctx_mean, options,
127 |                                          prefix='ff_init_state_char', activ='tanh')
128 |     init_state_word = get_layer('ff')[1](tparams, ctx_mean, options,
129 |                                          prefix='ff_init_state_word', activ='tanh')
130 | 
131 |     # word embedding and shifting for targets
132 |     yemb = tparams['Wemb_dec'][y.flatten()]
133 |     yemb = yemb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
134 |     yemb_shited = tensor.zeros_like(yemb)
135 |     yemb_shited = tensor.set_subtensor(yemb_shited[1:], yemb[:-1])
136 |     yemb = yemb_shited
137 | 
138 |     char_h, word_h, ctxs, alphas = \
139 |             get_layer('two_layer_gru_decoder')[1](tparams, yemb, options,
140 |                                                   prefix='decoder',
141 |                                                   mask=y_mask,
142 |                                                   context=ctx,
143 |                                                   context_mask=x_mask,
144 |                                                   one_step=False,
145 |                                                   init_state_char=init_state_char,
146 |                                                   init_state_word=init_state_word)
147 | 
148 |     opt_ret['dec_alphas'] = alphas
149 | 
150 |     # compute word probabilities
151 |     logit_rnn = get_layer('fff')[1](tparams, char_h, word_h, options,
152 |                                     prefix='ff_logit_rnn', activ='linear')
153 |     logit_prev = get_layer('ff')[1](tparams, yemb, options,
154 |                                     prefix='ff_logit_prev', activ='linear')
155 |     logit_ctx = get_layer('ff')[1](tparams, ctxs, options,
156 |                                    prefix='ff_logit_ctx', activ='linear')
157 |     logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx)
158 | 
159 |     if options['dropout_softmax']:
160 |         print "Dropout before Softmax"
161 |         logit = logit * trng.binomial(logit.shape, p=0.5, n=1, dtype=logit.dtype) * 2.0
162 | 
163 |     logit = get_layer('ff')[1](tparams, logit, options,
164 |                                prefix='ff_logit', activ='linear')
165 |     logit_shp = logit.shape
166 |     probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
167 | 
168 |     # cost
169 |     y_flat = y.flatten()
170 |     y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
171 |     cost = -tensor.log(probs.flatten()[y_flat_idx])
172 |     cost = cost.reshape([y.shape[0], y.shape[1]])
173 |     cost = (cost * y_mask).sum(0)
174 | 
175 |     return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
176 | 
177 | def build_sampler(tparams, options, trng, use_noise):
178 | 
179 |     x = tensor.matrix('x', dtype='int64')
180 | 
181 |     n_timesteps = x.shape[0]
182 |     n_samples = x.shape[1]
183 | 
184 |     emb = tparams['Wemb'][x.flatten()]
185 |     emb = emb.reshape([n_timesteps, n_samples, options['dim_word_src']])
186 | 
187 |     conv_out = get_layer('multi_scale_conv_encoder')[1](tparams, emb, options, prefix='multi_scale_conv_enc1', width=options['conv_width'], nkernels=options['conv_nkernels'], pool_window=options['pool_window'], pool_stride=options['pool_stride'])
188 | 
189 |     hw_in = conv_out.reshape([conv_out.shape[0] * conv_out.shape[1], conv_out.shape[2]])
190 |     for ii in xrange(options['highway']):
191 |         hw_in = get_layer('hw')[1](tparams, hw_in, options, prefix="hw_network{}".format(ii+1))
192 |     hw_out = hw_in.reshape([conv_out.shape[0], conv_out.shape[1], conv_out.shape[2]])
193 | 
194 |     # pass through gru layer, recurrence here
195 |     proj = get_layer('gru')[1](tparams, hw_out, options, prefix='encoder')
196 |     projr = get_layer('gru')[1](tparams, hw_out[::-1], options, prefix='encoderr')
197 | 
198 |     ctx = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
199 |     ctx_mean = ctx.mean(0)
200 | 
201 |     init_state_char = get_layer('ff')[1](tparams, ctx_mean, options,
202 |                                          prefix='ff_init_state_char', activ='tanh')
203 |     init_state_word = get_layer('ff')[1](tparams, ctx_mean, options,
204 |                                          prefix='ff_init_state_word', activ='tanh')
205 | 
206 |     print 'Building f_init...',
207 |     outs = [init_state_char, init_state_word, ctx]
208 |     f_init = theano.function([x], outs, name='f_init', profile=profile)
209 |     print 'Done'
210 | 
211 |     y = tensor.vector('y_sampler', dtype='int64')
212 |     init_state_char = tensor.matrix('init_state_char', dtype='float32')
213 |     init_state_word = tensor.matrix('init_state_word', dtype='float32')
214 | 
215 |     # if it's the first word, emb should be all zero and it is indicated by -1
216 |     yemb = tensor.switch(y[:, None] < 0,
217 |                          tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]),
218 |                          tparams['Wemb_dec'][y])
219 | 
220 |     next_state_char, next_state_word, next_ctx, next_alpha = \
221 |             get_layer('two_layer_gru_decoder')[1](tparams, yemb, options,
222 |                                                   prefix='decoder',
223 |                                                   context=ctx,
224 |                                                   mask=None,
225 |                                                   one_step=True,
226 |                                                   init_state_char=init_state_char,
227 |                                                   init_state_word=init_state_word)
228 | 
229 |     logit_rnn = get_layer('fff')[1](tparams,
230 |                                     next_state_char,
231 |                                     next_state_word,
232 |                                     options,
233 |                                     prefix='ff_logit_rnn',
234 |                                     activ='linear')
235 |     # dec_dim, dec_dim => dim_word
236 |     logit_prev = get_layer('ff')[1](tparams,
237 |                                     yemb,
238 |                                     options,
239 |                                     prefix='ff_logit_prev',
240 |                                     activ='linear')
241 |     # dim_word => dim_word
242 |     logit_ctx = get_layer('ff')[1](tparams,
243 |                                    next_ctx,
244 |                                    options,
245 |                                    prefix='ff_logit_ctx',
246 |                                    activ='linear')
247 |     # ctx_dim => dim_word
248 |     logit = tensor.tanh(logit_rnn + logit_prev + logit_ctx)
249 | 
250 |     logit = get_layer('ff')[1](tparams, logit, options,
251 |                                prefix='ff_logit',
252 |                                activ='linear')
253 |     next_probs = tensor.nnet.softmax(logit)
254 |     next_sample = trng.multinomial(pvals=next_probs).argmax(1)
255 | 
256 |     # next word probability
257 |     print 'Building f_next...',
258 |     inps = [y, ctx, init_state_char, init_state_word]
259 |     outs = [next_probs, next_sample, next_state_char, next_state_word]
260 |     f_next = theano.function(inps, outs, name='f_next', profile=profile)
261 |     print 'Done'
262 | 
263 |     return f_init, f_next
264 | 
265 | def gen_sample(tparams, f_init, f_next, x, options, trng=None,
266 |                k=1, maxlen=500, stochastic=True, argmax=False):
267 | 
268 |     # k is the beam size we have
269 |     if k > 1:
270 |         assert not stochastic, \
271 |             'Beam search does not support stochastic sampling'
272 | 
273 |     sample = []
274 |     sample_score = []
275 |     if stochastic:
276 |         sample_score = 0
277 | 
278 |     live_k = 1
279 |     dead_k = 0
280 | 
281 |     hyp_samples = [[]] * live_k
282 |     hyp_scores = numpy.zeros(live_k).astype('float32')
283 |     hyp_states = []
284 | 
285 |     # get initial state of decoder rnn and encoder context
286 |     ret = f_init(x)
287 | 
288 |     next_state_char, next_state_word, ctx0 = ret[0], ret[1], ret[2]
289 |     next_w = -1 * numpy.ones((1,)).astype('int64')  # bos indicator
290 | 
291 |     for ii in xrange(maxlen):
292 |         ctx = numpy.tile(ctx0, [live_k, 1])
293 |         inps = [next_w, ctx, next_state_char, next_state_word]
294 | 
295 |         ret = f_next(*inps)
296 | 
297 |         next_p, next_w, next_state_char, next_state_word = ret[0], ret[1], ret[2], ret[3]
298 | 
299 |         # FALSE while decoding
300 |         if stochastic:
301 |             if argmax:
302 |                 nw = next_p[0].argmax()
303 |             else:
304 |                 nw = next_w[0]
305 |             sample.append(nw)
306 |             sample_score += next_p[0, nw]
307 |             if nw == 0:
308 |                 break
309 |         else:
310 |             cand_scores = hyp_scores[:, None] - numpy.log(next_p)
311 |             cand_flat = cand_scores.flatten()
312 | 
313 |             ranks_flat = cand_flat.argsort()[:(k-dead_k)]
314 |             # k: beam width
315 |             # dead_k : initially 0, increments 1 by 1
316 | 
317 |             voc_size = next_p.shape[1]
318 |             trans_indices = ranks_flat / voc_size
319 |             word_indices = ranks_flat % voc_size
320 |             costs = cand_flat[ranks_flat]
321 |             # here, basically sort cand_flat
322 | 
323 |             new_hyp_samples = []
324 |             # k : beam width
325 |             new_hyp_scores = numpy.zeros(k-dead_k).astype('float32')
326 |             new_hyp_states_char = []
327 |             new_hyp_states_word = []
328 | 
329 |             for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)):
330 |                 new_hyp_samples.append(hyp_samples[ti]+[wi])
331 |                 new_hyp_scores[idx] = copy.copy(costs[idx])
332 |                 new_hyp_states_char.append(copy.copy(next_state_char[ti]))
333 |                 new_hyp_states_word.append(copy.copy(next_state_word[ti]))
334 | 
335 |             # check the finished samples
336 |             new_live_k = 0
337 |             hyp_samples = []
338 |             hyp_scores = []
339 |             hyp_states_char = []
340 |             hyp_states_word = []
341 | 
342 |             for idx in xrange(len(new_hyp_samples)):
343 |                 if new_hyp_samples[idx][-1] == 0:
344 |                     sample.append(new_hyp_samples[idx])
345 |                     sample_score.append(new_hyp_scores[idx])
346 |                     dead_k += 1
347 |                 else:
348 |                     new_live_k += 1
349 |                     hyp_samples.append(new_hyp_samples[idx])
350 |                     hyp_scores.append(new_hyp_scores[idx])
351 |                     hyp_states_char.append(new_hyp_states_char[idx])
352 |                     hyp_states_word.append(new_hyp_states_word[idx])
353 |             hyp_scores = numpy.array(hyp_scores)
354 |             live_k = new_live_k
355 | 
356 |             if new_live_k < 1:
357 |                 break
358 |             if dead_k >= k:
359 |                 break
360 | 
361 |             next_w = numpy.array([w[-1] for w in hyp_samples])
362 |             next_state_char = numpy.array(hyp_states_char)
363 |             next_state_word = numpy.array(hyp_states_word)
364 | 
365 |     if not stochastic:
366 |         # dump every remaining one
367 |         if live_k > 0:
368 |             for idx in xrange(live_k):
369 |                 sample.append(hyp_samples[idx])
370 |                 sample_score.append(hyp_scores[idx])
371 | 
372 |     return sample, sample_score
373 | 


--------------------------------------------------------------------------------
/char2char/conv_tools.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | from theano import tensor
 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 4 | 
 5 | import cPickle
 6 | import numpy
 7 | import copy
 8 | 
 9 | import os
10 | import warnings
11 | import sys
12 | import time
13 | 
14 | def conv_mask_pool(x_mask, pool_stride):
15 |     # x_mask.shape = (maxlen_x_pad, n_samples)
16 |     maxlen_x_pad, n_samples = x_mask.shape[0], x_mask.shape[1]
17 |     maxlen_pooled = maxlen_x_pad / pool_stride
18 | 
19 |     x_m = numpy.zeros((maxlen_pooled, n_samples)).astype('float32')
20 | 
21 |     for idx in range(n_samples):
22 |         x_sum = numpy.sum(x_mask[:,idx])
23 |         x_num = numpy.ceil( x_sum  / float(pool_stride))
24 |         x_num = int(x_num)
25 |         x_m[:x_num, idx] = 1.0
26 | 
27 |     return x_m
28 | 


--------------------------------------------------------------------------------
/char2char/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import numpy
  3 | import os
  4 | import random
  5 | 
  6 | import cPickle
  7 | import gzip
  8 | import codecs
  9 | 
 10 | from tempfile import mkstemp
 11 | 
 12 | random.seed(1029381209)
 13 | 
 14 | def fopen(filename, mode='r'):
 15 |     if filename.endswith('.gz'):
 16 |         return gzip.open(filename, mode)
 17 |     return open(filename, mode)
 18 | 
 19 | class TextIterator:
 20 |     """Simple Bitext iterator."""
 21 |     def __init__(self,
 22 |                  source, source_dict,
 23 |                  target=None, target_dict=None,
 24 |                  source_word_level=0,
 25 |                  target_word_level=0,
 26 |                  batch_size=128,
 27 |                  job_id=0,
 28 |                  sort_size=20,
 29 |                  n_words_source=-1,
 30 |                  n_words_target=-1,
 31 |                  shuffle_per_epoch=False):
 32 |         self.source_file = source
 33 |         self.target_file = target
 34 |         self.source = fopen(source, 'r')
 35 |         with open(source_dict, 'rb') as f:
 36 |             self.source_dict = cPickle.load(f)
 37 |         if target is not None:
 38 |             self.target = fopen(target, 'r')
 39 |             if target_dict is not None:
 40 |                 with open(target_dict, 'rb') as f:
 41 |                     self.target_dict = cPickle.load(f)
 42 |         else:
 43 |             self.target = None
 44 | 
 45 |         self.source_word_level = source_word_level
 46 |         self.target_word_level = target_word_level
 47 |         self.batch_size = batch_size
 48 | 
 49 |         self.n_words_source = n_words_source
 50 |         self.n_words_target = n_words_target
 51 |         self.shuffle_per_epoch = shuffle_per_epoch
 52 | 
 53 |         self.source_buffer = []
 54 |         self.target_buffer = []
 55 |         self.k = batch_size * sort_size
 56 | 
 57 |         self.end_of_data = False
 58 |         self.job_id = job_id
 59 | 
 60 |     def __iter__(self):
 61 |         return self
 62 | 
 63 |     def reset(self):
 64 |         if self.shuffle_per_epoch:
 65 |             # close current files
 66 |             self.source.close()
 67 |             if self.target is None:
 68 |                 self.shuffle([self.source_file])
 69 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 70 |             else:
 71 |                 self.target.close()
 72 |                 # shuffle *original* source files,
 73 |                 self.shuffle([self.source_file, self.target_file])
 74 |                 # open newly 're-shuffled' file as input
 75 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 76 |                 self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r')
 77 |         else:
 78 |             self.source.seek(0)
 79 |             if self.target is not None:
 80 |                 self.target.seek(0)
 81 | 
 82 |     @staticmethod
 83 |     def shuffle(files):
 84 |         tf_os, tpath = mkstemp()
 85 |         tf = open(tpath, 'w')
 86 |         fds = [open(ff) for ff in files]
 87 |         for l in fds[0]:
 88 |             lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
 89 |             print >>tf, "|||".join(lines)
 90 |         [ff.close() for ff in fds]
 91 |         tf.close()
 92 |         tf = open(tpath, 'r')
 93 |         lines = tf.readlines()
 94 |         random.shuffle(lines)
 95 |         fds = [open(ff+'.reshuf','w') for ff in files]
 96 |         for l in lines:
 97 |             s = l.strip().split('|||')
 98 |             for ii, fd in enumerate(fds):
 99 |                 print >>fd, s[ii]
100 |         [ff.close() for ff in fds]
101 |         os.remove(tpath)
102 |         return
103 | 
104 |     def next(self):
105 |         if self.end_of_data:
106 |             self.end_of_data = False
107 |             self.reset()
108 |             raise StopIteration
109 | 
110 |         source = []
111 |         target = []
112 | 
113 |         # fill buffer, if it's empty
114 |         if self.target is not None:
115 |             assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
116 | 
117 |         if len(self.source_buffer) == 0:
118 |             for k_ in xrange(self.k):
119 |                 #rand_idx = random.randint(0,len(self.source_buffer))
120 | 
121 |                 ss = self.source.readline()
122 | 
123 |                 if ss == "":
124 |                     break
125 | 
126 |                 if self.source_word_level:
127 |                     ss = ss.strip().split()
128 |                 else:
129 |                     ss = ss.strip()
130 |                     ss = list(ss.decode('utf8'))
131 | 
132 |                 #self.source_buffer.insert(rand_idx, ss)
133 |                 self.source_buffer.append(ss)
134 | 
135 |                 if self.target is not None:
136 |                     tt = self.target.readline()
137 | 
138 |                     if tt == "":
139 |                         break
140 | 
141 |                     if self.target_word_level:
142 |                         tt = tt.strip().split()
143 |                     else:
144 |                         tt = tt.strip()
145 |                         tt = list(tt.decode('utf8'))
146 | 
147 |                     #self.target_buffer.insert(rand_idx, tt)
148 |                     self.target_buffer.append(tt)
149 | 
150 |             if self.target is not None:
151 |                 # sort by target buffer
152 |                 tlen = numpy.array([len(t) for t in self.target_buffer])
153 |                 tidx = tlen.argsort()
154 |                 _sbuf = [self.source_buffer[i] for i in tidx]
155 |                 _tbuf = [self.target_buffer[i] for i in tidx]
156 |                 self.target_buffer = _tbuf
157 |             else:
158 |                 slen = numpy.array([len(s) for s in self.source_buffer])
159 |                 sidx = slen.argsort()
160 |                 _sbuf = [self.source_buffer[i] for i in sidx]
161 | 
162 |             self.source_buffer = _sbuf
163 | 
164 |         if self.target is not None:
165 |             if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
166 |                 self.end_of_data = False
167 |                 self.reset()
168 |                 raise StopIteration
169 |         elif len(self.source_buffer) == 0:
170 |             self.end_of_data = False
171 |             self.reset()
172 |             raise StopIteration
173 | 
174 |         try:
175 |             # actual work here
176 |             while True:
177 |                 # read from source file and map to word index
178 |                 try:
179 |                     ss_ = self.source_buffer.pop()
180 |                 except IndexError:
181 |                     break
182 |                 ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_]
183 |                 if self.n_words_source > 0:
184 |                     ss = [w if w < self.n_words_source else 1 for w in ss]
185 | 
186 |                 # NOTE : prepending and appending with SOS and EOS symbols
187 |                 # see preprocess/build_dictionary_char.py to see why 2 and 3.
188 |                 ss = [2] + ss + [3]
189 |                 source.append(ss)
190 | 
191 |                 if self.target is not None:
192 |                     # read from target file and map to word index
193 |                     tt_ = self.target_buffer.pop()
194 |                     tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_]
195 |                     if self.n_words_target > 0:
196 |                         tt = [w if w < self.n_words_target else 1 for w in tt]
197 |                     target.append(tt)
198 | 
199 |                 if len(source) >= self.batch_size:
200 |                     break
201 |         except IOError:
202 |             self.end_of_data = True
203 | 
204 |         if self.target is not None:
205 |             if len(source) <= 0 or len(target) <= 0:
206 |                 self.end_of_data = False
207 |                 self.reset()
208 |                 raise StopIteration
209 |             return source, target
210 |         else:
211 |             if len(source) <= 0:
212 |                 self.end_of_data = False
213 |                 self.reset()
214 |                 raise StopIteration
215 |             return source
216 | 


--------------------------------------------------------------------------------
/char2char/many_data_iterator.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import numpy
  3 | import os
  4 | import random
  5 | 
  6 | import cPickle
  7 | import gzip
  8 | import codecs
  9 | 
 10 | from tempfile import mkstemp
 11 | 
 12 | random.seed(1029381209)
 13 | 
 14 | def fopen(filename, mode='r'):
 15 |     if filename.endswith('.gz'):
 16 |         return gzip.open(filename, mode)
 17 |     return open(filename, mode)
 18 | 
 19 | class MultiTextIterator:
 20 |     """Simple Bitext iterator."""
 21 |     def __init__(self,
 22 |                  source, source_dict,
 23 |                  target=None, target_dict=None,
 24 |                  source_word_level=0,
 25 |                  target_word_level=0,
 26 |                  batch_size=[128,1,2,3],
 27 |                  job_id=0,
 28 |                  sort_size=20,
 29 |                  n_words_source=302,
 30 |                  n_words_target=302,
 31 |                  shuffle_per_epoch=False):
 32 | 
 33 |         self.source_files = source
 34 |         self.target_files = target
 35 | 
 36 |         self.sources = [fopen(s, 'r') for s in source]
 37 |         with open(source_dict, 'rb') as f:
 38 |             self.source_dict = cPickle.load(f)
 39 |             # one source dictionary
 40 | 
 41 |         self.targets = [fopen(t, 'r') for t in target]
 42 |         with open(target_dict, 'rb') as f:
 43 |             self.target_dict = cPickle.load(f)
 44 |             # one target dictionary
 45 | 
 46 |         self.source_word_level = source_word_level
 47 |         self.target_word_level = target_word_level
 48 |         self.batch_sizes = batch_size
 49 |         # list
 50 | 
 51 |         self.n_words_source = n_words_source
 52 |         self.n_words_target = n_words_target
 53 |         self.shuffle_per_epoch = shuffle_per_epoch
 54 | 
 55 |         self.source_buffers = [[],[],[],[]]
 56 |         self.target_buffers = [[],[],[],[]]
 57 |         self.k = [bs * sort_size for bs in batch_size]
 58 |         # at once, fetch 20 items
 59 |         # we're good for 20 updates
 60 | 
 61 |         self.end_of_data = False
 62 |         self.job_id = job_id
 63 | 
 64 |     def __iter__(self):
 65 |         return self
 66 | 
 67 |     def reset(self):
 68 |         if self.shuffle_per_epoch:
 69 |             raise Exception("hi")
 70 |             # close current files
 71 |             for s in self.sources:
 72 |                 s.close()
 73 | 
 74 |             if self.targets is None:
 75 |                 self.shuffle([self.source_file])
 76 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 77 |             else:
 78 |                 for t in self.targets:
 79 |                     t.close()
 80 | 
 81 |                 # shuffle *original* source files,
 82 |                 self.shuffle([self.source_file, self.target_file])
 83 |                 # open newly 're-shuffled' file as input
 84 |                 self.source = fopen(self.source_file + '.reshuf_%d' % self.job_id, 'r')
 85 |                 self.target = fopen(self.target_file + '.reshuf_%d' % self.job_id, 'r')
 86 |         else:
 87 |             for idx in xrange(4):
 88 |                 self.sources[idx].seek(0)
 89 |                 self.targets[idx].seek(0)
 90 | 
 91 |     @staticmethod
 92 |     def shuffle(files):
 93 |         tf_os, tpath = mkstemp()
 94 |         tf = open(tpath, 'w')
 95 |         fds = [open(ff) for ff in files]
 96 |         for l in fds[0]:
 97 |             lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
 98 |             print >>tf, "|||".join(lines)
 99 |         [ff.close() for ff in fds]
100 |         tf.close()
101 |         tf = open(tpath, 'r')
102 |         lines = tf.readlines()
103 |         random.shuffle(lines)
104 |         fds = [open(ff+'.reshuf','w') for ff in files]
105 |         for l in lines:
106 |             s = l.strip().split('|||')
107 |             for ii, fd in enumerate(fds):
108 |                 print >>fd, s[ii]
109 |         [ff.close() for ff in fds]
110 |         os.remove(tpath)
111 |         return
112 | 
113 |     def next(self):
114 |         # if end_of_data reaches, stop for loop
115 |         if self.end_of_data:
116 |             self.end_of_data = False
117 |             self.reset()
118 |             raise StopIteration
119 | 
120 |         sources = [[],[],[],[]]
121 |         targets = [[],[],[],[]]
122 |         # NOTE : this is the data to be used for "this" round of updates
123 | 
124 |         # fill buffer, if it's empty
125 |         for idx in xrange(4):
126 |             assert len(self.source_buffers[idx]) == len(self.target_buffers[idx]), 'Buffer size mismatch!'
127 | 
128 |         for idx in xrange(4):
129 | 
130 |             # NOTE : in buffer: don't put the whole dataset in... only for 'k' many updates
131 |             # after 'k' updates, self.source_buffers[idx] will be empty, in which case we will put new things in
132 | 
133 |             #if len(self.source_buffers[idx]) == 0:
134 |             if len(self.source_buffers[idx]) < self.batch_sizes[idx]:
135 |                 for k_ in xrange(self.k[idx]):
136 | 
137 |                     ss = self.sources[idx].readline()
138 |                     # NOTE: self.sources is where we keep the RAW data
139 |                     if ss == "":
140 |                         break
141 |                     if self.source_word_level:
142 |                         ss = ss.strip().split()
143 |                     else:
144 |                         ss = ss.strip()
145 |                         ss = list(ss.decode('utf8'))
146 |                     self.source_buffers[idx].append(ss)
147 | 
148 |                     tt = self.targets[idx].readline()
149 |                     if tt == "":
150 |                         break
151 |                     if self.target_word_level:
152 |                         tt = tt.strip().split()
153 |                     else:
154 |                         tt = tt.strip()
155 |                         tt = list(tt.decode('utf8'))
156 |                     self.target_buffers[idx].append(tt)
157 | 
158 |                 tlen = numpy.array([len(t) for t in self.target_buffers[idx]])
159 |                 tidx = tlen.argsort()
160 |                 _sbuf = [self.source_buffers[idx][i] for i in tidx]
161 |                 _tbuf = [self.target_buffers[idx][i] for i in tidx]
162 |                 self.target_buffers[idx] = _tbuf
163 |                 self.source_buffers[idx] = _sbuf
164 | 
165 |         stop = False
166 |         for idx in xrange(4):
167 |             if len(self.source_buffers[idx]) < self.batch_sizes[idx]:
168 |                 stop = True
169 | 
170 |         if stop:
171 |             self.end_of_data = False
172 |             self.reset()
173 |             raise StopIteration
174 | 
175 |         try:
176 |             # actual work here
177 |             for idx in xrange(4):
178 |                 while True:
179 |                 # read from source file and map to word index
180 |                     try:
181 |                         ss_ = self.source_buffers[idx].pop()
182 |                     except IndexError:
183 |                         # NOTE : just because source_buffers is empty, doesn't mean file scanned
184 |                         # we do add partial batches. We proceed until len(source_buffers) = 0
185 |                         break
186 | 
187 |                     ss = [self.source_dict[w] if w in self.source_dict else 1 for w in ss_]
188 |                     if self.n_words_source > 0:
189 |                         ss = [w if w < self.n_words_source else 1 for w in ss]
190 | 
191 |                     # NOTE : prepending and appending with SOS and EOS symbols
192 |                     ss = [2] + ss + [3]
193 |                     sources[idx].append(ss)
194 | 
195 |                     tt_ = self.target_buffers[idx].pop()
196 |                     tt = [self.target_dict[w] if w in self.target_dict else 1 for w in tt_]
197 |                     if self.n_words_target > 0:
198 |                         tt = [w if w < self.n_words_target else 1 for w in tt]
199 |                     targets[idx].append(tt)
200 | 
201 |                     if len(sources[idx]) >= self.batch_sizes[idx]:
202 |                         break
203 | 
204 |         except IOError:
205 |             self.end_of_data = True
206 | 
207 |         source = sources[0] + sources[1] + sources[2] + sources[3]
208 |         target = targets[0] + targets[1] + targets[2] + targets[3]
209 | 
210 |         # NOTE : just add anything, if still nothing, reset
211 |         min_batch_size = numpy.sum(self.batch_sizes) / float(1.0)
212 |         if len(source) < min_batch_size or len(target) < min_batch_size:
213 |             self.end_of_data = False
214 |             self.reset()
215 |             raise StopIteration
216 | 
217 |         return source, target
218 | 


--------------------------------------------------------------------------------
/char2char/prepare_data.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | from theano import tensor
 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 4 | 
 5 | import cPickle
 6 | import numpy
 7 | import copy
 8 | 
 9 | import os
10 | import warnings
11 | import sys
12 | import time
13 | 
14 | from conv_tools import *
15 | 
16 | from collections import OrderedDict
17 | from mixer import *
18 | 
19 | # batch preparation for char2char models
20 | def prepare_data(seqs_x, seqs_y, pool_stride, maxlen=None, maxlen_trg=None):
21 |     # x: a list of sentences
22 |     lengths_x = [len(s) for s in seqs_x]
23 |     lengths_y = [len(s) for s in seqs_y]
24 | 
25 |     if maxlen is not None:
26 |         new_seqs_x = []
27 |         new_seqs_y = []
28 |         new_lengths_x = []
29 |         new_lengths_y = []
30 |         for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
31 |             if l_x < maxlen and l_y < maxlen_trg:
32 |                 new_seqs_x.append(s_x)
33 |                 new_lengths_x.append(l_x)
34 |                 new_seqs_y.append(s_y)
35 |                 new_lengths_y.append(l_y)
36 |         lengths_x = new_lengths_x
37 |         seqs_x = new_seqs_x
38 |         lengths_y = new_lengths_y
39 |         seqs_y = new_seqs_y
40 | 
41 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
42 |             return None, None, None, None, None
43 | 
44 |     # n_samples is not always equal to batch_size, can be smaller!
45 |     n_samples = len(seqs_x)
46 |     maxlen_x = numpy.max(lengths_x) # SOS, EOS symbols are already added in data_iterator.py, hence no extra trick here.
47 |     maxlen_y = numpy.max(lengths_y) + 1 # account for EOS symbol at the end of the target sentence.
48 | 
49 |     maxlen_x_pad = int( numpy.ceil( maxlen_x / float(pool_stride) ) * pool_stride )
50 |     # 1st round padding, such that the length is a multiple of pool_stride
51 | 
52 |     x = numpy.zeros((maxlen_x_pad + 2*pool_stride, n_samples)).astype('int64')
53 |     # 2nd round padding at the beginning & the end for consistency, because theano's "half convolution" pads with zero-vectors by default. We want to ensure we don't pad with actual zero vectors, but rather with PAD embeddings. This is for consistency. For more information, consult http://deeplearning.net/software/theano/library/tensor/nnet/conv.html#theano.tensor.nnet.conv2d
54 | 
55 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
56 |     x_mask = numpy.zeros((maxlen_x_pad, n_samples)).astype('float32')
57 | 
58 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
59 |     for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
60 |         x[ pool_stride : pool_stride + lengths_x[idx], idx] = s_x
61 |         x_mask[:lengths_x[idx], idx] = 1.
62 | 
63 |         y[:lengths_y[idx], idx] = s_y
64 |         y_mask[:lengths_y[idx]+1, idx] = 1.
65 | 
66 |     x_m = conv_mask_pool(x_mask, pool_stride)
67 |     # x_m.shape = (maxlen_x_pad/pool_stride, n_samples)
68 |     # x_m is used as masks at the GRU layer, note its length is reduced by pool_stride.
69 | 
70 |     return x, x_m, y, y_mask, n_samples
71 | 


--------------------------------------------------------------------------------
/char2char/print_batch.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | def pbatch(source, dic):
 6 |     ss = np.transpose(source)
 7 |     for line in ss[:10]:
 8 |         for word in line:
 9 |             a = dic[word]
10 |             b = a
11 | 
12 |             if a == "SOS":
13 |                 b = "{"
14 |             elif a == "EOS":
15 |                 b = "}"
16 |             elif a == "ZERO":
17 |                 b = "_"
18 |             elif a == "UNK":
19 |                 b = "|"
20 | 
21 |             sys.stdout.write(b)
22 |         print " "
23 |     print ""
24 | 
25 | def pbatch_many(source, dic, n_x):
26 |     ss = np.transpose(source)
27 |     iis = [0, 20, n_x-8,n_x-1]
28 | 
29 |     for ii in iis:
30 |         line = ss[ii]
31 |         for word in line:
32 |             a = dic[word]
33 |             b = a
34 | 
35 |             if a == "SOS":
36 |                 b = "{"
37 |             elif a == "EOS":
38 |                 b = "}"
39 |             elif a == "ZERO":
40 |                 b = "_"
41 |             elif a == "UNK":
42 |                 b = "|"
43 | 
44 |             sys.stdout.write(b)
45 |         print " "
46 |     print ""
47 | 


--------------------------------------------------------------------------------
/char2char/train_bi_char2char.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import string
  5 | from collections import OrderedDict
  6 | from wmt_path import *
  7 | from char_base import *
  8 | from nmt import train
  9 | from conv_tools import *
 10 | from prepare_data import *
 11 | 
 12 | def main(job_id, args):
 13 |     save_file_name = args.model_name
 14 |     source_dataset = args.data_path + wmts[args.translate]['train'][0][0]
 15 |     target_dataset = args.data_path + wmts[args.translate]['train'][0][1]
 16 |     valid_source_dataset = args.data_path + wmts[args.translate]['dev'][0][0]
 17 |     valid_target_dataset = args.data_path + wmts[args.translate]['dev'][0][1]
 18 |     source_dictionary = args.data_path + wmts[args.translate]['dic'][0][0]
 19 |     target_dictionary = args.data_path + wmts[args.translate]['dic'][0][1]
 20 | 
 21 |     print args.model_path, save_file_name
 22 |     print source_dataset
 23 |     print target_dataset
 24 |     print valid_source_dataset
 25 |     print valid_target_dataset
 26 |     print source_dictionary
 27 |     print target_dictionary
 28 |     validerr = train(
 29 |         highway=args.highway,
 30 | 
 31 |         max_epochs=args.max_epochs,
 32 |         patience=args.patience,
 33 | 
 34 |         dim_word_src=args.dim_word_src,
 35 |         dim_word=args.dim_word,
 36 | 
 37 |         conv_width=args.conv_width,
 38 |         conv_nkernels=args.conv_nkernels,
 39 | 
 40 |         pool_window=args.pool_window,
 41 |         pool_stride=args.pool_stride,
 42 | 
 43 |         model_path=args.model_path,
 44 |         save_file_name=save_file_name,
 45 |         re_load=args.re_load,
 46 |         re_load_old_setting=args.re_load_old_setting,
 47 | 
 48 |         enc_dim=args.enc_dim,
 49 |         dec_dim=args.dec_dim,
 50 | 
 51 |         n_words_src=args.n_words_src,
 52 |         n_words=args.n_words,
 53 |         decay_c=args.decay_c,
 54 |         lrate=args.learning_rate,
 55 |         optimizer=args.optimizer,
 56 |         maxlen=args.maxlen,
 57 |         maxlen_trg=args.maxlen_trg,
 58 |         maxlen_sample=args.maxlen_sample,
 59 |         batch_size=args.batch_size,
 60 |         valid_batch_size=args.valid_batch_size,
 61 |         sort_size=args.sort_size,
 62 |         validFreq=args.validFreq,
 63 |         dispFreq=args.dispFreq,
 64 |         saveFreq=args.saveFreq,
 65 |         sampleFreq=args.sampleFreq,
 66 |         pbatchFreq=args.pbatchFreq,
 67 |         clip_c=args.clip_c,
 68 | 
 69 |         datasets=[source_dataset, target_dataset],
 70 |         valid_datasets=[valid_source_dataset, valid_target_dataset],
 71 |         dictionaries=[source_dictionary, target_dictionary],
 72 | 
 73 |         dropout_gru=args.dropout_gru,
 74 |         dropout_softmax=args.dropout_softmax,
 75 |         source_word_level=args.source_word_level,
 76 |         target_word_level=args.target_word_level,
 77 |         save_every_saveFreq=1,
 78 |         use_bpe=0,
 79 |         quit_immediately=args.quit_immediately,
 80 |         init_params=init_params,
 81 |         build_model=build_model,
 82 |         build_sampler=build_sampler,
 83 |         gen_sample=gen_sample,
 84 |         prepare_data=prepare_data,
 85 |     )
 86 |     return validerr
 87 | 
 88 | if __name__ == '__main__':
 89 | 
 90 |     import sys, time
 91 | 
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('-translate', type=str, default="de_en", help="de_en / cs_en / fi_en / ru_en")
 94 |     parser.add_argument('-highway', type=int, default=4)
 95 | 
 96 |     parser.add_argument('-conv_width', type=str, default="1-2-3-4-5-6-7-8")
 97 |     parser.add_argument('-conv_nkernels', type=str, default="200-200-250-250-300-300-300-300")
 98 | 
 99 |     parser.add_argument('-pool_window', type=int, default=5)
100 |     parser.add_argument('-pool_stride', type=int, default=5)
101 | 
102 |     parser.add_argument('-enc_dim', type=int, default=512)
103 |     parser.add_argument('-dec_dim', type=int, default=1024)
104 | 
105 |     parser.add_argument('-dim_word', type=int, default=512)
106 |     parser.add_argument('-dim_word_src', type=int, default=128)
107 | 
108 |     parser.add_argument('-batch_size', type=int, default=64, help="")
109 |     parser.add_argument('-valid_batch_size', type=int, default=64, help="")
110 | 
111 |     parser.add_argument('-dropout_gru', type=int, default=0, help="")
112 |     parser.add_argument('-dropout_softmax', type=int, default=0, help="")
113 | 
114 |     parser.add_argument('-maxlen', type=int, default=450, help="")
115 |     parser.add_argument('-maxlen_trg', type=int, default=500, help="")
116 |     parser.add_argument('-maxlen_sample', type=int, default=500, help="")
117 | 
118 |     parser.add_argument('-re_load', action="store_true", default=False)
119 |     parser.add_argument('-re_load_old_setting', action="store_true", default=False)
120 |     parser.add_argument('-quit_immediately', action="store_true", default=False, help="if true, will not proceed training, only print the size of the model.")
121 | 
122 |     parser.add_argument('-max_epochs', type=int, default=1000000000000, help="")
123 |     parser.add_argument('-patience', type=int, default=-1, help="")
124 |     parser.add_argument('-learning_rate', type=float, default=0.0001, help="")
125 | 
126 |     parser.add_argument('-n_words_src', type=int, default=304, help="298 for FI-EN")
127 |     parser.add_argument('-n_words', type=int, default=302, help="292 for FI-EN")
128 | 
129 |     parser.add_argument('-optimizer', type=str, default="adam", help="")
130 |     parser.add_argument('-decay_c', type=int, default=0, help="")
131 |     parser.add_argument('-clip_c', type=int, default=1, help="")
132 | 
133 |     parser.add_argument('-saveFreq', type=int, default=5000, help="")
134 |     parser.add_argument('-sampleFreq', type=int, default=5000, help="")
135 |     parser.add_argument('-dispFreq', type=int, default=1000, help="")
136 |     parser.add_argument('-validFreq', type=int, default=5000, help="")
137 |     parser.add_argument('-pbatchFreq', type=int, default=5000, help="")
138 |     parser.add_argument('-sort_size', type=int, default=20, help="")
139 | 
140 |     parser.add_argument('-source_word_level', type=int, default=0, help="")
141 |     parser.add_argument('-target_word_level', type=int, default=0, help="")
142 | 
143 |     args = parser.parse_args()
144 | 
145 |     if args.translate == "fi_en":
146 |         args.n_words_src = 304
147 |         args.n_words = 302
148 | 
149 |     if args.translate not in "de_en cs_en fi_en ru_en".split():
150 |         raise Exception('1')
151 | 
152 |     args.model_name = "bi-char2char"
153 | 
154 |     args.conv_width = [ int(x) for x in args.conv_width.split("-") ]
155 |     args.conv_nkernels = [ int(x) for x in args.conv_nkernels.split("-") ]
156 | 
157 |     args.model_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly
158 |     args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/wmt15/" # change accordingly
159 |     args.model_path = args.model_path + args.translate + "/"
160 | 
161 |     print "Model path:", args.model_path
162 | 
163 |     print args
164 |     main(0, args)
165 | 


--------------------------------------------------------------------------------
/char2char/train_multi_char2char.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import string
  5 | from collections import OrderedDict
  6 | from wmt_path_iso9 import *
  7 | from char_base import *
  8 | from nmt_many import train
  9 | from conv_tools import *
 10 | from prepare_data import *
 11 | 
 12 | def main(job_id, args):
 13 |     save_file_name = args.model_name
 14 |     langs = "de_en cs_en fi_en ru_en".split()
 15 |     source_dataset = []
 16 |     target_dataset = []
 17 |     valid_source_dataset = []
 18 |     valid_target_dataset = []
 19 | 
 20 |     for lang in langs:
 21 |         source_dataset.append(args.data_path + wmts[lang]['train'][0][0])
 22 |         target_dataset.append(args.data_path + wmts[lang]['train'][0][1])
 23 |         valid_source_dataset.append(args.data_path + wmts[lang]['dev'][0][0])
 24 |         valid_target_dataset.append(args.data_path + wmts[lang]['dev'][0][1])
 25 | 
 26 |     source_dictionary = args.data_path + wmts[args.translate]['dic'][0][0]
 27 |     target_dictionary = args.data_path + wmts[args.translate]['dic'][0][1]
 28 | 
 29 |     print args.model_path, save_file_name
 30 |     print source_dataset
 31 |     print target_dataset
 32 |     print valid_source_dataset
 33 |     print valid_target_dataset
 34 |     print source_dictionary
 35 |     print target_dictionary
 36 |     validerr = train(
 37 |         highway=args.highway,
 38 | 
 39 |         max_epochs=args.max_epochs,
 40 |         patience=args.patience,
 41 | 
 42 |         dim_word_src=args.dim_word_src,
 43 |         dim_word=args.dim_word,
 44 | 
 45 |         conv_width=args.conv_width,
 46 |         conv_nkernels=args.conv_nkernels,
 47 | 
 48 |         pool_window=args.pool_window,
 49 |         pool_stride=args.pool_stride,
 50 | 
 51 |         model_path=args.model_path,
 52 |         save_file_name=save_file_name,
 53 |         re_load=args.re_load,
 54 |         re_load_old_setting=args.re_load_old_setting,
 55 | 
 56 |         enc_dim=args.enc_dim,
 57 |         dec_dim=args.dec_dim,
 58 | 
 59 |         n_words_src=args.n_words_src,
 60 |         n_words=args.n_words,
 61 |         decay_c=args.decay_c,
 62 |         lrate=args.learning_rate,
 63 |         optimizer=args.optimizer,
 64 |         maxlen=args.maxlen,
 65 |         maxlen_trg=args.maxlen_trg,
 66 |         maxlen_sample=args.maxlen_sample,
 67 |         batch_size=args.train_batch_size,
 68 |         valid_batch_size=args.valid_batch_size,
 69 |         sort_size=args.sort_size,
 70 |         validFreq=args.validFreq,
 71 |         dispFreq=args.dispFreq,
 72 |         saveFreq=args.saveFreq,
 73 |         sampleFreq=args.sampleFreq,
 74 |         pbatchFreq=args.pbatchFreq,
 75 |         clip_c=args.clip_c,
 76 | 
 77 |         datasets=[source_dataset, target_dataset],
 78 |         valid_datasets=[[s,t] for s,t in zip(valid_source_dataset, valid_target_dataset)],
 79 |         dictionaries=[source_dictionary, target_dictionary],
 80 | 
 81 |         dropout_gru=args.dropout_gru,
 82 |         dropout_softmax=args.dropout_softmax,
 83 |         source_word_level=args.source_word_level,
 84 |         target_word_level=args.target_word_level,
 85 |         save_every_saveFreq=1,
 86 |         use_bpe=0,
 87 |         quit_immediately=args.quit_immediately,
 88 |         init_params=init_params,
 89 |         build_model=build_model,
 90 |         build_sampler=build_sampler,
 91 |         gen_sample=gen_sample,
 92 |         prepare_data=prepare_data,
 93 |     )
 94 |     return validerr
 95 | 
 96 | if __name__ == '__main__':
 97 | 
 98 |     import sys, time
 99 | 
100 |     parser = argparse.ArgumentParser()
101 |     parser.add_argument('-translate', type=str, default="many_en")
102 |     parser.add_argument('-highway', type=int, default=4)
103 |     
104 |     parser.add_argument('-conv_width', type=str, default="1-2-3-4-5-6-7-8")
105 |     parser.add_argument('-conv_nkernels', type=str, default="200-250-300-300-400-400-400-400")
106 | 
107 |     parser.add_argument('-pool_window', type=int, default=5)
108 |     parser.add_argument('-pool_stride', type=int, default=5)
109 | 
110 |     parser.add_argument('-enc_dim', type=int, default=512)
111 |     parser.add_argument('-dec_dim', type=int, default=1024)
112 | 
113 |     parser.add_argument('-dim_word', type=int, default=512)
114 |     parser.add_argument('-dim_word_src', type=int, default=128)
115 | 
116 |     parser.add_argument('-dropout_gru', type=int, default=0, help="")
117 |     parser.add_argument('-dropout_softmax', type=int, default=0, help="")
118 | 
119 |     parser.add_argument('-maxlen', type=int, default=400, help="")
120 |     parser.add_argument('-maxlen_trg', type=int, default=500, help="")
121 |     parser.add_argument('-maxlen_sample', type=int, default=500, help="")
122 | 
123 |     parser.add_argument('-train_batch_size', type=str,)
124 |     parser.add_argument('-valid_batch_size', type=int, default=60, help="")
125 |     parser.add_argument('-batch_size', type=int, default=60, help="")
126 | 
127 |     parser.add_argument('-re_load', action="store_true", default=False)
128 |     parser.add_argument('-re_load_old_setting', action="store_true", default=False)
129 |     parser.add_argument('-quit_immediately', action="store_true", default=False)
130 | 
131 |     parser.add_argument('-max_epochs', type=int, default=1000000000000, help="")
132 |     parser.add_argument('-patience', type=int, default=-1, help="")
133 |     parser.add_argument('-learning_rate', type=float, default=0.0001, help="")
134 | 
135 |     parser.add_argument('-n_words_src', type=int, default=404, help="")
136 |     parser.add_argument('-n_words', type=int, default=402, help="")
137 | 
138 |     parser.add_argument('-optimizer', type=str, default="adam", help="")
139 |     parser.add_argument('-decay_c', type=int, default=0, help="")
140 |     parser.add_argument('-clip_c', type=int, default=1, help="")
141 | 
142 |     parser.add_argument('-saveFreq', type=int, default=5000, help="")
143 |     parser.add_argument('-sampleFreq', type=int, default=5000, help="")
144 |     parser.add_argument('-dispFreq', type=int, default=1000, help="")
145 |     parser.add_argument('-validFreq', type=int, default=5000, help="")
146 |     parser.add_argument('-pbatchFreq', type=int, default=5000, help="")
147 |     parser.add_argument('-sort_size', type=int, default=20, help="")
148 | 
149 |     parser.add_argument('-source_word_level', type=int, default=0, help="")
150 |     parser.add_argument('-target_word_level', type=int, default=0, help="")
151 | 
152 |     args = parser.parse_args()
153 | 
154 |     if args.translate != "many_en":
155 |         raise Exception('1')
156 | 
157 |     args.train_batch_size = [ 14, 37, 6, 7 ]
158 | 
159 |     args.model_name = "multi-char2char"
160 | 
161 |     args.conv_width = [ int(x) for x in args.conv_width.split("-") ]
162 |     args.conv_nkernels = [ int(x) for x in args.conv_nkernels.split("-") ]
163 | 
164 |     args.model_path = "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/models/" # change accordingly
165 |     args.data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/multi-wmt15/" # change accordingly
166 |     args.model_path = args.model_path + args.translate + "/"
167 | 
168 |     print "Model path:", args.model_path
169 | 
170 |     print args
171 |     main(0, args)
172 | 


--------------------------------------------------------------------------------
/char2char/wmt_path.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Paths to training / valid / test corpus & dictionaries
  3 | # For the bilingual models
  4 | 
  5 | deen={
  6 |     "dic": [
  7 |                 ["deen/train/all_de-en.de.tok.304.pkl",
  8 |                  "deen/train/all_de-en.en.tok.300.pkl",],
  9 | 
 10 |                 ["deen/train/all_de-en.de.tok.bpe.word.pkl"],
 11 |             ],
 12 | 
 13 |     "train": [
 14 |                 ["deen/train/all_de-en.de.tok.shuf",
 15 |                  "deen/train/all_de-en.en.tok.shuf",],
 16 | 
 17 |                 ["deen/train/all_de-en.de.tok.bpe.shuf",
 18 |                  "deen/train/all_de-en.en.tok.bpe.shuf",],
 19 |             ],
 20 | 
 21 |     "dev": [
 22 |                 ["deen/dev/newstest2013.de.tok",
 23 |                  "deen/dev/newstest2013.en.tok",],
 24 | 
 25 |                 ["deen/dev/newstest2013.de.tok.bpe",
 26 |                  "deen/dev/newstest2013.en.tok.bpe",],
 27 |             ],
 28 | 
 29 |     "test1" :[
 30 |                 ["deen/test/newstest2014-deen-ref.de.tok",
 31 |                  "deen/test/newstest2014-deen-src.en.tok",],
 32 | 
 33 |                 ["deen/test/newstest2014-deen-ref.de.tok.bpe",
 34 |                  "deen/test/newstest2014-deen-src.en.tok.bpe",],
 35 |             ],
 36 | 
 37 |     "test2":[
 38 |                 ["deen/test/newstest2015-deen-ref.de.tok",
 39 |                  "deen/test/newstest2015-deen-src.en.tok",],
 40 | 
 41 |                 ["deen/test/newstest2015-deen-ref.de.tok.bpe",
 42 |                  "deen/test/newstest2015-deen-src.en.tok.bpe",],
 43 |             ],
 44 | }
 45 | 
 46 | csen={
 47 | 
 48 |     "dic":[
 49 |                 ["csen/train/all_cs-en.cs.tok.304.pkl",
 50 |                  "csen/train/all_cs-en.en.tok.300.pkl",],
 51 | 
 52 |                 ["csen/train/all_cs-en.cs.tok.bpe.word.pkl"],
 53 |             ],
 54 | 
 55 |     "train":[
 56 |                 ["csen/train/all_cs-en.cs.tok",
 57 |                  "csen/train/all_cs-en.en.tok",],
 58 | 
 59 |                 ["csen/train/all_cs-en.cs.tok.bpe",
 60 |                  "csen/train/all_cs-en.en.tok.bpe",],
 61 |             ],
 62 | 
 63 |     "dev": [
 64 |                 ["csen/dev/newstest2013-ref.cs.tok",
 65 |                  "csen/dev/newstest2013-src.en.tok",],
 66 | 
 67 |                 ["csen/dev/newstest2013-ref.cs.tok.bpe",
 68 |                  "csen/dev/newstest2013-src.en.tok.bpe",],
 69 |             ],
 70 | 
 71 |     "test1":[
 72 |                 ["csen/test/newstest2014-csen-ref.cs.tok",
 73 |                  "csen/test/newstest2014-csen-src.en.tok",],
 74 | 
 75 |                 ["csen/test/newstest2014-csen-ref.cs.tok.bpe",
 76 |                  "csen/test/newstest2014-csen-src.en.tok.bpe",],
 77 |         ],
 78 | 
 79 |     "test2":[
 80 |                 ["csen/test/newstest2015-csen-ref.cs.tok",
 81 |                  "csen/test/newstest2015-csen-src.en.tok",],
 82 | 
 83 |                 ["csen/test/newstest2015-csen-ref.cs.tok.bpe",
 84 |                  "csen/test/newstest2015-csen-src.en.tok.bpe",],
 85 |         ]
 86 | }
 87 | 
 88 | fien={
 89 |     "dic":[
 90 |                 ["fien/train/all_fi-en.fi.tok.304.pkl",
 91 |                  "fien/train/all_fi-en.en.tok.300.pkl",],
 92 | 
 93 |                 ["fien/train/all_fi-en.fi.tok.bpe.word.pkl"],
 94 |         ],
 95 | 
 96 |     "train":[
 97 |                 ["fien/train/all_fi-en.fi.tok",
 98 |                  "fien/train/all_fi-en.en.tok",],
 99 | 
100 |                 ["fien/train/all_fi-en.fi.tok.bpe",
101 |                  "fien/train/all_fi-en.en.tok.bpe",],
102 |         ],
103 | 
104 |     "dev":[
105 |                 ["fien/dev/newsdev2015-enfi-ref.fi.tok",
106 |                  "fien/dev/newsdev2015-enfi-src.en.tok",],
107 | 
108 |                 ["fien/dev/newsdev2015-enfi-ref.fi.tok.bpe",
109 |                  "fien/dev/newsdev2015-enfi-src.en.tok.bpe",],
110 |         ],
111 | 
112 |     "test1":[
113 |                 ["fien/test/newstest2015-fien-ref.fi.tok",
114 |                  "fien/test/newstest2015-fien-src.en.tok",],
115 | 
116 |                 ["fien/test/newstest2015-fien-ref.fi.tok.bpe",
117 |                  "fien/test/newstest2015-fien-src.en.tok.bpe",],
118 |         ],
119 | }
120 | 
121 | ruen={
122 | 
123 |     "dic":[
124 |                 ["ruen/train/all_ru-en.ru.tok.304.pkl",
125 |                  "ruen/train/all_ru-en.en.tok.300.pkl",],
126 | 
127 |                 ["ruen/train/all_ru-en.ru.tok.bpe.word.pkl"],
128 |         ],
129 | 
130 |     "train":[
131 |                 ["ruen/train/all_ru-en.ru.tok",
132 |                  "ruen/train/all_ru-en.en.tok",],
133 | 
134 |                 ["ruen/train/all_ru-en.ru.tok.bpe",
135 |                  "ruen/train/all_ru-en.en.tok.bpe",],
136 |         ],
137 | 
138 |     "dev":[
139 |                 ["ruen/dev/newstest2013-ref.ru.tok",
140 |                  "ruen/dev/newstest2013-src.en.tok",],
141 | 
142 |                 ["ruen/dev/newstest2013-ref.ru.tok.bpe",
143 |                  "ruen/dev/newstest2013-src.en.tok.bpe",],
144 |         ],
145 | 
146 |     "test1":[
147 |                 ["ruen/test/newstest2014-ruen-ref.ru.tok",
148 |                  "ruen/test/newstest2014-ruen-src.en.tok",],
149 | 
150 |                 ["ruen/test/newstest2014-ruen-ref.ru.tok.bpe",
151 |                  "ruen/test/newstest2014-ruen-src.en.tok.bpe",],
152 |         ],
153 | 
154 |     "test2":[
155 |                 ["ruen/test/newstest2015-ruen-ref.ru.tok",
156 |                  "ruen/test/newstest2015-ruen-src.en.tok",],
157 | 
158 |                 ["ruen/test/newstest2015-ruen-ref.ru.tok.bpe",
159 |                  "ruen/test/newstest2015-ruen-src.en.tok.bpe",],
160 |         ]
161 | }
162 | 
163 | manyen = {
164 |     "dic":[
165 |             ["char-source-for-dic.300.pkl",
166 |              "char-target-for-dic.300.pkl"],
167 | 
168 |             ["bpe-source-for-dic.word.pkl"]
169 |         ]
170 | }
171 | 
172 | wmts = dict()
173 | wmts["de_en"] = deen
174 | wmts["cs_en"] = csen
175 | wmts["fi_en"] = fien
176 | wmts["ru_en"] = ruen
177 | wmts["many_en"] = manyen
178 | 


--------------------------------------------------------------------------------
/char2char/wmt_path_iso9.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Paths to training / valid / test corpus & dictionaries
  3 | # For the multilingual models (russian characters converted to latin using iso9)
  4 | 
  5 | deen={
  6 |     "dic": [
  7 |                 ["deen/train/all_de-en.de.tok.shuf.iso9.304.pkl",
  8 |                  "dic/target.402.pkl"],
  9 | 
 10 |                 ["deen/train/all_de-en.de.tok.shuf.iso9.bpe.24111.word.pkl"],
 11 |             ],
 12 | 
 13 |     "train": [
 14 |                 ["deen/train/all_de-en.de.tok.shuf.iso9",
 15 |                  "deen/train/all_de-en.en.tok.shuf.iso9",],
 16 | 
 17 |                 ["deen/train/all_de-en.de.tok.shuf.iso9.bpe.50000",
 18 |                  "deen/train/all_de-en.de.tok.shuf.iso9.bpe.20000"],
 19 |             ],
 20 | 
 21 |     "dev": [
 22 |                 ["deen/dev/newstest2013.de.tok.iso9",
 23 |                  "deen/dev/newstest2013.en.tok.iso9",],
 24 | 
 25 |                 ["deen/dev/newstest2013.de.tok.iso9.bpe.50000",
 26 |                  "deen/dev/newstest2013.de.tok.iso9.bpe.20000"],
 27 |             ],
 28 | 
 29 |     "test1" :[
 30 |                 ["deen/test/newstest2014-deen-ref.de.tok.iso9",
 31 |                  "deen/test/newstest2014-deen-src.en.tok.iso9",],
 32 | 
 33 |                 ["deen/test/newstest2014-deen-ref.de.tok.iso9.bpe.50000",
 34 |                  "deen/test/newstest2014-deen-ref.de.tok.iso9.bpe.20000"],
 35 |             ],
 36 | 
 37 |     "test2":[
 38 |                 ["deen/test/newstest2015-deen-src.de.tok.iso9",
 39 |                  "deen/test/newstest2015-deen-src.en.tok.iso9",],
 40 | 
 41 |                 ["deen/test/newstest2015-deen-src.de.tok.iso9.bpe.50000",
 42 |                  "deen/test/newstest2015-deen-src.de.tok.iso9.bpe.20000"],
 43 |             ],
 44 | }
 45 | 
 46 | csen={
 47 |     "dic": [
 48 |                 ["csen/train/all_cs-en.cs.tok.iso9.304.pkl",
 49 |                  "dic/target.402.pkl"],
 50 | 
 51 |                 ["csen/train/all_cs-en.cs.tok.iso9.bpe.21697.word.pkl"],
 52 |             ],
 53 | 
 54 |     "train":[
 55 |                 ["csen/train/all_cs-en.cs.tok.iso9",
 56 |                  "csen/train/all_cs-en.en.tok.iso9",],
 57 | 
 58 |                 ["csen/train/all_cs-en.cs.tok.iso9.bpe.50000",
 59 |                  "csen/train/all_cs-en.cs.tok.iso9.bpe.20000"],
 60 |             ],
 61 | 
 62 |     "dev": [
 63 |                 ["csen/dev/newstest2013-ref.cs.tok.iso9",
 64 |                  "csen/dev/newstest2013-src.en.tok.iso9",],
 65 | 
 66 |                 ["csen/dev/newstest2013-ref.cs.tok.iso9.bpe.50000",
 67 |                  "csen/dev/newstest2013-ref.cs.tok.iso9.bpe.20000"],
 68 |             ],
 69 | 
 70 |     "test1":[
 71 |                 ["csen/test/newstest2014-csen-ref.cs.tok.iso9",
 72 |                  "csen/test/newstest2014-csen-src.en.tok.iso9",],
 73 | 
 74 |                 ["csen/test/newstest2014-csen-ref.cs.tok.iso9.bpe.50000",
 75 |                  "csen/test/newstest2014-csen-ref.cs.tok.iso9.bpe.20000"],
 76 |         ],
 77 | 
 78 |     "test2":[
 79 |                 ["csen/test/newstest2015-csen-ref.cs.tok.iso9",
 80 |                  "csen/test/newstest2015-csen-src.en.tok.iso9",],
 81 | 
 82 |                 ["csen/test/newstest2015-csen-ref.cs.tok.iso9.bpe.50000",
 83 |                  "csen/test/newstest2015-csen-ref.cs.tok.iso9.bpe.20000"],
 84 |         ]
 85 | }
 86 | 
 87 | fien={
 88 |     "dic": [
 89 |                 ["fien/train/all_fi-en.fi.tok.shuf.iso9.269.pkl",
 90 |                  "dic/target.402.pkl"],
 91 | 
 92 |                 ["fien/train/all_fi-en.fi.tok.shuf.iso9.bpe.20747.word.pkl"],
 93 |             ],
 94 | 
 95 |     "train":[
 96 |                 ["fien/train/all_fi-en.fi.tok.shuf.iso9",
 97 |                  "fien/train/all_fi-en.en.tok.shuf.iso9",],
 98 | 
 99 |                 ["fien/train/all_fi-en.fi.tok.shuf.iso9.bpe.50000",
100 |                  "fien/train/all_fi-en.fi.tok.shuf.iso9.bpe.20000"],
101 |         ],
102 | 
103 |     "dev":[
104 |                 ["fien/dev/newsdev2015-enfi-ref.fi.tok.iso9",
105 |                  "fien/dev/newsdev2015-enfi-src.en.tok.iso9",],
106 | 
107 |                 ["fien/dev/newsdev2015-enfi-ref.fi.tok.iso9.bpe.50000",
108 |                  "fien/dev/newsdev2015-enfi-ref.fi.tok.iso9.bpe.20000"],
109 |         ],
110 | 
111 |     "test1":[
112 |                 ["fien/test/newstest2015-fien-ref.fi.tok.iso9",
113 |                  "fien/test/newstest2015-fien-src.en.tok.iso9",],
114 | 
115 |                 ["fien/test/newstest2015-fien-ref.fi.tok.iso9.bpe.50000",
116 |                  "fien/test/newstest2015-fien-ref.fi.tok.iso9.bpe.20000"],
117 |         ],
118 | }
119 | 
120 | ruen={
121 |     "dic": [
122 |                 ["ruen/train/all_ru-en.ru.tok.iso9.304.pkl",
123 |                  "dic/target.402.pkl"],
124 | 
125 |                 ["ruen/train/all_ru-en.ru.tok.iso9.bpe.21995.word.pkl"],
126 |             ],
127 | 
128 |     "train":[
129 |                 ["ruen/train/all_ru-en.ru.tok.iso9",
130 |                  "ruen/train/all_ru-en.en.tok.iso9",],
131 | 
132 |                 ["ruen/train/all_ru-en.ru.tok.iso9.bpe.50000",
133 |                  "ruen/train/all_ru-en.ru.tok.iso9.bpe.20000"],
134 |         ],
135 | 
136 |     "dev":[
137 |                 ["ruen/dev/newstest2013-ref.ru.tok.iso9",
138 |                  "ruen/dev/newstest2013-src.en.tok.iso9",],
139 | 
140 |                 ["ruen/dev/newstest2013-ref.ru.tok.iso9.bpe.50000",
141 |                  "ruen/dev/newstest2013-ref.ru.tok.iso9.bpe.20000"],
142 |         ],
143 | 
144 |     "test1":[
145 |                 ["ruen/test/newstest2014-ruen-ref.ru.tok.iso9",
146 |                  "ruen/test/newstest2014-ruen-src.en.tok.iso9",],
147 | 
148 |                 ["ruen/test/newstest2014-ruen-ref.ru.tok.iso9.bpe.50000",
149 |                  "ruen/test/newstest2014-ruen-ref.ru.tok.iso9.bpe.20000"],
150 |         ],
151 | 
152 |     "test2":[
153 |                 ["ruen/test/newstest2015-ruen-ref.ru.tok.iso9",
154 |                  "ruen/test/newstest2015-ruen-src.en.tok.iso9",],
155 | 
156 |                 ["ruen/test/newstest2015-ruen-ref.ru.tok.iso9.bpe.50000",
157 |                  "ruen/test/newstest2015-ruen-ref.ru.tok.iso9.bpe.20000"],
158 |         ]
159 | }
160 | 
161 | manyen = {
162 |     "dic":[
163 |             ["dic/source.404.pkl",
164 |              "dic/target.402.pkl"],
165 | 
166 |             ["dic/bpe-source-for-dic.word.pkl"]
167 |         ]
168 | }
169 | 
170 | wmts = dict()
171 | wmts["de_en"] = deen
172 | wmts["cs_en"] = csen
173 | wmts["fi_en"] = fien
174 | wmts["ru_en"] = ruen
175 | wmts["many_en"] = manyen
176 | 


--------------------------------------------------------------------------------
/preprocess/build_dictionary_char.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pkl
 2 | import fileinput
 3 | import numpy
 4 | import sys
 5 | import codecs
 6 | 
 7 | from collections import OrderedDict
 8 | 
 9 | def main(filename, short_list, src):
10 |     # Build character dictionaries
11 |     print 'Processing', filename
12 |     word_freqs = OrderedDict()
13 | 
14 |     with open(filename, 'r') as f:
15 | 
16 |         for number, line in enumerate(f):
17 | 
18 |             if number % 20000 == 0:
19 |                 print 'line', number
20 | 
21 |             words_in = line.strip()
22 |             words_in = list(words_in.decode('utf8'))
23 | 
24 |             for w in words_in:
25 |                 if w not in word_freqs:
26 |                     word_freqs[w] = 0
27 |                 word_freqs[w] += 1
28 | 
29 |     print 'count finished'
30 | 
31 |     words = word_freqs.keys()
32 |     freqs = word_freqs.values()
33 | 
34 |     sorted_idx = numpy.argsort(freqs)
35 |     sorted_words = [words[ii] for ii in sorted_idx[::-1]]
36 | 
37 |     worddict = OrderedDict()
38 |     if src:
39 |         # 0 -> ZERO
40 |         # 1 -> UNK
41 |         # 2 -> SOS
42 |         # 3 -> EOS
43 |         tokens = "ZERO UNK SOS EOS".split()
44 |     else:
45 |         tokens = "EOS UNK".split()
46 |     print tokens
47 | 
48 |     for ii, aa in enumerate(tokens):
49 |         worddict[aa] = ii
50 |     print worddict
51 | 
52 |     if short_list is not None:
53 |         for ii in xrange(min(short_list, len(sorted_words))):
54 |             worddict[sorted_words[ii]] = ii + len(tokens)
55 |             # NOTE : sorted_words  
56 |         print 'dict finished'
57 | 
58 |     else:
59 |         for ii, ww in enumerate(sorted_words):
60 |             worddict[ww] = ii + len(tokens)
61 | 
62 |     print 'start dump'
63 |     with open('%s.%d.pkl' % (filename, short_list+len(tokens)), 'wb') as f:
64 |         pkl.dump(worddict, f)
65 | 
66 |     f.close()
67 |     print 'Done'
68 |     print len(worddict)
69 | 


--------------------------------------------------------------------------------
/preprocess/build_dictionary_word.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pkl
 2 | import fileinput
 3 | import numpy
 4 | import sys
 5 | import codecs
 6 | 
 7 | from collections import OrderedDict
 8 | 
 9 | def main():
10 |     for filename in sys.argv[1:]:
11 |         print 'Processing', filename
12 |         word_freqs = OrderedDict()
13 | 
14 |         with open(filename, 'r') as f:
15 | 
16 |             for number, line in enumerate(f):
17 | 
18 |                 if number % 20000 == 0:
19 |                     print 'line', number
20 | 
21 |                 words_in = line.strip().split(' ')
22 |                 for w in words_in:
23 |                     if w not in word_freqs:
24 |                         word_freqs[w] = 0
25 |                     word_freqs[w] += 1
26 | 
27 |         words = word_freqs.keys()
28 |         freqs = word_freqs.values()
29 | 
30 |         sorted_idx = numpy.argsort(freqs)
31 |         sorted_words = [words[ii] for ii in sorted_idx[::-1]]
32 | 
33 |         worddict = OrderedDict()
34 |         worddict['eos'] = 0
35 |         worddict['UNK'] = 1
36 | 
37 |         for ii, ww in enumerate(sorted_words):
38 |             worddict[ww] = ii + 2
39 | 
40 |         with open('%s.word.pkl' % filename, 'wb') as f:
41 |             pkl.dump(worddict, f)
42 | 
43 |         f.close()
44 |         print 'Done'
45 |         print len(worddict)
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/preprocess/clean_tags.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | 
 4 | from_file = sys.argv[1]
 5 | to_file = sys.argv[2]
 6 | to_file_out = open(to_file, "w")
 7 | 
 8 | regex = "<.*>"
 9 | 
10 | tag_match = re.compile(regex)
11 | matched_lines = []
12 | 
13 | with open(from_file) as from_file:
14 |     content = from_file.readlines()
15 |     for line in content:
16 |         if (tag_match.match(line)):
17 |             pass
18 |         else:
19 |             matched_lines.append(line)
20 | 
21 | matched_lines = "".join(matched_lines)
22 | to_file_out.write(matched_lines)
23 | to_file_out.close()
24 | 
25 | 


--------------------------------------------------------------------------------
/preprocess/fix_appo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # {1} is the directory name
 3 | 
 4 | 
 5 | for f in ${1}/*.xml
 6 | do
 7 |     cat $f | grep "</seg>" | sed "s/’/'/g" | sed "s/“/\"/g" | sed "s/”/\"/g" > ${f}.fixed
 8 | done
 9 | 
10 | 


--------------------------------------------------------------------------------
/preprocess/iso.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | from collections import OrderedDict
 4 | import re
 5 | import sys
 6 | reload(sys)
 7 | sys.setdefaultencoding("utf-8")
 8 | 
 9 | f = open("preprocess/iso9", 'rb')
10 | lines = [line for line in f]
11 | bigru = lines[::4]
12 | smallru = lines[1::4]
13 | bigen = lines[2::4]
14 | smallen = lines[3::4]
15 | iso = OrderedDict()
16 | 
17 | for br, sr, be, se in zip(bigru, smallru, bigen, smallen):
18 |     iso[br.replace("\n", "")] = be.replace("\n", "")
19 |     iso[sr.replace("\n", "")] = se.replace("\n", "")
20 | 
21 | def rep(a):
22 |     #aa = a.decode('utf-8')
23 |     aa = a
24 |     for k,v in iso.iteritems():
25 |         aa = aa.replace(k,v)
26 |         #aa = aa.replace(k.decode('utf-8'),v.decode('utf-8'))
27 |     #return aa.encode('utf-8')
28 |     return aa
29 | 
30 | if __name__ == '__main__':
31 |     filename = sys.argv[1]
32 |     rr = open(filename, 'rb')
33 |     txt = rr.read()
34 |     txt = rep(txt)
35 |     ww = open(filename+".iso9", "w")
36 |     ww.write(txt)
37 |     rr.close()
38 |     ww.close()
39 | 


--------------------------------------------------------------------------------
/preprocess/iso9:
--------------------------------------------------------------------------------
  1 | А
  2 | а
  3 | A
  4 | a
  5 | Б
  6 | б
  7 | B
  8 | b
  9 | В
 10 | в
 11 | V
 12 | v
 13 | Г
 14 | г
 15 | G
 16 | g
 17 | Д
 18 | д
 19 | D
 20 | d
 21 | Е
 22 | е
 23 | E
 24 | e
 25 | Ё
 26 | ё
 27 | Ë
 28 | ë
 29 | Ж
 30 | ж
 31 | Ž
 32 | ž
 33 | З
 34 | з
 35 | Z
 36 | z
 37 | И
 38 | и
 39 | I
 40 | i
 41 | Й
 42 | й
 43 | J
 44 | j
 45 | К
 46 | к
 47 | K
 48 | k
 49 | Л
 50 | л
 51 | L
 52 | l
 53 | М
 54 | м
 55 | M
 56 | m
 57 | Н
 58 | н
 59 | N
 60 | n
 61 | О
 62 | о
 63 | O
 64 | o
 65 | П
 66 | п
 67 | P
 68 | p
 69 | Р
 70 | р
 71 | R
 72 | r
 73 | С
 74 | с
 75 | S
 76 | s
 77 | Т
 78 | т
 79 | T
 80 | t
 81 | У
 82 | у
 83 | U
 84 | u
 85 | Ф
 86 | ф
 87 | F
 88 | f
 89 | Х
 90 | х
 91 | H
 92 | h
 93 | Ц
 94 | ц
 95 | C
 96 | c
 97 | Ч
 98 | ч
 99 | Č
100 | č
101 | Ш
102 | ш
103 | Š
104 | š
105 | Щ
106 | щ
107 | Ŝ
108 | ŝ
109 | Ъ
110 | ъ
111 | ʺ
112 | ʺ
113 | Ы
114 | ы
115 | Y
116 | y
117 | Ь
118 | ь
119 | ʹ
120 | ʹ
121 | Э
122 | э
123 | È
124 | è
125 | Ю
126 | ю
127 | Û
128 | û
129 | Я
130 | я
131 | Â 
132 | â
133 | 


--------------------------------------------------------------------------------
/preprocess/merge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | SRC=$1
 5 | TRG=$2
 6 | 
 7 | FSRC=all_${1}-${2}.${1}
 8 | FTRG=all_${1}-${2}.${2}
 9 | 
10 | echo "" > $FSRC
11 | for F in *${1}-${2}.${1}
12 | do
13 |     if [ "$F" = "$FSRC" ]; then
14 |         echo "pass"
15 |     else
16 |         cat $F >> $FSRC
17 |     fi
18 | done
19 | 
20 | 
21 | echo "" > $FTRG
22 | for F in *${1}-${2}.${2}
23 | do
24 |     if [ "$F" = "$FTRG" ]; then
25 |         echo "pass"
26 |     else
27 |         cat $F >> $FTRG
28 |     fi
29 | done
30 | 


--------------------------------------------------------------------------------
/preprocess/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | sub add_to_ref {
 35 |     my ($file,$REF) = @_;
 36 |     my $s=0;
 37 |     open(REF,$file) or die "Can't read $file";
 38 |     while(<REF>) {
 39 |     chop;
 40 |     push @{$$REF[$s++]}, $_;
 41 |     }
 42 |     close(REF);
 43 | }
 44 | 
 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 46 | my $s=0;
 47 | while(<STDIN>) {
 48 |     chop;
 49 |     $_ = lc if $lowercase;
 50 |     my @WORD = split;
 51 |     my %REF_NGRAM = ();
 52 |     my $length_translation_this_sentence = scalar(@WORD);
 53 |     my ($closest_diff,$closest_length) = (9999,9999);
 54 |     foreach my $reference (@{$REF[$s]}) {
 55 | #      print "$s $_ <=> $reference\n";
 56 |   $reference = lc($reference) if $lowercase;
 57 |     my @WORD = split(' ',$reference);
 58 |     my $length = scalar(@WORD);
 59 |         my $diff = abs($length_translation_this_sentence-$length);
 60 |     if ($diff < $closest_diff) {
 61 |         $closest_diff = $diff;
 62 |         $closest_length = $length;
 63 |         # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 64 |     } elsif ($diff == $closest_diff) {
 65 |             $closest_length = $length if $length < $closest_length;
 66 |             # from two references with the same closeness to me
 67 |             # take the *shorter* into account, not the "first" one.
 68 |         }
 69 |     for(my $n=1;$n<=4;$n++) {
 70 |         my %REF_NGRAM_N = ();
 71 |         for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 72 |         my $ngram = "$n";
 73 |         for(my $w=0;$w<$n;$w++) {
 74 |             $ngram .= " ".$WORD[$start+$w];
 75 |         }
 76 |         $REF_NGRAM_N{$ngram}++;
 77 |         }
 78 |         foreach my $ngram (keys %REF_NGRAM_N) {
 79 |         if (!defined($REF_NGRAM{$ngram}) ||
 80 |             $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 81 |             $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 82 | #       print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 83 |         }
 84 |         }
 85 |     }
 86 |     }
 87 |     $length_translation += $length_translation_this_sentence;
 88 |     $length_reference += $closest_length;
 89 |     for(my $n=1;$n<=4;$n++) {
 90 |     my %T_NGRAM = ();
 91 |     for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 92 |         my $ngram = "$n";
 93 |         for(my $w=0;$w<$n;$w++) {
 94 |         $ngram .= " ".$WORD[$start+$w];
 95 |         }
 96 |         $T_NGRAM{$ngram}++;
 97 |     }
 98 |     foreach my $ngram (keys %T_NGRAM) {
 99 |         $ngram =~ /^(\d+) /;
100 |         my $n = $1;
101 |             # my $corr = 0;
102 | #   print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
103 |         $TOTAL[$n] += $T_NGRAM{$ngram};
104 |         if (defined($REF_NGRAM{$ngram})) {
105 |         if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
106 |             $CORRECT[$n] += $T_NGRAM{$ngram};
107 |                     # $corr =  $T_NGRAM{$ngram};
108 | #       print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
109 |         }
110 |         else {
111 |             $CORRECT[$n] += $REF_NGRAM{$ngram};
112 |                     # $corr =  $REF_NGRAM{$ngram};
113 | #       print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
114 |         }
115 |         }
116 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
117 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
118 |     }
119 |     }
120 |     $s++;
121 | }
122 | my $brevity_penalty = 1;
123 | my $bleu = 0;
124 | 
125 | my @bleu=();
126 | 
127 | for(my $n=1;$n<=4;$n++) {
128 |   if (defined ($TOTAL[$n]) && defined ($CORRECT[$n]) && $TOTAL[$n] > 0){
129 |     $bleu[$n]=($TOTAL[$n]>0)?$CORRECT[$n]/$TOTAL[$n]:0;
130 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
131 |   }else{
132 |     $bleu[$n]=0;
133 |   }
134 | }
135 | 
136 | if ($length_reference==0){
137 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
138 |   exit(1);
139 | }
140 | 
141 | if ($length_translation<$length_reference) {
142 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
143 | }
144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
145 |                 my_log( $bleu[2] ) +
146 |                 my_log( $bleu[3] ) +
147 |                 my_log( $bleu[4] ) ) / 4) ;
148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
149 |     100*$bleu,
150 |     100*$bleu[1],
151 |     100*$bleu[2],
152 |     100*$bleu[3],
153 |     100*$bleu[4],
154 |     $brevity_penalty,
155 |     $length_translation / $length_reference,
156 |     $length_translation,
157 |     $length_reference;
158 | 
159 | sub my_log {
160 |   return -9999999999 unless $_[0];
161 |   return log($_[0]);
162 | }
163 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | ﻿#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 | 
211 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/preprocess/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/preprocess/normalize-punctuation.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | use strict;
 4 | 
 5 | my ($language) = @ARGV;
 6 | 
 7 | while(<STDIN>) {
 8 |     s/\r//g;
 9 |     # remove extra spaces
10 |     s/\(/ \(/g;
11 |     s/\)/\) /g; s/ +/ /g;
12 |     s/\) ([\.\!\:\?\;\,])/\)$1/g;
13 |     s/\( /\(/g;
14 |     s/ \)/\)/g;
15 |     s/(\d) \%/$1\%/g;
16 |     s/ :/:/g;
17 |     s/ ;/;/g;
18 |     # normalize unicode punctuation
19 |     s/„/\"/g;
20 |     s/“/\"/g;
21 |     s/”/\"/g;
22 |     s/–/-/g;
23 |     s/—/ - /g; s/ +/ /g;
24 |     s/´/\'/g;
25 |     s/([a-z])‘([a-z])/$1\'$2/gi;
26 |     s/([a-z])’([a-z])/$1\'$2/gi;
27 |     s/‘/\"/g;
28 |     s/‚/\"/g;
29 |     s/’/\"/g;
30 |     s/''/\"/g;
31 |     s/´´/\"/g;
32 |     s/…/.../g;
33 |     # French quotes
34 |     s/ « / \"/g;
35 |     s/« /\"/g;
36 |     s/«/\"/g;
37 |     s/ » /\" /g;
38 |     s/ »/\"/g;
39 |     s/»/\"/g;
40 |     # handle pseudo-spaces
41 |     s/ \%/\%/g;
42 |     s/nº /nº /g;
43 |     s/ :/:/g;
44 |     s/ ºC/ ºC/g;
45 |     s/ cm/ cm/g;
46 |     s/ \?/\?/g;
47 |     s/ \!/\!/g;
48 |     s/ ;/;/g;
49 |     s/, /, /g; s/ +/ /g;
50 | 
51 |     # English "quotation," followed by comma, style
52 |     if ($language eq "en") {
53 | 	s/\"([,\.]+)/$1\"/g;
54 |     }
55 |     # Czech is confused
56 |     elsif ($language eq "cs" || $language eq "cz") {
57 |     }
58 |     # German/Spanish/French "quotation", followed by comma, style
59 |     else {
60 | 	s/,\"/\",/g;	
61 | 	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
62 |     }
63 | 
64 |     print STDERR $_ if /﻿/;
65 | 
66 |     if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
67 | 	s/(\d) (\d)/$1,$2/g;
68 |     }
69 |     else {
70 | 	s/(\d) (\d)/$1.$2/g;
71 |     }
72 |     print $_;
73 | }
74 | 


--------------------------------------------------------------------------------
/preprocess/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # source language (example: fr)
 4 | S=$1
 5 | # target language (example: en)
 6 | T=$2
 7 | 
 8 | # path to dl4mt/data
 9 | P1=$3
10 | 
11 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt)
12 | P2=$4
13 | 
14 | ## merge all parallel corpora
15 | #./merge.sh $1 $2
16 | 
17 | perl $P1/normalize-punctuation.perl -l ${S} < all_${S}-${T}.${S} > all_${S}-${T}.${S}.norm  # do this for validation and test
18 | perl $P1/normalize-punctuation.perl -l ${T} < all_${S}-${T}.${T} > all_${S}-${T}.${T}.norm  # do this for validation and test
19 | 
20 | # tokenize
21 | perl $P1/tokenizer_apos.perl -threads 5 -l $S < all_${S}-${T}.${S}.norm > all_${S}-${T}.${S}.tok  # do this for validation and test
22 | perl $P1/tokenizer_apos.perl -threads 5 -l $T < all_${S}-${T}.${T}.norm > all_${S}-${T}.${T}.tok  # do this for validation and test
23 | 
24 | # BPE
25 | if [ ! -f "../${S}.bpe" ]; then
26 |     python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${S}.tok > ../${S}.bpe
27 | fi
28 | if [ ! -f "../${T}.bpe" ]; then
29 |     python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${T}.tok > ../${T}.bpe
30 | fi
31 | 
32 | python $P2/apply_bpe.py -c ../${S}.bpe < all_${S}-${T}.${S}.tok > all_${S}-${T}.${S}.tok.bpe  # do this for validation and test
33 | python $P2/apply_bpe.py -c ../${T}.bpe < all_${S}-${T}.${T}.tok > all_${S}-${T}.${T}.tok.bpe  # do this for validation and test
34 | 
35 | # shuffle 
36 | python $P1/shuffle.py all_${S}-${T}.${S}.tok.bpe all_${S}-${T}.${T}.tok.bpe all_${S}-${T}.${S}.tok all_${S}-${T}.${T}.tok
37 | 
38 | # build dictionary
39 | #python $P1/build_dictionary.py all_${S}-${T}.${S}.tok &
40 | #python $P1/build_dictionary.py all_${S}-${T}.${T}.tok &
41 | #python $P1/build_dictionary_word.py all_${S}-${T}.${S}.tok.bpe &
42 | #python $P1/build_dictionary_word.py all_${S}-${T}.${T}.tok.bpe &
43 | 


--------------------------------------------------------------------------------
/translate/translate_bpe2char.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import os
  4 | import time
  5 | 
  6 | reload(sys)
  7 | sys.setdefaultencoding('utf-8')
  8 | 
  9 | sys.path.insert(0, "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/bpe2char") # change appropriately
 10 | 
 11 | import numpy
 12 | import cPickle as pkl
 13 | from mixer import *
 14 | 
 15 | def translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent):
 16 | 
 17 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 18 |     trng = RandomStreams(1234)
 19 | 
 20 |     # allocate model parameters
 21 |     params = init_params(options)
 22 | 
 23 |     # load model parameters and set theano shared variables
 24 |     params = load_params(model, params)
 25 |     tparams = init_tparams(params)
 26 | 
 27 |     # word index
 28 |     use_noise = theano.shared(numpy.float32(0.))
 29 |     f_init, f_next = build_sampler(tparams, options, trng, use_noise)
 30 | 
 31 |     def _translate(seq):
 32 |         use_noise.set_value(0.)
 33 |         # sample given an input sequence and obtain scores
 34 |         # NOTE : if seq length too small, do something about it
 35 |         sample, score = gen_sample(tparams, f_init, f_next,
 36 |                                    numpy.array(seq).reshape([len(seq), 1]),
 37 |                                    options, trng=trng, k=k, maxlen=500,
 38 |                                    stochastic=False, argmax=False)
 39 | 
 40 |         # normalize scores according to sequence lengths
 41 |         if normalize:
 42 |             lengths = numpy.array([len(s) for s in sample])
 43 |             score = score / lengths
 44 |         sidx = numpy.argmin(score)
 45 |         return sample[sidx]
 46 | 
 47 |     while jobqueue:
 48 |         req = jobqueue.pop(0)
 49 | 
 50 |         idx, x = req[0], req[1]
 51 |         if not silent:
 52 |             print "sentence", idx, model_id
 53 |         seq = _translate(x)
 54 | 
 55 |         resultqueue.append((idx, seq))
 56 |     return
 57 | 
 58 | def main(model, dictionary, dictionary_target, source_file, saveto, k=6,
 59 |          normalize=False, encoder_chr_level=False,
 60 |          decoder_chr_level=False, utf8=False, 
 61 |         model_id=None, silent=False,):
 62 | 
 63 |     from char_base import (build_sampler, gen_sample, init_params)
 64 | 
 65 |     # load model model_options
 66 |     pkl_file = model.split('.')[0] + '.pkl'
 67 |     with open(pkl_file, 'rb') as f:
 68 |         options = pkl.load(f)
 69 | 
 70 |     # load source dictionary and invert
 71 |     with open(dictionary, 'rb') as f:
 72 |         word_dict = pkl.load(f)
 73 |     word_idict = dict()
 74 |     for kk, vv in word_dict.iteritems():
 75 |         word_idict[vv] = kk
 76 |     #word_idict[0] = '<eos>'
 77 |     #word_idict[1] = 'UNK'
 78 | 
 79 |     # load target dictionary and invert
 80 |     with open(dictionary_target, 'rb') as f:
 81 |         word_dict_trg = pkl.load(f)
 82 |     word_idict_trg = dict()
 83 |     for kk, vv in word_dict_trg.iteritems():
 84 |         word_idict_trg[vv] = kk
 85 |     #word_idict_trg[0] = '<eos>'
 86 |     #word_idict_trg[1] = 'UNK'
 87 | 
 88 |     # create input and output queues for processes
 89 |     jobqueue = []
 90 |     resultqueue = []
 91 | 
 92 |     # utility function
 93 |     def _seqs2words(caps):
 94 |         capsw = []
 95 |         for cc in caps:
 96 |             ww = []
 97 |             for w in cc:
 98 |                 if w == 0:
 99 |                     break
100 |                 if utf8:
101 |                     ww.append(word_idict_trg[w].encode('utf-8'))
102 |                 else:
103 |                     ww.append(word_idict_trg[w])
104 |             if decoder_chr_level:
105 |                 capsw.append(''.join(ww))
106 |             else:
107 |                 capsw.append(' '.join(ww))
108 |         return capsw
109 | 
110 |     def _send_jobs(fname):
111 |         with open(fname, 'r') as f:
112 |             for idx, line in enumerate(f):
113 |                 # idx : 0 ... len-1 
114 | 
115 |                 if encoder_chr_level:
116 |                     words = list(line.decode('utf-8').strip())
117 |                 else:
118 |                     words = line.strip().split()
119 | 
120 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
121 |                 x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
122 |                 x += [0]
123 |                 jobqueue.append((idx, x))
124 |         return idx+1
125 | 
126 |     def _retrieve_jobs(n_samples, silent):
127 |         trans = [None] * n_samples
128 | 
129 |         for idx in xrange(n_samples):
130 |             resp = resultqueue.pop(0)
131 |             trans[resp[0]] = resp[1]
132 |             if numpy.mod(idx, 10) == 0:
133 |                 if not silent:
134 |                     print 'Sample ', (idx+1), '/', n_samples, ' Done'
135 |         return trans
136 | 
137 |     print 'Translating ', source_file, '...'
138 |     print 'source dic ', dictionary, '...'
139 |     print 'target dic ', dictionary_target, '...'
140 |     n_samples = _send_jobs(source_file)
141 |     print "jobs sent"
142 | 
143 |     translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent)
144 |     trans = _seqs2words(_retrieve_jobs(n_samples, silent))
145 |     print "translations retrieved"
146 | 
147 |     with open(saveto, 'w') as f:
148 |         print >>f, u'\n'.join(trans).encode('utf-8')
149 | 
150 |     print "Done", saveto
151 | 
152 | if __name__ == "__main__":
153 |     parser = argparse.ArgumentParser()
154 |     parser.add_argument('-k', type=int, default=20) # beam width
155 |     parser.add_argument('-n', action="store_true", default=True) # normalize scores for different hypothesis based on their length (to penalize shorter hypotheses, longer hypotheses are already penalized by the BLEU measure, which is precision of sorts).
156 |     parser.add_argument('-enc_c', action="store_true", default=False) # is encoder character-level?
157 |     parser.add_argument('-dec_c', action="store_true", default=True) # is decoder character-level?
158 |     parser.add_argument('-utf8', action="store_true", default=True)
159 |     parser.add_argument('-many', action="store_true", default=False) # multilingual model?
160 |     parser.add_argument('-model', type=str) # absolute path to a model (.npz file)
161 |     parser.add_argument('-translate', type=str, help="de_en / cs_en / fi_en / ru_en") # which language? 
162 |     parser.add_argument('-saveto', type=str) # absolute path where the translation should be saved
163 |     parser.add_argument('-which', type=str, help="dev / test1 / test2", default="dev") # if you wish to translate any of development / test1 / test2 file from WMT15, simply specify which one here
164 |     parser.add_argument('-source', type=str, default="") # if you wish to provide your own file to be translated, provide an absolute path to the file to be translated
165 |     parser.add_argument('-silent', action="store_true", default=False) # suppress progress messages
166 | 
167 |     args = parser.parse_args()
168 | 
169 |     which_wmt = None
170 |     if args.many:
171 |         which_wmt = "multi-wmt15"
172 |     else:
173 |         which_wmt = "wmt15"
174 |     data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/%s/" % which_wmt # change appropriately
175 | 
176 |     if args.which not in "dev test1 test2".split():
177 |         raise Exception('1')
178 | 
179 |     if args.translate not in ["de_en", "cs_en", "fi_en", "ru_en"]:
180 |         raise Exception('1')
181 | 
182 |     if args.translate == "fi_en" and args.which == "test2":
183 |         raise Exception('1')
184 | 
185 |     if args.many:
186 |         from wmt_path_iso9 import *
187 | 
188 |         dictionary = wmts["many_en"]["dic"][1][0]
189 |         dictionary_target = wmts["many_en"]["dic"][0][1]
190 |         source = wmts[args.translate][args.which][1][0]
191 | 
192 |     else:
193 |         from wmt_path import *
194 | 
195 |         aa = args.translate.split("_")
196 |         lang = aa[0]
197 |         en = aa[1]
198 | 
199 |         dictionary = "%s%s/train/all_%s-%s.%s.tok.bpe.word.pkl" % (lang, en, lang, en, lang)
200 |         dictionary_target = "%s%s/train/all_%s-%s.%s.tok.300.pkl" % (lang, en, lang, en, en)
201 |         source = wmts[args.translate][args.which][1][0]
202 | 
203 |     # /work/yl1363/bpe2char/de_en/deen_bpe2char_two_layer_gru_decoder_adam.grads.355000.npz
204 |     model_id = args.model.split('/')[-1]
205 | 
206 |     dictionary = data_path + dictionary
207 |     dictionary_target = data_path + dictionary_target
208 |     source = data_path + source
209 | 
210 |     if args.source != "":
211 |         source = args.source
212 | 
213 |     print "src dict:", dictionary
214 |     print "trg dict:", dictionary_target
215 |     print "source:", source
216 | 
217 |     print "dest :", args.saveto
218 | 
219 |     print args
220 | 
221 |     time1 = time.time()
222 |     main(args.model, dictionary, dictionary_target, source,
223 |          args.saveto, k=args.k, normalize=args.n, encoder_chr_level=args.enc_c,
224 |          decoder_chr_level=args.dec_c,
225 |          utf8=args.utf8, 
226 |         model_id = model_id,
227 |          silent=args.silent,
228 |          )
229 |     time2 = time.time()
230 |     duration = (time2-time1)/float(60)
231 |     print("Translation took %.2f minutes" % duration)
232 | 


--------------------------------------------------------------------------------
/translate/translate_char2char.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import os
  4 | import time
  5 | 
  6 | reload(sys)
  7 | sys.setdefaultencoding('utf-8')
  8 | 
  9 | sys.path.insert(0, "/misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-c2c/char2char") # change appropriately
 10 | 
 11 | import numpy
 12 | import cPickle as pkl
 13 | from mixer import *
 14 | 
 15 | def translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent):
 16 | 
 17 |     from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 18 |     trng = RandomStreams(1234)
 19 | 
 20 |     # allocate model parameters
 21 |     params = init_params(options)
 22 | 
 23 |     # load model parameters and set theano shared variables
 24 |     params = load_params(model, params)
 25 |     tparams = init_tparams(params)
 26 | 
 27 |     # word index
 28 |     use_noise = theano.shared(numpy.float32(0.))
 29 |     f_init, f_next = build_sampler(tparams, options, trng, use_noise)
 30 | 
 31 |     def _translate(seq):
 32 |         use_noise.set_value(0.)
 33 |         # sample given an input sequence and obtain scores
 34 |         # NOTE : if seq length too small, do something about it
 35 |         sample, score = gen_sample(tparams, f_init, f_next,
 36 |                                    numpy.array(seq).reshape([len(seq), 1]),
 37 |                                    options, trng=trng, k=k, maxlen=500,
 38 |                                    stochastic=False, argmax=False)
 39 | 
 40 |         # normalize scores according to sequence lengths
 41 |         if normalize:
 42 |             lengths = numpy.array([len(s) for s in sample]) 
 43 |             score = score / lengths
 44 |         sidx = numpy.argmin(score)
 45 |         return sample[sidx]
 46 | 
 47 |     while jobqueue:
 48 |         req = jobqueue.pop(0)
 49 | 
 50 |         idx, x = req[0], req[1]
 51 |         if not silent:
 52 |             print "sentence", idx, model_id
 53 |         seq = _translate(x)
 54 | 
 55 |         resultqueue.append((idx, seq))
 56 |     return
 57 | 
 58 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
 59 |          normalize=False, encoder_chr_level=False,
 60 |          decoder_chr_level=False, utf8=False, 
 61 |           model_id=None, silent=False):
 62 | 
 63 |     from char_base import (build_sampler, gen_sample, init_params)
 64 | 
 65 |     # load model model_options
 66 |     # /misc/kcgscratch1/ChoGroup/jasonlee/dl4mt-cdec/models/one-multiscale-conv-two-hw-lngru-1234567-100-150-200-200-200-200-200-66-one.pkl
 67 |     pkl_file = model.split('.')[0] + '.pkl'
 68 |     with open(pkl_file, 'rb') as f:
 69 |         options = pkl.load(f)
 70 | 
 71 |     # load source dictionary and invert
 72 |     with open(dictionary, 'rb') as f:
 73 |         word_dict = pkl.load(f)
 74 |     word_idict = dict()
 75 |     for kk, vv in word_dict.iteritems():
 76 |         word_idict[vv] = kk
 77 |     #word_idict[0] = 'ZERO'
 78 |     #word_idict[1] = 'UNK'
 79 | 
 80 |     # load target dictionary and invert
 81 |     with open(dictionary_target, 'rb') as f:
 82 |         word_dict_trg = pkl.load(f)
 83 |     word_idict_trg = dict()
 84 |     for kk, vv in word_dict_trg.iteritems():
 85 |         word_idict_trg[vv] = kk
 86 |     #word_idict_trg[0] = 'ZERO'
 87 |     #word_idict_trg[1] = 'UNK'
 88 | 
 89 |     # create input and output queues for processes
 90 |     jobqueue = []
 91 |     resultqueue = []
 92 | 
 93 |     # utility function
 94 |     def _seqs2words(caps):
 95 |         capsw = []
 96 |         for cc in caps:
 97 |             ww = []
 98 |             for w in cc:
 99 |                 if w == 0:
100 |                     break
101 |                 if utf8:
102 |                     ww.append(word_idict_trg[w].encode('utf-8'))
103 |                 else:
104 |                     ww.append(word_idict_trg[w])
105 |             if decoder_chr_level:
106 |                 capsw.append(''.join(ww))
107 |             else:
108 |                 capsw.append(' '.join(ww))
109 |         return capsw
110 | 
111 |     def _send_jobs(fname):
112 |         with open(fname, 'r') as f:
113 |             for idx, line in enumerate(f):
114 |                 # idx : 0 ... len-1 
115 |                 pool_window = options['pool_stride']
116 | 
117 |                 if encoder_chr_level:
118 |                     words = list(line.decode('utf-8').strip())
119 |                 else:
120 |                     words = line.strip().split()
121 | 
122 |                 x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
123 |                 x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
124 |                 x = [2] + x + [3]
125 | 
126 |                 # len : 77, pool_window 10 -> 3 
127 |                 # len : 80, pool_window 10 -> 0
128 |                 #rem = pool_window - ( len(x) % pool_window )
129 |                 #if rem < pool_window:
130 |                 #    x += [0]*rem
131 | 
132 |                 while len(x) % pool_window != 0:
133 |                     x += [0]
134 | 
135 |                 x = [0]*pool_window + x + [0]*pool_window
136 | 
137 |                 jobqueue.append((idx, x))
138 | 
139 |         return idx+1
140 | 
141 |     def _retrieve_jobs(n_samples, silent):
142 |         trans = [None] * n_samples
143 | 
144 |         for idx in xrange(n_samples):
145 |             resp = resultqueue.pop(0)
146 |             trans[resp[0]] = resp[1]
147 |             if numpy.mod(idx, 10) == 0:
148 |                 if not silent:
149 |                     print 'Sample ', (idx+1), '/', n_samples, ' Done', model_id
150 |         return trans
151 | 
152 |     print 'Translating ', source_file, '...'
153 |     n_samples = _send_jobs(source_file)
154 |     print "jobs sent"
155 | 
156 |     translate_model(jobqueue, resultqueue, model, options, k, normalize, build_sampler, gen_sample, init_params, model_id, silent)
157 |     trans = _seqs2words(_retrieve_jobs(n_samples, silent))
158 |     print "translations retrieved"
159 | 
160 |     with open(saveto, 'w') as f:
161 |         print >>f, u'\n'.join(trans).encode('utf-8')
162 | 
163 |     print "Done", saveto
164 | 
165 | if __name__ == "__main__":
166 |     parser = argparse.ArgumentParser()
167 |     parser.add_argument('-k', type=int, default=20) # beam width
168 |     parser.add_argument('-n', action="store_true", default=True) # normalize scores for different hypothesis based on their length (to penalize shorter hypotheses, longer hypotheses are already penalized by the BLEU measure, which is precision of sorts).
169 |     parser.add_argument('-enc_c', action="store_true", default=True) # is encoder character-level?
170 |     parser.add_argument('-dec_c', action="store_true", default=True) # is decoder character-level?
171 |     parser.add_argument('-utf8', action="store_true", default=True)
172 |     parser.add_argument('-many', action="store_true", default=False) # multilingual model?
173 |     parser.add_argument('-model', type=str) # absolute path to a model (.npz file)
174 |     parser.add_argument('-translate', type=str, help="de_en / cs_en / fi_en / ru_en") # which language?
175 |     parser.add_argument('-saveto', type=str, ) # absolute path where the translation should be saved
176 |     parser.add_argument('-which', type=str, help="dev / test1 / test2", default="dev") # if you wish to translate any of development / test1 / test2 file from WMT15, simply specify which one here
177 |     parser.add_argument('-source', type=str, default="") # if you wish to provide your own file to be translated, provide an absolute path to the file to be translated
178 |     parser.add_argument('-silent', action="store_true", default=False) # suppress progress messages
179 | 
180 |     args = parser.parse_args()
181 | 
182 |     which_wmt = None
183 |     if args.many:
184 |         which_wmt = "multi-wmt15"
185 |     else:
186 |         which_wmt = "wmt15"
187 | 
188 |     data_path = "/misc/kcgscratch1/ChoGroup/jasonlee/temp_data/%s/" % which_wmt # change appropriately
189 | 
190 |     if args.which not in "dev test1 test2".split():
191 |         raise Exception('1')
192 | 
193 |     if args.translate not in ["de_en", "cs_en", "fi_en", "ru_en"]:
194 |         raise Exception('1')
195 | 
196 |     if args.translate == "fi_en" and args.which == "test2":
197 |         raise Exception('1')
198 | 
199 |     if args.many:
200 |         from wmt_path_iso9 import *
201 | 
202 |         dictionary = wmts['many_en']['dic'][0][0]
203 |         dictionary_target = wmts['many_en']['dic'][0][1]
204 |         source = wmts[args.translate][args.which][0][0]
205 | 
206 |     else:
207 |         from wmt_path import *
208 | 
209 |         aa = args.translate.split("_")
210 |         lang = aa[0]
211 |         en = aa[1]
212 | 
213 |         dictionary = "%s%s/train/all_%s-%s.%s.tok.304.pkl" % (lang, en, lang, en, lang)
214 |         dictionary_target = "%s%s/train/all_%s-%s.%s.tok.300.pkl" % (lang, en, lang, en, en)
215 |         source = wmts[args.translate][args.which][0][0]
216 | 
217 |     char_base = args.model.split("/")[-1]
218 | 
219 |     dictionary = data_path + dictionary
220 |     dictionary_target = data_path + dictionary_target
221 |     source = data_path + source
222 | 
223 |     if args.source != "":
224 |         source = args.source
225 | 
226 |     print "src dict:", dictionary
227 |     print "trg dict:", dictionary_target
228 |     print "source:", source
229 |     print "dest :", args.saveto
230 | 
231 |     print args
232 | 
233 |     time1 = time.time()
234 |     main(args.model, dictionary, dictionary_target, source,
235 |          args.saveto, k=args.k, normalize=args.n, encoder_chr_level=args.enc_c,
236 |          decoder_chr_level=args.dec_c,
237 |          utf8=args.utf8,
238 |          model_id=char_base,
239 |          silent=args.silent,
240 |         )
241 |     time2 = time.time()
242 |     duration = (time2-time1)/float(60)
243 |     print("Translation took %.2f minutes" % duration)
244 | 


--------------------------------------------------------------------------------