├── .idea ├── inspectionProfiles │ └── Project_Default.xml └── vcs.xml ├── LICENSE ├── README.md ├── __init__.py ├── actors.py ├── bleu.py ├── bleu.pyc ├── config.py ├── config.pyc ├── data ├── README.md ├── build_dictionary.py ├── download_files.py ├── length.py ├── merge.sh ├── multi-bleu.perl ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.el │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.pt │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.sv │ └── nonbreaking_prefix.ta ├── preprocess.sh ├── scan_example.py ├── setup_cluster_env.sh ├── setup_local_env.sh ├── shuffle.py ├── strip_sgml.py ├── tokenize_all.sh ├── tokenizer.perl └── translate.sh ├── data_iterator.py ├── data_iterator.pyc ├── insepection.py ├── insepection.pyc ├── itchat.pkl ├── layers.py ├── layers.pyc ├── mteval.sh ├── nmt_uni.py ├── nmt_uni.pyc ├── optimizer.py ├── optimizer.pyc ├── plot_heatmap.ipynb ├── policy.py ├── policy.pyc ├── pretrain_uni.py ├── reward.py ├── reward.pyc ├── run_eval.sh ├── run_train.sh ├── show_progress.ipynb ├── simultrans_beam.py ├── simultrans_eval.py ├── simultrans_model.py ├── simultrans_model.pyc ├── simultrans_model_clean.py ├── simultrans_model_clean.pyc ├── simultrans_train.py ├── translate_uni.py ├── translate_uni.sh ├── utils.py ├── utils.pyc └── utils └── msyh.ttf /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2016, New York University (Kyunghyun Cho) and Jiatao Gu 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Learning to Translate in Real-time with Neural Machine Translation 2 | =================================== 3 | Translation in Real-time, a.k.a, Simultaneous Translation. 4 | 5 | This code is the Theano implementation of the EACL2017 paper [Learning to Translate in Real-time with Neural Machine Translation](https://arxiv.org/abs/1610.00388). It is based on the dl4mt-tutorial (https://github.com/nyu-dl/dl4mt-tutorial). 6 | 7 | Dependencies: 8 | ---------------------- 9 | ### Python 2.7 10 | * Theano 0.8.2 (cuda 8.0, cudnn v5) 11 | * seaborn, pandas (for drawing the heatmap) 12 | * NLTK 3.2.1 13 | 14 | ### Preprocessing 15 | The preprocessing and evaluation scripts are from [MOSES](https://github.com/moses-smt/mosesdecoder). 16 | 17 | Dataset: 18 | ---------------------- 19 | We used the WMT'15 corpora as our training set for both pretraining the NMT model and training the Simultaneous NMT model. 20 | The original WMT'15 corpora can be downloaded from [here](http://www.statmt.org/wmt15/translation-task.html). 21 | For the preprocessed corpora used in our experiments, both the source and target datasets are preprocessed using byte-pair encoding (http://arxiv.org/abs/1508.07909, https://github.com/rsennrich/subword-nmt). 22 | 23 | Pretraining: 24 | ---------------------- 25 | Before training the agent for simultaneous translation, the underlined translation model requires pretraining. 26 | In our experiments, we pretrained single-layer undirectional NMT for both RU-EN and DE-EN corpora for both directions. 27 | 28 | * We provided the preprocessed dataset and the pretrained models: (https://drive.google.com/drive/folders/0B0miOG3ks5c1SVljM1Q5SURibU0?usp=sharing) 29 | 30 | ### Pretrain your own model: 31 | Follow the instructions and setup the configurations in `config.py (pretrain_config)` and then excute: 32 | ```bash 33 | $ export THEANO_FLAGS=device=gpu,floatX=float32 34 | $ python pretrain_uni.py 35 | ``` 36 | It normally takes 1~2 weeks for training an unidirectional NMT model for WMT15 corpora. 37 | 38 | ### Evaluate the BLEU score for a pre-trained NMT model 39 | TBA. 40 | 41 | Simultaneous Translation: 42 | ---------------------- 43 | ### Training an Agent 44 | Follow the instructions and setup the configurations in `config.py (rl_config)` and then excute: 45 | ```bash 46 | $ export THEANO_FLAGS=device=gpu,floatX=float32 47 | $ python simultrans_train.py 48 | ``` 49 | ### Monitoring 50 | TBA. 51 | 52 | ### Visualization 53 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/__init__.py -------------------------------------------------------------------------------- /actors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deterministic Actor Functions: 3 | """ 4 | from layers import * 5 | 6 | TINY = 1e-7 7 | 8 | # -------------------------------------------------------------------------# 9 | # Noise 10 | def ou_noise(trng, x, mu=0., theta=0.15, sigma=0.01): 11 | dx = theta * (mu - x) + sigma * trng.normal(x.shape) 12 | return x + dx 13 | 14 | 15 | def gaussian_noise(trng, x, mu=0, sigma=0.01): 16 | dx = mu + sigma * trng.normal(x.shape) 17 | return dx 18 | 19 | 20 | # -------------------------------------------------------------------------# 21 | # Actors: 22 | actors = dict() 23 | actors['dumb'] = ('param_init_dumb', 'dumb_actor') 24 | actors['const'] = ('param_init_constant', 'constant_actor') 25 | actors['ff'] = ('param_init_ff', 'ff_actor') 26 | actors['gru'] = ('param_init_gru', 'gru_actor', 'gru_actor_hard') 27 | actors['gru2'] = ('param_init_gru2', 'gru_actor2') 28 | actors['gg'] = ('param_init_gg', 'gg_actor') 29 | 30 | 31 | def get_actor(name): 32 | fns = actors[name] 33 | return tuple([eval(f) for f in fns]) 34 | 35 | 36 | def _p(pp, name): 37 | return '%s_%s' % (pp, name) 38 | 39 | 40 | # -------------------------------------------------------------------------# 41 | # Dump Actors: 42 | def param_init_dumb(options, prefix='db', nin=None, nout=None): 43 | params = OrderedDict() 44 | if nin is None: 45 | nin = options['dim'] + options['ctxdim'] 46 | 47 | if nout is None: 48 | nout = options['dim'] 49 | 50 | return params 51 | 52 | 53 | def dumb_actor(tparams, options,h1, ctx=None, act=None, prefix='db'): 54 | 55 | action = tensor.zeros_like(h1) 56 | hidden = act 57 | return action, hidden 58 | 59 | 60 | # constant Actors: 61 | def param_init_constant(options, prefix='ct', nin=None, nout=None): 62 | params = OrderedDict() 63 | if nin is None: 64 | nin = options['dim'] + options['ctxdim'] 65 | 66 | if nout is None: 67 | nout = options['dim'] 68 | 69 | params[_p(prefix, 'a')] = numpy.zeros((nout,)).astype('float32') 70 | return params 71 | 72 | 73 | def constant_actor(tparams, options, h1, ctx=None, act=None, prefix='ct'): 74 | action = tensor.zeros_like(h1) 75 | if action.ndim == 2: 76 | action += tparams[_p(prefix, 'a')][None, :] 77 | elif action.ndim == 3: 78 | action += tparams[_p(prefix, 'a')][None, None, :] 79 | else: 80 | action += tparams[_p(prefix, 'a')] 81 | 82 | hidden = act 83 | return action, hidden 84 | 85 | 86 | # Feedforward Actors: 87 | def param_init_ff(options, prefix='ff', nin=None, nout=None, nhid=None): 88 | 89 | params = OrderedDict() 90 | 91 | if nin is None: 92 | nin = options['dim'] + options['ctxdim'] 93 | 94 | if nout is None: 95 | nout = options['dim'] 96 | 97 | if nhid is None: 98 | nhid = options['act_hdim'] 99 | 100 | params = get_layer('ff')[0](options, params, prefix=prefix + '_in', 101 | nin=nin, nout=nhid, scale=0.001) 102 | 103 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out', 104 | nin=nhid, nout=nout, scale=0.001) 105 | 106 | return params 107 | 108 | 109 | def ff_actor(tparams, options, h1, ctx=None, act=None, prefix='ff'): 110 | 111 | hidden = get_layer('ff')[1](tparams, concatenate([h1, ctx], axis=1), 112 | options, prefix=prefix + '_in', activ='tanh') 113 | action = get_layer('ff')[1](tparams, hidden, 114 | options, prefix=prefix + '_out', activ='tanh') 115 | 116 | return action, hidden 117 | 118 | 119 | # Recurrent Actors: 120 | def param_init_gru(options, prefix='ff', nin=None, nout=None, nhid=None): 121 | 122 | params = OrderedDict() 123 | 124 | if nin is None: 125 | nin = 2 * options['dim'] + options['ctxdim'] 126 | 127 | if nout is None: 128 | nout = options['dim'] 129 | 130 | if nhid is None: 131 | nhid = options['act_hdim'] 132 | 133 | # params = get_layer('lngru')[0](options, params, prefix=prefix + '_in', 134 | # nin=nin, dim=nhid, scale=0.001) 135 | params = get_layer('gru')[0](options, params, prefix=prefix + '_in', 136 | nin=nin, dim=nhid, scale=0.001) 137 | 138 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out', 139 | nin=nhid, nout=nout, scale=0.001) 140 | 141 | return params 142 | 143 | 144 | def gru_actor(tparams, options, h1, ctx=None, act=None, prefix='ff'): 145 | 146 | pre_state, pre_action = act[:, :options['act_hdim']], act[:, options['act_hdim']:] 147 | # hidden = get_layer('lngru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1), 148 | # options, prefix=prefix + '_in', 149 | # one_step=True, _init_state=pre_state)[0] 150 | hidden = get_layer('gru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1), 151 | options, prefix=prefix + '_in', 152 | one_step=True, _init_state=pre_state)[0] 153 | 154 | action = get_layer('ff')[1](tparams, hidden, 155 | options, prefix=prefix + '_out', activ='tanh') 156 | cur_act = concatenate([hidden, action], axis=1) 157 | return action, cur_act 158 | 159 | 160 | # Recurrent Actor2 161 | def param_init_gru2(options, prefix='ff', nin=None, nout=None, nhid=None): 162 | 163 | params = OrderedDict() 164 | 165 | if nin is None: 166 | nin = options['dim'] 167 | 168 | if nout is None: 169 | nout = options['dim'] 170 | 171 | if nhid is None: 172 | nhid = options['act_hdim'] 173 | 174 | # params = get_layer('lngru')[0](options, params, prefix=prefix + '_in', 175 | # nin=nin, dim=nhid, scale=0.001) 176 | params = get_layer('gru')[0](options, params, prefix=prefix + '_in', 177 | nin=nin, dim=nhid, scale=0.001) 178 | 179 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out', 180 | nin=nhid, nout=nout, scale=0.001) 181 | 182 | return params 183 | 184 | 185 | def gru_actor2(tparams, options, h1, act=None, prefix='ff'): 186 | 187 | # hidden = get_layer('lngru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1), 188 | # options, prefix=prefix + '_in', 189 | # one_step=True, _init_state=pre_state)[0] 190 | hidden = get_layer('gru')[1](tparams, h1, 191 | options, prefix=prefix + '_in', 192 | one_step=True, _init_state=act)[0] 193 | 194 | action = get_layer('ff')[1](tparams, hidden, 195 | options, prefix=prefix + '_out', activ='tanh') 196 | return action, hidden 197 | 198 | 199 | def gru_actor_hard(tparams, options, h1, ctx=None, act=None, prefix='ff', bound=0.1): 200 | 201 | pre_state, pre_action = act[:, :options['act_hdim']], act[:, options['act_hdim']:] 202 | # hidden = get_layer('lngru')[2](tparams, concatenate([h1, ctx, pre_action], axis=1), 203 | # options, prefix=prefix + '_in', 204 | # one_step=True, _init_state=pre_state)[0] 205 | hidden = get_layer('gru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1), 206 | options, prefix=prefix + '_in', 207 | one_step=True, _init_state=pre_state)[0] 208 | 209 | action = get_layer('ff')[1](tparams, hidden, 210 | options, prefix=prefix + '_out', activ='tanh') 211 | 212 | a_norm = tensor.sqrt(tensor.sum(action ** 2, axis=-1, keepdims=True)) 213 | action = tensor.switch(a_norm > bound, action / a_norm * bound, action) # add a hard boundary of actions 214 | 215 | cur_act = concatenate([hidden, action], axis=1) 216 | return action, cur_act 217 | 218 | 219 | # Recurrent Actors: 220 | def param_init_gg(options, prefix='ff', nin=None, nout=None, nhid=None): 221 | 222 | params = OrderedDict() 223 | 224 | if nin is None: 225 | nin = 2 * options['dim'] + options['ctxdim'] 226 | 227 | if nout is None: 228 | nout = options['dim'] 229 | 230 | if nhid is None: 231 | nhid = options['act_hdim'] 232 | 233 | # params = get_layer('lngru')[0](options, params, prefix=prefix + '_in', 234 | # nin=nin, dim=nhid, scale=0.001) 235 | params = get_layer('gru')[0](options, params, prefix=prefix + '_in', 236 | nin=nin, dim=nhid, scale=0.001) 237 | 238 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out', 239 | nin=nhid, nout=nout, scale=0.001) 240 | 241 | # params = get_layer('ff')[0](options, params, prefix=prefix + '_gate', 242 | # nin=nhid + nout, nout=1) 243 | # params = get_layer('ff')[0](options, params, prefix=prefix + '_gate', 244 | # nin=nin + nout, nout=1) 245 | params = get_layer('ff')[0](options, params, prefix=prefix + '_gate', 246 | nin=nin + nout, nout=nout) 247 | 248 | return params 249 | 250 | 251 | def gg_actor(tparams, options, h1, ctx=None, act=None, prefix='ff'): 252 | 253 | pre_state, pre_action = act[:, :options['act_hdim']], act[:, options['act_hdim']:] 254 | # hidden = get_layer('lngru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1), 255 | # options, prefix=prefix + '_in', 256 | # one_step=True, _init_state=pre_state)[0] 257 | hidden = get_layer('gru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1), 258 | options, prefix=prefix + '_in', 259 | one_step=True, _init_state=pre_state)[0] 260 | 261 | output = get_layer('ff')[1](tparams, hidden, 262 | options, prefix=prefix + '_out', activ='tanh') 263 | # gate = get_layer('ff')[1](tparams, concatenate([hidden, output], axis=1), options, prefix=prefix + '_gate', activ='sigmoid')[:, 0] 264 | # gate = get_layer('ff')[1](tparams, concatenate([h1, ctx, pre_action, output], axis=1), options, prefix=prefix + '_gate', activ='sigmoid')[:, 0] 265 | # action = output * gate[:, None] 266 | gate = get_layer('ff')[1](tparams, concatenate([h1, ctx, pre_action, output], axis=1), options, prefix=prefix + '_gate', activ='sigmoid') 267 | action = output * gate 268 | cur_act = concatenate([hidden, action], axis=1) 269 | return action, cur_act 270 | 271 | 272 | 273 | 274 | -------------------------------------------------------------------------------- /bleu.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/bleu.pyc -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration for Simultaneous Neural Machine Translation 3 | """ 4 | from collections import OrderedDict 5 | 6 | # data_home = '/home/thoma/scratch/un16/' 7 | # model_home = '/home/thoma/scratch/simul/' 8 | # data_home = '/mnt/scratch/un16/' 9 | # model_home = '/mnt/scratch/simul/' 10 | 11 | data_home = '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/' 12 | model_home = '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/' 13 | 14 | 15 | def pretrain_config(): 16 | 17 | """Configuration for pretraining underlining NMT model.""" 18 | 19 | config = dict() 20 | 21 | # training set (source, target) 22 | config['datasets'] = [data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.np', 23 | data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.np'] 24 | 25 | # validation set (source, target) 26 | config['valid_datasets'] = [data_home + 'devset.un16.en-zh.zh.c0.tok.bpe20k.np', 27 | data_home + 'devset.un16.en-zh.en.c0.tok.bpe20k.np'] 28 | 29 | # vocabulary (source, target) 30 | config['dictionaries'] = [data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.vocab.pkl', 31 | data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.vocab.pkl'] 32 | 33 | # save the model to 34 | config['saveto'] = data_home + 'pretraining/model_un16_bpe2k_uni_zh-en.npz' 35 | config['reload_'] = True 36 | 37 | # model details 38 | config['dim_word'] = 512 39 | config['dim'] = 1028 40 | config['n_words'] = 20000 41 | config['n_words_src'] = 20000 42 | 43 | # learning details 44 | config['decay_c'] = 0 45 | config['clip_c'] = 1. 46 | config['use_dropout'] = False 47 | config['lrate'] = 0.0001 48 | config['optimizer'] = 'adadelta' 49 | config['patience'] = 1000 50 | config['maxlen'] = 50 51 | config['batch_size'] = 32 52 | config['valid_batch_size'] = 64 53 | config['validFreq'] = 1000 54 | config['dispFreq'] = 50 55 | config['saveFreq'] = 1000 56 | config['sampleFreq'] = 99 57 | 58 | return config 59 | 60 | 61 | def rl_config(): 62 | """Configuration for training the agent using REINFORCE algorithm.""" 63 | 64 | config = OrderedDict() # general configuration 65 | 66 | # work-space 67 | config['workspace'] = model_home 68 | 69 | # training set (source, target); or leave it None, agent will use the same corpus saved in the model 70 | config['datasets'] = [data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.np', 71 | data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.np'] 72 | 73 | # validation set (source, target); or leave it None, agent will use the same corpus saved in the model 74 | config['valid_datasets'] = [data_home + 'devset.un16.en-zh.en.c0.tok.bpe20k.np', 75 | data_home + 'devset.un16.en-zh.zh.c0.tok.bpe20k.np'] 76 | 77 | # vocabulary (source, target); or leave it None, agent will use the same dictionary saved in the model 78 | config['dictionaries'] = [data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.vocab.pkl', 79 | data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.vocab.pkl'] 80 | 81 | # pretrained model 82 | config['model'] = model_home + '.pretrained/model_un16_bpe2k_uni_en-zh.npz' 83 | config['option'] = model_home + '.pretrained/model_un16_bpe2k_uni_en-zh.npz.pkl' 84 | 85 | # critical training parameters. 86 | config['sample'] = 10 87 | config['batchsize'] = 10 88 | config['rl_maxlen'] = 100 89 | config['target_ap'] = 0.8 # 0.75 # target delay if using AP as reward. 90 | config['target_cw'] = 8 # if cw > 0 use cw mode 91 | 92 | # under-construction 93 | config['forget'] = False 94 | 95 | # learning rate 96 | config['lr_policy'] = 0.0002 97 | config['lr_model'] = 0.00002 98 | 99 | # policy parameters 100 | config['prop'] = 0.5 # leave it default 101 | config['recurrent'] = True # use a recurrent agent 102 | config['layernorm'] = False # layer normalalization for the GRU agent. 103 | config['updater'] = 'REINFORCE' # 'TRPO' not work well. 104 | config['act_mask'] = True # leave it default 105 | 106 | # old model parameters (maybe useless, leave them default) 107 | config['step'] = 1 108 | config['peek'] = 1 109 | config['s0'] = 1 110 | config['gamma'] = 1 111 | config['Rtype'] = 10 112 | config['maxsrc'] = 10 113 | config['pre'] = False 114 | config['coverage'] = False 115 | config['upper'] = False 116 | 117 | config['finetune'] = True 118 | config['train_gt'] = False # when training with GT, fix the random agent?? 119 | config['full_att'] = True 120 | config['predict'] = True 121 | 122 | return config 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/config.pyc -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Data pre-processing related scripts and utilities. 2 | 3 | #### Setup 4 | Easiest way to setup your environment: 5 | 6 | ```bash 7 | $ cd ~; mkdir codes; cd codes 8 | $ git clone https://github.com/nyu-dl/dl4mt-tutorial 9 | $ cd dl4mt-tutorial/data 10 | $ ./setup_local_env.sh 11 | ``` 12 | 13 | which will first clone this repository under `~/codes/dl4mt-tutorial` 14 | and then calls the `setup_local_env.sh` script to retrieve example data, 15 | and preprocesses it. 16 | 17 | #### Pre-processing 18 | Following steps are executed by `setup_local_env.sh`: 19 | 1. Clone `dl4mt-tutorial` repository (if not cloned already) 20 | 2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development) 21 | 3. Preprocess training and development sets 22 | * Tokenize using moses tokenizer 23 | * Shuffle training set for SGD 24 | * Build source and target dictionaries 25 | 26 | #### Pre-processing with subword-units 27 | If you want to use subword-units (eg. [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)) for source and target tokens, simply call: 28 | ```bash 29 | $ ./setup_local_env.sh -b 30 | ``` 31 | which will replace the third step above, and execute the following steps: 32 | 1. Clone `dl4mt-tutorial` repository (if not cloned already) 33 | 2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development) 34 | 3. Preprocess training and development sets (`preprocess.sh`) 35 | * Tokenize source and target side of all bitext 36 | * Learn BPE-codes for both source and target side using training sets 37 | * Encode source and target side using the learned codes 38 | * Shuffle training set for SGD 39 | * Build source and target dictionaries 40 | 41 | In case you want to preprocess your own data using BPE, you can use `preprocess.sh` script directly. 42 | 43 | For the usage and more details, please check the comments in the scripts. 44 | -------------------------------------------------------------------------------- /data/build_dictionary.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import cPickle as pkl 3 | 4 | import sys 5 | import fileinput 6 | 7 | from collections import OrderedDict 8 | 9 | def main(): 10 | for filename in sys.argv[1:]: 11 | print 'Processing', filename 12 | word_freqs = OrderedDict() 13 | with open(filename, 'r') as f: 14 | for line in f: 15 | words_in = line.strip().split(' ') 16 | for w in words_in: 17 | if w not in word_freqs: 18 | word_freqs[w] = 0 19 | word_freqs[w] += 1 20 | words = word_freqs.keys() 21 | freqs = word_freqs.values() 22 | 23 | sorted_idx = numpy.argsort(freqs) 24 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 25 | 26 | worddict = OrderedDict() 27 | worddict['eos'] = 0 28 | worddict['UNK'] = 1 29 | for ii, ww in enumerate(sorted_words): 30 | worddict[ww] = ii+2 31 | 32 | with open('%s.pkl'%filename, 'wb') as f: 33 | pkl.dump(worddict, f) 34 | 35 | print 'Done' 36 | 37 | if __name__ == '__main__': 38 | main() 39 | -------------------------------------------------------------------------------- /data/download_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import tarfile 7 | import urllib2 8 | 9 | TRAIN_DATA_URL = 'http://www.statmt.org/europarl/v7/fr-en.tgz' 10 | VALID_DATA_URL = 'http://matrix.statmt.org/test_sets/newstest2011.tgz' 11 | 12 | parser = argparse.ArgumentParser( 13 | description=""" 14 | This script donwloads parallel corpora given source and target pair language 15 | indicators. Adapted from, 16 | https://github.com/orhanf/blocks-examples/tree/master/machine_translation 17 | """, formatter_class=argparse.RawTextHelpFormatter) 18 | parser.add_argument("-s", "--source", type=str, help="Source language", 19 | default="fr") 20 | parser.add_argument("-t", "--target", type=str, help="Target language", 21 | default="en") 22 | parser.add_argument("--source-dev", type=str, default="newstest2011.fr", 23 | help="Source language dev filename") 24 | parser.add_argument("--target-dev", type=str, default="newstest2011.en", 25 | help="Target language dev filename") 26 | parser.add_argument("--outdir", type=str, default=".", 27 | help="Output directory") 28 | 29 | 30 | def download_and_write_file(url, file_name): 31 | logger.info("Downloading [{}]".format(url)) 32 | if not os.path.exists(file_name): 33 | path = os.path.dirname(file_name) 34 | if not os.path.exists(path): 35 | os.makedirs(path) 36 | u = urllib2.urlopen(url) 37 | f = open(file_name, 'wb') 38 | meta = u.info() 39 | file_size = int(meta.getheaders("Content-Length")[0]) 40 | logger.info("...saving to: %s Bytes: %s" % (file_name, file_size)) 41 | file_size_dl = 0 42 | block_sz = 8192 43 | while True: 44 | buffer = u.read(block_sz) 45 | if not buffer: 46 | break 47 | file_size_dl += len(buffer) 48 | f.write(buffer) 49 | status = r"%10d [%3.2f%%]" % \ 50 | (file_size_dl, file_size_dl * 100. / file_size) 51 | status = status + chr(8)*(len(status)+1) 52 | print status, 53 | f.close() 54 | else: 55 | logger.info("...file exists [{}]".format(file_name)) 56 | 57 | 58 | def extract_tar_file_to(file_to_extract, extract_into, names_to_look): 59 | extracted_filenames = [] 60 | try: 61 | logger.info("Extracting file [{}] into [{}]" 62 | .format(file_to_extract, extract_into)) 63 | tar = tarfile.open(file_to_extract, 'r') 64 | src_trg_files = [ff for ff in tar.getnames() 65 | if any([ff.find(nn) > -1 for nn in names_to_look])] 66 | if not len(src_trg_files): 67 | raise ValueError("[{}] pair does not exist in the archive!" 68 | .format(src_trg_files)) 69 | for item in tar: 70 | # extract only source-target pair 71 | if item.name in src_trg_files: 72 | file_path = os.path.join(extract_into, item.path) 73 | if not os.path.exists(file_path): 74 | logger.info("...extracting [{}] into [{}]" 75 | .format(item.name, file_path)) 76 | tar.extract(item, extract_into) 77 | else: 78 | logger.info("...file exists [{}]".format(file_path)) 79 | extracted_filenames.append( 80 | os.path.join(extract_into, item.path)) 81 | except Exception as e: 82 | logger.error("{}".format(str(e))) 83 | return extracted_filenames 84 | 85 | 86 | def main(): 87 | train_data_file = os.path.join(args.outdir, 'train_data.tgz') 88 | valid_data_file = os.path.join(args.outdir, 'valid_data.tgz') 89 | 90 | # Download europarl v7 and extract it 91 | download_and_write_file(TRAIN_DATA_URL, train_data_file) 92 | extract_tar_file_to( 93 | train_data_file, os.path.dirname(train_data_file), 94 | ["{}-{}".format(args.source, args.target)]) 95 | 96 | # Download development set and extract it 97 | download_and_write_file(VALID_DATA_URL, valid_data_file) 98 | extract_tar_file_to( 99 | valid_data_file, os.path.dirname(valid_data_file), 100 | [args.source_dev, args.target_dev]) 101 | 102 | 103 | if __name__ == "__main__": 104 | 105 | logging.basicConfig(level=logging.INFO) 106 | logger = logging.getLogger('prepare_data') 107 | 108 | args = parser.parse_args() 109 | main() 110 | -------------------------------------------------------------------------------- /data/length.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import sys 5 | 6 | for name in sys.argv[1:]: 7 | lens = [] 8 | with open(name, 'r') as f: 9 | for ll in f: 10 | lens.append(len(ll.strip().split(' '))) 11 | print name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens) 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /data/merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script merges all the bitext files in the current directory. 3 | # Source side files are concatenated into all_[src]-[trg].[src] 4 | # Target side files are concatenated into all_[src]-[trg].[trg] 5 | 6 | if [ "$#" -ne 3 ]; then 7 | echo "" 8 | echo "Usage: $0 src trg path_to_data" 9 | echo "" 10 | exit 1 11 | fi 12 | 13 | SRC=$1 14 | TRG=$2 15 | 16 | DATA_DIR=$3 17 | 18 | FSRC=${DATA_DIR}/all_${1}-${2}.${1} 19 | FTRG=${DATA_DIR}/all_${1}-${2}.${2} 20 | 21 | echo "" > $FSRC 22 | for F in ${DATA_DIR}/*${1}-${2}.${1} 23 | do 24 | if [ "$F" = "$FSRC" ]; then 25 | echo "pass" 26 | else 27 | cat $F >> $FSRC 28 | fi 29 | done 30 | 31 | 32 | echo "" > $FTRG 33 | for F in ${DATA_DIR}/*${1}-${2}.${2} 34 | do 35 | if [ "$F" = "$FTRG" ]; then 36 | echo "pass" 37 | else 38 | cat $F >> $FTRG 39 | fi 40 | done 41 | -------------------------------------------------------------------------------- /data/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | sub add_to_ref { 35 | my ($file,$REF) = @_; 36 | my $s=0; 37 | open(REF,$file) or die "Can't read $file"; 38 | while() { 39 | chop; 40 | push @{$$REF[$s++]}, $_; 41 | } 42 | close(REF); 43 | } 44 | 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 46 | my $s=0; 47 | while() { 48 | chop; 49 | $_ = lc if $lowercase; 50 | my @WORD = split; 51 | my %REF_NGRAM = (); 52 | my $length_translation_this_sentence = scalar(@WORD); 53 | my ($closest_diff,$closest_length) = (9999,9999); 54 | foreach my $reference (@{$REF[$s]}) { 55 | # print "$s $_ <=> $reference\n"; 56 | $reference = lc($reference) if $lowercase; 57 | my @WORD = split(' ',$reference); 58 | my $length = scalar(@WORD); 59 | my $diff = abs($length_translation_this_sentence-$length); 60 | if ($diff < $closest_diff) { 61 | $closest_diff = $diff; 62 | $closest_length = $length; 63 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 64 | } elsif ($diff == $closest_diff) { 65 | $closest_length = $length if $length < $closest_length; 66 | # from two references with the same closeness to me 67 | # take the *shorter* into account, not the "first" one. 68 | } 69 | for(my $n=1;$n<=4;$n++) { 70 | my %REF_NGRAM_N = (); 71 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 72 | my $ngram = "$n"; 73 | for(my $w=0;$w<$n;$w++) { 74 | $ngram .= " ".$WORD[$start+$w]; 75 | } 76 | $REF_NGRAM_N{$ngram}++; 77 | } 78 | foreach my $ngram (keys %REF_NGRAM_N) { 79 | if (!defined($REF_NGRAM{$ngram}) || 80 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 81 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 82 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 83 | } 84 | } 85 | } 86 | } 87 | $length_translation += $length_translation_this_sentence; 88 | $length_reference += $closest_length; 89 | for(my $n=1;$n<=4;$n++) { 90 | my %T_NGRAM = (); 91 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 92 | my $ngram = "$n"; 93 | for(my $w=0;$w<$n;$w++) { 94 | $ngram .= " ".$WORD[$start+$w]; 95 | } 96 | $T_NGRAM{$ngram}++; 97 | } 98 | foreach my $ngram (keys %T_NGRAM) { 99 | $ngram =~ /^(\d+) /; 100 | my $n = $1; 101 | # my $corr = 0; 102 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 103 | $TOTAL[$n] += $T_NGRAM{$ngram}; 104 | if (defined($REF_NGRAM{$ngram})) { 105 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 106 | $CORRECT[$n] += $T_NGRAM{$ngram}; 107 | # $corr = $T_NGRAM{$ngram}; 108 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 109 | } 110 | else { 111 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 112 | # $corr = $REF_NGRAM{$ngram}; 113 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 114 | } 115 | } 116 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 117 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 118 | } 119 | } 120 | $s++; 121 | } 122 | my $brevity_penalty = 1; 123 | my $bleu = 0; 124 | 125 | my @bleu=(); 126 | 127 | for(my $n=1;$n<=4;$n++) { 128 | if (defined ($TOTAL[$n])){ 129 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 130 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 131 | }else{ 132 | $bleu[$n]=0; 133 | } 134 | } 135 | 136 | if ($length_reference==0){ 137 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 138 | exit(1); 139 | } 140 | 141 | if ($length_translation<$length_reference) { 142 | $brevity_penalty = exp(1-$length_reference/$length_translation); 143 | } 144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 145 | my_log( $bleu[2] ) + 146 | my_log( $bleu[3] ) + 147 | my_log( $bleu[4] ) ) / 4) ; 148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 149 | 100*$bleu, 150 | 100*$bleu[1], 151 | 100*$bleu[2], 152 | 100*$bleu[3], 153 | 100*$bleu[4], 154 | $brevity_penalty, 155 | $length_translation / $length_reference, 156 | $length_translation, 157 | $length_reference; 158 | 159 | sub my_log { 160 | return -9999999999 unless $_[0]; 161 | return log($_[0]); 162 | } 163 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.el: -------------------------------------------------------------------------------- 1 | # Sigle letters in upper-case are usually abbreviations of names 2 | Α 3 | Β 4 | Γ 5 | Δ 6 | Ε 7 | Ζ 8 | Η 9 | Θ 10 | Ι 11 | Κ 12 | Λ 13 | Μ 14 | Ν 15 | Ξ 16 | Ο 17 | Π 18 | Ρ 19 | Σ 20 | Τ 21 | Υ 22 | Φ 23 | Χ 24 | Ψ 25 | Ω 26 | 27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content). 28 | Άθαν 29 | Έγχρ 30 | Έκθ 31 | Έσδ 32 | Έφ 33 | Όμ 34 | Α΄Έσδρ 35 | Α΄Έσδ 36 | Α΄Βασ 37 | Α΄Θεσ 38 | Α΄Ιω 39 | Α΄Κορινθ 40 | Α΄Κορ 41 | Α΄Μακκ 42 | Α΄Μακ 43 | Α΄Πέτρ 44 | Α΄Πέτ 45 | Α΄Παραλ 46 | Α΄Πε 47 | Α΄Σαμ 48 | Α΄Τιμ 49 | Α΄Χρον 50 | Α΄Χρ 51 | Α.Β.Α 52 | Α.Β 53 | Α.Ε 54 | Α.Κ.Τ.Ο 55 | Αέθλ 56 | Αέτ 57 | Αίλ.Δ 58 | Αίλ.Τακτ 59 | Αίσ 60 | Αββακ 61 | Αβυδ 62 | Αβ 63 | Αγάκλ 64 | Αγάπ 65 | Αγάπ.Αμαρτ.Σ 66 | Αγάπ.Γεωπ 67 | Αγαθάγγ 68 | Αγαθήμ 69 | Αγαθιν 70 | Αγαθοκλ 71 | Αγαθρχ 72 | Αγαθ 73 | Αγαθ.Ιστ 74 | Αγαλλ 75 | Αγαπητ 76 | Αγγ 77 | Αγησ 78 | Αγλ 79 | Αγορ.Κ 80 | Αγρο.Κωδ 81 | Αγρ.Εξ 82 | Αγρ.Κ 83 | Αγ.Γρ 84 | Αδριαν 85 | Αδρ 86 | Αετ 87 | Αθάν 88 | Αθήν 89 | Αθήν.Επιγρ 90 | Αθήν.Επιτ 91 | Αθήν.Ιατρ 92 | Αθήν.Μηχ 93 | Αθανάσ 94 | Αθαν 95 | Αθηνί 96 | Αθηναγ 97 | Αθηνόδ 98 | Αθ 99 | Αθ.Αρχ 100 | Αιλ 101 | Αιλ.Επιστ 102 | Αιλ.ΖΙ 103 | Αιλ.ΠΙ 104 | Αιλ.απ 105 | Αιμιλ 106 | Αιν.Γαζ 107 | Αιν.Τακτ 108 | Αισχίν 109 | Αισχίν.Επιστ 110 | Αισχ 111 | Αισχ.Αγαμ 112 | Αισχ.Αγ 113 | Αισχ.Αλ 114 | Αισχ.Ελεγ 115 | Αισχ.Επτ.Θ 116 | Αισχ.Ευμ 117 | Αισχ.Ικέτ 118 | Αισχ.Ικ 119 | Αισχ.Περσ 120 | Αισχ.Προμ.Δεσμ 121 | Αισχ.Πρ 122 | Αισχ.Χοηφ 123 | Αισχ.Χο 124 | Αισχ.απ 125 | ΑιτΕ 126 | Αιτ 127 | Αλκ 128 | Αλχιας 129 | Αμ.Π.Ο 130 | Αμβ 131 | Αμμών 132 | Αμ. 133 | Αν.Πειθ.Συμβ.Δικ 134 | Ανακρ 135 | Ανακ 136 | Αναμν.Τόμ 137 | Αναπλ 138 | Ανδ 139 | Ανθλγος 140 | Ανθστης 141 | Αντισθ 142 | Ανχης 143 | Αν 144 | Αποκ 145 | Απρ 146 | Απόδ 147 | Απόφ 148 | Απόφ.Νομ 149 | Απ 150 | Απ.Δαπ 151 | Απ.Διατ 152 | Απ.Επιστ 153 | Αριθ 154 | Αριστοτ 155 | Αριστοφ 156 | Αριστοφ.Όρν 157 | Αριστοφ.Αχ 158 | Αριστοφ.Βάτρ 159 | Αριστοφ.Ειρ 160 | Αριστοφ.Εκκλ 161 | Αριστοφ.Θεσμ 162 | Αριστοφ.Ιππ 163 | Αριστοφ.Λυσ 164 | Αριστοφ.Νεφ 165 | Αριστοφ.Πλ 166 | Αριστοφ.Σφ 167 | Αριστ 168 | Αριστ.Αθ.Πολ 169 | Αριστ.Αισθ 170 | Αριστ.Αν.Πρ 171 | Αριστ.Ζ.Ι 172 | Αριστ.Ηθ.Ευδ 173 | Αριστ.Ηθ.Νικ 174 | Αριστ.Κατ 175 | Αριστ.Μετ 176 | Αριστ.Πολ 177 | Αριστ.Φυσιογν 178 | Αριστ.Φυσ 179 | Αριστ.Ψυχ 180 | Αριστ.Ρητ 181 | Αρμεν 182 | Αρμ 183 | Αρχ.Εκ.Καν.Δ 184 | Αρχ.Ευβ.Μελ 185 | Αρχ.Ιδ.Δ 186 | Αρχ.Νομ 187 | Αρχ.Ν 188 | Αρχ.Π.Ε 189 | Αρ 190 | Αρ.Φορ.Μητρ 191 | Ασμ 192 | Ασμ.ασμ 193 | Αστ.Δ 194 | Αστ.Χρον 195 | Ασ 196 | Ατομ.Γνωμ 197 | Αυγ 198 | Αφρ 199 | Αχ.Νομ 200 | Α 201 | Α.Εγχ.Π 202 | Α.Κ.΄Υδρας 203 | Β΄Έσδρ 204 | Β΄Έσδ 205 | Β΄Βασ 206 | Β΄Θεσ 207 | Β΄Ιω 208 | Β΄Κορινθ 209 | Β΄Κορ 210 | Β΄Μακκ 211 | Β΄Μακ 212 | Β΄Πέτρ 213 | Β΄Πέτ 214 | Β΄Πέ 215 | Β΄Παραλ 216 | Β΄Σαμ 217 | Β΄Τιμ 218 | Β΄Χρον 219 | Β΄Χρ 220 | Β.Ι.Π.Ε 221 | Β.Κ.Τ 222 | Β.Κ.Ψ.Β 223 | Β.Μ 224 | Β.Ο.Α.Κ 225 | Β.Ο.Α 226 | Β.Ο.Δ 227 | Βίβλ 228 | Βαρ 229 | ΒεΘ 230 | Βι.Περ 231 | Βιπερ 232 | Βιργ 233 | Βλγ 234 | Βούλ 235 | Βρ 236 | Γ΄Βασ 237 | Γ΄Μακκ 238 | ΓΕΝμλ 239 | Γέν 240 | Γαλ 241 | Γεν 242 | Γλ 243 | Γν.Ν.Σ.Κρ 244 | Γνωμ 245 | Γν 246 | Γράμμ 247 | Γρηγ.Ναζ 248 | Γρηγ.Νύσ 249 | Γ Νοσ 250 | Γ' Ογκολ 251 | Γ.Ν 252 | Δ΄Βασ 253 | Δ.Β 254 | Δ.Δίκη 255 | Δ.Δίκ 256 | Δ.Ε.Σ 257 | Δ.Ε.Φ.Α 258 | Δ.Ε.Φ 259 | Δ.Εργ.Ν 260 | Δαμ 261 | Δαμ.μνημ.έργ 262 | Δαν 263 | Δασ.Κ 264 | Δεκ 265 | Δελτ.Δικ.Ε.Τ.Ε 266 | Δελτ.Νομ 267 | Δελτ.Συνδ.Α.Ε 268 | Δερμ 269 | Δευτ 270 | Δεύτ 271 | Δημοσθ 272 | Δημόκρ 273 | Δι.Δικ 274 | Διάτ 275 | Διαιτ.Απ 276 | Διαιτ 277 | Διαρκ.Στρατ 278 | Δικ 279 | Διοίκ.Πρωτ 280 | ΔιοικΔνη 281 | Διοικ.Εφ 282 | Διον.Αρ 283 | Διόρθ.Λαθ 284 | Δ.κ.Π 285 | Δνη 286 | Δν 287 | Δογμ.Όρος 288 | Δρ 289 | Δ.τ.Α 290 | Δτ 291 | ΔωδΝομ 292 | Δ.Περ 293 | Δ.Στρ 294 | ΕΔΠολ 295 | ΕΕυρΚ 296 | ΕΙΣ 297 | ΕΝαυτΔ 298 | ΕΣΑμΕΑ 299 | ΕΣΘ 300 | ΕΣυγκΔ 301 | ΕΤρΑξΧρΔ 302 | Ε.Φ.Ε.Τ 303 | Ε.Φ.Ι 304 | Ε.Φ.Ο.Επ.Α 305 | Εβδ 306 | Εβρ 307 | Εγκύκλ.Επιστ 308 | Εγκ 309 | Εε.Αιγ 310 | Εθν.Κ.Τ 311 | Εθν 312 | Ειδ.Δικ.Αγ.Κακ 313 | Εικ 314 | Ειρ.Αθ 315 | Ειρην.Αθ 316 | Ειρην 317 | Έλεγχ 318 | Ειρ 319 | Εισ.Α.Π 320 | Εισ.Ε 321 | Εισ.Ν.Α.Κ 322 | Εισ.Ν.Κ.Πολ.Δ 323 | Εισ.Πρωτ 324 | Εισηγ.Έκθ 325 | Εισ 326 | Εκκλ 327 | Εκκ 328 | Εκ 329 | Ελλ.Δνη 330 | Εν.Ε 331 | Εξ 332 | Επ.Αν 333 | Επ.Εργ.Δ 334 | Επ.Εφ 335 | Επ.Κυπ.Δ 336 | Επ.Μεσ.Αρχ 337 | Επ.Νομ 338 | Επίκτ 339 | Επίκ 340 | Επι.Δ.Ε 341 | Επιθ.Ναυτ.Δικ 342 | Επικ 343 | Επισκ.Ε.Δ 344 | Επισκ.Εμπ.Δικ 345 | Επιστ.Επετ.Αρμ 346 | Επιστ.Επετ 347 | Επιστ.Ιερ 348 | Επιτρ.Προστ.Συνδ.Στελ 349 | Επιφάν 350 | Επτ.Εφ 351 | Επ.Ιρ 352 | Επ.Ι 353 | Εργ.Ασφ.Νομ 354 | Ερμ.Α.Κ 355 | Ερμη.Σ 356 | Εσθ 357 | Εσπερ 358 | Ετρ.Δ 359 | Ευκλ 360 | Ευρ.Δ.Δ.Α 361 | Ευρ.Σ.Δ.Α 362 | Ευρ.ΣτΕ 363 | Ευρατόμ 364 | Ευρ.Άλκ 365 | Ευρ.Ανδρομ 366 | Ευρ.Βάκχ 367 | Ευρ.Εκ 368 | Ευρ.Ελ 369 | Ευρ.Ηλ 370 | Ευρ.Ηρακ 371 | Ευρ.Ηρ 372 | Ευρ.Ηρ.Μαιν 373 | Ευρ.Ικέτ 374 | Ευρ.Ιππόλ 375 | Ευρ.Ιφ.Α 376 | Ευρ.Ιφ.Τ 377 | Ευρ.Ι.Τ 378 | Ευρ.Κύκλ 379 | Ευρ.Μήδ 380 | Ευρ.Ορ 381 | Ευρ.Ρήσ 382 | Ευρ.Τρωάδ 383 | Ευρ.Φοίν 384 | Εφ.Αθ 385 | Εφ.Εν 386 | Εφ.Επ 387 | Εφ.Θρ 388 | Εφ.Θ 389 | Εφ.Ι 390 | Εφ.Κερ 391 | Εφ.Κρ 392 | Εφ.Λ 393 | Εφ.Ν 394 | Εφ.Πατ 395 | Εφ.Πειρ 396 | Εφαρμ.Δ.Δ 397 | Εφαρμ 398 | Εφεσ 399 | Εφημ 400 | Εφ 401 | Ζαχ 402 | Ζιγ 403 | Ζυ 404 | Ζχ 405 | ΗΕ.Δ 406 | Ημερ 407 | Ηράκλ 408 | Ηροδ 409 | Ησίοδ 410 | Ησ 411 | Η.Ε.Γ 412 | ΘΗΣ 413 | ΘΡ 414 | Θαλ 415 | Θεοδ 416 | Θεοφ 417 | Θεσ 418 | Θεόδ.Μοψ 419 | Θεόκρ 420 | Θεόφιλ 421 | Θουκ 422 | Θρ 423 | Θρ.Ε 424 | Θρ.Ιερ 425 | Θρ.Ιρ 426 | Ιακ 427 | Ιαν 428 | Ιβ 429 | Ιδθ 430 | Ιδ 431 | Ιεζ 432 | Ιερ 433 | Ιζ 434 | Ιησ 435 | Ιησ.Ν 436 | Ικ 437 | Ιλ 438 | Ιν 439 | Ιουδ 440 | Ιουστ 441 | Ιούδα 442 | Ιούλ 443 | Ιούν 444 | Ιπποκρ 445 | Ιππόλ 446 | Ιρ 447 | Ισίδ.Πηλ 448 | Ισοκρ 449 | Ισ.Ν 450 | Ιωβ 451 | Ιωλ 452 | Ιων 453 | Ιω 454 | ΚΟΣ 455 | ΚΟ.ΜΕ.ΚΟΝ 456 | ΚΠοινΔ 457 | ΚΠολΔ 458 | ΚαΒ 459 | Καλ 460 | Καλ.Τέχν 461 | ΚανΒ 462 | Καν.Διαδ 463 | Κατάργ 464 | Κλ 465 | ΚοινΔ 466 | Κολσ 467 | Κολ 468 | Κον 469 | Κορ 470 | Κος 471 | ΚριτΕπιθ 472 | ΚριτΕ 473 | Κριτ 474 | Κρ 475 | ΚτΒ 476 | ΚτΕ 477 | ΚτΠ 478 | Κυβ 479 | Κυπρ 480 | Κύριλ.Αλεξ 481 | Κύριλ.Ιερ 482 | Λεβ 483 | Λεξ.Σουίδα 484 | Λευϊτ 485 | Λευ 486 | Λκ 487 | Λογ 488 | ΛουκΑμ 489 | Λουκιαν 490 | Λουκ.Έρωτ 491 | Λουκ.Ενάλ.Διάλ 492 | Λουκ.Ερμ 493 | Λουκ.Εταιρ.Διάλ 494 | Λουκ.Ε.Δ 495 | Λουκ.Θε.Δ 496 | Λουκ.Ικ. 497 | Λουκ.Ιππ 498 | Λουκ.Λεξιφ 499 | Λουκ.Μεν 500 | Λουκ.Μισθ.Συν 501 | Λουκ.Ορχ 502 | Λουκ.Περ 503 | Λουκ.Συρ 504 | Λουκ.Τοξ 505 | Λουκ.Τυρ 506 | Λουκ.Φιλοψ 507 | Λουκ.Φιλ 508 | Λουκ.Χάρ 509 | Λουκ. 510 | Λουκ.Αλ 511 | Λοχ 512 | Λυδ 513 | Λυκ 514 | Λυσ 515 | Λωζ 516 | Λ1 517 | Λ2 518 | ΜΟΕφ 519 | Μάρκ 520 | Μέν 521 | Μαλ 522 | Ματθ 523 | Μα 524 | Μιχ 525 | Μκ 526 | Μλ 527 | Μμ 528 | Μον.Δ.Π 529 | Μον.Πρωτ 530 | Μον 531 | Μρ 532 | Μτ 533 | Μχ 534 | Μ.Βασ 535 | Μ.Πλ 536 | ΝΑ 537 | Ναυτ.Χρον 538 | Να 539 | Νδικ 540 | Νεεμ 541 | Νε 542 | Νικ 543 | ΝκΦ 544 | Νμ 545 | ΝοΒ 546 | Νομ.Δελτ.Τρ.Ελ 547 | Νομ.Δελτ 548 | Νομ.Σ.Κ 549 | Νομ.Χρ 550 | Νομ 551 | Νομ.Διεύθ 552 | Νοσ 553 | Ντ 554 | Νόσων 555 | Ν1 556 | Ν2 557 | Ν3 558 | Ν4 559 | Νtot 560 | Ξενοφ 561 | Ξεν 562 | Ξεν.Ανάβ 563 | Ξεν.Απολ 564 | Ξεν.Απομν 565 | Ξεν.Απομ 566 | Ξεν.Ελλ 567 | Ξεν.Ιέρ 568 | Ξεν.Ιππαρχ 569 | Ξεν.Ιππ 570 | Ξεν.Κυρ.Αν 571 | Ξεν.Κύρ.Παιδ 572 | Ξεν.Κ.Π 573 | Ξεν.Λακ.Πολ 574 | Ξεν.Οικ 575 | Ξεν.Προσ 576 | Ξεν.Συμπόσ 577 | Ξεν.Συμπ 578 | Ο΄ 579 | Οβδ 580 | Οβ 581 | ΟικΕ 582 | Οικ 583 | Οικ.Πατρ 584 | Οικ.Σύν.Βατ 585 | Ολομ 586 | Ολ 587 | Ολ.Α.Π 588 | Ομ.Ιλ 589 | Ομ.Οδ 590 | ΟπΤοιχ 591 | Οράτ 592 | Ορθ 593 | ΠΡΟ.ΠΟ 594 | Πίνδ 595 | Πίνδ.Ι 596 | Πίνδ.Νεμ 597 | Πίνδ.Ν 598 | Πίνδ.Ολ 599 | Πίνδ.Παθ 600 | Πίνδ.Πυθ 601 | Πίνδ.Π 602 | ΠαγΝμλγ 603 | Παν 604 | Παρμ 605 | Παροιμ 606 | Παρ 607 | Παυσ 608 | Πειθ.Συμβ 609 | ΠειρΝ 610 | Πελ 611 | ΠεντΣτρ 612 | Πεντ 613 | Πεντ.Εφ 614 | ΠερΔικ 615 | Περ.Γεν.Νοσ 616 | Πετ 617 | Πλάτ 618 | Πλάτ.Αλκ 619 | Πλάτ.Αντ 620 | Πλάτ.Αξίοχ 621 | Πλάτ.Απόλ 622 | Πλάτ.Γοργ 623 | Πλάτ.Ευθ 624 | Πλάτ.Θεαίτ 625 | Πλάτ.Κρατ 626 | Πλάτ.Κριτ 627 | Πλάτ.Λύσ 628 | Πλάτ.Μεν 629 | Πλάτ.Νόμ 630 | Πλάτ.Πολιτ 631 | Πλάτ.Πολ 632 | Πλάτ.Πρωτ 633 | Πλάτ.Σοφ. 634 | Πλάτ.Συμπ 635 | Πλάτ.Τίμ 636 | Πλάτ.Φαίδρ 637 | Πλάτ.Φιλ 638 | Πλημ 639 | Πλούτ 640 | Πλούτ.Άρατ 641 | Πλούτ.Αιμ 642 | Πλούτ.Αλέξ 643 | Πλούτ.Αλκ 644 | Πλούτ.Αντ 645 | Πλούτ.Αρτ 646 | Πλούτ.Ηθ 647 | Πλούτ.Θεμ 648 | Πλούτ.Κάμ 649 | Πλούτ.Καίσ 650 | Πλούτ.Κικ 651 | Πλούτ.Κράσ 652 | Πλούτ.Κ 653 | Πλούτ.Λυκ 654 | Πλούτ.Μάρκ 655 | Πλούτ.Μάρ 656 | Πλούτ.Περ 657 | Πλούτ.Ρωμ 658 | Πλούτ.Σύλλ 659 | Πλούτ.Φλαμ 660 | Πλ 661 | Ποιν.Δικ 662 | Ποιν.Δ 663 | Ποιν.Ν 664 | Ποιν.Χρον 665 | Ποιν.Χρ 666 | Πολ.Δ 667 | Πολ.Πρωτ 668 | Πολ 669 | Πολ.Μηχ 670 | Πολ.Μ 671 | Πρακτ.Αναθ 672 | Πρακτ.Ολ 673 | Πραξ 674 | Πρμ 675 | Πρξ 676 | Πρωτ 677 | Πρ 678 | Πρ.Αν 679 | Πρ.Λογ 680 | Πταισμ 681 | Πυρ.Καλ 682 | Πόλη 683 | Π.Δ 684 | Π.Δ.Άσμ 685 | ΡΜ.Ε 686 | Ρθ 687 | Ρμ 688 | Ρωμ 689 | ΣΠλημ 690 | Σαπφ 691 | Σειρ 692 | Σολ 693 | Σοφ 694 | Σοφ.Αντιγ 695 | Σοφ.Αντ 696 | Σοφ.Αποσ 697 | Σοφ.Απ 698 | Σοφ.Ηλέκ 699 | Σοφ.Ηλ 700 | Σοφ.Οιδ.Κολ 701 | Σοφ.Οιδ.Τύρ 702 | Σοφ.Ο.Τ 703 | Σοφ.Σειρ 704 | Σοφ.Σολ 705 | Σοφ.Τραχ 706 | Σοφ.Φιλοκτ 707 | Σρ 708 | Σ.τ.Ε 709 | Σ.τ.Π 710 | Στρ.Π.Κ 711 | Στ.Ευρ 712 | Συζήτ 713 | Συλλ.Νομολ 714 | Συλ.Νομ 715 | ΣυμβΕπιθ 716 | Συμπ.Ν 717 | Συνθ.Αμ 718 | Συνθ.Ε.Ε 719 | Συνθ.Ε.Κ 720 | Συνθ.Ν 721 | Σφν 722 | Σφ 723 | Σφ.Σλ 724 | Σχ.Πολ.Δ 725 | Σχ.Συντ.Ε 726 | Σωσ 727 | Σύντ 728 | Σ.Πληρ 729 | ΤΘ 730 | ΤΣ.Δ 731 | Τίτ 732 | Τβ 733 | Τελ.Ενημ 734 | Τελ.Κ 735 | Τερτυλ 736 | Τιμ 737 | Τοπ.Α 738 | Τρ.Ο 739 | Τριμ 740 | Τριμ.Πλ 741 | Τρ.Πλημ 742 | Τρ.Π.Δ 743 | Τ.τ.Ε 744 | Ττ 745 | Τωβ 746 | Υγ 747 | Υπερ 748 | Υπ 749 | Υ.Γ 750 | Φιλήμ 751 | Φιλιπ 752 | Φιλ 753 | Φλμ 754 | Φλ 755 | Φορ.Β 756 | Φορ.Δ.Ε 757 | Φορ.Δνη 758 | Φορ.Δ 759 | Φορ.Επ 760 | Φώτ 761 | Χρ.Ι.Δ 762 | Χρ.Ιδ.Δ 763 | Χρ.Ο 764 | Χρυσ 765 | Ψήφ 766 | Ψαλμ 767 | Ψαλ 768 | Ψλ 769 | Ωριγ 770 | Ωσ 771 | Ω.Ρ.Λ 772 | άγν 773 | άγν.ετυμολ 774 | άγ 775 | άκλ 776 | άνθρ 777 | άπ 778 | άρθρ 779 | άρν 780 | άρ 781 | άτ 782 | άψ 783 | ά 784 | έκδ 785 | έκφρ 786 | έμψ 787 | ένθ.αν 788 | έτ 789 | έ.α 790 | ίδ 791 | αβεστ 792 | αβησσ 793 | αγγλ 794 | αγγ 795 | αδημ 796 | αεροναυτ 797 | αερον 798 | αεροπ 799 | αθλητ 800 | αθλ 801 | αθροιστ 802 | αιγυπτ 803 | αιγ 804 | αιτιολ 805 | αιτ 806 | αι 807 | ακαδ 808 | ακκαδ 809 | αλβ 810 | αλλ 811 | αλφαβητ 812 | αμα 813 | αμερικ 814 | αμερ 815 | αμετάβ 816 | αμτβ 817 | αμφιβ 818 | αμφισβ 819 | αμφ 820 | αμ 821 | ανάλ 822 | ανάπτ 823 | ανάτ 824 | αναβ 825 | αναδαν 826 | αναδιπλασ 827 | αναδιπλ 828 | αναδρ 829 | αναλ 830 | αναν 831 | ανασυλλ 832 | ανατολ 833 | ανατομ 834 | ανατυπ 835 | ανατ 836 | αναφορ 837 | αναφ 838 | ανα.ε 839 | ανδρων 840 | ανθρωπολ 841 | ανθρωπ 842 | ανθ 843 | ανομ 844 | αντίτ 845 | αντδ 846 | αντιγρ 847 | αντιθ 848 | αντικ 849 | αντιμετάθ 850 | αντων 851 | αντ 852 | ανωτ 853 | ανόργ 854 | ανών 855 | αορ 856 | απαρέμφ 857 | απαρφ 858 | απαρχ 859 | απαρ 860 | απλολ 861 | απλοπ 862 | αποβ 863 | αποηχηροπ 864 | αποθ 865 | αποκρυφ 866 | αποφ 867 | απρμφ 868 | απρφ 869 | απρόσ 870 | απόδ 871 | απόλ 872 | απόσπ 873 | απόφ 874 | αραβοτουρκ 875 | αραβ 876 | αραμ 877 | αρβαν 878 | αργκ 879 | αριθμτ 880 | αριθμ 881 | αριθ 882 | αρκτικόλ 883 | αρκ 884 | αρμεν 885 | αρμ 886 | αρνητ 887 | αρσ 888 | αρχαιολ 889 | αρχιτεκτ 890 | αρχιτ 891 | αρχκ 892 | αρχ 893 | αρωμουν 894 | αρωμ 895 | αρ 896 | αρ.μετρ 897 | αρ.φ 898 | ασσυρ 899 | αστρολ 900 | αστροναυτ 901 | αστρον 902 | αττ 903 | αυστραλ 904 | αυτοπ 905 | αυτ 906 | αφγαν 907 | αφηρ 908 | αφομ 909 | αφρικ 910 | αχώρ 911 | αόρ 912 | α.α 913 | α/α 914 | α0 915 | βαθμ 916 | βαθ 917 | βαπτ 918 | βασκ 919 | βεβαιωτ 920 | βεβ 921 | βεδ 922 | βενετ 923 | βεν 924 | βερβερ 925 | βιβλγρ 926 | βιολ 927 | βιομ 928 | βιοχημ 929 | βιοχ 930 | βλάχ 931 | βλ 932 | βλ.λ 933 | βοταν 934 | βοτ 935 | βουλγαρ 936 | βουλγ 937 | βούλ 938 | βραζιλ 939 | βρετον 940 | βόρ 941 | γαλλ 942 | γενικότ 943 | γενοβ 944 | γεν 945 | γερμαν 946 | γερμ 947 | γεωγρ 948 | γεωλ 949 | γεωμετρ 950 | γεωμ 951 | γεωπ 952 | γεωργ 953 | γλυπτ 954 | γλωσσολ 955 | γλωσσ 956 | γλ 957 | γνμδ 958 | γνμ 959 | γνωμ 960 | γοτθ 961 | γραμμ 962 | γραμ 963 | γρμ 964 | γρ 965 | γυμν 966 | δίδες 967 | δίκ 968 | δίφθ 969 | δαν 970 | δεικτ 971 | δεκατ 972 | δηλ 973 | δημογρ 974 | δημοτ 975 | δημώδ 976 | δημ 977 | διάγρ 978 | διάκρ 979 | διάλεξ 980 | διάλ 981 | διάσπ 982 | διαλεκτ 983 | διατρ 984 | διαφ 985 | διαχ 986 | διδα 987 | διεθν 988 | διεθ 989 | δικον 990 | διστ 991 | δισύλλ 992 | δισ 993 | διφθογγοπ 994 | δογμ 995 | δολ 996 | δοτ 997 | δρμ 998 | δρχ 999 | δρ(α) 1000 | δωρ 1001 | δ 1002 | εβρ 1003 | εγκλπ 1004 | εδ 1005 | εθνολ 1006 | εθν 1007 | ειδικότ 1008 | ειδ 1009 | ειδ.β 1010 | εικ 1011 | ειρ 1012 | εισ 1013 | εκατοστμ 1014 | εκατοστ 1015 | εκατστ.2 1016 | εκατστ.3 1017 | εκατ 1018 | εκδ 1019 | εκκλησ 1020 | εκκλ 1021 | εκ 1022 | ελλην 1023 | ελλ 1024 | ελνστ 1025 | ελπ 1026 | εμβ 1027 | εμφ 1028 | εναλλ 1029 | ενδ 1030 | ενεργ 1031 | ενεστ 1032 | ενικ 1033 | ενν 1034 | εν 1035 | εξέλ 1036 | εξακολ 1037 | εξομάλ 1038 | εξ 1039 | εο 1040 | επέκτ 1041 | επίδρ 1042 | επίθ 1043 | επίρρ 1044 | επίσ 1045 | επαγγελμ 1046 | επανάλ 1047 | επανέκδ 1048 | επιθ 1049 | επικ 1050 | επιμ 1051 | επιρρ 1052 | επιστ 1053 | επιτατ 1054 | επιφ 1055 | επών 1056 | επ 1057 | εργ 1058 | ερμ 1059 | ερρινοπ 1060 | ερωτ 1061 | ετρουσκ 1062 | ετυμ 1063 | ετ 1064 | ευφ 1065 | ευχετ 1066 | εφ 1067 | εύχρ 1068 | ε.α 1069 | ε/υ 1070 | ε0 1071 | ζωγρ 1072 | ζωολ 1073 | ηθικ 1074 | ηθ 1075 | ηλεκτρολ 1076 | ηλεκτρον 1077 | ηλεκτρ 1078 | ημίτ 1079 | ημίφ 1080 | ημιφ 1081 | ηχηροπ 1082 | ηχηρ 1083 | ηχομιμ 1084 | ηχ 1085 | η 1086 | θέατρ 1087 | θεολ 1088 | θετ 1089 | θηλ 1090 | θρακ 1091 | θρησκειολ 1092 | θρησκ 1093 | θ 1094 | ιαπων 1095 | ιατρ 1096 | ιδιωμ 1097 | ιδ 1098 | ινδ 1099 | ιραν 1100 | ισπαν 1101 | ιστορ 1102 | ιστ 1103 | ισχυροπ 1104 | ιταλ 1105 | ιχθυολ 1106 | ιων 1107 | κάτ 1108 | καθ 1109 | κακοσ 1110 | καν 1111 | καρ 1112 | κατάλ 1113 | κατατ 1114 | κατωτ 1115 | κατ 1116 | κα 1117 | κελτ 1118 | κεφ 1119 | κινεζ 1120 | κινημ 1121 | κλητ 1122 | κλιτ 1123 | κλπ 1124 | κλ 1125 | κν 1126 | κοινωνιολ 1127 | κοινων 1128 | κοπτ 1129 | κουτσοβλαχ 1130 | κουτσοβλ 1131 | κπ 1132 | κρ.γν 1133 | κτγ 1134 | κτην 1135 | κτητ 1136 | κτλ 1137 | κτ 1138 | κυριολ 1139 | κυρ 1140 | κύρ 1141 | κ 1142 | κ.ά 1143 | κ.ά.π 1144 | κ.α 1145 | κ.εξ 1146 | κ.επ 1147 | κ.ε 1148 | κ.λπ 1149 | κ.λ.π 1150 | κ.ού.κ 1151 | κ.ο.κ 1152 | κ.τ.λ 1153 | κ.τ.τ 1154 | κ.τ.ό 1155 | λέξ 1156 | λαογρ 1157 | λαπ 1158 | λατιν 1159 | λατ 1160 | λαϊκότρ 1161 | λαϊκ 1162 | λετ 1163 | λιθ 1164 | λογιστ 1165 | λογοτ 1166 | λογ 1167 | λουβ 1168 | λυδ 1169 | λόγ 1170 | λ 1171 | λ.χ 1172 | μέλλ 1173 | μέσ 1174 | μαθημ 1175 | μαθ 1176 | μαιευτ 1177 | μαλαισ 1178 | μαλτ 1179 | μαμμων 1180 | μεγεθ 1181 | μεε 1182 | μειωτ 1183 | μελ 1184 | μεξ 1185 | μεσν 1186 | μεσογ 1187 | μεσοπαθ 1188 | μεσοφ 1189 | μετάθ 1190 | μεταβτ 1191 | μεταβ 1192 | μετακ 1193 | μεταπλ 1194 | μεταπτωτ 1195 | μεταρ 1196 | μεταφορ 1197 | μετβ 1198 | μετεπιθ 1199 | μετεπιρρ 1200 | μετεωρολ 1201 | μετεωρ 1202 | μετον 1203 | μετουσ 1204 | μετοχ 1205 | μετρ 1206 | μετ 1207 | μητρων 1208 | μηχανολ 1209 | μηχ 1210 | μικροβιολ 1211 | μογγολ 1212 | μορφολ 1213 | μουσ 1214 | μπενελούξ 1215 | μσνλατ 1216 | μσν 1217 | μτβ 1218 | μτγν 1219 | μτγ 1220 | μτφρδ 1221 | μτφρ 1222 | μτφ 1223 | μτχ 1224 | μυθ 1225 | μυκην 1226 | μυκ 1227 | μφ 1228 | μ 1229 | μ.ε 1230 | μ.μ 1231 | μ.π.ε 1232 | μ.π.π 1233 | μ0 1234 | ναυτ 1235 | νεοελλ 1236 | νεολατιν 1237 | νεολατ 1238 | νεολ 1239 | νεότ 1240 | νλατ 1241 | νομ 1242 | νορβ 1243 | νοσ 1244 | νότ 1245 | ν 1246 | ξ.λ 1247 | οικοδ 1248 | οικολ 1249 | οικον 1250 | οικ 1251 | ολλανδ 1252 | ολλ 1253 | ομηρ 1254 | ομόρρ 1255 | ονομ 1256 | ον 1257 | οπτ 1258 | ορθογρ 1259 | ορθ 1260 | οριστ 1261 | ορυκτολ 1262 | ορυκτ 1263 | ορ 1264 | οσετ 1265 | οσκ 1266 | ουαλ 1267 | ουγγρ 1268 | ουδ 1269 | ουσιαστικοπ 1270 | ουσιαστ 1271 | ουσ 1272 | πίν 1273 | παθητ 1274 | παθολ 1275 | παθ 1276 | παιδ 1277 | παλαιοντ 1278 | παλαιότ 1279 | παλ 1280 | παππων 1281 | παράγρ 1282 | παράγ 1283 | παράλλ 1284 | παράλ 1285 | παραγ 1286 | παρακ 1287 | παραλ 1288 | παραπ 1289 | παρατ 1290 | παρβ 1291 | παρετυμ 1292 | παροξ 1293 | παρων 1294 | παρωχ 1295 | παρ 1296 | παρ.φρ 1297 | πατριδων 1298 | πατρων 1299 | πβ 1300 | περιθ 1301 | περιλ 1302 | περιφρ 1303 | περσ 1304 | περ 1305 | πιθ 1306 | πληθ 1307 | πληροφ 1308 | ποδ 1309 | ποιητ 1310 | πολιτ 1311 | πολλαπλ 1312 | πολ 1313 | πορτογαλ 1314 | πορτ 1315 | ποσ 1316 | πρακριτ 1317 | πρβλ 1318 | πρβ 1319 | πργ 1320 | πρκμ 1321 | πρκ 1322 | πρλ 1323 | προέλ 1324 | προβηγκ 1325 | προελλ 1326 | προηγ 1327 | προθεμ 1328 | προπαραλ 1329 | προπαροξ 1330 | προπερισπ 1331 | προσαρμ 1332 | προσηγορ 1333 | προσταχτ 1334 | προστ 1335 | προσφών 1336 | προσ 1337 | προτακτ 1338 | προτ.Εισ 1339 | προφ 1340 | προχωρ 1341 | πρτ 1342 | πρόθ 1343 | πρόσθ 1344 | πρόσ 1345 | πρότ 1346 | πρ 1347 | πρ.Εφ 1348 | πτ 1349 | πυ 1350 | π 1351 | π.Χ 1352 | π.μ 1353 | π.χ 1354 | ρήμ 1355 | ρίζ 1356 | ρηματ 1357 | ρητορ 1358 | ριν 1359 | ρουμ 1360 | ρωμ 1361 | ρωσ 1362 | ρ 1363 | σανσκρ 1364 | σαξ 1365 | σελ 1366 | σερβοκρ 1367 | σερβ 1368 | σημασιολ 1369 | σημδ 1370 | σημειολ 1371 | σημερ 1372 | σημιτ 1373 | σημ 1374 | σκανδ 1375 | σκυθ 1376 | σκωπτ 1377 | σλαβ 1378 | σλοβ 1379 | σουηδ 1380 | σουμερ 1381 | σουπ 1382 | σπάν 1383 | σπανιότ 1384 | σπ 1385 | σσ 1386 | στατ 1387 | στερ 1388 | στιγμ 1389 | στιχ 1390 | στρέμ 1391 | στρατιωτ 1392 | στρατ 1393 | στ 1394 | συγγ 1395 | συγκρ 1396 | συγκ 1397 | συμπερ 1398 | συμπλεκτ 1399 | συμπλ 1400 | συμπροφ 1401 | συμφυρ 1402 | συμφ 1403 | συνήθ 1404 | συνίζ 1405 | συναίρ 1406 | συναισθ 1407 | συνδετ 1408 | συνδ 1409 | συνεκδ 1410 | συνηρ 1411 | συνθετ 1412 | συνθ 1413 | συνοπτ 1414 | συντελ 1415 | συντομογρ 1416 | συντ 1417 | συν 1418 | συρ 1419 | σχημ 1420 | σχ 1421 | σύγκρ 1422 | σύμπλ 1423 | σύμφ 1424 | σύνδ 1425 | σύνθ 1426 | σύντμ 1427 | σύντ 1428 | σ 1429 | σ.π 1430 | σ/β 1431 | τακτ 1432 | τελ 1433 | τετρ 1434 | τετρ.μ 1435 | τεχνλ 1436 | τεχνολ 1437 | τεχν 1438 | τεύχ 1439 | τηλεπικ 1440 | τηλεόρ 1441 | τιμ 1442 | τιμ.τομ 1443 | τοΣ 1444 | τον 1445 | τοπογρ 1446 | τοπων 1447 | τοπ 1448 | τοσκ 1449 | τουρκ 1450 | τοχ 1451 | τριτοπρόσ 1452 | τροποπ 1453 | τροπ 1454 | τσεχ 1455 | τσιγγ 1456 | ττ 1457 | τυπ 1458 | τόμ 1459 | τόνν 1460 | τ 1461 | τ.μ 1462 | τ.χλμ 1463 | υβρ 1464 | υπερθ 1465 | υπερσ 1466 | υπερ 1467 | υπεύθ 1468 | υποθ 1469 | υποκορ 1470 | υποκ 1471 | υποσημ 1472 | υποτ 1473 | υποφ 1474 | υποχωρ 1475 | υπόλ 1476 | υπόχρ 1477 | υπ 1478 | υστλατ 1479 | υψόμ 1480 | υψ 1481 | φάκ 1482 | φαρμακολ 1483 | φαρμ 1484 | φιλολ 1485 | φιλοσ 1486 | φιλοτ 1487 | φινλ 1488 | φοινικ 1489 | φράγκ 1490 | φρανκον 1491 | φριζ 1492 | φρ 1493 | φυλλ 1494 | φυσιολ 1495 | φυσ 1496 | φωνηεντ 1497 | φωνητ 1498 | φωνολ 1499 | φων 1500 | φωτογρ 1501 | φ 1502 | φ.τ.μ 1503 | χαμιτ 1504 | χαρτόσ 1505 | χαρτ 1506 | χασμ 1507 | χαϊδ 1508 | χγφ 1509 | χειλ 1510 | χεττ 1511 | χημ 1512 | χιλ 1513 | χλγρ 1514 | χλγ 1515 | χλμ 1516 | χλμ.2 1517 | χλμ.3 1518 | χλσγρ 1519 | χλστγρ 1520 | χλστμ 1521 | χλστμ.2 1522 | χλστμ.3 1523 | χλ 1524 | χργρ 1525 | χρημ 1526 | χρον 1527 | χρ 1528 | χφ 1529 | χ.ε 1530 | χ.κ 1531 | χ.ο 1532 | χ.σ 1533 | χ.τ 1534 | χ.χ 1535 | ψευδ 1536 | ψυχαν 1537 | ψυχιατρ 1538 | ψυχολ 1539 | ψυχ 1540 | ωκεαν 1541 | όμ 1542 | όν 1543 | όπ.παρ 1544 | όπ.π 1545 | ό.π 1546 | ύψ 1547 | 1Βσ 1548 | 1Εσ 1549 | 1Θσ 1550 | 1Ιν 1551 | 1Κρ 1552 | 1Μκ 1553 | 1Πρ 1554 | 1Πτ 1555 | 1Τμ 1556 | 2Βσ 1557 | 2Εσ 1558 | 2Θσ 1559 | 2Ιν 1560 | 2Κρ 1561 | 2Μκ 1562 | 2Πρ 1563 | 2Πτ 1564 | 2Τμ 1565 | 3Βσ 1566 | 3Ιν 1567 | 3Μκ 1568 | 4Βσ 1569 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /data/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script preprocesses bitext with Byte Pair Encoding for NMT. 3 | # Executes the following steps: 4 | # 1. Tokenize source and target side of bitext 5 | # 2. Learn BPE-codes for both source and target side 6 | # 3. Encode source and target side using the codes learned 7 | # 4. Shuffle bitext for SGD 8 | # 5. Build source and target dictionaries 9 | 10 | if [ "$#" -ne 4 ]; then 11 | echo "" 12 | echo "Usage: $0 src trg path_to_data path_to_subword" 13 | echo "" 14 | exit 1 15 | fi 16 | 17 | # number of merge ops (codes) for bpe 18 | SRC_CODE_SIZE=20000 19 | TRG_CODE_SIZE=20000 20 | 21 | # source language (example: fr) 22 | S=$1 23 | # target language (example: en) 24 | T=$2 25 | 26 | # path to dl4mt/data 27 | P1=$3 28 | 29 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt) 30 | P2=$4 31 | 32 | 33 | # merge all parallel corpora 34 | ./merge.sh $1 $2 $3 35 | 36 | # tokenize training and validation data 37 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/all_${S}-${T}.${S} > ${P1}/all_${S}-${T}.${S}.tok 38 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/all_${S}-${T}.${T} > ${P1}/all_${S}-${T}.${T}.tok 39 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/test2011/newstest2011.${S} > ${P1}/newstest2011.${S}.tok 40 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/test2011/newstest2011.${T} > ${P1}/newstest2011.${T}.tok 41 | 42 | # BPE 43 | if [ ! -f "${S}.bpe" ]; then 44 | python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${S}.tok > ${S}.bpe 45 | fi 46 | if [ ! -f "${T}.bpe" ]; then 47 | python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${T}.tok > ${T}.bpe 48 | fi 49 | 50 | # utility function to encode a file with bpe 51 | encode () { 52 | if [ ! -f "$3" ]; then 53 | python $P2/apply_bpe.py -c $1 < $2 > $3 54 | else 55 | echo "$3 exists, pass" 56 | fi 57 | } 58 | 59 | # apply bpe to training data 60 | encode ${S}.bpe ${P1}/all_${S}-${T}.${S}.tok ${P1}/all_${S}-${T}.${S}.tok.bpe 61 | encode ${T}.bpe ${P1}/all_${S}-${T}.${T}.tok ${P1}/all_${S}-${T}.${T}.tok.bpe 62 | encode ${S}.bpe ${P1}/newstest2011.${S}.tok ${P1}/newstest2011.${S}.tok.bpe 63 | encode ${T}.bpe ${P1}/newstest2011.${T}.tok ${P1}/newstest2011.${T}.tok.bpe 64 | 65 | # shuffle 66 | python $P1/shuffle.py all_${S}-${T}.${S}.tok.bpe all_${S}-${T}.${T}.tok.bpe 67 | 68 | # build dictionary 69 | python $P1/build_dictionary.py all_${S}-${T}.${S}.tok.bpe 70 | python $P1/build_dictionary.py all_${S}-${T}.${T}.tok.bpe 71 | 72 | -------------------------------------------------------------------------------- /data/scan_example.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import theano 3 | 4 | from theano import tensor 5 | 6 | 7 | # some numbers 8 | n_steps = 10 9 | n_samples = 5 10 | dim = 10 11 | input_dim = 20 12 | output_dim = 2 13 | 14 | 15 | # one step function that will be used by scan 16 | def oneStep(x_t, h_tm1, W_x, W_h, W_o): 17 | 18 | h_t = tensor.tanh(tensor.dot(x_t, W_x) + 19 | tensor.dot(h_tm1, W_h)) 20 | o_t = tensor.dot(h_t, W_o) 21 | 22 | return h_t, o_t 23 | 24 | # spawn theano tensor variable, our symbolic input 25 | # a 3D tensor (n_steps, n_samples, dim) 26 | x = tensor.tensor3(dtype='float32') 27 | 28 | # initial state of our rnn 29 | init_state = tensor.alloc(0., n_samples, dim) 30 | 31 | # create parameters that we will use, 32 | # note that, parameters are theano shared variables 33 | 34 | # parameters for input to hidden states 35 | W_x_ = numpy.random.randn(input_dim, dim).astype('float32') 36 | W_x = theano.shared(W_x_) 37 | 38 | # parameters for hidden state transition 39 | W_h_ = numpy.random.randn(dim, dim).astype('float32') 40 | W_h = theano.shared(W_h_) 41 | 42 | # parameters from hidden state to output 43 | W_o_ = numpy.random.randn(dim, output_dim).astype('float32') 44 | W_o = theano.shared(W_o_) 45 | 46 | # scan function 47 | ([h_vals, o_vals], updates) = theano.scan( 48 | fn=oneStep, 49 | sequences=[x], 50 | outputs_info=[init_state, None], 51 | non_sequences=[W_x, W_h, W_o], 52 | n_steps=n_steps, 53 | strict=True) 54 | 55 | # let us now compile a function to get the output 56 | f = theano.function([x], [h_vals, o_vals]) 57 | 58 | # now we will call the compiled function with actual input 59 | actual_input = numpy.random.randn( 60 | n_steps, n_samples, input_dim).astype('float32') 61 | h_vals_, o_vals_ = f(actual_input) 62 | 63 | # print the shapes 64 | print 'shape of input :', actual_input.shape 65 | print 'shape of h_vals:', h_vals_.shape 66 | print 'shape of o_vals:', o_vals_.shape 67 | -------------------------------------------------------------------------------- /data/setup_cluster_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up development and data environments for 3 | # fionn cluster, copy under your home directory and run. 4 | 5 | # this file is for the dependencies 6 | LOCAL_INSTALL_FILE=/ichec/work/dl4mt_data/local_install.tgz 7 | 8 | # code directory for cloned repositories 9 | CODE_DIR=${HOME}/codes/dl4mt-material 10 | 11 | # code repository 12 | CODE_CENTRAL=https://github.com/kyunghyuncho/dl4mt-material 13 | 14 | # reference files directory 15 | REF_DATA_DIR=/ichec/work/dl4mt_data/nec_files 16 | 17 | # our input files will reside here 18 | DATA_DIR=${HOME}/data 19 | 20 | # our trained models will be saved here 21 | MODELS_DIR=${HOME}/models 22 | 23 | # theano repository 24 | THEANO_GIT=https://github.com/Theano/Theano.git 25 | 26 | # theano install dir 27 | THEANO_DIR=${HOME}/repo/Theano 28 | 29 | # move to home directory 30 | cd 31 | 32 | # copy dependency file to your local and extract 33 | echo "Copying and extracting dependency file" 34 | rsync --bwlimit=20000 -Pavz ${LOCAL_INSTALL_FILE} ${HOME} 35 | tar zxvf ${HOME}/local_install.tgz 36 | 37 | # clone the repository from github into code directory 38 | echo "Cloning lab repository" 39 | if [ ! -d "${CODE_DIR}" ]; then 40 | mkdir -p ${CODE_DIR} 41 | fi 42 | git clone ${CODE_CENTRAL} ${CODE_DIR} 43 | 44 | # copy corpora, dictionaries etc for training and dev 45 | echo "Copying data" 46 | if [ ! -d "${DATA_DIR}" ]; then 47 | mkdir -p ${DATA_DIR} 48 | fi 49 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/all.* ${DATA_DIR} 50 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/news* ${DATA_DIR} 51 | 52 | # create model output directory if it does not exist 53 | if [ ! -d "${MODELS_DIR}" ]; then 54 | mkdir -p ${MODELS_DIR} 55 | fi 56 | 57 | # clone and install Theano 58 | echo "Cloning/installing Theano" 59 | mkdir -p ${THEANO_DIR} 60 | git clone ${THEANO_GIT} ${THEANO_DIR} 61 | cd ${THEANO_DIR} 62 | python setup.py install --user 63 | 64 | # check if theano is working 65 | python -c "import theano;print 'theano available!'" 66 | 67 | -------------------------------------------------------------------------------- /data/setup_local_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up development and data environments for 3 | # a local machine, copy under your home directory and run. 4 | # Note that, Theano is NOT installed by this script. 5 | # To use Byte Pair Encoding, simply pass -b argument. 6 | 7 | BPE=false 8 | 9 | while getopts ':b' flag; do 10 | case "${flag}" in 11 | b) BPE=true 12 | echo "Using Byte Pair Encoding" ;; 13 | *) error 14 | echo "" 15 | echo "Usage: $0 [-b]" 16 | echo "" 17 | exit 1 ;; 18 | esac 19 | done 20 | 21 | 22 | # code directory for cloned repositories 23 | CODE_DIR=${HOME}/git/dl4mt-tutorial 24 | 25 | # code repository 26 | CODE_CENTRAL=https://github.com/kyunghyuncho/dl4mt-tutorial 27 | 28 | # our input files will reside here 29 | DATA_DIR=${CODE_DIR}/data 30 | 31 | # our trained models will be saved here 32 | MODELS_DIR=${HOME}/models 33 | 34 | 35 | # clone the repository from github into code directory 36 | if [ ! -d "${CODE_DIR}" ]; then 37 | echo "Cloning central ..." 38 | mkdir -p ${CODE_DIR} 39 | git clone ${CODE_CENTRAL} ${CODE_DIR} 40 | fi 41 | 42 | # download the europarl v7 and validation sets and extract 43 | python ${CODE_DIR}/data/download_files.py \ 44 | -s='fr' -t='en' \ 45 | --source-dev=newstest2011.fr \ 46 | --target-dev=newstest2011.en \ 47 | --outdir=${DATA_DIR} 48 | 49 | if [ "$BPE" = true ] ; then 50 | 51 | BPE_DIR=${HOME}/codes/subword-nmt 52 | BPE_CENTRAL=https://github.com/rsennrich/subword-nmt 53 | 54 | # clone subword-nmt repository 55 | if [ ! -d "${BPE_DIR}" ]; then 56 | echo "Cloning BPE central ..." 57 | mkdir -p ${BPE_DIR} 58 | git clone ${BPE_CENTRAL} ${BPE_DIR} 59 | fi 60 | 61 | # follow the preprocessing pipeline for BPE 62 | ./preprocess.sh 'fr' 'en' ${DATA_DIR} ${BPE_DIR} 63 | 64 | else 65 | 66 | # tokenize corresponding files 67 | perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/test2011/newstest2011.fr > ${DATA_DIR}/newstest2011.fr.tok 68 | perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/test2011/newstest2011.en > ${DATA_DIR}/newstest2011.en.tok 69 | perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/europarl-v7.fr-en.fr > ${DATA_DIR}/europarl-v7.fr-en.fr.tok 70 | perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/europarl-v7.fr-en.en > ${DATA_DIR}/europarl-v7.fr-en.en.tok 71 | 72 | # extract dictionaries 73 | python ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.fr.tok 74 | python ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.en.tok 75 | 76 | # shuffle traning data 77 | python ${CODE_DIR}/data/shuffle.py ${DATA_DIR}/europarl-v7.fr-en.en.tok ${DATA_DIR}/europarl-v7.fr-en.fr.tok 78 | fi 79 | 80 | # create model output directory if it does not exist 81 | if [ ! -d "${MODELS_DIR}" ]; then 82 | mkdir -p ${MODELS_DIR} 83 | fi 84 | 85 | # check if theano is working 86 | python -c "import theano;print 'theano available!'" 87 | -------------------------------------------------------------------------------- /data/shuffle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | 5 | from tempfile import mkstemp 6 | from subprocess import call 7 | 8 | 9 | 10 | def main(files): 11 | 12 | tf_os, tpath = mkstemp() 13 | tf = open(tpath, 'w') 14 | 15 | fds = [open(ff) for ff in files] 16 | 17 | for l in fds[0]: 18 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 19 | print >>tf, "|||".join(lines) 20 | 21 | [ff.close() for ff in fds] 22 | tf.close() 23 | 24 | tf = open(tpath, 'r') 25 | lines = tf.readlines() 26 | random.shuffle(lines) 27 | 28 | fds = [open(ff+'.shuf','w') for ff in files] 29 | 30 | for l in lines: 31 | s = l.strip().split('|||') 32 | for ii, fd in enumerate(fds): 33 | print >>fd, s[ii] 34 | 35 | [ff.close() for ff in fds] 36 | 37 | os.remove(tpath) 38 | 39 | if __name__ == '__main__': 40 | main(sys.argv[1:]) 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /data/strip_sgml.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | 5 | def main(): 6 | fin = sys.stdin 7 | fout = sys.stdout 8 | for l in fin: 9 | line = l.strip() 10 | text = re.sub('<[^<]+>', "", line).strip() 11 | if len(text) == 0: 12 | continue 13 | print >>fout, text 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | 19 | -------------------------------------------------------------------------------- /data/tokenize_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for F in `ls ./training/* | grep -v pkl | grep -v tok` 4 | do 5 | echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok" 6 | perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok 7 | done 8 | 9 | for F in `ls ./dev/*.?? | grep -v tok` 10 | do 11 | echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok" 12 | perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok 13 | done 14 | -------------------------------------------------------------------------------- /data/tokenizer.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | 8 | # Sample Tokenizer 9 | ### Version 1.1 10 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn 11 | # Version 1.1 updates: 12 | # (1) add multithreading option "-threads NUM_THREADS" (default is 1); 13 | # (2) add a timing option "-time" to calculate the average speed of this tokenizer; 14 | # (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); 15 | ### Version 1.0 16 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ 17 | # written by Josh Schroeder, based on code by Philipp Koehn 18 | 19 | binmode(STDIN, ":utf8"); 20 | binmode(STDOUT, ":utf8"); 21 | 22 | use warnings; 23 | use FindBin qw($RealBin); 24 | use strict; 25 | use Time::HiRes; 26 | 27 | if (eval {require Thread;1;}) { 28 | #module loaded 29 | Thread->import(); 30 | } 31 | 32 | my $mydir = "$RealBin/nonbreaking_prefixes"; 33 | 34 | my %NONBREAKING_PREFIX = (); 35 | my @protected_patterns = (); 36 | my $protected_patterns_file = ""; 37 | my $language = "en"; 38 | my $QUIET = 0; 39 | my $HELP = 0; 40 | my $AGGRESSIVE = 0; 41 | my $SKIP_XML = 0; 42 | my $TIMING = 0; 43 | my $NUM_THREADS = 1; 44 | my $NUM_SENTENCES_PER_THREAD = 2000; 45 | my $PENN = 0; 46 | my $NO_ESCAPING = 0; 47 | while (@ARGV) 48 | { 49 | $_ = shift; 50 | /^-b$/ && ($| = 1, next); 51 | /^-l$/ && ($language = shift, next); 52 | /^-q$/ && ($QUIET = 1, next); 53 | /^-h$/ && ($HELP = 1, next); 54 | /^-x$/ && ($SKIP_XML = 1, next); 55 | /^-a$/ && ($AGGRESSIVE = 1, next); 56 | /^-time$/ && ($TIMING = 1, next); 57 | # Option to add list of regexps to be protected 58 | /^-protected/ && ($protected_patterns_file = shift, next); 59 | /^-threads$/ && ($NUM_THREADS = int(shift), next); 60 | /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); 61 | /^-penn$/ && ($PENN = 1, next); 62 | /^-no-escape/ && ($NO_ESCAPING = 1, next); 63 | } 64 | 65 | # for time calculation 66 | my $start_time; 67 | if ($TIMING) 68 | { 69 | $start_time = [ Time::HiRes::gettimeofday( ) ]; 70 | } 71 | 72 | # print help message 73 | if ($HELP) 74 | { 75 | print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; 76 | print "Options:\n"; 77 | print " -q ... quiet.\n"; 78 | print " -a ... aggressive hyphen splitting.\n"; 79 | print " -b ... disable Perl buffering.\n"; 80 | print " -time ... enable processing time calculation.\n"; 81 | print " -penn ... use Penn treebank-like tokenization.\n"; 82 | print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; 83 | print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; 84 | exit; 85 | } 86 | 87 | if (!$QUIET) 88 | { 89 | print STDERR "Tokenizer Version 1.1\n"; 90 | print STDERR "Language: $language\n"; 91 | print STDERR "Number of threads: $NUM_THREADS\n"; 92 | } 93 | 94 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes 95 | load_prefixes($language,\%NONBREAKING_PREFIX); 96 | 97 | if (scalar(%NONBREAKING_PREFIX) eq 0) 98 | { 99 | print STDERR "Warning: No known abbreviations for language '$language'\n"; 100 | } 101 | 102 | # Load protected patterns 103 | if ($protected_patterns_file) 104 | { 105 | open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file"; 106 | while() { 107 | chomp; 108 | push @protected_patterns, $_; 109 | } 110 | } 111 | 112 | my @batch_sentences = (); 113 | my @thread_list = (); 114 | my $count_sentences = 0; 115 | 116 | if ($NUM_THREADS > 1) 117 | {# multi-threading tokenization 118 | while() 119 | { 120 | $count_sentences = $count_sentences + 1; 121 | push(@batch_sentences, $_); 122 | if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) 123 | { 124 | # assign each thread work 125 | for (my $i=0; $i<$NUM_THREADS; $i++) 126 | { 127 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD; 128 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; 129 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; 130 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; 131 | push(@thread_list, $new_thread); 132 | } 133 | foreach (@thread_list) 134 | { 135 | my $tokenized_list = $_->join; 136 | foreach (@$tokenized_list) 137 | { 138 | print $_; 139 | } 140 | } 141 | # reset for the new run 142 | @thread_list = (); 143 | @batch_sentences = (); 144 | } 145 | } 146 | # the last batch 147 | if (scalar(@batch_sentences)>0) 148 | { 149 | # assign each thread work 150 | for (my $i=0; $i<$NUM_THREADS; $i++) 151 | { 152 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD; 153 | if ($start_index >= scalar(@batch_sentences)) 154 | { 155 | last; 156 | } 157 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; 158 | if ($end_index >= scalar(@batch_sentences)) 159 | { 160 | $end_index = scalar(@batch_sentences)-1; 161 | } 162 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; 163 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; 164 | push(@thread_list, $new_thread); 165 | } 166 | foreach (@thread_list) 167 | { 168 | my $tokenized_list = $_->join; 169 | foreach (@$tokenized_list) 170 | { 171 | print $_; 172 | } 173 | } 174 | } 175 | } 176 | else 177 | {# single thread only 178 | while() 179 | { 180 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 181 | { 182 | #don't try to tokenize XML/HTML tag lines 183 | print $_; 184 | } 185 | else 186 | { 187 | print &tokenize($_); 188 | } 189 | } 190 | } 191 | 192 | if ($TIMING) 193 | { 194 | my $duration = Time::HiRes::tv_interval( $start_time ); 195 | print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); 196 | print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); 197 | } 198 | 199 | ##################################################################################### 200 | # subroutines afterward 201 | 202 | # tokenize a batch of texts saved in an array 203 | # input: an array containing a batch of texts 204 | # return: another array containing a batch of tokenized texts for the input array 205 | sub tokenize_batch 206 | { 207 | my(@text_list) = @_; 208 | my(@tokenized_list) = (); 209 | foreach (@text_list) 210 | { 211 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 212 | { 213 | #don't try to tokenize XML/HTML tag lines 214 | push(@tokenized_list, $_); 215 | } 216 | else 217 | { 218 | push(@tokenized_list, &tokenize($_)); 219 | } 220 | } 221 | return \@tokenized_list; 222 | } 223 | 224 | # the actual tokenize function which tokenizes one input string 225 | # input: one string 226 | # return: the tokenized string for the input string 227 | sub tokenize 228 | { 229 | my($text) = @_; 230 | 231 | if ($PENN) { 232 | return tokenize_penn($text); 233 | } 234 | 235 | chomp($text); 236 | $text = " $text "; 237 | 238 | # remove ASCII junk 239 | $text =~ s/\s+/ /g; 240 | $text =~ s/[\000-\037]//g; 241 | 242 | # Find protected patterns 243 | my @protected = (); 244 | foreach my $protected_pattern (@protected_patterns) { 245 | my $t = $text; 246 | while ($t =~ /($protected_pattern)(.*)$/) { 247 | push @protected, $1; 248 | $t = $2; 249 | } 250 | } 251 | 252 | for (my $i = 0; $i < scalar(@protected); ++$i) { 253 | my $subst = sprintf("THISISPROTECTED%.3d", $i); 254 | $text =~ s,\Q$protected[$i], $subst ,g; 255 | } 256 | $text =~ s/ +/ /g; 257 | $text =~ s/^ //g; 258 | $text =~ s/ $//g; 259 | 260 | # seperate out all "other" special characters 261 | $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; 262 | 263 | # aggressive hyphen splitting 264 | if ($AGGRESSIVE) 265 | { 266 | $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g; 267 | } 268 | 269 | #multi-dots stay together 270 | $text =~ s/\.([\.]+)/ DOTMULTI$1/g; 271 | while($text =~ /DOTMULTI\./) 272 | { 273 | $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; 274 | $text =~ s/DOTMULTI\./DOTDOTMULTI/g; 275 | } 276 | 277 | # seperate out "," except if within numbers (5,300) 278 | #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 279 | 280 | # separate out "," except if within numbers (5,300) 281 | # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E 282 | # first application uses up B so rule can't see B,C 283 | # two-step version here may create extra spaces but these are removed later 284 | # will also space digit,letter or letter,digit forms (redundant with next section) 285 | $text =~ s/([^\p{IsN}])[,]/$1 , /g; 286 | $text =~ s/[,]([^\p{IsN}])/ , $1/g; 287 | 288 | # separate , pre and post number 289 | #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 290 | #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 291 | 292 | # turn `into ' 293 | #$text =~ s/\`/\'/g; 294 | 295 | #turn '' into " 296 | #$text =~ s/\'\'/ \" /g; 297 | 298 | if ($language eq "en") 299 | { 300 | #split contractions right 301 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 302 | $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; 303 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 304 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; 305 | #special case for "1990's" 306 | $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; 307 | } 308 | elsif (($language eq "fr") or ($language eq "it")) 309 | { 310 | #split contractions left 311 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 312 | $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; 313 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 314 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; 315 | } 316 | else 317 | { 318 | $text =~ s/\'/ \' /g; 319 | } 320 | 321 | #word token method 322 | my @words = split(/\s/,$text); 323 | $text = ""; 324 | for (my $i=0;$i<(scalar(@words));$i++) 325 | { 326 | my $word = $words[$i]; 327 | if ( $word =~ /^(\S+)\.$/) 328 | { 329 | my $pre = $1; 330 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml 371 | $text =~ s/\'/\'/g; # xml 372 | $text =~ s/\"/\"/g; # xml 373 | $text =~ s/\[/\[/g; # syntax non-terminal 374 | $text =~ s/\]/\]/g; # syntax non-terminal 375 | } 376 | 377 | #ensure final line break 378 | $text .= "\n" unless $text =~ /\n$/; 379 | 380 | return $text; 381 | } 382 | 383 | sub tokenize_penn 384 | { 385 | # Improved compatibility with Penn Treebank tokenization. Useful if 386 | # the text is to later be parsed with a PTB-trained parser. 387 | # 388 | # Adapted from Robert MacIntyre's sed script: 389 | # http://www.cis.upenn.edu/~treebank/tokenizer.sed 390 | 391 | my($text) = @_; 392 | chomp($text); 393 | 394 | # remove ASCII junk 395 | $text =~ s/\s+/ /g; 396 | $text =~ s/[\000-\037]//g; 397 | 398 | # attempt to get correct directional quotes 399 | $text =~ s/^``/`` /g; 400 | $text =~ s/^"/`` /g; 401 | $text =~ s/^`([^`])/` $1/g; 402 | $text =~ s/^'/` /g; 403 | $text =~ s/([ ([{<])"/$1 `` /g; 404 | $text =~ s/([ ([{<])``/$1 `` /g; 405 | $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; 406 | $text =~ s/([ ([{<])'/$1 ` /g; 407 | # close quotes handled at end 408 | 409 | $text =~ s=\.\.\.= _ELLIPSIS_ =g; 410 | 411 | # separate out "," except if within numbers (5,300) 412 | $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 413 | # separate , pre and post number 414 | $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 415 | $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 416 | 417 | #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; 418 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; 419 | 420 | # Separate out intra-token slashes. PTB tokenization doesn't do this, so 421 | # the tokens should be merged prior to parsing with a PTB-trained parser 422 | # (see syntax-hyphen-splitting.perl). 423 | $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; 424 | 425 | # Assume sentence tokenization has been done first, so split FINAL periods 426 | # only. 427 | $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; 428 | # however, we may as well split ALL question marks and exclamation points, 429 | # since they shouldn't have the abbrev.-marker ambiguity problem 430 | $text =~ s=([?!])= $1 =g; 431 | 432 | # parentheses, brackets, etc. 433 | $text =~ s=([\]\[\(\){}<>])= $1 =g; 434 | $text =~ s/\(/-LRB-/g; 435 | $text =~ s/\)/-RRB-/g; 436 | $text =~ s/\[/-LSB-/g; 437 | $text =~ s/\]/-RSB-/g; 438 | $text =~ s/{/-LCB-/g; 439 | $text =~ s/}/-RCB-/g; 440 | 441 | $text =~ s=--= -- =g; 442 | 443 | # First off, add a space to the beginning and end of each line, to reduce 444 | # necessary number of regexps. 445 | $text =~ s=$= =; 446 | $text =~ s=^= =; 447 | 448 | $text =~ s="= '' =g; 449 | # possessive or close-single-quote 450 | $text =~ s=([^'])' =$1 ' =g; 451 | # as in it's, I'm, we'd 452 | $text =~ s='([sSmMdD]) = '$1 =g; 453 | $text =~ s='ll = 'll =g; 454 | $text =~ s='re = 're =g; 455 | $text =~ s='ve = 've =g; 456 | $text =~ s=n't = n't =g; 457 | $text =~ s='LL = 'LL =g; 458 | $text =~ s='RE = 'RE =g; 459 | $text =~ s='VE = 'VE =g; 460 | $text =~ s=N'T = N'T =g; 461 | 462 | $text =~ s= ([Cc])annot = $1an not =g; 463 | $text =~ s= ([Dd])'ye = $1' ye =g; 464 | $text =~ s= ([Gg])imme = $1im me =g; 465 | $text =~ s= ([Gg])onna = $1on na =g; 466 | $text =~ s= ([Gg])otta = $1ot ta =g; 467 | $text =~ s= ([Ll])emme = $1em me =g; 468 | $text =~ s= ([Mm])ore'n = $1ore 'n =g; 469 | $text =~ s= '([Tt])is = '$1 is =g; 470 | $text =~ s= '([Tt])was = '$1 was =g; 471 | $text =~ s= ([Ww])anna = $1an na =g; 472 | 473 | #word token method 474 | my @words = split(/\s/,$text); 475 | $text = ""; 476 | for (my $i=0;$i<(scalar(@words));$i++) 477 | { 478 | my $word = $words[$i]; 479 | if ( $word =~ /^(\S+)\.$/) 480 | { 481 | my $pre = $1; 482 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml 511 | $text =~ s/\'/\'/g; # xml 512 | $text =~ s/\"/\"/g; # xml 513 | $text =~ s/\[/\[/g; # syntax non-terminal 514 | $text =~ s/\]/\]/g; # syntax non-terminal 515 | 516 | #ensure final line break 517 | $text .= "\n" unless $text =~ /\n$/; 518 | 519 | return $text; 520 | } 521 | 522 | sub load_prefixes 523 | { 524 | my ($language, $PREFIX_REF) = @_; 525 | 526 | my $prefixfile = "$mydir/nonbreaking_prefix.$language"; 527 | 528 | #default back to English if we don't have a language-specific prefix file 529 | if (!(-e $prefixfile)) 530 | { 531 | $prefixfile = "$mydir/nonbreaking_prefix.en"; 532 | print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; 533 | die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); 534 | } 535 | 536 | if (-e "$prefixfile") 537 | { 538 | open(PREFIX, "<:utf8", "$prefixfile"); 539 | while () 540 | { 541 | my $item = $_; 542 | chomp($item); 543 | if (($item) && (substr($item,0,1) ne "#")) 544 | { 545 | if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) 546 | { 547 | $PREFIX_REF->{$1} = 2; 548 | } 549 | else 550 | { 551 | $PREFIX_REF->{$item} = 1; 552 | } 553 | } 554 | } 555 | close(PREFIX); 556 | } 557 | } 558 | -------------------------------------------------------------------------------- /data/translate.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | model=".pretrained/model_wmt15_bpe2k_uni_en-ru.npz" 4 | dict="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl" 5 | dict_rev="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl" 6 | source="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe" 7 | saveto=".translate/standard.trans" 8 | 9 | python translate_uni.py $model $dict $dict_rev $source $saveto -------------------------------------------------------------------------------- /data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import cPickle as pkl 4 | import gzip 5 | 6 | 7 | def fopen(filename, mode='r'): 8 | if filename.endswith('.gz'): 9 | return gzip.open(filename, mode) 10 | return open(filename, mode) 11 | 12 | 13 | class TextIterator: 14 | """Simple Bitext iterator.""" 15 | def __init__(self, source, target, 16 | source_dict, target_dict, 17 | batch_size=128, 18 | maxlen=100, 19 | n_words_source=-1, 20 | n_words_target=-1, 21 | cache=10, 22 | eos=False): 23 | 24 | self.source = fopen(source, 'r') 25 | self.target = fopen(target, 'r') 26 | 27 | print 'scan the dataset.' 28 | for si, _ in enumerate(self.source): 29 | pass 30 | for ti, _ in enumerate(self.target): 31 | pass 32 | 33 | self.source.close() 34 | self.target.close() 35 | 36 | assert si == ti, 'the number of the source and target document must the same' 37 | print 'scanned {} lines'.format(si) 38 | 39 | self.source = fopen(source, 'r') 40 | self.target = fopen(target, 'r') 41 | 42 | with open(source_dict, 'rb') as f: 43 | self.source_dict = pkl.load(f) 44 | with open(target_dict, 'rb') as f: 45 | self.target_dict = pkl.load(f) 46 | 47 | self.num = si 48 | self.batch_size = batch_size 49 | self.maxlen = maxlen 50 | 51 | self.n_words_source = n_words_source 52 | self.n_words_target = n_words_target 53 | 54 | self.source_buffer = [] 55 | self.target_buffer = [] 56 | self.k = batch_size * cache 57 | 58 | self.end_of_data = False 59 | 60 | 61 | 62 | 63 | def __iter__(self): 64 | return self 65 | 66 | def reset(self): 67 | self.source.seek(0) 68 | self.target.seek(0) 69 | 70 | def next(self): 71 | if self.end_of_data: 72 | self.end_of_data = False 73 | self.reset() 74 | raise StopIteration 75 | 76 | source = [] 77 | target = [] 78 | 79 | # fill buffer, if it's empty 80 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 81 | 82 | if len(self.source_buffer) == 0: 83 | for k_ in xrange(self.k): 84 | ss = self.source.readline() 85 | if ss == "": 86 | break 87 | tt = self.target.readline() 88 | if tt == "": 89 | break 90 | 91 | self.source_buffer.append(ss.strip().split()) 92 | self.target_buffer.append(tt.strip().split()) 93 | 94 | # sort by target buffer 95 | tlen = numpy.array([len(t) for t in self.target_buffer]) 96 | tidx = tlen.argsort() 97 | 98 | _sbuf = [self.source_buffer[i] for i in tidx] 99 | _tbuf = [self.target_buffer[i] for i in tidx] 100 | 101 | self.source_buffer = _sbuf 102 | self.target_buffer = _tbuf 103 | 104 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 105 | self.end_of_data = False 106 | self.reset() 107 | raise StopIteration 108 | 109 | try: 110 | 111 | # actual work here 112 | while True: 113 | 114 | # read from source file and map to word index 115 | try: 116 | ss = self.source_buffer.pop() 117 | except IndexError: 118 | break 119 | ss = [self.source_dict[w] if w in self.source_dict else 1 120 | for w in ss] 121 | if self.n_words_source > 0: 122 | ss = [w if w < self.n_words_source else 1 for w in ss] 123 | 124 | # read from source file and map to word index 125 | tt = self.target_buffer.pop() 126 | tt = [self.target_dict[w] if w in self.target_dict else 1 127 | for w in tt] 128 | if self.n_words_target > 0: 129 | tt = [w if w < self.n_words_target else 1 for w in tt] 130 | 131 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 132 | continue 133 | 134 | source.append(ss) 135 | target.append(tt) 136 | 137 | if len(source) >= self.batch_size or \ 138 | len(target) >= self.batch_size: 139 | break 140 | except IOError: 141 | self.end_of_data = True 142 | 143 | if len(source) <= 0 or len(target) <= 0: 144 | self.end_of_data = False 145 | self.reset() 146 | raise StopIteration 147 | 148 | return source, target 149 | 150 | 151 | def iterate(fname, word_dict, n_words): 152 | with open(fname, 'r') as f: 153 | for line in f: 154 | words = line.strip().split() 155 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 156 | x = map(lambda ii: ii if ii < n_words else 1, x) 157 | x += [0] 158 | yield x 159 | 160 | 161 | def check_length(fname): 162 | f = open(fname, 'r') 163 | count = 0 164 | for _ in f: 165 | count += 1 166 | f.close() 167 | return count -------------------------------------------------------------------------------- /data_iterator.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/data_iterator.pyc -------------------------------------------------------------------------------- /insepection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import matplotlib 3 | # matplotlib.use('agg') 4 | import copy 5 | import numpy 6 | import os 7 | import seaborn as sns 8 | import pandas as pd 9 | sns.set(context="paper", font="monospace", style='whitegrid') 10 | from matplotlib import pyplot as plot 11 | from matplotlib import rc 12 | 13 | rc('font',**{'family':'Verdana', 'weight': 'normal'}) 14 | rc('font', size=8) 15 | rc('text', usetex=True) 16 | rc('text.latex',unicode=True) 17 | rc('text.latex',preamble='\usepackage[utf8]{inputenc}') 18 | rc('text.latex',preamble='\usepackage[russian]{babel}') 19 | rc('text.latex',preamble='\usepackage[german]{babel}') 20 | rc('text.latex',preamble='\usepackage[ngerman]{babel}') 21 | 22 | matplotlib.rcParams['ytick.labelsize'] = 11 23 | matplotlib.rcParams['xtick.labelsize'] = 11 24 | 25 | def heatmap(sources, refs, trans, actions, idx, atten=None, savefig=True, name='test', info=None, show=False): 26 | source = [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||'] 27 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] 28 | action = actions[idx] 29 | 30 | 31 | if atten: 32 | attention = numpy.array(atten[idx]) 33 | 34 | def track(acts, data, annote): 35 | x, y = 0, 0 36 | for a in acts: 37 | x += a 38 | y += 1 - a 39 | # print a, x, y, target[x].encode('utf8') 40 | data[y, x] = 1 41 | annote[y, x] = 'W' if a == 0 else 'C' 42 | 43 | return data, annote 44 | # print target 45 | 46 | data = numpy.zeros((len(source), len(target))) 47 | annote = numpy.chararray(data.shape, itemsize=8) 48 | annote[:] = '' 49 | data, annote = track(action, data, annote) 50 | data[0, 0] = 1 51 | annote[0, 0] = 'S' 52 | if atten: 53 | data[:-1, 1:] += attention.T 54 | 55 | d = pd.DataFrame(data=data, columns=target, index=source) 56 | # p = sns.diverging_palette(220, 10, as_cmap=True) 57 | f, ax = plot.subplots(figsize=(11, 11)) 58 | f.set_canvas(plot.gcf().canvas) 59 | g = sns.heatmap(d, ax=ax, annot=annote, fmt='s') 60 | g.xaxis.tick_top() 61 | 62 | plot.xticks(rotation=90) 63 | plot.yticks(rotation=0) 64 | # plot.show() 65 | if savefig: 66 | if not os.path.exists('.images/C_{}'.format(name)): 67 | os.mkdir('.images/C_{}'.format(name)) 68 | 69 | filename = 'Idx={}||'.format(info['index']) 70 | for w in info: 71 | if w is not 'index': 72 | filename += '.{}={:.2f}'.format(w, float(info[w])) 73 | 74 | print 'saving...' 75 | f.savefig('.images/C_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100) 76 | if show: 77 | plot.show() 78 | 79 | print 'plotting done.' 80 | plot.close() 81 | 82 | def heatmap2(sources, refs, trans, actions, idx, atten=None, full_atten=None, savefig=True, name='test', info=None, show=False): 83 | source = ['*'] + [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||'] 84 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] + ['*'] 85 | action = actions[idx] 86 | 87 | flag = 0 88 | if atten: 89 | attention = numpy.array(atten[idx]) 90 | else: 91 | attention = None 92 | 93 | if full_atten: 94 | fullatten = numpy.array(full_atten[idx]) 95 | else: 96 | fullatten = None 97 | 98 | def track(acts, data, annote): 99 | x, y, z = 0, 0, 0 100 | for a in acts: 101 | x += (a == 1) 102 | y += (a == 0) 103 | z += (a == 2) 104 | 105 | # data[y + 1, x] = 1 106 | # data[z, x + 1] = 1 107 | # annote[y, x] = 'W' if a == 0 else 'C' 108 | 109 | return data, annote 110 | # print target 111 | 112 | data = numpy.zeros((len(source), len(target))) 113 | annote = numpy.chararray(data.shape, itemsize=8) 114 | annote[:] = '' 115 | data, annote = track(action, data, annote) 116 | data[1, 0] = 1 117 | 118 | def draw(data_t, ax, attention=None): 119 | 120 | data = copy.copy(data_t) 121 | data[1:-1, 1:-1] += attention.T 122 | d = pd.DataFrame(data=data, columns=target, index=source) 123 | # p = sns.diverging_palette(220, 10, as_cmap=True) 124 | g = sns.heatmap(d, mask=(data==0), square=True, cbar=False, linewidths=0.1, ax=ax, annot=annote, fmt='s') 125 | g.xaxis.tick_top() 126 | 127 | for tick in ax.get_xticklabels(): 128 | tick.set_rotation(90) 129 | for tick in ax.get_yticklabels(): 130 | tick.set_rotation(0) 131 | 132 | ax.grid(True) 133 | f, [ax1, ax2] = plot.subplots(1, 2, figsize=(22, 11)) 134 | f.set_canvas(plot.gcf().canvas) 135 | 136 | draw(data, ax1, attention) 137 | # plot.xticks(rotation=90) 138 | # plot.yticks(rotation=0) 139 | # plot.grid() 140 | 141 | draw(data, ax2, fullatten) 142 | # plot.xticks(rotation=90) 143 | # plot.yticks(rotation=0) 144 | # plot.grid() 145 | 146 | 147 | if savefig: 148 | if not os.path.exists('.images/M_{}'.format(name)): 149 | os.mkdir('.images/M_{}'.format(name)) 150 | 151 | filename = 'Idx={}||'.format(info['index']) 152 | for w in info: 153 | if w is not 'index': 154 | filename += '.{}={:.2f}'.format(w, float(info[w])) 155 | 156 | # print 'saving...' 157 | plot.savefig('.images/M_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100) 158 | 159 | if show: 160 | plot.show() 161 | 162 | # print 'plotting done.' 163 | plot.close() 164 | 165 | 166 | 167 | 168 | 169 | 170 | def visualize(sources, refs, trans, aligns, idx, savefig=True, name='test', info=None): 171 | 172 | colors = ['b', 'g'] 173 | 174 | fig = plot.figure(figsize=(20, 2)) 175 | ax = plot.gca() 176 | 177 | # plot.hold('on') 178 | 179 | plot.xlim([0., 10.]) 180 | 181 | scolors = [] 182 | caidx = 0 183 | coloridx = 0 184 | for sidx in xrange(len([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + [''])): 185 | if caidx >= len(numpy.unique(aligns[idx])) or sidx >= numpy.unique(aligns[idx])[caidx]: 186 | caidx = caidx + 1 187 | coloridx = 1 - coloridx 188 | scolors.append(colors[coloridx]) 189 | 190 | tcolors = [] 191 | lastidx = -1 192 | coloridx = 1 193 | for tt in aligns[idx]: 194 | if tt != lastidx: 195 | lastidx = tt 196 | coloridx = 1 - coloridx 197 | tcolors.append(colors[coloridx]) 198 | 199 | x, y = 0., 1. 200 | s_pos = [(x, y)] 201 | for ii, ss in enumerate([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + ['']): 202 | 203 | ss.replace('%', '\%') 204 | xx = plot.text(x, y, ss) 205 | xx.set_bbox(dict(color=scolors[ii], alpha=0.1, edgecolor=scolors[ii])) 206 | xx._renderer = fig.canvas.get_renderer() 207 | wext = xx.get_window_extent() 208 | bbox = ax.transData.inverted().transform(wext) 209 | x = bbox[1, 0] + 0. 210 | s_pos.append((x, y)) 211 | s_pos.append((bbox[1, 0], y)) 212 | 213 | x, y = 0., .95 214 | t_pos = [] 215 | for ii, ss in enumerate([s_.decode('utf8').replace('@@', '--') for s_ in trans[idx].split()]): 216 | 217 | ss.replace('%', '\%') 218 | xx = plot.text(x, y, ss) 219 | xx._renderer = fig.canvas.get_renderer() 220 | wext = xx.get_window_extent() 221 | bbox = ax.transData.inverted().transform(wext) 222 | t_pos.append((bbox[0, 0], bbox[0, 1] + 0.03)) 223 | x = bbox[1, 0] + 0. 224 | t_pos.append((bbox[1, 0], bbox[0, 1] + 0.03)) 225 | 226 | lasttidx = 0 227 | lastidx = -1 228 | for tidx, sidx in enumerate(aligns[idx]): 229 | if lastidx != sidx: 230 | lastidx = sidx 231 | lasttidx = tidx 232 | sidx = numpy.minimum(sidx, len(s_pos) - 1) 233 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1], 234 | t_pos[tidx][0] - s_pos[sidx][0], 235 | t_pos[tidx][1] - s_pos[sidx][1], 236 | head_width=0., head_length=0., 237 | fc=tcolors[tidx], ec=tcolors[tidx], 238 | linestyle='dotted', width=0.0001) 239 | for tt in xrange(tidx, len(aligns[idx])): 240 | if aligns[idx][tt] != sidx: 241 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1], 242 | t_pos[tt][0] - s_pos[sidx][0], 243 | t_pos[tt][1] - s_pos[sidx][1], 244 | head_width=0., head_length=0., 245 | fc=tcolors[tidx], ec=tcolors[tidx], 246 | linestyle='dotted', width=0.0001) 247 | plot.fill_between([t_pos[tidx][0], s_pos[sidx][0], t_pos[tt][0]], 248 | [t_pos[tidx][1], s_pos[sidx][1], t_pos[tt][1]], 249 | facecolor=tcolors[tidx], alpha=0.1) 250 | break 251 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1], 252 | t_pos[-1][0] - s_pos[sidx][0], 253 | t_pos[-1][1] - s_pos[sidx][1], 254 | head_width=0., head_length=0., 255 | fc=tcolors[-1], ec=tcolors[-1], 256 | linestyle='dotted', width=0.0001) 257 | plot.fill_between([t_pos[lasttidx][0], s_pos[sidx][0], t_pos[-1][0]], 258 | [t_pos[lasttidx][1], s_pos[sidx][1], t_pos[-1][1]], 259 | facecolor=tcolors[tidx], alpha=0.1) 260 | 261 | # plot.hold('off') 262 | 263 | plot.axis('off') 264 | plot.ylim([0.95, 1.01]) 265 | plot.tight_layout() 266 | 267 | if savefig: 268 | if not os.path.exists('.images/{}'.format(name)): 269 | os.mkdir('.images/{}'.format(name)) 270 | 271 | filename = 'Idx={}||'.format(info['index']) 272 | for w in info: 273 | if w is not 'index': 274 | filename += '.{}={:.2f}'.format(w, float(info[w])) 275 | 276 | plot.savefig('.images/{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=300) 277 | 278 | print 'plotting done.' 279 | plot.close() 280 | # plot.show() 281 | 282 | 283 | if __name__ == "__main__": 284 | 285 | sources = ['I cannot understand .'] 286 | targets = ['Ich verstehe nicht .'] 287 | actions = [[0, 0, 1, 1, 2, 0, 1, 2, 2, 0, 1]] 288 | heatmap2(sources, targets, targets, actions, 0, savefig=False, show=True) 289 | -------------------------------------------------------------------------------- /insepection.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/insepection.pyc -------------------------------------------------------------------------------- /itchat.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'cookies' 3 | p1 4 | (dp2 5 | S'webwx_data_ticket' 6 | p3 7 | S'gSdzU7D7VCmK6kLYm/REsyf8' 8 | p4 9 | sS'wxuin' 10 | p5 11 | S'1059617351' 12 | p6 13 | sS'webwxuvid' 14 | p7 15 | S'7e7876624bedc284f3184c1f6790bab3fb32382982efcab8e8e342feffde08084d7eacd58ec0dd5b5934762d984ea238' 16 | p8 17 | sS'webwx_auth_ticket' 18 | p9 19 | S'CIsBEIHz4qwOGoAB6e/2gBxNrbvLSYvZ6sEa7pcl65diPjyZ2lDKbWoj6R1hg1cyC3eMtluSIcwockeE1rFtthBYz0fgcSK9CKijLujMJxe+V9SAtUxLxdZDUdN/QHJgDAa6zTkGYu+lwz7sXk6T0LmWCSzbGohUtURcm9PybIL/9mUkTldZR3Y8S0Q=' 20 | p10 21 | sS'wxloadtime' 22 | p11 23 | S'1488957301_expired' 24 | p12 25 | sS'wxpluginkey' 26 | p13 27 | S'1488934262' 28 | p14 29 | sS'wxsid' 30 | p15 31 | S'f7PxyPc1Cfip7gPz' 32 | p16 33 | sS'mm_lang' 34 | p17 35 | S'zh_CN' 36 | p18 37 | ssS'version' 38 | p19 39 | S'1.2.27' 40 | p20 41 | sS'storage' 42 | p21 43 | (dp22 44 | S'userName' 45 | p23 46 | V@81d220b3ed273e9d1a9d0bd871cc198f5999208926fda248aa8006daa5ffffb0 47 | p24 48 | sS'lastInputUserName' 49 | p25 50 | NsS'memberList' 51 | p26 52 | (lp27 53 | (dp28 54 | S'UserName' 55 | p29 56 | g24 57 | sS'City' 58 | p30 59 | S'' 60 | p31 61 | sS'DisplayName' 62 | p32 63 | g31 64 | sS'UniFriend' 65 | p33 66 | I0 67 | sS'OwnerUin' 68 | p34 69 | I0 70 | sS'MemberList' 71 | p35 72 | (lp36 73 | sS'PYQuanPin' 74 | p37 75 | V 76 | p38 77 | sS'RemarkPYInitial' 78 | p39 79 | g38 80 | sS'Uin' 81 | p40 82 | I1059617351 83 | sS'AppAccountFlag' 84 | p41 85 | I0 86 | sS'VerifyFlag' 87 | p42 88 | I0 89 | sS'Province' 90 | p43 91 | g31 92 | sS'KeyWord' 93 | p44 94 | g31 95 | sS'RemarkName' 96 | p45 97 | g38 98 | sS'PYInitial' 99 | p46 100 | g38 101 | sS'ChatRoomId' 102 | p47 103 | I0 104 | sS'HideInputBarFlag' 105 | p48 106 | I0 107 | sVHeadImgFlag 108 | p49 109 | I1 110 | sS'EncryChatRoomId' 111 | p50 112 | g31 113 | sS'AttrStatus' 114 | p51 115 | I0 116 | sS'SnsFlag' 117 | p52 118 | I0 119 | sS'MemberCount' 120 | p53 121 | I0 122 | sVWebWxPluginSwitch 123 | p54 124 | I0 125 | sS'Alias' 126 | p55 127 | g31 128 | sS'Signature' 129 | p56 130 | g38 131 | sS'ContactFlag' 132 | p57 133 | I0 134 | sS'NickName' 135 | p58 136 | VCoral 137 | p59 138 | sS'RemarkPYQuanPin' 139 | p60 140 | g38 141 | sS'HeadImgUrl' 142 | p61 143 | V/cgi-bin/mmwebwx-bin/webwxgeticon?seq=1782193440&username=@81d220b3ed273e9d1a9d0bd871cc198f5999208926fda248aa8006daa5ffffb0&skey=@crypt_4c00d0e1_bf264d5f643dcc22a8b1701b72f216ca 144 | p62 145 | sS'Sex' 146 | p63 147 | I0 148 | sS'StarFriend' 149 | p64 150 | I0 151 | sS'Statues' 152 | p65 153 | I0 154 | sa(dp66 155 | VUserName 156 | p67 157 | V@f3ca6485604bf7a0b3518d4930e5cbdee824941efc1905597b5e6ddad15d3658 158 | p68 159 | sVCity 160 | p69 161 | V\u4e2d\u897f\u533a 162 | p70 163 | sVDisplayName 164 | p71 165 | g38 166 | sVUniFriend 167 | p72 168 | I0 169 | sVMemberList 170 | p73 171 | (lp74 172 | sVPYQuanPin 173 | p75 174 | g38 175 | sVRemarkPYInitial 176 | p76 177 | g38 178 | sVSex 179 | p77 180 | I1 181 | sVAppAccountFlag 182 | p78 183 | I0 184 | sVVerifyFlag 185 | p79 186 | I0 187 | sVProvince 188 | p80 189 | V\u9999\u6e2f 190 | p81 191 | sVKeyWord 192 | p82 193 | g38 194 | sVRemarkName 195 | p83 196 | g38 197 | sVPYInitial 198 | p84 199 | g38 200 | sVIsOwner 201 | p85 202 | I0 203 | sVChatRoomId 204 | p86 205 | I0 206 | sVHideInputBarFlag 207 | p87 208 | I0 209 | sVEncryChatRoomId 210 | p88 211 | g38 212 | sVAttrStatus 213 | p89 214 | I37847143 215 | sVSnsFlag 216 | p90 217 | I49 218 | sVMemberCount 219 | p91 220 | I0 221 | sVOwnerUin 222 | p92 223 | I0 224 | sVAlias 225 | p93 226 | VThoma_Gu 227 | p94 228 | sVSignature 229 | p95 230 | V\u80f8\u304c\u75db\u3044\u3001\u65e5\u3005\u5f37\u307e\u308b 231 | p96 232 | sVContactFlag 233 | p97 234 | I3 235 | sVNickName 236 | p98 237 | V\u30de\u30eb\u30c1\u30d1\u30b9 238 | p99 239 | sVRemarkPYQuanPin 240 | p100 241 | g38 242 | sVHeadImgUrl 243 | p101 244 | V/cgi-bin/mmwebwx-bin/webwxgeticon?seq=647880064&username=@f3ca6485604bf7a0b3518d4930e5cbdee824941efc1905597b5e6ddad15d3658&skey=@crypt_4c00d0e1_bf264d5f643dcc22a8b1701b72f216ca 245 | p102 246 | sVUin 247 | p103 248 | Vwxid_xgph596ajfxh12 249 | p104 250 | sVStarFriend 251 | p105 252 | I0 253 | sVStatues 254 | p106 255 | I0 256 | sasS'chatroomList' 257 | p107 258 | (lp108 259 | sS'nickName' 260 | p109 261 | g59 262 | sS'mpList' 263 | p110 264 | (lp111 265 | ssS'loginInfo' 266 | p112 267 | (dp113 268 | S'SyncKey' 269 | p114 270 | (dp115 271 | VCount 272 | p116 273 | I9 274 | sVList 275 | p117 276 | (lp118 277 | (dp119 278 | VVal 279 | p120 280 | I647880085 281 | sVKey 282 | p121 283 | I1 284 | sa(dp122 285 | VVal 286 | p123 287 | I647880092 288 | sVKey 289 | p124 290 | I2 291 | sa(dp125 292 | VVal 293 | p126 294 | I647880064 295 | sVKey 296 | p127 297 | I3 298 | sa(dp128 299 | VVal 300 | p129 301 | I647880012 302 | sVKey 303 | p130 304 | I11 305 | sa(dp131 306 | VVal 307 | p132 308 | I647880012 309 | sVKey 310 | p133 311 | I13 312 | sa(dp134 313 | VVal 314 | p135 315 | I1488959281 316 | sVKey 317 | p136 318 | I201 319 | sa(dp137 320 | VVal 321 | p138 322 | I1488934262 323 | sVKey 324 | p139 325 | I1000 326 | sa(dp140 327 | VVal 328 | p141 329 | I1488934292 330 | sVKey 331 | p142 332 | I1001 333 | sa(dp143 334 | VVal 335 | p144 336 | I1488859979 337 | sVKey 338 | p145 339 | I1003 340 | sassS'syncUrl' 341 | p146 342 | S'https://webpush.web.wechat.com/cgi-bin/mmwebwx-bin' 343 | p147 344 | sS'skey' 345 | p148 346 | V@crypt_4c00d0e1_bf264d5f643dcc22a8b1701b72f216ca 347 | p149 348 | sS'wxuin' 349 | p150 350 | V1059617351 351 | p151 352 | sS'synckey' 353 | p152 354 | S'1_647880085|2_647880092|3_647880064|11_647880012|13_647880012|201_1488959281|1000_1488934262|1001_1488934292|1003_1488859979' 355 | p153 356 | sS'url' 357 | p154 358 | Vhttps://web.wechat.com/cgi-bin/mmwebwx-bin 359 | p155 360 | sS'pass_ticket' 361 | p156 362 | V35aSttODnTZMiou7%2BxJ9v9C087xjTCfcWoYWANyF03knm19w4vbp6dnQGT1FCf14 363 | p157 364 | sS'wxsid' 365 | p158 366 | Vf7PxyPc1Cfip7gPz 367 | p159 368 | sS'User' 369 | p160 370 | g28 371 | sS'InviteStartCount' 372 | p161 373 | I40 374 | sS'fileUrl' 375 | p162 376 | S'https://file.web.wechat.com/cgi-bin/mmwebwx-bin' 377 | p163 378 | sS'BaseRequest' 379 | p164 380 | (dp165 381 | S'Sid' 382 | p166 383 | g159 384 | sS'Skey' 385 | p167 386 | g149 387 | sS'DeviceID' 388 | p168 389 | g157 390 | sg40 391 | g151 392 | ssS'deviceid' 393 | p169 394 | S'e275584183735061' 395 | p170 396 | ss. -------------------------------------------------------------------------------- /layers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/layers.pyc -------------------------------------------------------------------------------- /mteval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ref=" /misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok" 4 | # sed -i 's/@@ //g' $1 5 | 6 | DIR="/work/jg5223/work/SimulTrans/.translate/" 7 | 8 | ./data/multi-bleu.perl $DIR/ref.txt < $DIR/test.txt 9 | -------------------------------------------------------------------------------- /nmt_uni.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/nmt_uni.pyc -------------------------------------------------------------------------------- /optimizer.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as tensor 3 | import numpy 4 | 5 | from layers import * 6 | profile = False 7 | 8 | # optimizers 9 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update 10 | 11 | 12 | # gradient clipping 13 | def grad_clip(grad): 14 | clip_c = 1. 15 | if clip_c > 0.: 16 | g2 = 0. 17 | for g in grad: 18 | g2 += (g ** 2).sum() 19 | new_grads = [] 20 | for g in grad: 21 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g)) 22 | grad = new_grads 23 | return grad 24 | 25 | 26 | def adam(lr, tparams, grads, inp, cost): 27 | gshared = [theano.shared(p.get_value() * 0., 28 | name='%s_grad' % k) 29 | for k, p in tparams.iteritems()] 30 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 31 | 32 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile, on_unused_input='ignore') 33 | 34 | lr0 = lr # 0.0002 35 | b1 = 0.1 36 | b2 = 0.001 37 | e = 1e-8 38 | 39 | updates = [] 40 | 41 | i = theano.shared(numpy.float32(0.)) 42 | i_t = i + 1. 43 | fix1 = 1. - b1**(i_t) 44 | fix2 = 1. - b2**(i_t) 45 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1) 46 | 47 | for p, g in zip(tparams.values(), gshared): 48 | m = theano.shared(p.get_value() * 0.) 49 | v = theano.shared(p.get_value() * 0.) 50 | m_t = (b1 * g) + ((1. - b1) * m) 51 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) 52 | g_t = m_t / (tensor.sqrt(v_t) + e) 53 | p_t = p - (lr_t * g_t) 54 | updates.append((m, m_t)) 55 | updates.append((v, v_t)) 56 | updates.append((p, p_t)) 57 | updates.append((i, i_t)) 58 | 59 | print 'build optimizer with Adam' 60 | f_update = theano.function([lr], [], updates=updates, 61 | on_unused_input='ignore', profile=profile) 62 | 63 | return f_grad_shared, f_update 64 | 65 | 66 | def adadelta(lr, tparams, grads, inp, cost): 67 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 68 | name='%s_grad' % k) 69 | for k, p in tparams.iteritems()] 70 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 71 | name='%s_rup2' % k) 72 | for k, p in tparams.iteritems()] 73 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 74 | name='%s_rgrad2' % k) 75 | for k, p in tparams.iteritems()] 76 | 77 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 78 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 79 | for rg2, g in zip(running_grads2, grads)] 80 | 81 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 82 | profile=profile) 83 | 84 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 85 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 86 | running_grads2)] 87 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 88 | for ru2, ud in zip(running_up2, updir)] 89 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 90 | 91 | f_update = theano.function([lr], [], updates=ru2up+param_up, 92 | on_unused_input='ignore', profile=profile) 93 | 94 | print 'build optimizer with Adadelta' 95 | return f_grad_shared, f_update 96 | 97 | 98 | def rmsprop(lr, tparams, grads, inp, cost): 99 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 100 | name='%s_grad' % k) 101 | for k, p in tparams.iteritems()] 102 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 103 | name='%s_rgrad' % k) 104 | for k, p in tparams.iteritems()] 105 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 106 | name='%s_rgrad2' % k) 107 | for k, p in tparams.iteritems()] 108 | 109 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 110 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 111 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 112 | for rg2, g in zip(running_grads2, grads)] 113 | 114 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 115 | profile=profile) 116 | 117 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 118 | name='%s_updir' % k) 119 | for k, p in tparams.iteritems()] 120 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 121 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 122 | running_grads2)] 123 | param_up = [(p, p + udn[1]) 124 | for p, udn in zip(itemlist(tparams), updir_new)] 125 | f_update = theano.function([lr], [], updates=updir_new+param_up, 126 | on_unused_input='ignore', profile=profile) 127 | 128 | print 'build optimizer with Rmsprop' 129 | return f_grad_shared, f_update 130 | 131 | 132 | def sgd(lr, tparams, grads, x, mask, y, cost): 133 | gshared = [theano.shared(p.get_value() * 0., 134 | name='%s_grad' % k) 135 | for k, p in tparams.iteritems()] 136 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 137 | 138 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, 139 | profile=profile) 140 | 141 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 142 | f_update = theano.function([lr], [], updates=pup, profile=profile) 143 | 144 | print 'build optimizer with SGD' 145 | return f_grad_shared, f_update 146 | 147 | -------------------------------------------------------------------------------- /optimizer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/optimizer.pyc -------------------------------------------------------------------------------- /policy.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/policy.pyc -------------------------------------------------------------------------------- /pretrain_uni.py: -------------------------------------------------------------------------------- 1 | from nmt_uni import train 2 | from config import pretrain_config 3 | 4 | 5 | def main(job_id, params): 6 | print 'pretraining settings:' 7 | for c, v in sorted(params.items(), key=lambda a:a[0]): 8 | print '{}: {}'.format(c, v) 9 | 10 | validerr = train(**params) 11 | return validerr 12 | 13 | if __name__ == '__main__': 14 | main(0, pretrain_config()) 15 | 16 | 17 | -------------------------------------------------------------------------------- /reward.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/reward.pyc -------------------------------------------------------------------------------- /run_eval.sh: -------------------------------------------------------------------------------- 1 | THEANO_FLAGS=device=gpu2 python simualtrans_eval.py --sample 1 --batchsize 1 --target 10 --sinit 1 --gamma 1 --recurrent True --Rtype 10 --coverage True 2 | -------------------------------------------------------------------------------- /run_train.sh: -------------------------------------------------------------------------------- 1 | export THEANO_FLAGS=device=gpu1,floatX=float32 2 | python simultrans_train.py 3 | 4 | -------------------------------------------------------------------------------- /simultrans_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simultaneous Machine Translateion: Training with Policy Gradient 3 | 4 | """ 5 | import argparse 6 | import os 7 | import cPickle as pkl 8 | 9 | from bleu import * 10 | from nmt_uni import * 11 | from policy import Controller as Policy 12 | from utils import Progbar, Monitor 13 | 14 | from simultrans_beam import simultaneous_decoding 15 | from simultrans_model import _seqs2words, _bpe2words, _action2delay, PIPE, _padding 16 | 17 | import time 18 | 19 | numpy.random.seed(19920206) 20 | timer = time.time 21 | 22 | WORK = '/misc/kcgscratch1/ChoGroup/thoma_exp/SimulTrans/' 23 | EXP = WORK 24 | 25 | # check hidden folders 26 | def check_env(): 27 | import os 28 | paths = ['.policy', '.pretrained', '.log', '.config', '.images', '.translate'] 29 | for p in paths: 30 | p = WORK + p 31 | if not os.path.exists(p): 32 | os.mkdir 33 | # run training function:: >>> 34 | def run_simultrans(model, 35 | options_file=None, 36 | config=None, 37 | policy=None, 38 | id=None, 39 | remote=False): 40 | # check envoriments 41 | check_env() 42 | if id is not None: 43 | fcon = WORK + '.config/{}.conf'.format(id) 44 | if os.path.exists(fcon): 45 | print 'load config files' 46 | policy, config = pkl.load(open(fcon, 'r')) 47 | 48 | # ============================================================================== # 49 | # load model model_options 50 | # ============================================================================== # 51 | _model = model 52 | model = WORK + '.pretrained/{}'.format(model) 53 | 54 | if options_file is not None: 55 | with open(options_file, 'rb') as f: 56 | options = pkl.load(f) 57 | else: 58 | with open('%s.pkl' % model, 'rb') as f: 59 | options = pkl.load(f) 60 | 61 | print 'load options...' 62 | for w, p in sorted(options.items(), key=lambda x: x[0]): 63 | print '{}: {}'.format(w, p) 64 | 65 | # load detail settings from option file: 66 | dictionary, dictionary_target = options['dictionaries'] 67 | 68 | def _iter(fname): 69 | with open(fname, 'r') as f: 70 | for line in f: 71 | words = line.strip().split() 72 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 73 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 74 | x += [0] 75 | yield x 76 | 77 | def _check_length(fname): 78 | f = open(fname, 'r') 79 | count = 0 80 | for _ in f: 81 | count += 1 82 | f.close() 83 | 84 | return count 85 | 86 | # load source dictionary and invert 87 | with open(dictionary, 'rb') as f: 88 | word_dict = pkl.load(f) 89 | word_idict = dict() 90 | for kk, vv in word_dict.iteritems(): 91 | word_idict[vv] = kk 92 | word_idict[0] = '' 93 | word_idict[1] = 'UNK' 94 | 95 | # load target dictionary and invert 96 | with open(dictionary_target, 'rb') as f: 97 | word_dict_trg = pkl.load(f) 98 | word_idict_trg = dict() 99 | for kk, vv in word_dict_trg.iteritems(): 100 | word_idict_trg[vv] = kk 101 | word_idict_trg[0] = '' 102 | word_idict_trg[1] = 'UNK' 103 | 104 | ## use additional input for the policy network 105 | options['pre'] = config['pre'] 106 | 107 | # ================================================================================= # 108 | # Build a Simultaneous Translator 109 | # ================================================================================= # 110 | 111 | # allocate model parameters 112 | params = init_params(options) 113 | params = load_params(model, params) 114 | tparams = init_tparams(params) 115 | 116 | # print 'build the model for computing cost (full source sentence).' 117 | trng, use_noise, \ 118 | _x, _x_mask, _y, _y_mask, \ 119 | opt_ret, \ 120 | cost, f_cost = build_model(tparams, options) 121 | print 'done' 122 | 123 | # functions for sampler 124 | f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng) 125 | 126 | # function for finetune 127 | if config['finetune'] != 'nope': 128 | f_fine_init, f_fine_cost, f_fine_update = build_fine(tparams, options, 129 | fullmodel=True if config['finetune'] == 'full' 130 | else False) 131 | 132 | def _translate(src, trg, train=False, samples=config['sample'], greedy=False): 133 | ret = simultaneous_decoding( 134 | f_sim_ctx, f_sim_init, 135 | f_sim_next, f_cost, 136 | _policy, 137 | src, trg, word_idict_trg, 138 | step=config['step'], peek=config['peek'], sidx=config['s0'], 139 | n_samples=samples, 140 | reward_config={'target': config['target'], 141 | 'gamma': config['gamma'], 142 | 'Rtype': config['Rtype'], 143 | 'maxsrc': config['maxsrc'], 144 | 'greedy': greedy, 145 | 'upper': config['upper']}, 146 | train=train, 147 | use_forget=config['forget'], 148 | use_newinput=config['pre'], 149 | use_coverage=config['coverage'], 150 | on_groundtruth=0 if config['finetune'] == 'nope' else 10) 151 | 152 | print ret 153 | import sys; sys.exit(-1) 154 | 155 | 156 | return ret 157 | 158 | # if not train: 159 | # sample, score, actions, R, tracks, attentions = ret 160 | # return sample, score, actions, R, tracks 161 | # else: 162 | # sample, score, actions, R, info, pipe_t = ret 163 | # return sample, score, actions, R, info, pipe_t 164 | 165 | # check the ID: 166 | policy['base'] = _model 167 | _policy = Policy(trng, options, policy, config, 168 | n_in=options['readout_dim'] + 1 if config['coverage'] else options['readout_dim'], 169 | n_out=3 if config['forget'] else 2, 170 | recurrent=policy['recurrent'], id=id) 171 | 172 | # make the dataset ready for training & validation 173 | # train_ = options['datasets'][0] 174 | # train_num = _check_length 175 | trainIter = TextIterator(options['datasets'][0], options['datasets'][1], 176 | options['dictionaries'][0], options['dictionaries'][1], 177 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 178 | batch_size=config['batchsize'], 179 | maxlen=options['maxlen']) 180 | 181 | train_num = trainIter.num 182 | 183 | validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1], 184 | options['dictionaries'][0], options['dictionaries'][1], 185 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 186 | batch_size=1, cache=1, 187 | maxlen=1000000) 188 | 189 | valid_num = validIter.num 190 | 191 | valid_ = options['valid_datasets'][0] 192 | valid_num = _check_length(valid_) 193 | print 'training set {} lines / validation set {} lines'.format(train_num, valid_num) 194 | print 'use the reward function {}'.format(chr(config['Rtype'] + 65)) 195 | 196 | # ================================================================================= # 197 | # Main Loop: Run 198 | # ================================================================================= # 199 | print 'Start Simultaneous Translator...' 200 | probar = Progbar(train_num / config['batchsize'], with_history=False) 201 | monitor = None 202 | if remote: 203 | monitor = Monitor(root='http://localhost:9000') 204 | 205 | # freqs 206 | save_freq = 200 207 | sample_freq = 10 208 | valid_freq = 200 209 | valid_size = 200 210 | display_freq = 50 211 | finetune_freq = 5 212 | 213 | history, last_it = _policy.load() 214 | action_space = ['W', 'C', 'F'] 215 | Log_avg = {} 216 | time0 = timer() 217 | pipe = PIPE(['x', 'x_mask', 'y', 'y_mask', 'c_mask']) 218 | 219 | for it, (srcs, trgs) in enumerate(trainIter): # only one sentence each iteration 220 | if it < last_it: # go over the scanned lines. 221 | continue 222 | 223 | # for validation 224 | # doing the whole validation!! 225 | reference = [] 226 | system = [] 227 | 228 | reference2 = [] 229 | system2 = [] 230 | 231 | if it % valid_freq == 0: 232 | print 'start validation' 233 | 234 | collections = [[], [], [], [], []] 235 | probar_v = Progbar(valid_num / 64 + 1) 236 | for ij, (srcs, trgs) in enumerate(validIter): 237 | 238 | # new_srcs, new_trgs = [], [] 239 | 240 | # for src, trg in zip(srcs, trgs): 241 | # if len(src) < config['s0']: 242 | # continue # ignore when the source sentence is less than sidx. we don't use the policy\ 243 | # else: 244 | # new_srcs += [src] 245 | # new_trgs += [trg] 246 | 247 | # if len(new_srcs) == 0: 248 | # continue 249 | # srcs, trgs = new_srcs, new_trgs 250 | 251 | statistics = _translate(srcs, trgs, train=False, samples=1, greedy=True) 252 | 253 | quality, delay, reward = zip(*statistics['track']) 254 | reference += statistics['Ref'] 255 | system += statistics['Sys'] 256 | 257 | # print ' '.join(reference[-1][0]) 258 | # print ' '.join(system[-1]) 259 | 260 | 261 | # compute the average consective waiting length 262 | def _consective(action): 263 | waits = [] 264 | temp = 0 265 | for a in action: 266 | if a == 0: 267 | temp += 1 268 | elif temp > 0: 269 | waits += [temp] 270 | temp = 0 271 | 272 | if temp > 0: 273 | waits += [temp] 274 | 275 | mean = numpy.mean(waits) 276 | gec = numpy.max(waits) # numpy.prod(waits) ** (1./len(waits)) 277 | return mean, gec 278 | 279 | def _max_length(action): 280 | _cur = 0 281 | _end = 0 282 | _max = 0 283 | for it, a in enumerate(action): 284 | if a == 0: 285 | _cur += 1 286 | elif a == 2: 287 | _end += 1 288 | 289 | temp = _cur - _end 290 | if temp > _max: 291 | _max = temp 292 | return _max 293 | 294 | maxlen = [_max_length(action) for action in statistics['action']] 295 | means, gecs = zip(*(_consective(action) for action in statistics['action'])) 296 | 297 | collections[0] += quality 298 | collections[1] += delay 299 | collections[2] += means 300 | collections[3] += gecs 301 | collections[4] += maxlen 302 | 303 | values = [('quality', numpy.mean(quality)), ('delay', numpy.mean(delay)), 304 | ('wait_mean', numpy.mean(means)), ('wait_max', numpy.mean(gecs)), 305 | ('max_len', numpy.mean(maxlen))] 306 | probar_v.update(ij + 1, values=values) 307 | 308 | 309 | validIter.reset() 310 | valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx = [numpy.mean(a) for a in collections] 311 | print 'Iter = {}: AVG BLEU = {}, DELAY = {}, WAIT(MEAN) = {}, WAIT(MAX) = {}, MaxLen={}'.format( 312 | it, valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx) 313 | 314 | print 'Compute the Corpus BLEU={} (greedy)'.format(corpus_bleu(reference, system)) 315 | 316 | with open(WORK + '.translate/test.txt', 'w') as fout: 317 | for sys in system: 318 | fout.write('{}\n'.format(' '.join(sys))) 319 | 320 | with open(WORK + '.translate/ref.txt', 'w') as fout: 321 | for ref in reference: 322 | fout.write('{}\n'.format(' '.join(ref[0]))) 323 | 324 | 325 | 326 | if config['upper']: 327 | print 'done' 328 | import sys; sys.exit(-1) 329 | 330 | 331 | # training set sentence tuning 332 | new_srcs, new_trgs = [], [] 333 | for src, trg in zip(srcs, trgs): 334 | if len(src) <= config['s0']: 335 | continue # ignore when the source sentence is less than sidx. we don't use the policy\ 336 | else: 337 | new_srcs += [src] 338 | new_trgs += [trg] 339 | 340 | if len(new_srcs) == 0: 341 | continue 342 | 343 | srcs, trgs = new_srcs, new_trgs 344 | try: 345 | statistics, info, pipe_t = _translate(srcs, trgs, train=True) 346 | except Exception: 347 | print 'translate a empty sentence. bug.' 348 | continue 349 | 350 | 351 | # samples, scores, actions, rewards, info, pipe_t = _translate(srcs, trgs, train=True) 352 | # print pipe_t 353 | 354 | 355 | if config['finetune'] != 'nope': 356 | 357 | for idx, act in enumerate(pipe_t['action']): 358 | _start = 0 359 | _end = 0 360 | _mask = [0 for _ in srcs[0]] 361 | _cmask = [] 362 | 363 | pipe.messages['x'] += srcs 364 | pipe.messages['y'] += [pipe_t['sample'][idx]] 365 | 366 | for a in act: 367 | # print _start, _end 368 | if a == 0: 369 | _mask[_start] = 1 370 | _start += 1 371 | elif a == 2: 372 | _mask[_end] = 0 373 | _end += 1 374 | else: 375 | _cmask.append(_mask) 376 | # print numpy.asarray(_cmask).shape 377 | 378 | pipe.messages['c_mask'].append(_cmask) 379 | 380 | if it % finetune_freq == (finetune_freq - 1): 381 | num = len(pipe.messages['x']) 382 | max_x = max([len(v) for v in pipe.messages['x']]) 383 | max_y = max([len(v) for v in pipe.messages['y']]) 384 | 385 | xx, xx_mask = _padding(pipe.messages['x'], shape=(max_x, num), return_mask=True, dtype='int64') 386 | yy, yy_mask = _padding(pipe.messages['y'], shape=(max_y, num), return_mask=True, dtype='int64') 387 | cc_mask = _padding(pipe.messages['c_mask'], shape=(max_y, num, max_x)).transpose([0, 2, 1]) 388 | 389 | # fine-tune the EncDec of translation 390 | if config['finetune'] == 'full': 391 | cost = f_fine_cost(xx, xx_mask, yy, yy_mask, cc_mask) 392 | elif config['finetune'] == 'decoder': 393 | cost = f_fine_cost(xx, xx_mask, yy, yy_mask, cc_mask) 394 | else: 395 | raise NotImplementedError 396 | 397 | print '\nIter={} || cost = {}'.format(it, cost[0]) 398 | f_fine_update(0.00001) 399 | pipe.reset() 400 | 401 | if it % sample_freq == 0: 402 | 403 | print '\nModel:{} has been trained for {} hours'.format(_policy.id, (timer() - time0) / 3600.) 404 | print 'source: ', _bpe2words(_seqs2words([srcs[0]], word_idict))[0] 405 | print 'target: ', _bpe2words(_seqs2words([trgs[0]], word_idict_trg))[0] 406 | 407 | # obtain the translation results 408 | samples = _bpe2words(_seqs2words(statistics['sample'], word_idict_trg)) 409 | 410 | # obtain the delay (normalized) 411 | # delays = _action2delay(srcs[0], statistics['action']) 412 | 413 | c = 0 414 | for j in xrange(len(samples)): 415 | 416 | if statistics['secs'][j][0] == 0: 417 | if c < 5: 418 | c += 1 419 | 420 | print '---ID: {}'.format(_policy.id) 421 | print 'sample: ', samples[j] 422 | # print 'action: ', ','.join( 423 | # ['{}({})'.format(action_space[t], f) 424 | # for t, f in 425 | # zip(statistics['action'][j], statistics['forgotten'][j])]) 426 | 427 | print 'action: ', ','.join( 428 | ['{}'.format(action_space[t]) 429 | for t in statistics['action'][j]]) 430 | 431 | print 'quality:', statistics['track'][j][0] 432 | print 'delay:', statistics['track'][j][1] 433 | # print 'score:', statistics['score'][j] 434 | break 435 | 436 | values = [(w, info[w]) for w in info] 437 | probar.update(it + 1, values=values) 438 | 439 | 440 | # NaN detector 441 | for w in info: 442 | if numpy.isnan(info[w]) or numpy.isinf(info[w]): 443 | raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id) 444 | 445 | # remote display 446 | if remote: 447 | logs = {'R': info['R'], 'Q': info['Q'], 448 | 'D': info['D'], 'P': float(info['P'])} 449 | # print logs 450 | for w in logs: 451 | Log_avg[w] = Log_avg.get(w, 0) + logs[w] 452 | 453 | if it % display_freq == (display_freq - 1): 454 | for w in Log_avg: 455 | Log_avg[w] /= display_freq 456 | 457 | monitor.display(it + 1, Log_avg) 458 | Log_avg = dict() 459 | 460 | # save the history & model 461 | history += [info] 462 | if it % save_freq == 0: 463 | _policy.save(history, it) 464 | 465 | 466 | if __name__ == "__main__": 467 | parser = argparse.ArgumentParser() 468 | parser.add_argument('-s', '--step', type=int, default=1) 469 | parser.add_argument('-k', '--peek', type=int, default=1) 470 | parser.add_argument('-i', '--sinit', type=int, default=1) 471 | parser.add_argument('-n', '--sample', type=int, default=20) 472 | parser.add_argument('-b', '--batchsize', type=int, default=10) 473 | parser.add_argument('-c', action="store_true", default=False) 474 | parser.add_argument('-o', type=str, default=None) 475 | 476 | parser.add_argument('--updater', type=str, default='REINFORCE') 477 | parser.add_argument('--recurrent', default=False) 478 | parser.add_argument('--layernorm', default=False) 479 | parser.add_argument('--upper', default=False) 480 | parser.add_argument('--target', type=float, default=0.5) 481 | parser.add_argument('--gamma', type=float, default=10) 482 | parser.add_argument('--prop', type=float, default=0.5) # only useful for random policy 483 | parser.add_argument('--Rtype', type=int, default=0) # 0, 1, 2, 3 484 | parser.add_argument('--forget', default=False) 485 | parser.add_argument('--maxsrc', type=float, default=10) 486 | parser.add_argument('--pre', default=False) 487 | parser.add_argument('--coverage', default=False) 488 | parser.add_argument('--finetune', type=str, default='nope') 489 | parser.add_argument('--id', type=str, default=None) 490 | # parser.add_argument('-m', '--model', type=str, 491 | # default='model_wmt15_bpe2k_uni_en-de.npz') 492 | parser.add_argument('-m', '--model', type=str, 493 | default='model_wmt15_bpe2k_uni_en-ru.npz') 494 | parser.add_argument('--remote', default=False) 495 | args = parser.parse_args() 496 | print args # print settings 497 | 498 | policy = OrderedDict() 499 | policy['prop'] = args.prop 500 | policy['recurrent'] = args.recurrent 501 | policy['layernorm'] = args.layernorm 502 | policy['updater'] = args.updater 503 | policy['act_mask'] = True 504 | 505 | config = OrderedDict() 506 | config['step'] = args.step 507 | config['peek'] = args.peek 508 | config['s0'] = args.sinit 509 | config['sample'] = args.sample 510 | config['batchsize'] = args.batchsize 511 | config['target'] = args.target 512 | config['gamma'] = args.gamma 513 | config['Rtype'] = args.Rtype 514 | config['forget'] = args.forget 515 | config['maxsrc'] = args.maxsrc 516 | config['pre'] = args.pre 517 | config['coverage'] = args.coverage 518 | config['upper'] = False 519 | 520 | config['finetune'] = args.finetune 521 | 522 | run_simultrans(args.model, 523 | options_file=args.o, 524 | config=config, 525 | policy=policy, 526 | id=args.id, 527 | remote=args.remote) 528 | 529 | 530 | 531 | 532 | -------------------------------------------------------------------------------- /simultrans_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/simultrans_model.pyc -------------------------------------------------------------------------------- /simultrans_model_clean.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/simultrans_model_clean.pyc -------------------------------------------------------------------------------- /simultrans_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simultaneous Machine Translateion: Training with Policy Gradient 3 | 4 | """ 5 | import argparse 6 | import os 7 | import cPickle as pkl 8 | 9 | from bleu import * 10 | from nmt_uni import * 11 | from policy import Controller as Policy 12 | from utils import Progbar, Monitor 13 | from data_iterator import check_length, iterate 14 | 15 | from simultrans_model_clean import simultaneous_decoding 16 | from simultrans_model_clean import _seqs2words, _bpe2words, _padding 17 | from actors import get_actor 18 | import time 19 | 20 | numpy.random.seed(19920206) 21 | timer = time.time 22 | 23 | 24 | # run training function:: >>> 25 | def run_simultrans(model, 26 | options_file=None, 27 | config=None, 28 | id=None, 29 | remote=False): 30 | 31 | WORK = config['workspace'] 32 | 33 | # check hidden folders 34 | paths = ['.policy', '.pretrained', '.log', '.config', '.images', '.translate'] 35 | for p in paths: 36 | p = WORK + p 37 | if not os.path.exists(p): 38 | os.mkdir(p) 39 | 40 | if id is not None: 41 | fcon = WORK + '.config/{}.conf'.format(id) 42 | if os.path.exists(fcon): 43 | print 'load config files' 44 | policy, config = pkl.load(open(fcon, 'r')) 45 | 46 | # ============================================================================== # 47 | # load model model_options 48 | # ============================================================================== # 49 | _model = model.split('/')[-1] 50 | 51 | if options_file is not None: 52 | with open(options_file, 'rb') as f: 53 | options = pkl.load(f) 54 | else: 55 | with open('%s.pkl' % model, 'rb') as f: 56 | options = pkl.load(f) 57 | 58 | print 'merge configuration into options' 59 | for w in config: 60 | # if (w in options) and (config[w] is not None): 61 | options[w] = config[w] 62 | 63 | print 'load options...' 64 | for w, p in sorted(options.items(), key=lambda x: x[0]): 65 | print '{}: {}'.format(w, p) 66 | 67 | # load detail settings from option file: 68 | dictionary, dictionary_target = options['dictionaries'] 69 | 70 | # load source dictionary and invert 71 | with open(dictionary, 'rb') as f: 72 | word_dict = pkl.load(f) 73 | word_idict = dict() 74 | for kk, vv in word_dict.iteritems(): 75 | word_idict[vv] = kk 76 | word_idict[0] = '' 77 | word_idict[1] = 'UNK' 78 | 79 | # load target dictionary and invert 80 | with open(dictionary_target, 'rb') as f: 81 | word_dict_trg = pkl.load(f) 82 | word_idict_trg = dict() 83 | for kk, vv in word_dict_trg.iteritems(): 84 | word_idict_trg[vv] = kk 85 | word_idict_trg[0] = '' 86 | word_idict_trg[1] = 'UNK' 87 | 88 | options['pre'] = config['pre'] 89 | 90 | # ========================================================================= # 91 | # Build a Simultaneous Translator 92 | # ========================================================================= # 93 | 94 | # allocate model parameters 95 | params = init_params(options) 96 | params = load_params(model, params) 97 | tparams = init_tparams(params) 98 | 99 | # print 'build the model for computing cost (full source sentence).' 100 | trng, use_noise, \ 101 | _x, _x_mask, _y, _y_mask, \ 102 | opt_ret, \ 103 | cost, f_cost = build_model(tparams, options) 104 | print 'done' 105 | 106 | # functions for sampler 107 | f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng) 108 | 109 | # function for finetune the underlying model 110 | if options['finetune']: 111 | ff_init, ff_cost, ff_update = build_simultaneous_model(tparams, options, rl=True) 112 | funcs = [f_sim_ctx, f_sim_init, f_sim_next, f_cost, ff_init, ff_cost, ff_update] 113 | 114 | else: 115 | funcs = [f_sim_ctx, f_sim_init, f_sim_next, f_cost] 116 | 117 | # build a res-predictor 118 | if options['predict']: 119 | params_act = get_actor('gru')[0](options, prefix='pdt', 120 | nin=options['dim']) 121 | pass 122 | 123 | 124 | # check the ID: 125 | options['base'] = _model 126 | agent = Policy(trng, options, 127 | n_in=options['readout_dim'] + 1 if options['coverage'] else options['readout_dim'], 128 | n_out=3 if config['forget'] else 2, 129 | recurrent=options['recurrent'], id=id) 130 | 131 | # make the dataset ready for training & validation 132 | trainIter = TextIterator(options['datasets'][0], options['datasets'][1], 133 | options['dictionaries'][0], options['dictionaries'][1], 134 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 135 | batch_size=config['batchsize'], 136 | maxlen=options['maxlen']) 137 | 138 | train_num = trainIter.num 139 | 140 | validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1], 141 | options['dictionaries'][0], options['dictionaries'][1], 142 | n_words_source=options['n_words_src'], n_words_target=options['n_words'], 143 | batch_size=20, cache=10, 144 | maxlen=1000000) 145 | 146 | valid_num = validIter.num 147 | print 'training set {} lines / validation set {} lines'.format(train_num, valid_num) 148 | print 'use the reward function {}'.format(chr(config['Rtype'] + 65)) 149 | 150 | # ========================================================================== # 151 | # Main Loop: Run 152 | # ========================================================================== # 153 | print 'Start Simultaneous Translator...' 154 | monitor = None 155 | if remote: 156 | monitor = Monitor(root='http://localhost:9000') 157 | 158 | # freqs 159 | save_freq = 200 160 | sample_freq = 10 161 | valid_freq = 200 162 | valid_size = 200 163 | display_freq = 50 164 | finetune_freq = 5 165 | 166 | history, last_it = agent.load() 167 | action_space = ['W', 'C', 'F'] 168 | Log_avg = {} 169 | time0 = timer() 170 | 171 | pipe = OrderedDict() 172 | for key in ['x', 'x_mask', 'y', 'y_mask', 'c_mask']: 173 | pipe[key] = [] 174 | 175 | def _translate(src, trg, samples=None, train=False, 176 | greedy=False, show=False, full=False): 177 | time0 = time.time() 178 | if full: 179 | options1 = copy.copy(options) 180 | options1['upper'] = True 181 | else: 182 | options1 = options 183 | 184 | ret = simultaneous_decoding( 185 | funcs, agent, options1, 186 | src, trg, word_idict_trg, 187 | samples, greedy, train) 188 | 189 | if show: 190 | info = ret[1] 191 | values = [(w, float(info[w])) for w in info if w != 'advantages'] 192 | print ' , '.join(['{}={:.3f}'.format(k, f) for k, f in values]), 193 | print '...{}s'.format(time.time() - time0) 194 | 195 | return ret 196 | 197 | for it, (srcs, trgs) in enumerate(trainIter): # only one sentence each iteration 198 | if it < last_it: # go over the scanned lines. 199 | continue 200 | 201 | # for validation 202 | # doing the whole validation!! 203 | reference = [] 204 | system = [] 205 | 206 | if it % valid_freq == (valid_freq-1): 207 | print 'start validation' 208 | 209 | collections = [[], [], [], [], []] 210 | probar_v = Progbar(valid_num / 20 + 1) 211 | for ij, (srcs, trgs) in enumerate(validIter): 212 | 213 | statistics = _translate(srcs, trgs, samples=1, train=False, greedy=True) 214 | 215 | quality, delay, reward = zip(*statistics['track']) 216 | reference += statistics['Ref'] 217 | system += statistics['Sys'] 218 | 219 | # compute the average consective waiting length 220 | def _consective(action): 221 | waits = [] 222 | temp = 0 223 | for a in action: 224 | if a == 0: 225 | temp += 1 226 | elif temp > 0: 227 | waits += [temp] 228 | temp = 0 229 | 230 | if temp > 0: 231 | waits += [temp] 232 | 233 | mean = numpy.mean(waits) 234 | gec = numpy.max(waits) # numpy.prod(waits) ** (1./len(waits)) 235 | return mean, gec 236 | 237 | def _max_length(action): 238 | _cur = 0 239 | _end = 0 240 | _max = 0 241 | for it, a in enumerate(action): 242 | if a == 0: 243 | _cur += 1 244 | elif a == 2: 245 | _end += 1 246 | 247 | temp = _cur - _end 248 | if temp > _max: 249 | _max = temp 250 | return _max 251 | 252 | maxlen = [_max_length(action) for action in statistics['action']] 253 | means, gecs = zip(*(_consective(action) for action in statistics['action'])) 254 | 255 | collections[0] += quality 256 | collections[1] += delay 257 | collections[2] += means 258 | collections[3] += gecs 259 | collections[4] += maxlen 260 | 261 | values = [('quality', numpy.mean(quality)), ('delay', numpy.mean(delay)), 262 | ('wait_mean', numpy.mean(means)), ('wait_max', numpy.mean(gecs)), 263 | ('max_len', numpy.mean(maxlen))] 264 | probar_v.update(ij + 1, values=values) 265 | 266 | validIter.reset() 267 | valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx = [numpy.mean(a) for a in collections] 268 | print 'Iter = {}: AVG BLEU = {}, DELAY = {}, WAIT(MEAN) = {}, WAIT(MAX) = {}, MaxLen={}'.format( 269 | it, valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx) 270 | 271 | print 'Compute the Corpus BLEU={} (greedy)'.format(corpus_bleu(reference, system)) 272 | 273 | with open(WORK + '.translate/test.txt', 'w') as fout: 274 | for sys in system: 275 | fout.write('{}\n'.format(' '.join(sys))) 276 | 277 | with open(WORK + '.translate/ref.txt', 'w') as fout: 278 | for ref in reference: 279 | fout.write('{}\n'.format(' '.join(ref[0]))) 280 | 281 | history += [collections] 282 | print 'done' 283 | 284 | if options['upper']: 285 | print 'done' 286 | import sys; sys.exit(-1) 287 | 288 | # training set sentence tuning 289 | new_srcs, new_trgs = [], [] 290 | for src, trg in zip(srcs, trgs): 291 | if len(src) <= options['s0']: 292 | continue # ignore when the source sentence is less than sidx. 293 | else: 294 | new_srcs += [src] 295 | new_trgs += [trg] 296 | 297 | if len(new_srcs) == 0: 298 | continue 299 | 300 | srcs, trgs = new_srcs, new_trgs 301 | statistics, info = _translate(srcs, trgs, train=True, show=True) 302 | 303 | if it % sample_freq == 0: 304 | 305 | # obtain the translation results 306 | samples = _bpe2words( 307 | _seqs2words(statistics['sample'], word_idict_trg, 308 | statistics['action'], 1)) 309 | sources = _bpe2words( 310 | _seqs2words(statistics['SWord'], word_idict, 311 | statistics['action'], 0)) 312 | targets = _bpe2words( 313 | _seqs2words(statistics['TWord'], word_idict_trg)) 314 | 315 | # obtain the delay (normalized) 316 | # delays = _action2delay(srcs[0], statistics['action']) 317 | 318 | c = 0 319 | for j in xrange(len(samples)): 320 | 321 | if statistics['seq_info'][j][0] == 0: 322 | if c < (config['sample']/2.): 323 | c += 1 324 | continue 325 | 326 | print '--Iter: {}'.format(it) 327 | print 'source: ', sources[j] 328 | print 'sample: ', samples[j] 329 | print 'target: ', targets[j] 330 | print 'quality:', statistics['track'][j][0] 331 | print 'delay:', statistics['track'][j][1] 332 | print 'reward:', statistics['track'][j][2] 333 | break 334 | 335 | 336 | # NaN detector 337 | #for w in info: 338 | # if numpy.isnan(info[w]) or numpy.isinf(info[w]): 339 | # raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id) 340 | 341 | # remote display 342 | if remote: 343 | logs = {'R': info['R'], 'Q': info['Q'], 344 | 'D': info['D'], 'P': float(info['P'])} 345 | if 'a_cost' in info: 346 | logs['A'] = info['a_cost'] 347 | 348 | print logs 349 | for w in logs: 350 | Log_avg[w] = Log_avg.get(w, 0) + logs[w] 351 | 352 | if it % display_freq == (display_freq - 1): 353 | for w in Log_avg: 354 | Log_avg[w] /= display_freq 355 | 356 | monitor.display(it + 1, Log_avg) 357 | Log_avg = dict() 358 | 359 | # save the history & model 360 | history += [info] 361 | if it % save_freq == 0: 362 | agent.save(history, it) 363 | 364 | 365 | if __name__ == "__main__": 366 | from config import rl_config 367 | config = rl_config() 368 | 369 | run_simultrans(config['model'], 370 | options_file=config['option'], 371 | config=config, 372 | id=None, 373 | remote=False) 374 | 375 | 376 | 377 | 378 | -------------------------------------------------------------------------------- /translate_uni.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Translates a source file using a translation model. 3 | ''' 4 | import theano 5 | import argparse 6 | 7 | import numpy 8 | import cPickle as pkl 9 | 10 | from nmt_uni import (build_model, build_sampler, gen_sample, load_params, 11 | init_params, init_tparams, prepare_data) 12 | 13 | from multiprocessing import Process, Queue 14 | 15 | 16 | def translate_model(queue, rqueue, pid, model, options, k, normalize, kp, sigma): 17 | 18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 19 | trng = RandomStreams(1234) 20 | 21 | # allocate model parameters 22 | params = init_params(options) 23 | 24 | # load model parameters and set theano shared variables 25 | params = load_params(model, params) 26 | tparams = init_tparams(params) 27 | 28 | trng, use_noise, \ 29 | x, x_mask, y, y_mask, \ 30 | opt_ret, \ 31 | cost = \ 32 | build_model(tparams, options) 33 | inps = [x, x_mask, y, y_mask] 34 | 35 | f_log_probs = theano.function(inps, cost) 36 | 37 | # word index 38 | f_init, f_next = build_sampler(tparams, options, trng) 39 | 40 | def _translate(idx, seq): 41 | all_samples = [] 42 | all_scores = [] 43 | all_c = [] 44 | for kidx in xrange(kp): 45 | if kidx == 0: 46 | ss = -1. 47 | else: 48 | ss = sigma 49 | # sample given an input sequence and obtain scores 50 | sample, score, c = gen_sample(tparams, f_init, f_next, 51 | numpy.array(seq).reshape([len(seq), 1]), 52 | options, trng=trng, k=1, maxlen=200, 53 | stochastic=True, argmax=True, sigma=ss) 54 | 55 | # normalize scores according to sequence lengths 56 | if normalize: 57 | lengths = numpy.array([len(s) for s in sample]) 58 | score = score / lengths 59 | #print idx, score 60 | sidx = numpy.argmin(score) 61 | all_samples.append(sample[sidx]) 62 | all_scores.append(score[sidx]) 63 | all_c.append(c[0]) 64 | 65 | source_list = [seq] * kp 66 | x, x_mask, y, y_mask = prepare_data(source_list, all_samples, maxlen=None) 67 | all_scores = f_log_probs(x, x_mask, y, y_mask) 68 | if normalize: 69 | lengths = numpy.array([len(s) for s in all_samples]) 70 | all_scores = all_scores / lengths 71 | 72 | print idx, all_scores 73 | sidx = numpy.argmin(all_scores) 74 | return all_samples[sidx], all_c[sidx] 75 | 76 | while True: 77 | req = queue.get() 78 | if req is None: 79 | break 80 | 81 | idx, x = req[0], req[1] 82 | print pid, '-', idx 83 | seq = _translate(idx, x) 84 | 85 | rqueue.put((idx, seq)) 86 | 87 | return 88 | 89 | 90 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5, 91 | normalize=False, n_process=5, chr_level=False, 92 | options_file=None, sigma=-1., kp=1): 93 | 94 | # load model model_options 95 | if options_file is not None: 96 | with open(options_file, 'rb') as f: 97 | options = pkl.load(f) 98 | else: 99 | with open('%s.pkl' % model, 'rb') as f: 100 | options = pkl.load(f) 101 | 102 | # load source dictionary and invert 103 | with open(dictionary, 'rb') as f: 104 | word_dict = pkl.load(f) 105 | word_idict = dict() 106 | for kk, vv in word_dict.iteritems(): 107 | word_idict[vv] = kk 108 | word_idict[0] = '' 109 | word_idict[1] = 'UNK' 110 | 111 | # load target dictionary and invert 112 | with open(dictionary_target, 'rb') as f: 113 | word_dict_trg = pkl.load(f) 114 | word_idict_trg = dict() 115 | for kk, vv in word_dict_trg.iteritems(): 116 | word_idict_trg[vv] = kk 117 | word_idict_trg[0] = '' 118 | word_idict_trg[1] = 'UNK' 119 | 120 | # create input and output queues for processes 121 | queue = Queue() 122 | rqueue = Queue() 123 | processes = [None] * n_process 124 | for midx in xrange(n_process): 125 | processes[midx] = Process( 126 | target=translate_model, 127 | args=(queue, rqueue, midx, model, options, k, normalize, kp, sigma)) 128 | processes[midx].start() 129 | 130 | # utility function 131 | def _seqs2words(caps): 132 | capsw = [] 133 | for cc in caps: 134 | ww = [] 135 | for w in cc: 136 | if w == 0: 137 | break 138 | ww.append(word_idict_trg[w]) 139 | capsw.append(' '.join(ww)) 140 | return capsw 141 | 142 | def _send_jobs(fname): 143 | with open(fname, 'r') as f: 144 | for idx, line in enumerate(f): 145 | if chr_level: 146 | words = list(line.decode('utf-8').strip()) 147 | else: 148 | words = line.strip().split() 149 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words) 150 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x) 151 | x += [0] 152 | queue.put((idx, x)) 153 | return idx+1 154 | 155 | def _finish_processes(): 156 | for midx in xrange(n_process): 157 | queue.put(None) 158 | 159 | def _retrieve_jobs(n_samples): 160 | trans = [None] * n_samples 161 | c = [None] * n_samples 162 | for idx in xrange(n_samples): 163 | resp = rqueue.get() 164 | trans[resp[0]] = resp[1][0] 165 | c[resp[0]] = resp[1][1] 166 | if numpy.mod(idx, 10) == 0: 167 | print 'Sample ', (idx+1), '/', n_samples, ' Done' 168 | 169 | return trans, c 170 | 171 | print 'Translating ', source_file, '...' 172 | n_samples = _send_jobs(source_file) 173 | trans, c = _retrieve_jobs(n_samples) 174 | trans = _seqs2words(trans) 175 | _finish_processes() 176 | with open(saveto, 'w') as f: 177 | print >>f, '\n'.join(trans) 178 | print >>f, '{}\n'.format(c) 179 | print 'Done' 180 | 181 | 182 | if __name__ == "__main__": 183 | parser = argparse.ArgumentParser() 184 | parser.add_argument('-k', type=int, default=5) 185 | parser.add_argument('-kp', type=int, default=1) 186 | parser.add_argument('-p', type=int, default=5) 187 | parser.add_argument('-n', action="store_true", default=False) 188 | parser.add_argument('-c', action="store_true", default=False) 189 | parser.add_argument('-o', type=str, default=None) 190 | parser.add_argument('-s', type=float, default=-1.) 191 | parser.add_argument('model', type=str) 192 | parser.add_argument('dictionary', type=str) 193 | parser.add_argument('dictionary_target', type=str) 194 | parser.add_argument('source', type=str) 195 | parser.add_argument('saveto', type=str) 196 | 197 | args = parser.parse_args() 198 | 199 | main(args.model, args.dictionary, args.dictionary_target, args.source, 200 | args.saveto, k=args.k, normalize=args.n, n_process=args.p, 201 | chr_level=args.c, options_file=args.o, kp=args.kp, sigma=args.s) 202 | -------------------------------------------------------------------------------- /translate_uni.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | dataset=dev 3 | model="/work/jg5223/work/SimulTrans/.pretrained/model_wmt15_bpe2k_uni_en-ru.npz" 4 | dict="/scratch/jg5223/data/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl" 5 | dict_rev="/scratch/jg5223/data/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl" 6 | source="/scratch/jg5223/data/wmt15/ruen/${dataset}/newstest2013-src.en.tok.bpe" 7 | saveto="./enrugreedy.out" 8 | reference="/scratch/jg5223/data/wmt15/ruen/${dataset}/newstest2013-src.ru.tok" 9 | 10 | # pyenv local anaconda-2.4.0 11 | THEANO_FLAGS="floatX=float32, device=cpu" python translate_uni.py -p 8 -k 1 $model $dict $dict_rev $source $saveto 12 | 13 | ./data/multi-bleu.perl $reference < $saveto 14 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is for functions to help the translation 3 | """ 4 | import numpy as np 5 | import time 6 | import sys 7 | import json 8 | 9 | class Monitor(object): 10 | def __init__(self, root='http://localhost:9000'): 11 | self.root = root 12 | 13 | def display(self, batch, logs={}): 14 | import requests 15 | send = {} 16 | send['epoch'] = batch 17 | for k, v in logs.items(): 18 | send[k] = v 19 | 20 | try: 21 | requests.post(self.root + '/publish/epoch/end/', 22 | {'data': json.dumps(send)}) 23 | except: 24 | print('Warning: could not reach RemoteMonitor ' 25 | 'root server at ' + str(self.root)) 26 | 27 | 28 | 29 | class Progbar(object): 30 | def __init__(self, target, width=30, verbose=1, with_history=True): 31 | ''' 32 | @param target: total number of steps expected 33 | ''' 34 | self.width = width 35 | self.target = target 36 | self.sum_values = {} 37 | self.unique_values = [] 38 | self.start = time.time() 39 | self.total_width = 0 40 | self.seen_so_far = 0 41 | self.verbose = verbose 42 | self.with_history = with_history 43 | 44 | def update(self, current, values=[]): 45 | ''' 46 | @param current: index of current step 47 | @param values: list of tuples (name, value_for_last_step). 48 | The progress bar will display averages for these values. 49 | ''' 50 | if not self.with_history: 51 | self.sum_values = {} 52 | self.unique_values = [] 53 | 54 | for k, v in values: 55 | if k not in self.sum_values: 56 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far] 57 | self.unique_values.append(k) 58 | else: 59 | self.sum_values[k][0] += v * (current - self.seen_so_far) 60 | self.sum_values[k][1] += (current - self.seen_so_far) 61 | self.seen_so_far = current 62 | 63 | now = time.time() 64 | if self.verbose == 1: 65 | prev_total_width = self.total_width 66 | sys.stdout.write("\b" * prev_total_width) 67 | sys.stdout.write("\r") 68 | 69 | numdigits = int(np.floor(np.log10(self.target))) + 1 70 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 71 | bar = barstr % (current, self.target) 72 | prog = float(current)/self.target 73 | prog_width = int(self.width*prog) 74 | if prog_width > 0: 75 | bar += ('.'*(prog_width-1)) 76 | if current < self.target: 77 | bar += '(-w-)' 78 | else: 79 | bar += '(-v-)!!' 80 | bar += ('~' * (self.width-prog_width)) 81 | bar += ']' 82 | sys.stdout.write(bar) 83 | self.total_width = len(bar) 84 | 85 | if current: 86 | time_per_unit = (now - self.start) / current 87 | else: 88 | time_per_unit = 0 89 | eta = time_per_unit*(self.target - current) 90 | info = '' 91 | if current < self.target: 92 | info += ' - ETA: %ds' % eta 93 | else: 94 | info += ' - %ds' % (now - self.start) 95 | for k in self.unique_values: 96 | if k == 'perplexity' or k == 'PPL': 97 | info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1]))) 98 | else: 99 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 100 | 101 | self.total_width += len(info) 102 | if prev_total_width > self.total_width: 103 | info += ((prev_total_width-self.total_width) * " ") 104 | 105 | sys.stdout.write(info) 106 | sys.stdout.flush() 107 | 108 | if current >= self.target: 109 | sys.stdout.write("\n") 110 | 111 | if self.verbose == 2: 112 | if current >= self.target: 113 | info = '%ds' % (now - self.start) 114 | for k in self.unique_values: 115 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 116 | sys.stdout.write(info + "\n") 117 | 118 | def add(self, n, values=[]): 119 | self.update(self.seen_so_far + n, values) 120 | 121 | def clear(self): 122 | self.sum_values = {} 123 | self.unique_values = [] 124 | self.total_width = 0 125 | self.seen_so_far = 0 126 | -------------------------------------------------------------------------------- /utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/utils.pyc -------------------------------------------------------------------------------- /utils/msyh.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/utils/msyh.ttf --------------------------------------------------------------------------------