├── .idea
├── inspectionProfiles
│ └── Project_Default.xml
└── vcs.xml
├── LICENSE
├── README.md
├── __init__.py
├── actors.py
├── bleu.py
├── bleu.pyc
├── config.py
├── config.pyc
├── data
├── README.md
├── build_dictionary.py
├── download_files.py
├── length.py
├── merge.sh
├── multi-bleu.perl
├── nonbreaking_prefixes
│ ├── README.txt
│ ├── nonbreaking_prefix.ca
│ ├── nonbreaking_prefix.cs
│ ├── nonbreaking_prefix.de
│ ├── nonbreaking_prefix.el
│ ├── nonbreaking_prefix.en
│ ├── nonbreaking_prefix.es
│ ├── nonbreaking_prefix.fi
│ ├── nonbreaking_prefix.fr
│ ├── nonbreaking_prefix.hu
│ ├── nonbreaking_prefix.is
│ ├── nonbreaking_prefix.it
│ ├── nonbreaking_prefix.lv
│ ├── nonbreaking_prefix.nl
│ ├── nonbreaking_prefix.pl
│ ├── nonbreaking_prefix.pt
│ ├── nonbreaking_prefix.ro
│ ├── nonbreaking_prefix.ru
│ ├── nonbreaking_prefix.sk
│ ├── nonbreaking_prefix.sl
│ ├── nonbreaking_prefix.sv
│ └── nonbreaking_prefix.ta
├── preprocess.sh
├── scan_example.py
├── setup_cluster_env.sh
├── setup_local_env.sh
├── shuffle.py
├── strip_sgml.py
├── tokenize_all.sh
├── tokenizer.perl
└── translate.sh
├── data_iterator.py
├── data_iterator.pyc
├── insepection.py
├── insepection.pyc
├── itchat.pkl
├── layers.py
├── layers.pyc
├── mteval.sh
├── nmt_uni.py
├── nmt_uni.pyc
├── optimizer.py
├── optimizer.pyc
├── plot_heatmap.ipynb
├── policy.py
├── policy.pyc
├── pretrain_uni.py
├── reward.py
├── reward.pyc
├── run_eval.sh
├── run_train.sh
├── show_progress.ipynb
├── simultrans_beam.py
├── simultrans_eval.py
├── simultrans_model.py
├── simultrans_model.pyc
├── simultrans_model_clean.py
├── simultrans_model_clean.pyc
├── simultrans_train.py
├── translate_uni.py
├── translate_uni.sh
├── utils.py
├── utils.pyc
└── utils
└── msyh.ttf
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2016, New York University (Kyunghyun Cho) and Jiatao Gu
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Learning to Translate in Real-time with Neural Machine Translation
2 | ===================================
3 | Translation in Real-time, a.k.a, Simultaneous Translation.
4 |
5 | This code is the Theano implementation of the EACL2017 paper [Learning to Translate in Real-time with Neural Machine Translation](https://arxiv.org/abs/1610.00388). It is based on the dl4mt-tutorial (https://github.com/nyu-dl/dl4mt-tutorial).
6 |
7 | Dependencies:
8 | ----------------------
9 | ### Python 2.7
10 | * Theano 0.8.2 (cuda 8.0, cudnn v5)
11 | * seaborn, pandas (for drawing the heatmap)
12 | * NLTK 3.2.1
13 |
14 | ### Preprocessing
15 | The preprocessing and evaluation scripts are from [MOSES](https://github.com/moses-smt/mosesdecoder).
16 |
17 | Dataset:
18 | ----------------------
19 | We used the WMT'15 corpora as our training set for both pretraining the NMT model and training the Simultaneous NMT model.
20 | The original WMT'15 corpora can be downloaded from [here](http://www.statmt.org/wmt15/translation-task.html).
21 | For the preprocessed corpora used in our experiments, both the source and target datasets are preprocessed using byte-pair encoding (http://arxiv.org/abs/1508.07909, https://github.com/rsennrich/subword-nmt).
22 |
23 | Pretraining:
24 | ----------------------
25 | Before training the agent for simultaneous translation, the underlined translation model requires pretraining.
26 | In our experiments, we pretrained single-layer undirectional NMT for both RU-EN and DE-EN corpora for both directions.
27 |
28 | * We provided the preprocessed dataset and the pretrained models: (https://drive.google.com/drive/folders/0B0miOG3ks5c1SVljM1Q5SURibU0?usp=sharing)
29 |
30 | ### Pretrain your own model:
31 | Follow the instructions and setup the configurations in `config.py (pretrain_config)` and then excute:
32 | ```bash
33 | $ export THEANO_FLAGS=device=gpu,floatX=float32
34 | $ python pretrain_uni.py
35 | ```
36 | It normally takes 1~2 weeks for training an unidirectional NMT model for WMT15 corpora.
37 |
38 | ### Evaluate the BLEU score for a pre-trained NMT model
39 | TBA.
40 |
41 | Simultaneous Translation:
42 | ----------------------
43 | ### Training an Agent
44 | Follow the instructions and setup the configurations in `config.py (rl_config)` and then excute:
45 | ```bash
46 | $ export THEANO_FLAGS=device=gpu,floatX=float32
47 | $ python simultrans_train.py
48 | ```
49 | ### Monitoring
50 | TBA.
51 |
52 | ### Visualization
53 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/__init__.py
--------------------------------------------------------------------------------
/actors.py:
--------------------------------------------------------------------------------
1 | """
2 | Deterministic Actor Functions:
3 | """
4 | from layers import *
5 |
6 | TINY = 1e-7
7 |
8 | # -------------------------------------------------------------------------#
9 | # Noise
10 | def ou_noise(trng, x, mu=0., theta=0.15, sigma=0.01):
11 | dx = theta * (mu - x) + sigma * trng.normal(x.shape)
12 | return x + dx
13 |
14 |
15 | def gaussian_noise(trng, x, mu=0, sigma=0.01):
16 | dx = mu + sigma * trng.normal(x.shape)
17 | return dx
18 |
19 |
20 | # -------------------------------------------------------------------------#
21 | # Actors:
22 | actors = dict()
23 | actors['dumb'] = ('param_init_dumb', 'dumb_actor')
24 | actors['const'] = ('param_init_constant', 'constant_actor')
25 | actors['ff'] = ('param_init_ff', 'ff_actor')
26 | actors['gru'] = ('param_init_gru', 'gru_actor', 'gru_actor_hard')
27 | actors['gru2'] = ('param_init_gru2', 'gru_actor2')
28 | actors['gg'] = ('param_init_gg', 'gg_actor')
29 |
30 |
31 | def get_actor(name):
32 | fns = actors[name]
33 | return tuple([eval(f) for f in fns])
34 |
35 |
36 | def _p(pp, name):
37 | return '%s_%s' % (pp, name)
38 |
39 |
40 | # -------------------------------------------------------------------------#
41 | # Dump Actors:
42 | def param_init_dumb(options, prefix='db', nin=None, nout=None):
43 | params = OrderedDict()
44 | if nin is None:
45 | nin = options['dim'] + options['ctxdim']
46 |
47 | if nout is None:
48 | nout = options['dim']
49 |
50 | return params
51 |
52 |
53 | def dumb_actor(tparams, options,h1, ctx=None, act=None, prefix='db'):
54 |
55 | action = tensor.zeros_like(h1)
56 | hidden = act
57 | return action, hidden
58 |
59 |
60 | # constant Actors:
61 | def param_init_constant(options, prefix='ct', nin=None, nout=None):
62 | params = OrderedDict()
63 | if nin is None:
64 | nin = options['dim'] + options['ctxdim']
65 |
66 | if nout is None:
67 | nout = options['dim']
68 |
69 | params[_p(prefix, 'a')] = numpy.zeros((nout,)).astype('float32')
70 | return params
71 |
72 |
73 | def constant_actor(tparams, options, h1, ctx=None, act=None, prefix='ct'):
74 | action = tensor.zeros_like(h1)
75 | if action.ndim == 2:
76 | action += tparams[_p(prefix, 'a')][None, :]
77 | elif action.ndim == 3:
78 | action += tparams[_p(prefix, 'a')][None, None, :]
79 | else:
80 | action += tparams[_p(prefix, 'a')]
81 |
82 | hidden = act
83 | return action, hidden
84 |
85 |
86 | # Feedforward Actors:
87 | def param_init_ff(options, prefix='ff', nin=None, nout=None, nhid=None):
88 |
89 | params = OrderedDict()
90 |
91 | if nin is None:
92 | nin = options['dim'] + options['ctxdim']
93 |
94 | if nout is None:
95 | nout = options['dim']
96 |
97 | if nhid is None:
98 | nhid = options['act_hdim']
99 |
100 | params = get_layer('ff')[0](options, params, prefix=prefix + '_in',
101 | nin=nin, nout=nhid, scale=0.001)
102 |
103 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out',
104 | nin=nhid, nout=nout, scale=0.001)
105 |
106 | return params
107 |
108 |
109 | def ff_actor(tparams, options, h1, ctx=None, act=None, prefix='ff'):
110 |
111 | hidden = get_layer('ff')[1](tparams, concatenate([h1, ctx], axis=1),
112 | options, prefix=prefix + '_in', activ='tanh')
113 | action = get_layer('ff')[1](tparams, hidden,
114 | options, prefix=prefix + '_out', activ='tanh')
115 |
116 | return action, hidden
117 |
118 |
119 | # Recurrent Actors:
120 | def param_init_gru(options, prefix='ff', nin=None, nout=None, nhid=None):
121 |
122 | params = OrderedDict()
123 |
124 | if nin is None:
125 | nin = 2 * options['dim'] + options['ctxdim']
126 |
127 | if nout is None:
128 | nout = options['dim']
129 |
130 | if nhid is None:
131 | nhid = options['act_hdim']
132 |
133 | # params = get_layer('lngru')[0](options, params, prefix=prefix + '_in',
134 | # nin=nin, dim=nhid, scale=0.001)
135 | params = get_layer('gru')[0](options, params, prefix=prefix + '_in',
136 | nin=nin, dim=nhid, scale=0.001)
137 |
138 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out',
139 | nin=nhid, nout=nout, scale=0.001)
140 |
141 | return params
142 |
143 |
144 | def gru_actor(tparams, options, h1, ctx=None, act=None, prefix='ff'):
145 |
146 | pre_state, pre_action = act[:, :options['act_hdim']], act[:, options['act_hdim']:]
147 | # hidden = get_layer('lngru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1),
148 | # options, prefix=prefix + '_in',
149 | # one_step=True, _init_state=pre_state)[0]
150 | hidden = get_layer('gru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1),
151 | options, prefix=prefix + '_in',
152 | one_step=True, _init_state=pre_state)[0]
153 |
154 | action = get_layer('ff')[1](tparams, hidden,
155 | options, prefix=prefix + '_out', activ='tanh')
156 | cur_act = concatenate([hidden, action], axis=1)
157 | return action, cur_act
158 |
159 |
160 | # Recurrent Actor2
161 | def param_init_gru2(options, prefix='ff', nin=None, nout=None, nhid=None):
162 |
163 | params = OrderedDict()
164 |
165 | if nin is None:
166 | nin = options['dim']
167 |
168 | if nout is None:
169 | nout = options['dim']
170 |
171 | if nhid is None:
172 | nhid = options['act_hdim']
173 |
174 | # params = get_layer('lngru')[0](options, params, prefix=prefix + '_in',
175 | # nin=nin, dim=nhid, scale=0.001)
176 | params = get_layer('gru')[0](options, params, prefix=prefix + '_in',
177 | nin=nin, dim=nhid, scale=0.001)
178 |
179 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out',
180 | nin=nhid, nout=nout, scale=0.001)
181 |
182 | return params
183 |
184 |
185 | def gru_actor2(tparams, options, h1, act=None, prefix='ff'):
186 |
187 | # hidden = get_layer('lngru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1),
188 | # options, prefix=prefix + '_in',
189 | # one_step=True, _init_state=pre_state)[0]
190 | hidden = get_layer('gru')[1](tparams, h1,
191 | options, prefix=prefix + '_in',
192 | one_step=True, _init_state=act)[0]
193 |
194 | action = get_layer('ff')[1](tparams, hidden,
195 | options, prefix=prefix + '_out', activ='tanh')
196 | return action, hidden
197 |
198 |
199 | def gru_actor_hard(tparams, options, h1, ctx=None, act=None, prefix='ff', bound=0.1):
200 |
201 | pre_state, pre_action = act[:, :options['act_hdim']], act[:, options['act_hdim']:]
202 | # hidden = get_layer('lngru')[2](tparams, concatenate([h1, ctx, pre_action], axis=1),
203 | # options, prefix=prefix + '_in',
204 | # one_step=True, _init_state=pre_state)[0]
205 | hidden = get_layer('gru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1),
206 | options, prefix=prefix + '_in',
207 | one_step=True, _init_state=pre_state)[0]
208 |
209 | action = get_layer('ff')[1](tparams, hidden,
210 | options, prefix=prefix + '_out', activ='tanh')
211 |
212 | a_norm = tensor.sqrt(tensor.sum(action ** 2, axis=-1, keepdims=True))
213 | action = tensor.switch(a_norm > bound, action / a_norm * bound, action) # add a hard boundary of actions
214 |
215 | cur_act = concatenate([hidden, action], axis=1)
216 | return action, cur_act
217 |
218 |
219 | # Recurrent Actors:
220 | def param_init_gg(options, prefix='ff', nin=None, nout=None, nhid=None):
221 |
222 | params = OrderedDict()
223 |
224 | if nin is None:
225 | nin = 2 * options['dim'] + options['ctxdim']
226 |
227 | if nout is None:
228 | nout = options['dim']
229 |
230 | if nhid is None:
231 | nhid = options['act_hdim']
232 |
233 | # params = get_layer('lngru')[0](options, params, prefix=prefix + '_in',
234 | # nin=nin, dim=nhid, scale=0.001)
235 | params = get_layer('gru')[0](options, params, prefix=prefix + '_in',
236 | nin=nin, dim=nhid, scale=0.001)
237 |
238 | params = get_layer('ff')[0](options, params, prefix=prefix + '_out',
239 | nin=nhid, nout=nout, scale=0.001)
240 |
241 | # params = get_layer('ff')[0](options, params, prefix=prefix + '_gate',
242 | # nin=nhid + nout, nout=1)
243 | # params = get_layer('ff')[0](options, params, prefix=prefix + '_gate',
244 | # nin=nin + nout, nout=1)
245 | params = get_layer('ff')[0](options, params, prefix=prefix + '_gate',
246 | nin=nin + nout, nout=nout)
247 |
248 | return params
249 |
250 |
251 | def gg_actor(tparams, options, h1, ctx=None, act=None, prefix='ff'):
252 |
253 | pre_state, pre_action = act[:, :options['act_hdim']], act[:, options['act_hdim']:]
254 | # hidden = get_layer('lngru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1),
255 | # options, prefix=prefix + '_in',
256 | # one_step=True, _init_state=pre_state)[0]
257 | hidden = get_layer('gru')[1](tparams, concatenate([h1, ctx, pre_action], axis=1),
258 | options, prefix=prefix + '_in',
259 | one_step=True, _init_state=pre_state)[0]
260 |
261 | output = get_layer('ff')[1](tparams, hidden,
262 | options, prefix=prefix + '_out', activ='tanh')
263 | # gate = get_layer('ff')[1](tparams, concatenate([hidden, output], axis=1), options, prefix=prefix + '_gate', activ='sigmoid')[:, 0]
264 | # gate = get_layer('ff')[1](tparams, concatenate([h1, ctx, pre_action, output], axis=1), options, prefix=prefix + '_gate', activ='sigmoid')[:, 0]
265 | # action = output * gate[:, None]
266 | gate = get_layer('ff')[1](tparams, concatenate([h1, ctx, pre_action, output], axis=1), options, prefix=prefix + '_gate', activ='sigmoid')
267 | action = output * gate
268 | cur_act = concatenate([hidden, action], axis=1)
269 | return action, cur_act
270 |
271 |
272 |
273 |
274 |
--------------------------------------------------------------------------------
/bleu.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/bleu.pyc
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | """
2 | Configuration for Simultaneous Neural Machine Translation
3 | """
4 | from collections import OrderedDict
5 |
6 | # data_home = '/home/thoma/scratch/un16/'
7 | # model_home = '/home/thoma/scratch/simul/'
8 | # data_home = '/mnt/scratch/un16/'
9 | # model_home = '/mnt/scratch/simul/'
10 |
11 | data_home = '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/un16/'
12 | model_home = '/misc/kcgscratch1/ChoGroup/thoma_data/simul_trans/'
13 |
14 |
15 | def pretrain_config():
16 |
17 | """Configuration for pretraining underlining NMT model."""
18 |
19 | config = dict()
20 |
21 | # training set (source, target)
22 | config['datasets'] = [data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.np',
23 | data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.np']
24 |
25 | # validation set (source, target)
26 | config['valid_datasets'] = [data_home + 'devset.un16.en-zh.zh.c0.tok.bpe20k.np',
27 | data_home + 'devset.un16.en-zh.en.c0.tok.bpe20k.np']
28 |
29 | # vocabulary (source, target)
30 | config['dictionaries'] = [data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.vocab.pkl',
31 | data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.vocab.pkl']
32 |
33 | # save the model to
34 | config['saveto'] = data_home + 'pretraining/model_un16_bpe2k_uni_zh-en.npz'
35 | config['reload_'] = True
36 |
37 | # model details
38 | config['dim_word'] = 512
39 | config['dim'] = 1028
40 | config['n_words'] = 20000
41 | config['n_words_src'] = 20000
42 |
43 | # learning details
44 | config['decay_c'] = 0
45 | config['clip_c'] = 1.
46 | config['use_dropout'] = False
47 | config['lrate'] = 0.0001
48 | config['optimizer'] = 'adadelta'
49 | config['patience'] = 1000
50 | config['maxlen'] = 50
51 | config['batch_size'] = 32
52 | config['valid_batch_size'] = 64
53 | config['validFreq'] = 1000
54 | config['dispFreq'] = 50
55 | config['saveFreq'] = 1000
56 | config['sampleFreq'] = 99
57 |
58 | return config
59 |
60 |
61 | def rl_config():
62 | """Configuration for training the agent using REINFORCE algorithm."""
63 |
64 | config = OrderedDict() # general configuration
65 |
66 | # work-space
67 | config['workspace'] = model_home
68 |
69 | # training set (source, target); or leave it None, agent will use the same corpus saved in the model
70 | config['datasets'] = [data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.np',
71 | data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.np']
72 |
73 | # validation set (source, target); or leave it None, agent will use the same corpus saved in the model
74 | config['valid_datasets'] = [data_home + 'devset.un16.en-zh.en.c0.tok.bpe20k.np',
75 | data_home + 'devset.un16.en-zh.zh.c0.tok.bpe20k.np']
76 |
77 | # vocabulary (source, target); or leave it None, agent will use the same dictionary saved in the model
78 | config['dictionaries'] = [data_home + 'train.un16.en-zh.en.c0.tok.clean.bpe20k.vocab.pkl',
79 | data_home + 'train.un16.en-zh.zh.c0.tok.clean.bpe20k.vocab.pkl']
80 |
81 | # pretrained model
82 | config['model'] = model_home + '.pretrained/model_un16_bpe2k_uni_en-zh.npz'
83 | config['option'] = model_home + '.pretrained/model_un16_bpe2k_uni_en-zh.npz.pkl'
84 |
85 | # critical training parameters.
86 | config['sample'] = 10
87 | config['batchsize'] = 10
88 | config['rl_maxlen'] = 100
89 | config['target_ap'] = 0.8 # 0.75 # target delay if using AP as reward.
90 | config['target_cw'] = 8 # if cw > 0 use cw mode
91 |
92 | # under-construction
93 | config['forget'] = False
94 |
95 | # learning rate
96 | config['lr_policy'] = 0.0002
97 | config['lr_model'] = 0.00002
98 |
99 | # policy parameters
100 | config['prop'] = 0.5 # leave it default
101 | config['recurrent'] = True # use a recurrent agent
102 | config['layernorm'] = False # layer normalalization for the GRU agent.
103 | config['updater'] = 'REINFORCE' # 'TRPO' not work well.
104 | config['act_mask'] = True # leave it default
105 |
106 | # old model parameters (maybe useless, leave them default)
107 | config['step'] = 1
108 | config['peek'] = 1
109 | config['s0'] = 1
110 | config['gamma'] = 1
111 | config['Rtype'] = 10
112 | config['maxsrc'] = 10
113 | config['pre'] = False
114 | config['coverage'] = False
115 | config['upper'] = False
116 |
117 | config['finetune'] = True
118 | config['train_gt'] = False # when training with GT, fix the random agent??
119 | config['full_att'] = True
120 | config['predict'] = True
121 |
122 | return config
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
--------------------------------------------------------------------------------
/config.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/config.pyc
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Data pre-processing related scripts and utilities.
2 |
3 | #### Setup
4 | Easiest way to setup your environment:
5 |
6 | ```bash
7 | $ cd ~; mkdir codes; cd codes
8 | $ git clone https://github.com/nyu-dl/dl4mt-tutorial
9 | $ cd dl4mt-tutorial/data
10 | $ ./setup_local_env.sh
11 | ```
12 |
13 | which will first clone this repository under `~/codes/dl4mt-tutorial`
14 | and then calls the `setup_local_env.sh` script to retrieve example data,
15 | and preprocesses it.
16 |
17 | #### Pre-processing
18 | Following steps are executed by `setup_local_env.sh`:
19 | 1. Clone `dl4mt-tutorial` repository (if not cloned already)
20 | 2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development)
21 | 3. Preprocess training and development sets
22 | * Tokenize using moses tokenizer
23 | * Shuffle training set for SGD
24 | * Build source and target dictionaries
25 |
26 | #### Pre-processing with subword-units
27 | If you want to use subword-units (eg. [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)) for source and target tokens, simply call:
28 | ```bash
29 | $ ./setup_local_env.sh -b
30 | ```
31 | which will replace the third step above, and execute the following steps:
32 | 1. Clone `dl4mt-tutorial` repository (if not cloned already)
33 | 2. Download `europarl-v7.fr-en` (training) and `newstest2011` (development)
34 | 3. Preprocess training and development sets (`preprocess.sh`)
35 | * Tokenize source and target side of all bitext
36 | * Learn BPE-codes for both source and target side using training sets
37 | * Encode source and target side using the learned codes
38 | * Shuffle training set for SGD
39 | * Build source and target dictionaries
40 |
41 | In case you want to preprocess your own data using BPE, you can use `preprocess.sh` script directly.
42 |
43 | For the usage and more details, please check the comments in the scripts.
44 |
--------------------------------------------------------------------------------
/data/build_dictionary.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import cPickle as pkl
3 |
4 | import sys
5 | import fileinput
6 |
7 | from collections import OrderedDict
8 |
9 | def main():
10 | for filename in sys.argv[1:]:
11 | print 'Processing', filename
12 | word_freqs = OrderedDict()
13 | with open(filename, 'r') as f:
14 | for line in f:
15 | words_in = line.strip().split(' ')
16 | for w in words_in:
17 | if w not in word_freqs:
18 | word_freqs[w] = 0
19 | word_freqs[w] += 1
20 | words = word_freqs.keys()
21 | freqs = word_freqs.values()
22 |
23 | sorted_idx = numpy.argsort(freqs)
24 | sorted_words = [words[ii] for ii in sorted_idx[::-1]]
25 |
26 | worddict = OrderedDict()
27 | worddict['eos'] = 0
28 | worddict['UNK'] = 1
29 | for ii, ww in enumerate(sorted_words):
30 | worddict[ww] = ii+2
31 |
32 | with open('%s.pkl'%filename, 'wb') as f:
33 | pkl.dump(worddict, f)
34 |
35 | print 'Done'
36 |
37 | if __name__ == '__main__':
38 | main()
39 |
--------------------------------------------------------------------------------
/data/download_files.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import argparse
4 | import logging
5 | import os
6 | import tarfile
7 | import urllib2
8 |
9 | TRAIN_DATA_URL = 'http://www.statmt.org/europarl/v7/fr-en.tgz'
10 | VALID_DATA_URL = 'http://matrix.statmt.org/test_sets/newstest2011.tgz'
11 |
12 | parser = argparse.ArgumentParser(
13 | description="""
14 | This script donwloads parallel corpora given source and target pair language
15 | indicators. Adapted from,
16 | https://github.com/orhanf/blocks-examples/tree/master/machine_translation
17 | """, formatter_class=argparse.RawTextHelpFormatter)
18 | parser.add_argument("-s", "--source", type=str, help="Source language",
19 | default="fr")
20 | parser.add_argument("-t", "--target", type=str, help="Target language",
21 | default="en")
22 | parser.add_argument("--source-dev", type=str, default="newstest2011.fr",
23 | help="Source language dev filename")
24 | parser.add_argument("--target-dev", type=str, default="newstest2011.en",
25 | help="Target language dev filename")
26 | parser.add_argument("--outdir", type=str, default=".",
27 | help="Output directory")
28 |
29 |
30 | def download_and_write_file(url, file_name):
31 | logger.info("Downloading [{}]".format(url))
32 | if not os.path.exists(file_name):
33 | path = os.path.dirname(file_name)
34 | if not os.path.exists(path):
35 | os.makedirs(path)
36 | u = urllib2.urlopen(url)
37 | f = open(file_name, 'wb')
38 | meta = u.info()
39 | file_size = int(meta.getheaders("Content-Length")[0])
40 | logger.info("...saving to: %s Bytes: %s" % (file_name, file_size))
41 | file_size_dl = 0
42 | block_sz = 8192
43 | while True:
44 | buffer = u.read(block_sz)
45 | if not buffer:
46 | break
47 | file_size_dl += len(buffer)
48 | f.write(buffer)
49 | status = r"%10d [%3.2f%%]" % \
50 | (file_size_dl, file_size_dl * 100. / file_size)
51 | status = status + chr(8)*(len(status)+1)
52 | print status,
53 | f.close()
54 | else:
55 | logger.info("...file exists [{}]".format(file_name))
56 |
57 |
58 | def extract_tar_file_to(file_to_extract, extract_into, names_to_look):
59 | extracted_filenames = []
60 | try:
61 | logger.info("Extracting file [{}] into [{}]"
62 | .format(file_to_extract, extract_into))
63 | tar = tarfile.open(file_to_extract, 'r')
64 | src_trg_files = [ff for ff in tar.getnames()
65 | if any([ff.find(nn) > -1 for nn in names_to_look])]
66 | if not len(src_trg_files):
67 | raise ValueError("[{}] pair does not exist in the archive!"
68 | .format(src_trg_files))
69 | for item in tar:
70 | # extract only source-target pair
71 | if item.name in src_trg_files:
72 | file_path = os.path.join(extract_into, item.path)
73 | if not os.path.exists(file_path):
74 | logger.info("...extracting [{}] into [{}]"
75 | .format(item.name, file_path))
76 | tar.extract(item, extract_into)
77 | else:
78 | logger.info("...file exists [{}]".format(file_path))
79 | extracted_filenames.append(
80 | os.path.join(extract_into, item.path))
81 | except Exception as e:
82 | logger.error("{}".format(str(e)))
83 | return extracted_filenames
84 |
85 |
86 | def main():
87 | train_data_file = os.path.join(args.outdir, 'train_data.tgz')
88 | valid_data_file = os.path.join(args.outdir, 'valid_data.tgz')
89 |
90 | # Download europarl v7 and extract it
91 | download_and_write_file(TRAIN_DATA_URL, train_data_file)
92 | extract_tar_file_to(
93 | train_data_file, os.path.dirname(train_data_file),
94 | ["{}-{}".format(args.source, args.target)])
95 |
96 | # Download development set and extract it
97 | download_and_write_file(VALID_DATA_URL, valid_data_file)
98 | extract_tar_file_to(
99 | valid_data_file, os.path.dirname(valid_data_file),
100 | [args.source_dev, args.target_dev])
101 |
102 |
103 | if __name__ == "__main__":
104 |
105 | logging.basicConfig(level=logging.INFO)
106 | logger = logging.getLogger('prepare_data')
107 |
108 | args = parser.parse_args()
109 | main()
110 |
--------------------------------------------------------------------------------
/data/length.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import numpy
4 | import sys
5 |
6 | for name in sys.argv[1:]:
7 | lens = []
8 | with open(name, 'r') as f:
9 | for ll in f:
10 | lens.append(len(ll.strip().split(' ')))
11 | print name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens)
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/data/merge.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script merges all the bitext files in the current directory.
3 | # Source side files are concatenated into all_[src]-[trg].[src]
4 | # Target side files are concatenated into all_[src]-[trg].[trg]
5 |
6 | if [ "$#" -ne 3 ]; then
7 | echo ""
8 | echo "Usage: $0 src trg path_to_data"
9 | echo ""
10 | exit 1
11 | fi
12 |
13 | SRC=$1
14 | TRG=$2
15 |
16 | DATA_DIR=$3
17 |
18 | FSRC=${DATA_DIR}/all_${1}-${2}.${1}
19 | FTRG=${DATA_DIR}/all_${1}-${2}.${2}
20 |
21 | echo "" > $FSRC
22 | for F in ${DATA_DIR}/*${1}-${2}.${1}
23 | do
24 | if [ "$F" = "$FSRC" ]; then
25 | echo "pass"
26 | else
27 | cat $F >> $FSRC
28 | fi
29 | done
30 |
31 |
32 | echo "" > $FTRG
33 | for F in ${DATA_DIR}/*${1}-${2}.${2}
34 | do
35 | if [ "$F" = "$FTRG" ]; then
36 | echo "pass"
37 | else
38 | cat $F >> $FTRG
39 | fi
40 | done
41 |
--------------------------------------------------------------------------------
/data/multi-bleu.perl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | #
3 | # This file is part of moses. Its use is licensed under the GNU Lesser General
4 | # Public License version 2.1 or, at your option, any later version.
5 |
6 | # $Id$
7 | use warnings;
8 | use strict;
9 |
10 | my $lowercase = 0;
11 | if ($ARGV[0] eq "-lc") {
12 | $lowercase = 1;
13 | shift;
14 | }
15 |
16 | my $stem = $ARGV[0];
17 | if (!defined $stem) {
18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n";
20 | exit(1);
21 | }
22 |
23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
24 |
25 | my @REF;
26 | my $ref=0;
27 | while(-e "$stem$ref") {
28 | &add_to_ref("$stem$ref",\@REF);
29 | $ref++;
30 | }
31 | &add_to_ref($stem,\@REF) if -e $stem;
32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
33 |
34 | sub add_to_ref {
35 | my ($file,$REF) = @_;
36 | my $s=0;
37 | open(REF,$file) or die "Can't read $file";
38 | while([) {
39 | chop;
40 | push @{$$REF[$s++]}, $_;
41 | }
42 | close(REF);
43 | }
44 |
45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
46 | my $s=0;
47 | while() {
48 | chop;
49 | $_ = lc if $lowercase;
50 | my @WORD = split;
51 | my %REF_NGRAM = ();
52 | my $length_translation_this_sentence = scalar(@WORD);
53 | my ($closest_diff,$closest_length) = (9999,9999);
54 | foreach my $reference (@{$REF[$s]}) {
55 | # print "$s $_ <=> $reference\n";
56 | $reference = lc($reference) if $lowercase;
57 | my @WORD = split(' ',$reference);
58 | my $length = scalar(@WORD);
59 | my $diff = abs($length_translation_this_sentence-$length);
60 | if ($diff < $closest_diff) {
61 | $closest_diff = $diff;
62 | $closest_length = $length;
63 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
64 | } elsif ($diff == $closest_diff) {
65 | $closest_length = $length if $length < $closest_length;
66 | # from two references with the same closeness to me
67 | # take the *shorter* into account, not the "first" one.
68 | }
69 | for(my $n=1;$n<=4;$n++) {
70 | my %REF_NGRAM_N = ();
71 | for(my $start=0;$start<=$#WORD-($n-1);$start++) {
72 | my $ngram = "$n";
73 | for(my $w=0;$w<$n;$w++) {
74 | $ngram .= " ".$WORD[$start+$w];
75 | }
76 | $REF_NGRAM_N{$ngram}++;
77 | }
78 | foreach my $ngram (keys %REF_NGRAM_N) {
79 | if (!defined($REF_NGRAM{$ngram}) ||
80 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
81 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
82 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}]
\n";
83 | }
84 | }
85 | }
86 | }
87 | $length_translation += $length_translation_this_sentence;
88 | $length_reference += $closest_length;
89 | for(my $n=1;$n<=4;$n++) {
90 | my %T_NGRAM = ();
91 | for(my $start=0;$start<=$#WORD-($n-1);$start++) {
92 | my $ngram = "$n";
93 | for(my $w=0;$w<$n;$w++) {
94 | $ngram .= " ".$WORD[$start+$w];
95 | }
96 | $T_NGRAM{$ngram}++;
97 | }
98 | foreach my $ngram (keys %T_NGRAM) {
99 | $ngram =~ /^(\d+) /;
100 | my $n = $1;
101 | # my $corr = 0;
102 | # print "$i e $ngram $T_NGRAM{$ngram}
\n";
103 | $TOTAL[$n] += $T_NGRAM{$ngram};
104 | if (defined($REF_NGRAM{$ngram})) {
105 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
106 | $CORRECT[$n] += $T_NGRAM{$ngram};
107 | # $corr = $T_NGRAM{$ngram};
108 | # print "$i e correct1 $T_NGRAM{$ngram}
\n";
109 | }
110 | else {
111 | $CORRECT[$n] += $REF_NGRAM{$ngram};
112 | # $corr = $REF_NGRAM{$ngram};
113 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n";
114 | }
115 | }
116 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
117 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
118 | }
119 | }
120 | $s++;
121 | }
122 | my $brevity_penalty = 1;
123 | my $bleu = 0;
124 |
125 | my @bleu=();
126 |
127 | for(my $n=1;$n<=4;$n++) {
128 | if (defined ($TOTAL[$n])){
129 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
130 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
131 | }else{
132 | $bleu[$n]=0;
133 | }
134 | }
135 |
136 | if ($length_reference==0){
137 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
138 | exit(1);
139 | }
140 |
141 | if ($length_translation<$length_reference) {
142 | $brevity_penalty = exp(1-$length_reference/$length_translation);
143 | }
144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
145 | my_log( $bleu[2] ) +
146 | my_log( $bleu[3] ) +
147 | my_log( $bleu[4] ) ) / 4) ;
148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
149 | 100*$bleu,
150 | 100*$bleu[1],
151 | 100*$bleu[2],
152 | 100*$bleu[3],
153 | 100*$bleu[4],
154 | $brevity_penalty,
155 | $length_translation / $length_reference,
156 | $length_translation,
157 | $length_reference;
158 |
159 | sub my_log {
160 | return -9999999999 unless $_[0];
161 | return log($_[0]);
162 | }
163 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 |
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 |
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 |
8 |
9 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
1 | Dr
2 | Dra
3 | pàg
4 | p
5 | c
6 | av
7 | Sr
8 | Sra
9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
1 | Bc
2 | BcA
3 | Ing
4 | Ing.arch
5 | MUDr
6 | MVDr
7 | MgA
8 | Mgr
9 | JUDr
10 | PhDr
11 | RNDr
12 | PharmDr
13 | ThLic
14 | ThDr
15 | Ph.D
16 | Th.D
17 | prof
18 | doc
19 | CSc
20 | DrSc
21 | dr. h. c
22 | PaedDr
23 | Dr
24 | PhMr
25 | DiS
26 | abt
27 | ad
28 | a.i
29 | aj
30 | angl
31 | anon
32 | apod
33 | atd
34 | atp
35 | aut
36 | bd
37 | biogr
38 | b.m
39 | b.p
40 | b.r
41 | cca
42 | cit
43 | cizojaz
44 | c.k
45 | col
46 | čes
47 | čín
48 | čj
49 | ed
50 | facs
51 | fasc
52 | fol
53 | fot
54 | franc
55 | h.c
56 | hist
57 | hl
58 | hrsg
59 | ibid
60 | il
61 | ind
62 | inv.č
63 | jap
64 | jhdt
65 | jv
66 | koed
67 | kol
68 | korej
69 | kl
70 | krit
71 | lat
72 | lit
73 | m.a
74 | maď
75 | mj
76 | mp
77 | násl
78 | např
79 | nepubl
80 | něm
81 | no
82 | nr
83 | n.s
84 | okr
85 | odd
86 | odp
87 | obr
88 | opr
89 | orig
90 | phil
91 | pl
92 | pokrač
93 | pol
94 | port
95 | pozn
96 | př.kr
97 | př.n.l
98 | přel
99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | #no german words end in single lower-case letters, so we throw those in too.
7 | A
8 | B
9 | C
10 | D
11 | E
12 | F
13 | G
14 | H
15 | I
16 | J
17 | K
18 | L
19 | M
20 | N
21 | O
22 | P
23 | Q
24 | R
25 | S
26 | T
27 | U
28 | V
29 | W
30 | X
31 | Y
32 | Z
33 | a
34 | b
35 | c
36 | d
37 | e
38 | f
39 | g
40 | h
41 | i
42 | j
43 | k
44 | l
45 | m
46 | n
47 | o
48 | p
49 | q
50 | r
51 | s
52 | t
53 | u
54 | v
55 | w
56 | x
57 | y
58 | z
59 |
60 |
61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
62 | I
63 | II
64 | III
65 | IV
66 | V
67 | VI
68 | VII
69 | VIII
70 | IX
71 | X
72 | XI
73 | XII
74 | XIII
75 | XIV
76 | XV
77 | XVI
78 | XVII
79 | XVIII
80 | XIX
81 | XX
82 | i
83 | ii
84 | iii
85 | iv
86 | v
87 | vi
88 | vii
89 | viii
90 | ix
91 | x
92 | xi
93 | xii
94 | xiii
95 | xiv
96 | xv
97 | xvi
98 | xvii
99 | xviii
100 | xix
101 | xx
102 |
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 |
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 |
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 |
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.el:
--------------------------------------------------------------------------------
1 | # Sigle letters in upper-case are usually abbreviations of names
2 | Α
3 | Β
4 | Γ
5 | Δ
6 | Ε
7 | Ζ
8 | Η
9 | Θ
10 | Ι
11 | Κ
12 | Λ
13 | Μ
14 | Ν
15 | Ξ
16 | Ο
17 | Π
18 | Ρ
19 | Σ
20 | Τ
21 | Υ
22 | Φ
23 | Χ
24 | Ψ
25 | Ω
26 |
27 | # Includes abbreviations for the Greek language compiled from various sources (Greek grammar books, Greek language related web content).
28 | Άθαν
29 | Έγχρ
30 | Έκθ
31 | Έσδ
32 | Έφ
33 | Όμ
34 | Α΄Έσδρ
35 | Α΄Έσδ
36 | Α΄Βασ
37 | Α΄Θεσ
38 | Α΄Ιω
39 | Α΄Κορινθ
40 | Α΄Κορ
41 | Α΄Μακκ
42 | Α΄Μακ
43 | Α΄Πέτρ
44 | Α΄Πέτ
45 | Α΄Παραλ
46 | Α΄Πε
47 | Α΄Σαμ
48 | Α΄Τιμ
49 | Α΄Χρον
50 | Α΄Χρ
51 | Α.Β.Α
52 | Α.Β
53 | Α.Ε
54 | Α.Κ.Τ.Ο
55 | Αέθλ
56 | Αέτ
57 | Αίλ.Δ
58 | Αίλ.Τακτ
59 | Αίσ
60 | Αββακ
61 | Αβυδ
62 | Αβ
63 | Αγάκλ
64 | Αγάπ
65 | Αγάπ.Αμαρτ.Σ
66 | Αγάπ.Γεωπ
67 | Αγαθάγγ
68 | Αγαθήμ
69 | Αγαθιν
70 | Αγαθοκλ
71 | Αγαθρχ
72 | Αγαθ
73 | Αγαθ.Ιστ
74 | Αγαλλ
75 | Αγαπητ
76 | Αγγ
77 | Αγησ
78 | Αγλ
79 | Αγορ.Κ
80 | Αγρο.Κωδ
81 | Αγρ.Εξ
82 | Αγρ.Κ
83 | Αγ.Γρ
84 | Αδριαν
85 | Αδρ
86 | Αετ
87 | Αθάν
88 | Αθήν
89 | Αθήν.Επιγρ
90 | Αθήν.Επιτ
91 | Αθήν.Ιατρ
92 | Αθήν.Μηχ
93 | Αθανάσ
94 | Αθαν
95 | Αθηνί
96 | Αθηναγ
97 | Αθηνόδ
98 | Αθ
99 | Αθ.Αρχ
100 | Αιλ
101 | Αιλ.Επιστ
102 | Αιλ.ΖΙ
103 | Αιλ.ΠΙ
104 | Αιλ.απ
105 | Αιμιλ
106 | Αιν.Γαζ
107 | Αιν.Τακτ
108 | Αισχίν
109 | Αισχίν.Επιστ
110 | Αισχ
111 | Αισχ.Αγαμ
112 | Αισχ.Αγ
113 | Αισχ.Αλ
114 | Αισχ.Ελεγ
115 | Αισχ.Επτ.Θ
116 | Αισχ.Ευμ
117 | Αισχ.Ικέτ
118 | Αισχ.Ικ
119 | Αισχ.Περσ
120 | Αισχ.Προμ.Δεσμ
121 | Αισχ.Πρ
122 | Αισχ.Χοηφ
123 | Αισχ.Χο
124 | Αισχ.απ
125 | ΑιτΕ
126 | Αιτ
127 | Αλκ
128 | Αλχιας
129 | Αμ.Π.Ο
130 | Αμβ
131 | Αμμών
132 | Αμ.
133 | Αν.Πειθ.Συμβ.Δικ
134 | Ανακρ
135 | Ανακ
136 | Αναμν.Τόμ
137 | Αναπλ
138 | Ανδ
139 | Ανθλγος
140 | Ανθστης
141 | Αντισθ
142 | Ανχης
143 | Αν
144 | Αποκ
145 | Απρ
146 | Απόδ
147 | Απόφ
148 | Απόφ.Νομ
149 | Απ
150 | Απ.Δαπ
151 | Απ.Διατ
152 | Απ.Επιστ
153 | Αριθ
154 | Αριστοτ
155 | Αριστοφ
156 | Αριστοφ.Όρν
157 | Αριστοφ.Αχ
158 | Αριστοφ.Βάτρ
159 | Αριστοφ.Ειρ
160 | Αριστοφ.Εκκλ
161 | Αριστοφ.Θεσμ
162 | Αριστοφ.Ιππ
163 | Αριστοφ.Λυσ
164 | Αριστοφ.Νεφ
165 | Αριστοφ.Πλ
166 | Αριστοφ.Σφ
167 | Αριστ
168 | Αριστ.Αθ.Πολ
169 | Αριστ.Αισθ
170 | Αριστ.Αν.Πρ
171 | Αριστ.Ζ.Ι
172 | Αριστ.Ηθ.Ευδ
173 | Αριστ.Ηθ.Νικ
174 | Αριστ.Κατ
175 | Αριστ.Μετ
176 | Αριστ.Πολ
177 | Αριστ.Φυσιογν
178 | Αριστ.Φυσ
179 | Αριστ.Ψυχ
180 | Αριστ.Ρητ
181 | Αρμεν
182 | Αρμ
183 | Αρχ.Εκ.Καν.Δ
184 | Αρχ.Ευβ.Μελ
185 | Αρχ.Ιδ.Δ
186 | Αρχ.Νομ
187 | Αρχ.Ν
188 | Αρχ.Π.Ε
189 | Αρ
190 | Αρ.Φορ.Μητρ
191 | Ασμ
192 | Ασμ.ασμ
193 | Αστ.Δ
194 | Αστ.Χρον
195 | Ασ
196 | Ατομ.Γνωμ
197 | Αυγ
198 | Αφρ
199 | Αχ.Νομ
200 | Α
201 | Α.Εγχ.Π
202 | Α.Κ.΄Υδρας
203 | Β΄Έσδρ
204 | Β΄Έσδ
205 | Β΄Βασ
206 | Β΄Θεσ
207 | Β΄Ιω
208 | Β΄Κορινθ
209 | Β΄Κορ
210 | Β΄Μακκ
211 | Β΄Μακ
212 | Β΄Πέτρ
213 | Β΄Πέτ
214 | Β΄Πέ
215 | Β΄Παραλ
216 | Β΄Σαμ
217 | Β΄Τιμ
218 | Β΄Χρον
219 | Β΄Χρ
220 | Β.Ι.Π.Ε
221 | Β.Κ.Τ
222 | Β.Κ.Ψ.Β
223 | Β.Μ
224 | Β.Ο.Α.Κ
225 | Β.Ο.Α
226 | Β.Ο.Δ
227 | Βίβλ
228 | Βαρ
229 | ΒεΘ
230 | Βι.Περ
231 | Βιπερ
232 | Βιργ
233 | Βλγ
234 | Βούλ
235 | Βρ
236 | Γ΄Βασ
237 | Γ΄Μακκ
238 | ΓΕΝμλ
239 | Γέν
240 | Γαλ
241 | Γεν
242 | Γλ
243 | Γν.Ν.Σ.Κρ
244 | Γνωμ
245 | Γν
246 | Γράμμ
247 | Γρηγ.Ναζ
248 | Γρηγ.Νύσ
249 | Γ Νοσ
250 | Γ' Ογκολ
251 | Γ.Ν
252 | Δ΄Βασ
253 | Δ.Β
254 | Δ.Δίκη
255 | Δ.Δίκ
256 | Δ.Ε.Σ
257 | Δ.Ε.Φ.Α
258 | Δ.Ε.Φ
259 | Δ.Εργ.Ν
260 | Δαμ
261 | Δαμ.μνημ.έργ
262 | Δαν
263 | Δασ.Κ
264 | Δεκ
265 | Δελτ.Δικ.Ε.Τ.Ε
266 | Δελτ.Νομ
267 | Δελτ.Συνδ.Α.Ε
268 | Δερμ
269 | Δευτ
270 | Δεύτ
271 | Δημοσθ
272 | Δημόκρ
273 | Δι.Δικ
274 | Διάτ
275 | Διαιτ.Απ
276 | Διαιτ
277 | Διαρκ.Στρατ
278 | Δικ
279 | Διοίκ.Πρωτ
280 | ΔιοικΔνη
281 | Διοικ.Εφ
282 | Διον.Αρ
283 | Διόρθ.Λαθ
284 | Δ.κ.Π
285 | Δνη
286 | Δν
287 | Δογμ.Όρος
288 | Δρ
289 | Δ.τ.Α
290 | Δτ
291 | ΔωδΝομ
292 | Δ.Περ
293 | Δ.Στρ
294 | ΕΔΠολ
295 | ΕΕυρΚ
296 | ΕΙΣ
297 | ΕΝαυτΔ
298 | ΕΣΑμΕΑ
299 | ΕΣΘ
300 | ΕΣυγκΔ
301 | ΕΤρΑξΧρΔ
302 | Ε.Φ.Ε.Τ
303 | Ε.Φ.Ι
304 | Ε.Φ.Ο.Επ.Α
305 | Εβδ
306 | Εβρ
307 | Εγκύκλ.Επιστ
308 | Εγκ
309 | Εε.Αιγ
310 | Εθν.Κ.Τ
311 | Εθν
312 | Ειδ.Δικ.Αγ.Κακ
313 | Εικ
314 | Ειρ.Αθ
315 | Ειρην.Αθ
316 | Ειρην
317 | Έλεγχ
318 | Ειρ
319 | Εισ.Α.Π
320 | Εισ.Ε
321 | Εισ.Ν.Α.Κ
322 | Εισ.Ν.Κ.Πολ.Δ
323 | Εισ.Πρωτ
324 | Εισηγ.Έκθ
325 | Εισ
326 | Εκκλ
327 | Εκκ
328 | Εκ
329 | Ελλ.Δνη
330 | Εν.Ε
331 | Εξ
332 | Επ.Αν
333 | Επ.Εργ.Δ
334 | Επ.Εφ
335 | Επ.Κυπ.Δ
336 | Επ.Μεσ.Αρχ
337 | Επ.Νομ
338 | Επίκτ
339 | Επίκ
340 | Επι.Δ.Ε
341 | Επιθ.Ναυτ.Δικ
342 | Επικ
343 | Επισκ.Ε.Δ
344 | Επισκ.Εμπ.Δικ
345 | Επιστ.Επετ.Αρμ
346 | Επιστ.Επετ
347 | Επιστ.Ιερ
348 | Επιτρ.Προστ.Συνδ.Στελ
349 | Επιφάν
350 | Επτ.Εφ
351 | Επ.Ιρ
352 | Επ.Ι
353 | Εργ.Ασφ.Νομ
354 | Ερμ.Α.Κ
355 | Ερμη.Σ
356 | Εσθ
357 | Εσπερ
358 | Ετρ.Δ
359 | Ευκλ
360 | Ευρ.Δ.Δ.Α
361 | Ευρ.Σ.Δ.Α
362 | Ευρ.ΣτΕ
363 | Ευρατόμ
364 | Ευρ.Άλκ
365 | Ευρ.Ανδρομ
366 | Ευρ.Βάκχ
367 | Ευρ.Εκ
368 | Ευρ.Ελ
369 | Ευρ.Ηλ
370 | Ευρ.Ηρακ
371 | Ευρ.Ηρ
372 | Ευρ.Ηρ.Μαιν
373 | Ευρ.Ικέτ
374 | Ευρ.Ιππόλ
375 | Ευρ.Ιφ.Α
376 | Ευρ.Ιφ.Τ
377 | Ευρ.Ι.Τ
378 | Ευρ.Κύκλ
379 | Ευρ.Μήδ
380 | Ευρ.Ορ
381 | Ευρ.Ρήσ
382 | Ευρ.Τρωάδ
383 | Ευρ.Φοίν
384 | Εφ.Αθ
385 | Εφ.Εν
386 | Εφ.Επ
387 | Εφ.Θρ
388 | Εφ.Θ
389 | Εφ.Ι
390 | Εφ.Κερ
391 | Εφ.Κρ
392 | Εφ.Λ
393 | Εφ.Ν
394 | Εφ.Πατ
395 | Εφ.Πειρ
396 | Εφαρμ.Δ.Δ
397 | Εφαρμ
398 | Εφεσ
399 | Εφημ
400 | Εφ
401 | Ζαχ
402 | Ζιγ
403 | Ζυ
404 | Ζχ
405 | ΗΕ.Δ
406 | Ημερ
407 | Ηράκλ
408 | Ηροδ
409 | Ησίοδ
410 | Ησ
411 | Η.Ε.Γ
412 | ΘΗΣ
413 | ΘΡ
414 | Θαλ
415 | Θεοδ
416 | Θεοφ
417 | Θεσ
418 | Θεόδ.Μοψ
419 | Θεόκρ
420 | Θεόφιλ
421 | Θουκ
422 | Θρ
423 | Θρ.Ε
424 | Θρ.Ιερ
425 | Θρ.Ιρ
426 | Ιακ
427 | Ιαν
428 | Ιβ
429 | Ιδθ
430 | Ιδ
431 | Ιεζ
432 | Ιερ
433 | Ιζ
434 | Ιησ
435 | Ιησ.Ν
436 | Ικ
437 | Ιλ
438 | Ιν
439 | Ιουδ
440 | Ιουστ
441 | Ιούδα
442 | Ιούλ
443 | Ιούν
444 | Ιπποκρ
445 | Ιππόλ
446 | Ιρ
447 | Ισίδ.Πηλ
448 | Ισοκρ
449 | Ισ.Ν
450 | Ιωβ
451 | Ιωλ
452 | Ιων
453 | Ιω
454 | ΚΟΣ
455 | ΚΟ.ΜΕ.ΚΟΝ
456 | ΚΠοινΔ
457 | ΚΠολΔ
458 | ΚαΒ
459 | Καλ
460 | Καλ.Τέχν
461 | ΚανΒ
462 | Καν.Διαδ
463 | Κατάργ
464 | Κλ
465 | ΚοινΔ
466 | Κολσ
467 | Κολ
468 | Κον
469 | Κορ
470 | Κος
471 | ΚριτΕπιθ
472 | ΚριτΕ
473 | Κριτ
474 | Κρ
475 | ΚτΒ
476 | ΚτΕ
477 | ΚτΠ
478 | Κυβ
479 | Κυπρ
480 | Κύριλ.Αλεξ
481 | Κύριλ.Ιερ
482 | Λεβ
483 | Λεξ.Σουίδα
484 | Λευϊτ
485 | Λευ
486 | Λκ
487 | Λογ
488 | ΛουκΑμ
489 | Λουκιαν
490 | Λουκ.Έρωτ
491 | Λουκ.Ενάλ.Διάλ
492 | Λουκ.Ερμ
493 | Λουκ.Εταιρ.Διάλ
494 | Λουκ.Ε.Δ
495 | Λουκ.Θε.Δ
496 | Λουκ.Ικ.
497 | Λουκ.Ιππ
498 | Λουκ.Λεξιφ
499 | Λουκ.Μεν
500 | Λουκ.Μισθ.Συν
501 | Λουκ.Ορχ
502 | Λουκ.Περ
503 | Λουκ.Συρ
504 | Λουκ.Τοξ
505 | Λουκ.Τυρ
506 | Λουκ.Φιλοψ
507 | Λουκ.Φιλ
508 | Λουκ.Χάρ
509 | Λουκ.
510 | Λουκ.Αλ
511 | Λοχ
512 | Λυδ
513 | Λυκ
514 | Λυσ
515 | Λωζ
516 | Λ1
517 | Λ2
518 | ΜΟΕφ
519 | Μάρκ
520 | Μέν
521 | Μαλ
522 | Ματθ
523 | Μα
524 | Μιχ
525 | Μκ
526 | Μλ
527 | Μμ
528 | Μον.Δ.Π
529 | Μον.Πρωτ
530 | Μον
531 | Μρ
532 | Μτ
533 | Μχ
534 | Μ.Βασ
535 | Μ.Πλ
536 | ΝΑ
537 | Ναυτ.Χρον
538 | Να
539 | Νδικ
540 | Νεεμ
541 | Νε
542 | Νικ
543 | ΝκΦ
544 | Νμ
545 | ΝοΒ
546 | Νομ.Δελτ.Τρ.Ελ
547 | Νομ.Δελτ
548 | Νομ.Σ.Κ
549 | Νομ.Χρ
550 | Νομ
551 | Νομ.Διεύθ
552 | Νοσ
553 | Ντ
554 | Νόσων
555 | Ν1
556 | Ν2
557 | Ν3
558 | Ν4
559 | Νtot
560 | Ξενοφ
561 | Ξεν
562 | Ξεν.Ανάβ
563 | Ξεν.Απολ
564 | Ξεν.Απομν
565 | Ξεν.Απομ
566 | Ξεν.Ελλ
567 | Ξεν.Ιέρ
568 | Ξεν.Ιππαρχ
569 | Ξεν.Ιππ
570 | Ξεν.Κυρ.Αν
571 | Ξεν.Κύρ.Παιδ
572 | Ξεν.Κ.Π
573 | Ξεν.Λακ.Πολ
574 | Ξεν.Οικ
575 | Ξεν.Προσ
576 | Ξεν.Συμπόσ
577 | Ξεν.Συμπ
578 | Ο΄
579 | Οβδ
580 | Οβ
581 | ΟικΕ
582 | Οικ
583 | Οικ.Πατρ
584 | Οικ.Σύν.Βατ
585 | Ολομ
586 | Ολ
587 | Ολ.Α.Π
588 | Ομ.Ιλ
589 | Ομ.Οδ
590 | ΟπΤοιχ
591 | Οράτ
592 | Ορθ
593 | ΠΡΟ.ΠΟ
594 | Πίνδ
595 | Πίνδ.Ι
596 | Πίνδ.Νεμ
597 | Πίνδ.Ν
598 | Πίνδ.Ολ
599 | Πίνδ.Παθ
600 | Πίνδ.Πυθ
601 | Πίνδ.Π
602 | ΠαγΝμλγ
603 | Παν
604 | Παρμ
605 | Παροιμ
606 | Παρ
607 | Παυσ
608 | Πειθ.Συμβ
609 | ΠειρΝ
610 | Πελ
611 | ΠεντΣτρ
612 | Πεντ
613 | Πεντ.Εφ
614 | ΠερΔικ
615 | Περ.Γεν.Νοσ
616 | Πετ
617 | Πλάτ
618 | Πλάτ.Αλκ
619 | Πλάτ.Αντ
620 | Πλάτ.Αξίοχ
621 | Πλάτ.Απόλ
622 | Πλάτ.Γοργ
623 | Πλάτ.Ευθ
624 | Πλάτ.Θεαίτ
625 | Πλάτ.Κρατ
626 | Πλάτ.Κριτ
627 | Πλάτ.Λύσ
628 | Πλάτ.Μεν
629 | Πλάτ.Νόμ
630 | Πλάτ.Πολιτ
631 | Πλάτ.Πολ
632 | Πλάτ.Πρωτ
633 | Πλάτ.Σοφ.
634 | Πλάτ.Συμπ
635 | Πλάτ.Τίμ
636 | Πλάτ.Φαίδρ
637 | Πλάτ.Φιλ
638 | Πλημ
639 | Πλούτ
640 | Πλούτ.Άρατ
641 | Πλούτ.Αιμ
642 | Πλούτ.Αλέξ
643 | Πλούτ.Αλκ
644 | Πλούτ.Αντ
645 | Πλούτ.Αρτ
646 | Πλούτ.Ηθ
647 | Πλούτ.Θεμ
648 | Πλούτ.Κάμ
649 | Πλούτ.Καίσ
650 | Πλούτ.Κικ
651 | Πλούτ.Κράσ
652 | Πλούτ.Κ
653 | Πλούτ.Λυκ
654 | Πλούτ.Μάρκ
655 | Πλούτ.Μάρ
656 | Πλούτ.Περ
657 | Πλούτ.Ρωμ
658 | Πλούτ.Σύλλ
659 | Πλούτ.Φλαμ
660 | Πλ
661 | Ποιν.Δικ
662 | Ποιν.Δ
663 | Ποιν.Ν
664 | Ποιν.Χρον
665 | Ποιν.Χρ
666 | Πολ.Δ
667 | Πολ.Πρωτ
668 | Πολ
669 | Πολ.Μηχ
670 | Πολ.Μ
671 | Πρακτ.Αναθ
672 | Πρακτ.Ολ
673 | Πραξ
674 | Πρμ
675 | Πρξ
676 | Πρωτ
677 | Πρ
678 | Πρ.Αν
679 | Πρ.Λογ
680 | Πταισμ
681 | Πυρ.Καλ
682 | Πόλη
683 | Π.Δ
684 | Π.Δ.Άσμ
685 | ΡΜ.Ε
686 | Ρθ
687 | Ρμ
688 | Ρωμ
689 | ΣΠλημ
690 | Σαπφ
691 | Σειρ
692 | Σολ
693 | Σοφ
694 | Σοφ.Αντιγ
695 | Σοφ.Αντ
696 | Σοφ.Αποσ
697 | Σοφ.Απ
698 | Σοφ.Ηλέκ
699 | Σοφ.Ηλ
700 | Σοφ.Οιδ.Κολ
701 | Σοφ.Οιδ.Τύρ
702 | Σοφ.Ο.Τ
703 | Σοφ.Σειρ
704 | Σοφ.Σολ
705 | Σοφ.Τραχ
706 | Σοφ.Φιλοκτ
707 | Σρ
708 | Σ.τ.Ε
709 | Σ.τ.Π
710 | Στρ.Π.Κ
711 | Στ.Ευρ
712 | Συζήτ
713 | Συλλ.Νομολ
714 | Συλ.Νομ
715 | ΣυμβΕπιθ
716 | Συμπ.Ν
717 | Συνθ.Αμ
718 | Συνθ.Ε.Ε
719 | Συνθ.Ε.Κ
720 | Συνθ.Ν
721 | Σφν
722 | Σφ
723 | Σφ.Σλ
724 | Σχ.Πολ.Δ
725 | Σχ.Συντ.Ε
726 | Σωσ
727 | Σύντ
728 | Σ.Πληρ
729 | ΤΘ
730 | ΤΣ.Δ
731 | Τίτ
732 | Τβ
733 | Τελ.Ενημ
734 | Τελ.Κ
735 | Τερτυλ
736 | Τιμ
737 | Τοπ.Α
738 | Τρ.Ο
739 | Τριμ
740 | Τριμ.Πλ
741 | Τρ.Πλημ
742 | Τρ.Π.Δ
743 | Τ.τ.Ε
744 | Ττ
745 | Τωβ
746 | Υγ
747 | Υπερ
748 | Υπ
749 | Υ.Γ
750 | Φιλήμ
751 | Φιλιπ
752 | Φιλ
753 | Φλμ
754 | Φλ
755 | Φορ.Β
756 | Φορ.Δ.Ε
757 | Φορ.Δνη
758 | Φορ.Δ
759 | Φορ.Επ
760 | Φώτ
761 | Χρ.Ι.Δ
762 | Χρ.Ιδ.Δ
763 | Χρ.Ο
764 | Χρυσ
765 | Ψήφ
766 | Ψαλμ
767 | Ψαλ
768 | Ψλ
769 | Ωριγ
770 | Ωσ
771 | Ω.Ρ.Λ
772 | άγν
773 | άγν.ετυμολ
774 | άγ
775 | άκλ
776 | άνθρ
777 | άπ
778 | άρθρ
779 | άρν
780 | άρ
781 | άτ
782 | άψ
783 | ά
784 | έκδ
785 | έκφρ
786 | έμψ
787 | ένθ.αν
788 | έτ
789 | έ.α
790 | ίδ
791 | αβεστ
792 | αβησσ
793 | αγγλ
794 | αγγ
795 | αδημ
796 | αεροναυτ
797 | αερον
798 | αεροπ
799 | αθλητ
800 | αθλ
801 | αθροιστ
802 | αιγυπτ
803 | αιγ
804 | αιτιολ
805 | αιτ
806 | αι
807 | ακαδ
808 | ακκαδ
809 | αλβ
810 | αλλ
811 | αλφαβητ
812 | αμα
813 | αμερικ
814 | αμερ
815 | αμετάβ
816 | αμτβ
817 | αμφιβ
818 | αμφισβ
819 | αμφ
820 | αμ
821 | ανάλ
822 | ανάπτ
823 | ανάτ
824 | αναβ
825 | αναδαν
826 | αναδιπλασ
827 | αναδιπλ
828 | αναδρ
829 | αναλ
830 | αναν
831 | ανασυλλ
832 | ανατολ
833 | ανατομ
834 | ανατυπ
835 | ανατ
836 | αναφορ
837 | αναφ
838 | ανα.ε
839 | ανδρων
840 | ανθρωπολ
841 | ανθρωπ
842 | ανθ
843 | ανομ
844 | αντίτ
845 | αντδ
846 | αντιγρ
847 | αντιθ
848 | αντικ
849 | αντιμετάθ
850 | αντων
851 | αντ
852 | ανωτ
853 | ανόργ
854 | ανών
855 | αορ
856 | απαρέμφ
857 | απαρφ
858 | απαρχ
859 | απαρ
860 | απλολ
861 | απλοπ
862 | αποβ
863 | αποηχηροπ
864 | αποθ
865 | αποκρυφ
866 | αποφ
867 | απρμφ
868 | απρφ
869 | απρόσ
870 | απόδ
871 | απόλ
872 | απόσπ
873 | απόφ
874 | αραβοτουρκ
875 | αραβ
876 | αραμ
877 | αρβαν
878 | αργκ
879 | αριθμτ
880 | αριθμ
881 | αριθ
882 | αρκτικόλ
883 | αρκ
884 | αρμεν
885 | αρμ
886 | αρνητ
887 | αρσ
888 | αρχαιολ
889 | αρχιτεκτ
890 | αρχιτ
891 | αρχκ
892 | αρχ
893 | αρωμουν
894 | αρωμ
895 | αρ
896 | αρ.μετρ
897 | αρ.φ
898 | ασσυρ
899 | αστρολ
900 | αστροναυτ
901 | αστρον
902 | αττ
903 | αυστραλ
904 | αυτοπ
905 | αυτ
906 | αφγαν
907 | αφηρ
908 | αφομ
909 | αφρικ
910 | αχώρ
911 | αόρ
912 | α.α
913 | α/α
914 | α0
915 | βαθμ
916 | βαθ
917 | βαπτ
918 | βασκ
919 | βεβαιωτ
920 | βεβ
921 | βεδ
922 | βενετ
923 | βεν
924 | βερβερ
925 | βιβλγρ
926 | βιολ
927 | βιομ
928 | βιοχημ
929 | βιοχ
930 | βλάχ
931 | βλ
932 | βλ.λ
933 | βοταν
934 | βοτ
935 | βουλγαρ
936 | βουλγ
937 | βούλ
938 | βραζιλ
939 | βρετον
940 | βόρ
941 | γαλλ
942 | γενικότ
943 | γενοβ
944 | γεν
945 | γερμαν
946 | γερμ
947 | γεωγρ
948 | γεωλ
949 | γεωμετρ
950 | γεωμ
951 | γεωπ
952 | γεωργ
953 | γλυπτ
954 | γλωσσολ
955 | γλωσσ
956 | γλ
957 | γνμδ
958 | γνμ
959 | γνωμ
960 | γοτθ
961 | γραμμ
962 | γραμ
963 | γρμ
964 | γρ
965 | γυμν
966 | δίδες
967 | δίκ
968 | δίφθ
969 | δαν
970 | δεικτ
971 | δεκατ
972 | δηλ
973 | δημογρ
974 | δημοτ
975 | δημώδ
976 | δημ
977 | διάγρ
978 | διάκρ
979 | διάλεξ
980 | διάλ
981 | διάσπ
982 | διαλεκτ
983 | διατρ
984 | διαφ
985 | διαχ
986 | διδα
987 | διεθν
988 | διεθ
989 | δικον
990 | διστ
991 | δισύλλ
992 | δισ
993 | διφθογγοπ
994 | δογμ
995 | δολ
996 | δοτ
997 | δρμ
998 | δρχ
999 | δρ(α)
1000 | δωρ
1001 | δ
1002 | εβρ
1003 | εγκλπ
1004 | εδ
1005 | εθνολ
1006 | εθν
1007 | ειδικότ
1008 | ειδ
1009 | ειδ.β
1010 | εικ
1011 | ειρ
1012 | εισ
1013 | εκατοστμ
1014 | εκατοστ
1015 | εκατστ.2
1016 | εκατστ.3
1017 | εκατ
1018 | εκδ
1019 | εκκλησ
1020 | εκκλ
1021 | εκ
1022 | ελλην
1023 | ελλ
1024 | ελνστ
1025 | ελπ
1026 | εμβ
1027 | εμφ
1028 | εναλλ
1029 | ενδ
1030 | ενεργ
1031 | ενεστ
1032 | ενικ
1033 | ενν
1034 | εν
1035 | εξέλ
1036 | εξακολ
1037 | εξομάλ
1038 | εξ
1039 | εο
1040 | επέκτ
1041 | επίδρ
1042 | επίθ
1043 | επίρρ
1044 | επίσ
1045 | επαγγελμ
1046 | επανάλ
1047 | επανέκδ
1048 | επιθ
1049 | επικ
1050 | επιμ
1051 | επιρρ
1052 | επιστ
1053 | επιτατ
1054 | επιφ
1055 | επών
1056 | επ
1057 | εργ
1058 | ερμ
1059 | ερρινοπ
1060 | ερωτ
1061 | ετρουσκ
1062 | ετυμ
1063 | ετ
1064 | ευφ
1065 | ευχετ
1066 | εφ
1067 | εύχρ
1068 | ε.α
1069 | ε/υ
1070 | ε0
1071 | ζωγρ
1072 | ζωολ
1073 | ηθικ
1074 | ηθ
1075 | ηλεκτρολ
1076 | ηλεκτρον
1077 | ηλεκτρ
1078 | ημίτ
1079 | ημίφ
1080 | ημιφ
1081 | ηχηροπ
1082 | ηχηρ
1083 | ηχομιμ
1084 | ηχ
1085 | η
1086 | θέατρ
1087 | θεολ
1088 | θετ
1089 | θηλ
1090 | θρακ
1091 | θρησκειολ
1092 | θρησκ
1093 | θ
1094 | ιαπων
1095 | ιατρ
1096 | ιδιωμ
1097 | ιδ
1098 | ινδ
1099 | ιραν
1100 | ισπαν
1101 | ιστορ
1102 | ιστ
1103 | ισχυροπ
1104 | ιταλ
1105 | ιχθυολ
1106 | ιων
1107 | κάτ
1108 | καθ
1109 | κακοσ
1110 | καν
1111 | καρ
1112 | κατάλ
1113 | κατατ
1114 | κατωτ
1115 | κατ
1116 | κα
1117 | κελτ
1118 | κεφ
1119 | κινεζ
1120 | κινημ
1121 | κλητ
1122 | κλιτ
1123 | κλπ
1124 | κλ
1125 | κν
1126 | κοινωνιολ
1127 | κοινων
1128 | κοπτ
1129 | κουτσοβλαχ
1130 | κουτσοβλ
1131 | κπ
1132 | κρ.γν
1133 | κτγ
1134 | κτην
1135 | κτητ
1136 | κτλ
1137 | κτ
1138 | κυριολ
1139 | κυρ
1140 | κύρ
1141 | κ
1142 | κ.ά
1143 | κ.ά.π
1144 | κ.α
1145 | κ.εξ
1146 | κ.επ
1147 | κ.ε
1148 | κ.λπ
1149 | κ.λ.π
1150 | κ.ού.κ
1151 | κ.ο.κ
1152 | κ.τ.λ
1153 | κ.τ.τ
1154 | κ.τ.ό
1155 | λέξ
1156 | λαογρ
1157 | λαπ
1158 | λατιν
1159 | λατ
1160 | λαϊκότρ
1161 | λαϊκ
1162 | λετ
1163 | λιθ
1164 | λογιστ
1165 | λογοτ
1166 | λογ
1167 | λουβ
1168 | λυδ
1169 | λόγ
1170 | λ
1171 | λ.χ
1172 | μέλλ
1173 | μέσ
1174 | μαθημ
1175 | μαθ
1176 | μαιευτ
1177 | μαλαισ
1178 | μαλτ
1179 | μαμμων
1180 | μεγεθ
1181 | μεε
1182 | μειωτ
1183 | μελ
1184 | μεξ
1185 | μεσν
1186 | μεσογ
1187 | μεσοπαθ
1188 | μεσοφ
1189 | μετάθ
1190 | μεταβτ
1191 | μεταβ
1192 | μετακ
1193 | μεταπλ
1194 | μεταπτωτ
1195 | μεταρ
1196 | μεταφορ
1197 | μετβ
1198 | μετεπιθ
1199 | μετεπιρρ
1200 | μετεωρολ
1201 | μετεωρ
1202 | μετον
1203 | μετουσ
1204 | μετοχ
1205 | μετρ
1206 | μετ
1207 | μητρων
1208 | μηχανολ
1209 | μηχ
1210 | μικροβιολ
1211 | μογγολ
1212 | μορφολ
1213 | μουσ
1214 | μπενελούξ
1215 | μσνλατ
1216 | μσν
1217 | μτβ
1218 | μτγν
1219 | μτγ
1220 | μτφρδ
1221 | μτφρ
1222 | μτφ
1223 | μτχ
1224 | μυθ
1225 | μυκην
1226 | μυκ
1227 | μφ
1228 | μ
1229 | μ.ε
1230 | μ.μ
1231 | μ.π.ε
1232 | μ.π.π
1233 | μ0
1234 | ναυτ
1235 | νεοελλ
1236 | νεολατιν
1237 | νεολατ
1238 | νεολ
1239 | νεότ
1240 | νλατ
1241 | νομ
1242 | νορβ
1243 | νοσ
1244 | νότ
1245 | ν
1246 | ξ.λ
1247 | οικοδ
1248 | οικολ
1249 | οικον
1250 | οικ
1251 | ολλανδ
1252 | ολλ
1253 | ομηρ
1254 | ομόρρ
1255 | ονομ
1256 | ον
1257 | οπτ
1258 | ορθογρ
1259 | ορθ
1260 | οριστ
1261 | ορυκτολ
1262 | ορυκτ
1263 | ορ
1264 | οσετ
1265 | οσκ
1266 | ουαλ
1267 | ουγγρ
1268 | ουδ
1269 | ουσιαστικοπ
1270 | ουσιαστ
1271 | ουσ
1272 | πίν
1273 | παθητ
1274 | παθολ
1275 | παθ
1276 | παιδ
1277 | παλαιοντ
1278 | παλαιότ
1279 | παλ
1280 | παππων
1281 | παράγρ
1282 | παράγ
1283 | παράλλ
1284 | παράλ
1285 | παραγ
1286 | παρακ
1287 | παραλ
1288 | παραπ
1289 | παρατ
1290 | παρβ
1291 | παρετυμ
1292 | παροξ
1293 | παρων
1294 | παρωχ
1295 | παρ
1296 | παρ.φρ
1297 | πατριδων
1298 | πατρων
1299 | πβ
1300 | περιθ
1301 | περιλ
1302 | περιφρ
1303 | περσ
1304 | περ
1305 | πιθ
1306 | πληθ
1307 | πληροφ
1308 | ποδ
1309 | ποιητ
1310 | πολιτ
1311 | πολλαπλ
1312 | πολ
1313 | πορτογαλ
1314 | πορτ
1315 | ποσ
1316 | πρακριτ
1317 | πρβλ
1318 | πρβ
1319 | πργ
1320 | πρκμ
1321 | πρκ
1322 | πρλ
1323 | προέλ
1324 | προβηγκ
1325 | προελλ
1326 | προηγ
1327 | προθεμ
1328 | προπαραλ
1329 | προπαροξ
1330 | προπερισπ
1331 | προσαρμ
1332 | προσηγορ
1333 | προσταχτ
1334 | προστ
1335 | προσφών
1336 | προσ
1337 | προτακτ
1338 | προτ.Εισ
1339 | προφ
1340 | προχωρ
1341 | πρτ
1342 | πρόθ
1343 | πρόσθ
1344 | πρόσ
1345 | πρότ
1346 | πρ
1347 | πρ.Εφ
1348 | πτ
1349 | πυ
1350 | π
1351 | π.Χ
1352 | π.μ
1353 | π.χ
1354 | ρήμ
1355 | ρίζ
1356 | ρηματ
1357 | ρητορ
1358 | ριν
1359 | ρουμ
1360 | ρωμ
1361 | ρωσ
1362 | ρ
1363 | σανσκρ
1364 | σαξ
1365 | σελ
1366 | σερβοκρ
1367 | σερβ
1368 | σημασιολ
1369 | σημδ
1370 | σημειολ
1371 | σημερ
1372 | σημιτ
1373 | σημ
1374 | σκανδ
1375 | σκυθ
1376 | σκωπτ
1377 | σλαβ
1378 | σλοβ
1379 | σουηδ
1380 | σουμερ
1381 | σουπ
1382 | σπάν
1383 | σπανιότ
1384 | σπ
1385 | σσ
1386 | στατ
1387 | στερ
1388 | στιγμ
1389 | στιχ
1390 | στρέμ
1391 | στρατιωτ
1392 | στρατ
1393 | στ
1394 | συγγ
1395 | συγκρ
1396 | συγκ
1397 | συμπερ
1398 | συμπλεκτ
1399 | συμπλ
1400 | συμπροφ
1401 | συμφυρ
1402 | συμφ
1403 | συνήθ
1404 | συνίζ
1405 | συναίρ
1406 | συναισθ
1407 | συνδετ
1408 | συνδ
1409 | συνεκδ
1410 | συνηρ
1411 | συνθετ
1412 | συνθ
1413 | συνοπτ
1414 | συντελ
1415 | συντομογρ
1416 | συντ
1417 | συν
1418 | συρ
1419 | σχημ
1420 | σχ
1421 | σύγκρ
1422 | σύμπλ
1423 | σύμφ
1424 | σύνδ
1425 | σύνθ
1426 | σύντμ
1427 | σύντ
1428 | σ
1429 | σ.π
1430 | σ/β
1431 | τακτ
1432 | τελ
1433 | τετρ
1434 | τετρ.μ
1435 | τεχνλ
1436 | τεχνολ
1437 | τεχν
1438 | τεύχ
1439 | τηλεπικ
1440 | τηλεόρ
1441 | τιμ
1442 | τιμ.τομ
1443 | τοΣ
1444 | τον
1445 | τοπογρ
1446 | τοπων
1447 | τοπ
1448 | τοσκ
1449 | τουρκ
1450 | τοχ
1451 | τριτοπρόσ
1452 | τροποπ
1453 | τροπ
1454 | τσεχ
1455 | τσιγγ
1456 | ττ
1457 | τυπ
1458 | τόμ
1459 | τόνν
1460 | τ
1461 | τ.μ
1462 | τ.χλμ
1463 | υβρ
1464 | υπερθ
1465 | υπερσ
1466 | υπερ
1467 | υπεύθ
1468 | υποθ
1469 | υποκορ
1470 | υποκ
1471 | υποσημ
1472 | υποτ
1473 | υποφ
1474 | υποχωρ
1475 | υπόλ
1476 | υπόχρ
1477 | υπ
1478 | υστλατ
1479 | υψόμ
1480 | υψ
1481 | φάκ
1482 | φαρμακολ
1483 | φαρμ
1484 | φιλολ
1485 | φιλοσ
1486 | φιλοτ
1487 | φινλ
1488 | φοινικ
1489 | φράγκ
1490 | φρανκον
1491 | φριζ
1492 | φρ
1493 | φυλλ
1494 | φυσιολ
1495 | φυσ
1496 | φωνηεντ
1497 | φωνητ
1498 | φωνολ
1499 | φων
1500 | φωτογρ
1501 | φ
1502 | φ.τ.μ
1503 | χαμιτ
1504 | χαρτόσ
1505 | χαρτ
1506 | χασμ
1507 | χαϊδ
1508 | χγφ
1509 | χειλ
1510 | χεττ
1511 | χημ
1512 | χιλ
1513 | χλγρ
1514 | χλγ
1515 | χλμ
1516 | χλμ.2
1517 | χλμ.3
1518 | χλσγρ
1519 | χλστγρ
1520 | χλστμ
1521 | χλστμ.2
1522 | χλστμ.3
1523 | χλ
1524 | χργρ
1525 | χρημ
1526 | χρον
1527 | χρ
1528 | χφ
1529 | χ.ε
1530 | χ.κ
1531 | χ.ο
1532 | χ.σ
1533 | χ.τ
1534 | χ.χ
1535 | ψευδ
1536 | ψυχαν
1537 | ψυχιατρ
1538 | ψυχολ
1539 | ψυχ
1540 | ωκεαν
1541 | όμ
1542 | όν
1543 | όπ.παρ
1544 | όπ.π
1545 | ό.π
1546 | ύψ
1547 | 1Βσ
1548 | 1Εσ
1549 | 1Θσ
1550 | 1Ιν
1551 | 1Κρ
1552 | 1Μκ
1553 | 1Πρ
1554 | 1Πτ
1555 | 1Τμ
1556 | 2Βσ
1557 | 2Εσ
1558 | 2Θσ
1559 | 2Ιν
1560 | 2Κρ
1561 | 2Μκ
1562 | 2Πρ
1563 | 2Πτ
1564 | 2Τμ
1565 | 3Βσ
1566 | 3Ιν
1567 | 3Μκ
1568 | 4Βσ
1569 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 |
33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34 | Adj
35 | Adm
36 | Adv
37 | Asst
38 | Bart
39 | Bldg
40 | Brig
41 | Bros
42 | Capt
43 | Cmdr
44 | Col
45 | Comdr
46 | Con
47 | Corp
48 | Cpl
49 | DR
50 | Dr
51 | Drs
52 | Ens
53 | Gen
54 | Gov
55 | Hon
56 | Hr
57 | Hosp
58 | Insp
59 | Lt
60 | MM
61 | MR
62 | MRS
63 | MS
64 | Maj
65 | Messrs
66 | Mlle
67 | Mme
68 | Mr
69 | Mrs
70 | Ms
71 | Msgr
72 | Op
73 | Ord
74 | Pfc
75 | Ph
76 | Prof
77 | Pvt
78 | Rep
79 | Reps
80 | Res
81 | Rev
82 | Rt
83 | Sen
84 | Sens
85 | Sfc
86 | Sgt
87 | Sr
88 | St
89 | Supt
90 | Surg
91 |
92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
93 | v
94 | vs
95 | i.e
96 | rev
97 | e.g
98 |
99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY#
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 |
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 |
33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
34 |
35 | A.C
36 | Apdo
37 | Av
38 | Bco
39 | CC.AA
40 | Da
41 | Dep
42 | Dn
43 | Dr
44 | Dra
45 | EE.UU
46 | Excmo
47 | FF.CC
48 | Fil
49 | Gral
50 | J.C
51 | Let
52 | Lic
53 | N.B
54 | P.D
55 | P.V.P
56 | Prof
57 | Pts
58 | Rte
59 | S.A
60 | S.A.R
61 | S.E
62 | S.L
63 | S.R.C
64 | Sr
65 | Sra
66 | Srta
67 | Sta
68 | Sto
69 | T.V.E
70 | Tel
71 | Ud
72 | Uds
73 | V.B
74 | V.E
75 | Vd
76 | Vds
77 | a/c
78 | adj
79 | admón
80 | afmo
81 | apdo
82 | av
83 | c
84 | c.f
85 | c.g
86 | cap
87 | cm
88 | cta
89 | dcha
90 | doc
91 | ej
92 | entlo
93 | esq
94 | etc
95 | f.c
96 | gr
97 | grs
98 | izq
99 | kg
100 | km
101 | mg
102 | mm
103 | núm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pág
110 | págs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
2 | #indicate an end-of-sentence marker. Special cases are included for prefixes
3 | #that ONLY appear before 0-9 numbers.
4 |
5 | #This list is compiled from omorfi database
6 | #by Tommi A Pirinen.
7 |
8 |
9 | #any single upper case letter followed by a period is not a sentence ender
10 | A
11 | B
12 | C
13 | D
14 | E
15 | F
16 | G
17 | H
18 | I
19 | J
20 | K
21 | L
22 | M
23 | N
24 | O
25 | P
26 | Q
27 | R
28 | S
29 | T
30 | U
31 | V
32 | W
33 | X
34 | Y
35 | Z
36 | Å
37 | Ä
38 | Ö
39 |
40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
41 | alik
42 | alil
43 | amir
44 | apul
45 | apul.prof
46 | arkkit
47 | ass
48 | assist
49 | dipl
50 | dipl.arkkit
51 | dipl.ekon
52 | dipl.ins
53 | dipl.kielenk
54 | dipl.kirjeenv
55 | dipl.kosm
56 | dipl.urk
57 | dos
58 | erikoiseläinl
59 | erikoishammasl
60 | erikoisl
61 | erikoist
62 | ev.luutn
63 | evp
64 | fil
65 | ft
66 | hallinton
67 | hallintot
68 | hammaslääket
69 | jatk
70 | jääk
71 | kansaned
72 | kapt
73 | kapt.luutn
74 | kenr
75 | kenr.luutn
76 | kenr.maj
77 | kers
78 | kirjeenv
79 | kom
80 | kom.kapt
81 | komm
82 | konst
83 | korpr
84 | luutn
85 | maist
86 | maj
87 | Mr
88 | Mrs
89 | Ms
90 | M.Sc
91 | neuv
92 | nimim
93 | Ph.D
94 | prof
95 | puh.joht
96 | pääll
97 | res
98 | san
99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 |
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 | #
4 | #any single upper case letter followed by a period is not a sentence ender
5 | #usually upper case letters are initials in a name
6 | #no French words end in single lower-case letters, so we throw those in too?
7 | A
8 | B
9 | C
10 | D
11 | E
12 | F
13 | G
14 | H
15 | I
16 | J
17 | K
18 | L
19 | M
20 | N
21 | O
22 | P
23 | Q
24 | R
25 | S
26 | T
27 | U
28 | V
29 | W
30 | X
31 | Y
32 | Z
33 | a
34 | b
35 | c
36 | d
37 | e
38 | f
39 | g
40 | h
41 | i
42 | j
43 | k
44 | l
45 | m
46 | n
47 | o
48 | p
49 | q
50 | r
51 | s
52 | t
53 | u
54 | v
55 | w
56 | x
57 | y
58 | z
59 |
60 | # Period-final abbreviation list for French
61 | A.C.N
62 | A.M
63 | art
64 | ann
65 | apr
66 | av
67 | auj
68 | lib
69 | B.P
70 | boul
71 | ca
72 | c.-à-d
73 | cf
74 | ch.-l
75 | chap
76 | contr
77 | C.P.I
78 | C.Q.F.D
79 | C.N
80 | C.N.S
81 | C.S
82 | dir
83 | éd
84 | e.g
85 | env
86 | al
87 | etc
88 | E.V
89 | ex
90 | fasc
91 | fém
92 | fig
93 | fr
94 | hab
95 | ibid
96 | id
97 | i.e
98 | inf
99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 | Á
33 | É
34 | Í
35 | Ó
36 | Ö
37 | Ő
38 | Ú
39 | Ü
40 | Ű
41 |
42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
43 | Dr
44 | dr
45 | kb
46 | Kb
47 | vö
48 | Vö
49 | pl
50 | Pl
51 | ca
52 | Ca
53 | min
54 | Min
55 | max
56 | Max
57 | ún
58 | Ún
59 | prof
60 | Prof
61 | de
62 | De
63 | du
64 | Du
65 | Szt
66 | St
67 |
68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
69 | # add NUMERIC_ONLY after the word for this function
70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
71 | #if followed by a number, a non-breaking prefix
72 |
73 | # Month name abbreviations
74 | jan #NUMERIC_ONLY#
75 | Jan #NUMERIC_ONLY#
76 | Feb #NUMERIC_ONLY#
77 | feb #NUMERIC_ONLY#
78 | márc #NUMERIC_ONLY#
79 | Márc #NUMERIC_ONLY#
80 | ápr #NUMERIC_ONLY#
81 | Ápr #NUMERIC_ONLY#
82 | máj #NUMERIC_ONLY#
83 | Máj #NUMERIC_ONLY#
84 | jún #NUMERIC_ONLY#
85 | Jún #NUMERIC_ONLY#
86 | Júl #NUMERIC_ONLY#
87 | júl #NUMERIC_ONLY#
88 | aug #NUMERIC_ONLY#
89 | Aug #NUMERIC_ONLY#
90 | Szept #NUMERIC_ONLY#
91 | szept #NUMERIC_ONLY#
92 | okt #NUMERIC_ONLY#
93 | Okt #NUMERIC_ONLY#
94 | nov #NUMERIC_ONLY#
95 | Nov #NUMERIC_ONLY#
96 | dec #NUMERIC_ONLY#
97 | Dec #NUMERIC_ONLY#
98 |
99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
1 | no #NUMERIC_ONLY#
2 | No #NUMERIC_ONLY#
3 | nr #NUMERIC_ONLY#
4 | Nr #NUMERIC_ONLY#
5 | nR #NUMERIC_ONLY#
6 | NR #NUMERIC_ONLY#
7 | a
8 | b
9 | c
10 | d
11 | e
12 | f
13 | g
14 | h
15 | i
16 | j
17 | k
18 | l
19 | m
20 | n
21 | o
22 | p
23 | q
24 | r
25 | s
26 | t
27 | u
28 | v
29 | w
30 | x
31 | y
32 | z
33 | ^
34 | í
35 | á
36 | ó
37 | æ
38 | A
39 | B
40 | C
41 | D
42 | E
43 | F
44 | G
45 | H
46 | I
47 | J
48 | K
49 | L
50 | M
51 | N
52 | O
53 | P
54 | Q
55 | R
56 | S
57 | T
58 | U
59 | V
60 | W
61 | X
62 | Y
63 | Z
64 | ab.fn
65 | a.fn
66 | afs
67 | al
68 | alm
69 | alg
70 | andh
71 | ath
72 | aths
73 | atr
74 | ao
75 | au
76 | aukaf
77 | áfn
78 | áhrl.s
79 | áhrs
80 | ákv.gr
81 | ákv
82 | bh
83 | bls
84 | dr
85 | e.Kr
86 | et
87 | ef
88 | efn
89 | ennfr
90 | eink
91 | end
92 | e.st
93 | erl
94 | fél
95 | fskj
96 | fh
97 | f.hl
98 | físl
99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 |
33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34 | Adj
35 | Adm
36 | Adv
37 | Amn
38 | Arch
39 | Asst
40 | Avv
41 | Bart
42 | Bcc
43 | Bldg
44 | Brig
45 | Bros
46 | C.A.P
47 | C.P
48 | Capt
49 | Cc
50 | Cmdr
51 | Co
52 | Col
53 | Comdr
54 | Con
55 | Corp
56 | Cpl
57 | DR
58 | Dott
59 | Dr
60 | Drs
61 | Egr
62 | Ens
63 | Gen
64 | Geom
65 | Gov
66 | Hon
67 | Hosp
68 | Hr
69 | Id
70 | Ing
71 | Insp
72 | Lt
73 | MM
74 | MR
75 | MRS
76 | MS
77 | Maj
78 | Messrs
79 | Mlle
80 | Mme
81 | Mo
82 | Mons
83 | Mr
84 | Mrs
85 | Ms
86 | Msgr
87 | N.B
88 | Op
89 | Ord
90 | P.S
91 | P.T
92 | Pfc
93 | Ph
94 | Prof
95 | Pvt
96 | RP
97 | RSVP
98 | Rag
99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 |
124 | # other
125 | a.c
126 | acc
127 | all
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 |
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 |
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY#
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | Ā
8 | B
9 | C
10 | Č
11 | D
12 | E
13 | Ē
14 | F
15 | G
16 | Ģ
17 | H
18 | I
19 | Ī
20 | J
21 | K
22 | Ķ
23 | L
24 | Ļ
25 | M
26 | N
27 | Ņ
28 | O
29 | P
30 | Q
31 | R
32 | S
33 | Š
34 | T
35 | U
36 | Ū
37 | V
38 | W
39 | X
40 | Y
41 | Z
42 | Ž
43 |
44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
45 | dr
46 | Dr
47 | med
48 | prof
49 | Prof
50 | inž
51 | Inž
52 | ist.loc
53 | Ist.loc
54 | kor.loc
55 | Kor.loc
56 | v.i
57 | vietn
58 | Vietn
59 |
60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
61 | a.l
62 | t.p
63 | pārb
64 | Pārb
65 | vec
66 | Vec
67 | inv
68 | Inv
69 | sk
70 | Sk
71 | spec
72 | Spec
73 | vienk
74 | Vienk
75 | virz
76 | Virz
77 | māksl
78 | Māksl
79 | mūz
80 | Mūz
81 | akad
82 | Akad
83 | soc
84 | Soc
85 | galv
86 | Galv
87 | vad
88 | Vad
89 | sertif
90 | Sertif
91 | folkl
92 | Folkl
93 | hum
94 | Hum
95 |
96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
97 | # add NUMERIC_ONLY after the word for this function
98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY#
101 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm
5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
7 | #usually upper case letters are initials in a name
8 | A
9 | B
10 | C
11 | D
12 | E
13 | F
14 | G
15 | H
16 | I
17 | J
18 | K
19 | L
20 | M
21 | N
22 | O
23 | P
24 | Q
25 | R
26 | S
27 | T
28 | U
29 | V
30 | W
31 | X
32 | Y
33 | Z
34 |
35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
36 | bacc
37 | bc
38 | bgen
39 | c.i
40 | dhr
41 | dr
42 | dr.h.c
43 | drs
44 | drs
45 | ds
46 | eint
47 | fa
48 | Fa
49 | fam
50 | gen
51 | genm
52 | ing
53 | ir
54 | jhr
55 | jkvr
56 | jr
57 | kand
58 | kol
59 | lgen
60 | lkol
61 | Lt
62 | maj
63 | Mej
64 | mevr
65 | Mme
66 | mr
67 | mr
68 | Mw
69 | o.b.s
70 | plv
71 | prof
72 | ritm
73 | tint
74 | Vz
75 | Z.D
76 | Z.D.H
77 | Z.E
78 | Z.Em
79 | Z.H
80 | Z.K.H
81 | Z.K.M
82 | Z.M
83 | z.v
84 |
85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
87 | a.g.v
88 | bijv
89 | bijz
90 | bv
91 | d.w.z
92 | e.c
93 | e.g
94 | e.k
95 | ev
96 | i.p.v
97 | i.s.m
98 | i.t.t
99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 |
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY#
113 | Nrs
114 | nrs
115 | nr #NUMERIC_ONLY#
116 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
1 | adw
2 | afr
3 | akad
4 | al
5 | Al
6 | am
7 | amer
8 | arch
9 | art
10 | Art
11 | artyst
12 | astr
13 | austr
14 | bałt
15 | bdb
16 | bł
17 | bm
18 | br
19 | bryg
20 | bryt
21 | centr
22 | ces
23 | chem
24 | chiń
25 | chir
26 | c.k
27 | c.o
28 | cyg
29 | cyw
30 | cyt
31 | czes
32 | czw
33 | cd
34 | Cd
35 | czyt
36 | ćw
37 | ćwicz
38 | daw
39 | dcn
40 | dekl
41 | demokr
42 | det
43 | diec
44 | dł
45 | dn
46 | dot
47 | dol
48 | dop
49 | dost
50 | dosł
51 | h.c
52 | ds
53 | dst
54 | duszp
55 | dypl
56 | egz
57 | ekol
58 | ekon
59 | elektr
60 | em
61 | ew
62 | fab
63 | farm
64 | fot
65 | fr
66 | gat
67 | gastr
68 | geogr
69 | geol
70 | gimn
71 | głęb
72 | gm
73 | godz
74 | górn
75 | gosp
76 | gr
77 | gram
78 | hist
79 | hiszp
80 | hr
81 | Hr
82 | hot
83 | id
84 | in
85 | im
86 | iron
87 | jn
88 | kard
89 | kat
90 | katol
91 | k.k
92 | kk
93 | kol
94 | kl
95 | k.p.a
96 | kpc
97 | k.p.c
98 | kpt
99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
4 |
5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
6 | #usually upper case letters are initials in a name
7 | A
8 | B
9 | C
10 | D
11 | E
12 | F
13 | G
14 | H
15 | I
16 | J
17 | K
18 | L
19 | M
20 | N
21 | O
22 | P
23 | Q
24 | R
25 | S
26 | T
27 | U
28 | V
29 | W
30 | X
31 | Y
32 | Z
33 | a
34 | b
35 | c
36 | d
37 | e
38 | f
39 | g
40 | h
41 | i
42 | j
43 | k
44 | l
45 | m
46 | n
47 | o
48 | p
49 | q
50 | r
51 | s
52 | t
53 | u
54 | v
55 | w
56 | x
57 | y
58 | z
59 |
60 |
61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
62 | I
63 | II
64 | III
65 | IV
66 | V
67 | VI
68 | VII
69 | VIII
70 | IX
71 | X
72 | XI
73 | XII
74 | XIII
75 | XIV
76 | XV
77 | XVI
78 | XVII
79 | XVIII
80 | XIX
81 | XX
82 | i
83 | ii
84 | iii
85 | iv
86 | v
87 | vi
88 | vii
89 | viii
90 | ix
91 | x
92 | xi
93 | xii
94 | xiii
95 | xiv
96 | xv
97 | xvi
98 | xvii
99 | xviii
100 | xix
101 | xx
102 |
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 |
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 |
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY#
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 |
211 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
1 | A
2 | B
3 | C
4 | D
5 | E
6 | F
7 | G
8 | H
9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
1 | # added Cyrillic uppercase letters [А-Я]
2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
4 | А
5 | Б
6 | В
7 | Г
8 | Д
9 | Е
10 | Ж
11 | З
12 | И
13 | Й
14 | К
15 | Л
16 | М
17 | Н
18 | О
19 | П
20 | Р
21 | С
22 | Т
23 | У
24 | Ф
25 | Х
26 | Ц
27 | Ч
28 | Ш
29 | Щ
30 | Ъ
31 | Ы
32 | Ь
33 | Э
34 | Ю
35 | Я
36 | A
37 | B
38 | C
39 | D
40 | E
41 | F
42 | G
43 | H
44 | I
45 | J
46 | K
47 | L
48 | M
49 | N
50 | O
51 | P
52 | Q
53 | R
54 | S
55 | T
56 | U
57 | V
58 | W
59 | X
60 | Y
61 | Z
62 | 0гг
63 | 1гг
64 | 2гг
65 | 3гг
66 | 4гг
67 | 5гг
68 | 6гг
69 | 7гг
70 | 8гг
71 | 9гг
72 | 0г
73 | 1г
74 | 2г
75 | 3г
76 | 4г
77 | 5г
78 | 6г
79 | 7г
80 | 8г
81 | 9г
82 | Xвв
83 | Vвв
84 | Iвв
85 | Lвв
86 | Mвв
87 | Cвв
88 | Xв
89 | Vв
90 | Iв
91 | Lв
92 | Mв
93 | Cв
94 | 0м
95 | 1м
96 | 2м
97 | 3м
98 | 4м
99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
1 | Bc
2 | Mgr
3 | RNDr
4 | PharmDr
5 | PhDr
6 | JUDr
7 | PaedDr
8 | ThDr
9 | Ing
10 | MUDr
11 | MDDr
12 | MVDr
13 | Dr
14 | ThLic
15 | PhD
16 | ArtD
17 | ThDr
18 | Dr
19 | DrSc
20 | CSs
21 | prof
22 | obr
23 | Obr
24 | Č
25 | č
26 | absol
27 | adj
28 | admin
29 | adr
30 | Adr
31 | adv
32 | advok
33 | afr
34 | ak
35 | akad
36 | akc
37 | akuz
38 | et
39 | al
40 | alch
41 | amer
42 | anat
43 | angl
44 | Angl
45 | anglosas
46 | anorg
47 | ap
48 | apod
49 | arch
50 | archeol
51 | archit
52 | arg
53 | art
54 | astr
55 | astrol
56 | astron
57 | atp
58 | atď
59 | austr
60 | Austr
61 | aut
62 | belg
63 | Belg
64 | bibl
65 | Bibl
66 | biol
67 | bot
68 | bud
69 | bás
70 | býv
71 | cest
72 | chem
73 | cirk
74 | csl
75 | čs
76 | Čs
77 | dat
78 | dep
79 | det
80 | dial
81 | diaľ
82 | dipl
83 | distrib
84 | dokl
85 | dosl
86 | dopr
87 | dram
88 | duš
89 | dv
90 | dvojčl
91 | dór
92 | ekol
93 | ekon
94 | el
95 | elektr
96 | elektrotech
97 | energet
98 | epic
99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
1 | dr
2 | Dr
3 | itd
4 | itn
5 | št #NUMERIC_ONLY#
6 | Št #NUMERIC_ONLY#
7 | d
8 | jan
9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
1 | #single upper case letter are usually initials
2 | A
3 | B
4 | C
5 | D
6 | E
7 | F
8 | G
9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 |
--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | அ
7 | ஆ
8 | இ
9 | ஈ
10 | உ
11 | ஊ
12 | எ
13 | ஏ
14 | ஐ
15 | ஒ
16 | ஓ
17 | ஔ
18 | ஃ
19 | க
20 | கா
21 | கி
22 | கீ
23 | கு
24 | கூ
25 | கெ
26 | கே
27 | கை
28 | கொ
29 | கோ
30 | கௌ
31 | க்
32 | ச
33 | சா
34 | சி
35 | சீ
36 | சு
37 | சூ
38 | செ
39 | சே
40 | சை
41 | சொ
42 | சோ
43 | சௌ
44 | ச்
45 | ட
46 | டா
47 | டி
48 | டீ
49 | டு
50 | டூ
51 | டெ
52 | டே
53 | டை
54 | டொ
55 | டோ
56 | டௌ
57 | ட்
58 | த
59 | தா
60 | தி
61 | தீ
62 | து
63 | தூ
64 | தெ
65 | தே
66 | தை
67 | தொ
68 | தோ
69 | தௌ
70 | த்
71 | ப
72 | பா
73 | பி
74 | பீ
75 | பு
76 | பூ
77 | பெ
78 | பே
79 | பை
80 | பொ
81 | போ
82 | பௌ
83 | ப்
84 | ற
85 | றா
86 | றி
87 | றீ
88 | று
89 | றூ
90 | றெ
91 | றே
92 | றை
93 | றொ
94 | றோ
95 | றௌ
96 | ற்
97 | ய
98 | யா
99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ்
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந்
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம்
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 |
254 |
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 |
261 |
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 |
267 |
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY#
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 |
--------------------------------------------------------------------------------
/data/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script preprocesses bitext with Byte Pair Encoding for NMT.
3 | # Executes the following steps:
4 | # 1. Tokenize source and target side of bitext
5 | # 2. Learn BPE-codes for both source and target side
6 | # 3. Encode source and target side using the codes learned
7 | # 4. Shuffle bitext for SGD
8 | # 5. Build source and target dictionaries
9 |
10 | if [ "$#" -ne 4 ]; then
11 | echo ""
12 | echo "Usage: $0 src trg path_to_data path_to_subword"
13 | echo ""
14 | exit 1
15 | fi
16 |
17 | # number of merge ops (codes) for bpe
18 | SRC_CODE_SIZE=20000
19 | TRG_CODE_SIZE=20000
20 |
21 | # source language (example: fr)
22 | S=$1
23 | # target language (example: en)
24 | T=$2
25 |
26 | # path to dl4mt/data
27 | P1=$3
28 |
29 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt)
30 | P2=$4
31 |
32 |
33 | # merge all parallel corpora
34 | ./merge.sh $1 $2 $3
35 |
36 | # tokenize training and validation data
37 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/all_${S}-${T}.${S} > ${P1}/all_${S}-${T}.${S}.tok
38 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/all_${S}-${T}.${T} > ${P1}/all_${S}-${T}.${T}.tok
39 | perl $P1/tokenizer.perl -threads 5 -l $S < ${P1}/test2011/newstest2011.${S} > ${P1}/newstest2011.${S}.tok
40 | perl $P1/tokenizer.perl -threads 5 -l $T < ${P1}/test2011/newstest2011.${T} > ${P1}/newstest2011.${T}.tok
41 |
42 | # BPE
43 | if [ ! -f "${S}.bpe" ]; then
44 | python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${S}.tok > ${S}.bpe
45 | fi
46 | if [ ! -f "${T}.bpe" ]; then
47 | python $P2/learn_bpe.py -s 20000 < all_${S}-${T}.${T}.tok > ${T}.bpe
48 | fi
49 |
50 | # utility function to encode a file with bpe
51 | encode () {
52 | if [ ! -f "$3" ]; then
53 | python $P2/apply_bpe.py -c $1 < $2 > $3
54 | else
55 | echo "$3 exists, pass"
56 | fi
57 | }
58 |
59 | # apply bpe to training data
60 | encode ${S}.bpe ${P1}/all_${S}-${T}.${S}.tok ${P1}/all_${S}-${T}.${S}.tok.bpe
61 | encode ${T}.bpe ${P1}/all_${S}-${T}.${T}.tok ${P1}/all_${S}-${T}.${T}.tok.bpe
62 | encode ${S}.bpe ${P1}/newstest2011.${S}.tok ${P1}/newstest2011.${S}.tok.bpe
63 | encode ${T}.bpe ${P1}/newstest2011.${T}.tok ${P1}/newstest2011.${T}.tok.bpe
64 |
65 | # shuffle
66 | python $P1/shuffle.py all_${S}-${T}.${S}.tok.bpe all_${S}-${T}.${T}.tok.bpe
67 |
68 | # build dictionary
69 | python $P1/build_dictionary.py all_${S}-${T}.${S}.tok.bpe
70 | python $P1/build_dictionary.py all_${S}-${T}.${T}.tok.bpe
71 |
72 |
--------------------------------------------------------------------------------
/data/scan_example.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import theano
3 |
4 | from theano import tensor
5 |
6 |
7 | # some numbers
8 | n_steps = 10
9 | n_samples = 5
10 | dim = 10
11 | input_dim = 20
12 | output_dim = 2
13 |
14 |
15 | # one step function that will be used by scan
16 | def oneStep(x_t, h_tm1, W_x, W_h, W_o):
17 |
18 | h_t = tensor.tanh(tensor.dot(x_t, W_x) +
19 | tensor.dot(h_tm1, W_h))
20 | o_t = tensor.dot(h_t, W_o)
21 |
22 | return h_t, o_t
23 |
24 | # spawn theano tensor variable, our symbolic input
25 | # a 3D tensor (n_steps, n_samples, dim)
26 | x = tensor.tensor3(dtype='float32')
27 |
28 | # initial state of our rnn
29 | init_state = tensor.alloc(0., n_samples, dim)
30 |
31 | # create parameters that we will use,
32 | # note that, parameters are theano shared variables
33 |
34 | # parameters for input to hidden states
35 | W_x_ = numpy.random.randn(input_dim, dim).astype('float32')
36 | W_x = theano.shared(W_x_)
37 |
38 | # parameters for hidden state transition
39 | W_h_ = numpy.random.randn(dim, dim).astype('float32')
40 | W_h = theano.shared(W_h_)
41 |
42 | # parameters from hidden state to output
43 | W_o_ = numpy.random.randn(dim, output_dim).astype('float32')
44 | W_o = theano.shared(W_o_)
45 |
46 | # scan function
47 | ([h_vals, o_vals], updates) = theano.scan(
48 | fn=oneStep,
49 | sequences=[x],
50 | outputs_info=[init_state, None],
51 | non_sequences=[W_x, W_h, W_o],
52 | n_steps=n_steps,
53 | strict=True)
54 |
55 | # let us now compile a function to get the output
56 | f = theano.function([x], [h_vals, o_vals])
57 |
58 | # now we will call the compiled function with actual input
59 | actual_input = numpy.random.randn(
60 | n_steps, n_samples, input_dim).astype('float32')
61 | h_vals_, o_vals_ = f(actual_input)
62 |
63 | # print the shapes
64 | print 'shape of input :', actual_input.shape
65 | print 'shape of h_vals:', h_vals_.shape
66 | print 'shape of o_vals:', o_vals_.shape
67 |
--------------------------------------------------------------------------------
/data/setup_cluster_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script sets up development and data environments for
3 | # fionn cluster, copy under your home directory and run.
4 |
5 | # this file is for the dependencies
6 | LOCAL_INSTALL_FILE=/ichec/work/dl4mt_data/local_install.tgz
7 |
8 | # code directory for cloned repositories
9 | CODE_DIR=${HOME}/codes/dl4mt-material
10 |
11 | # code repository
12 | CODE_CENTRAL=https://github.com/kyunghyuncho/dl4mt-material
13 |
14 | # reference files directory
15 | REF_DATA_DIR=/ichec/work/dl4mt_data/nec_files
16 |
17 | # our input files will reside here
18 | DATA_DIR=${HOME}/data
19 |
20 | # our trained models will be saved here
21 | MODELS_DIR=${HOME}/models
22 |
23 | # theano repository
24 | THEANO_GIT=https://github.com/Theano/Theano.git
25 |
26 | # theano install dir
27 | THEANO_DIR=${HOME}/repo/Theano
28 |
29 | # move to home directory
30 | cd
31 |
32 | # copy dependency file to your local and extract
33 | echo "Copying and extracting dependency file"
34 | rsync --bwlimit=20000 -Pavz ${LOCAL_INSTALL_FILE} ${HOME}
35 | tar zxvf ${HOME}/local_install.tgz
36 |
37 | # clone the repository from github into code directory
38 | echo "Cloning lab repository"
39 | if [ ! -d "${CODE_DIR}" ]; then
40 | mkdir -p ${CODE_DIR}
41 | fi
42 | git clone ${CODE_CENTRAL} ${CODE_DIR}
43 |
44 | # copy corpora, dictionaries etc for training and dev
45 | echo "Copying data"
46 | if [ ! -d "${DATA_DIR}" ]; then
47 | mkdir -p ${DATA_DIR}
48 | fi
49 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/all.* ${DATA_DIR}
50 | rsync --bwlimit=20000 -Pavz ${REF_DATA_DIR}/news* ${DATA_DIR}
51 |
52 | # create model output directory if it does not exist
53 | if [ ! -d "${MODELS_DIR}" ]; then
54 | mkdir -p ${MODELS_DIR}
55 | fi
56 |
57 | # clone and install Theano
58 | echo "Cloning/installing Theano"
59 | mkdir -p ${THEANO_DIR}
60 | git clone ${THEANO_GIT} ${THEANO_DIR}
61 | cd ${THEANO_DIR}
62 | python setup.py install --user
63 |
64 | # check if theano is working
65 | python -c "import theano;print 'theano available!'"
66 |
67 |
--------------------------------------------------------------------------------
/data/setup_local_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script sets up development and data environments for
3 | # a local machine, copy under your home directory and run.
4 | # Note that, Theano is NOT installed by this script.
5 | # To use Byte Pair Encoding, simply pass -b argument.
6 |
7 | BPE=false
8 |
9 | while getopts ':b' flag; do
10 | case "${flag}" in
11 | b) BPE=true
12 | echo "Using Byte Pair Encoding" ;;
13 | *) error
14 | echo ""
15 | echo "Usage: $0 [-b]"
16 | echo ""
17 | exit 1 ;;
18 | esac
19 | done
20 |
21 |
22 | # code directory for cloned repositories
23 | CODE_DIR=${HOME}/git/dl4mt-tutorial
24 |
25 | # code repository
26 | CODE_CENTRAL=https://github.com/kyunghyuncho/dl4mt-tutorial
27 |
28 | # our input files will reside here
29 | DATA_DIR=${CODE_DIR}/data
30 |
31 | # our trained models will be saved here
32 | MODELS_DIR=${HOME}/models
33 |
34 |
35 | # clone the repository from github into code directory
36 | if [ ! -d "${CODE_DIR}" ]; then
37 | echo "Cloning central ..."
38 | mkdir -p ${CODE_DIR}
39 | git clone ${CODE_CENTRAL} ${CODE_DIR}
40 | fi
41 |
42 | # download the europarl v7 and validation sets and extract
43 | python ${CODE_DIR}/data/download_files.py \
44 | -s='fr' -t='en' \
45 | --source-dev=newstest2011.fr \
46 | --target-dev=newstest2011.en \
47 | --outdir=${DATA_DIR}
48 |
49 | if [ "$BPE" = true ] ; then
50 |
51 | BPE_DIR=${HOME}/codes/subword-nmt
52 | BPE_CENTRAL=https://github.com/rsennrich/subword-nmt
53 |
54 | # clone subword-nmt repository
55 | if [ ! -d "${BPE_DIR}" ]; then
56 | echo "Cloning BPE central ..."
57 | mkdir -p ${BPE_DIR}
58 | git clone ${BPE_CENTRAL} ${BPE_DIR}
59 | fi
60 |
61 | # follow the preprocessing pipeline for BPE
62 | ./preprocess.sh 'fr' 'en' ${DATA_DIR} ${BPE_DIR}
63 |
64 | else
65 |
66 | # tokenize corresponding files
67 | perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/test2011/newstest2011.fr > ${DATA_DIR}/newstest2011.fr.tok
68 | perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/test2011/newstest2011.en > ${DATA_DIR}/newstest2011.en.tok
69 | perl ${CODE_DIR}/data/tokenizer.perl -l 'fr' < ${DATA_DIR}/europarl-v7.fr-en.fr > ${DATA_DIR}/europarl-v7.fr-en.fr.tok
70 | perl ${CODE_DIR}/data/tokenizer.perl -l 'en' < ${DATA_DIR}/europarl-v7.fr-en.en > ${DATA_DIR}/europarl-v7.fr-en.en.tok
71 |
72 | # extract dictionaries
73 | python ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.fr.tok
74 | python ${CODE_DIR}/data/build_dictionary.py ${DATA_DIR}/europarl-v7.fr-en.en.tok
75 |
76 | # shuffle traning data
77 | python ${CODE_DIR}/data/shuffle.py ${DATA_DIR}/europarl-v7.fr-en.en.tok ${DATA_DIR}/europarl-v7.fr-en.fr.tok
78 | fi
79 |
80 | # create model output directory if it does not exist
81 | if [ ! -d "${MODELS_DIR}" ]; then
82 | mkdir -p ${MODELS_DIR}
83 | fi
84 |
85 | # check if theano is working
86 | python -c "import theano;print 'theano available!'"
87 |
--------------------------------------------------------------------------------
/data/shuffle.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import random
4 |
5 | from tempfile import mkstemp
6 | from subprocess import call
7 |
8 |
9 |
10 | def main(files):
11 |
12 | tf_os, tpath = mkstemp()
13 | tf = open(tpath, 'w')
14 |
15 | fds = [open(ff) for ff in files]
16 |
17 | for l in fds[0]:
18 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
19 | print >>tf, "|||".join(lines)
20 |
21 | [ff.close() for ff in fds]
22 | tf.close()
23 |
24 | tf = open(tpath, 'r')
25 | lines = tf.readlines()
26 | random.shuffle(lines)
27 |
28 | fds = [open(ff+'.shuf','w') for ff in files]
29 |
30 | for l in lines:
31 | s = l.strip().split('|||')
32 | for ii, fd in enumerate(fds):
33 | print >>fd, s[ii]
34 |
35 | [ff.close() for ff in fds]
36 |
37 | os.remove(tpath)
38 |
39 | if __name__ == '__main__':
40 | main(sys.argv[1:])
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/data/strip_sgml.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import re
3 |
4 |
5 | def main():
6 | fin = sys.stdin
7 | fout = sys.stdout
8 | for l in fin:
9 | line = l.strip()
10 | text = re.sub('<[^<]+>', "", line).strip()
11 | if len(text) == 0:
12 | continue
13 | print >>fout, text
14 |
15 |
16 | if __name__ == "__main__":
17 | main()
18 |
19 |
--------------------------------------------------------------------------------
/data/tokenize_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for F in `ls ./training/* | grep -v pkl | grep -v tok`
4 | do
5 | echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok"
6 | perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok
7 | done
8 |
9 | for F in `ls ./dev/*.?? | grep -v tok`
10 | do
11 | echo "perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok"
12 | perl ./tokenizer.perl -l ${F:(-2)} < $F > $F.tok
13 | done
14 |
--------------------------------------------------------------------------------
/data/tokenizer.perl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | #
3 | # This file is part of moses. Its use is licensed under the GNU Lesser General
4 | # Public License version 2.1 or, at your option, any later version.
5 |
6 | use warnings;
7 |
8 | # Sample Tokenizer
9 | ### Version 1.1
10 | # written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
11 | # Version 1.1 updates:
12 | # (1) add multithreading option "-threads NUM_THREADS" (default is 1);
13 | # (2) add a timing option "-time" to calculate the average speed of this tokenizer;
14 | # (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
15 | ### Version 1.0
16 | # $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
17 | # written by Josh Schroeder, based on code by Philipp Koehn
18 |
19 | binmode(STDIN, ":utf8");
20 | binmode(STDOUT, ":utf8");
21 |
22 | use warnings;
23 | use FindBin qw($RealBin);
24 | use strict;
25 | use Time::HiRes;
26 |
27 | if (eval {require Thread;1;}) {
28 | #module loaded
29 | Thread->import();
30 | }
31 |
32 | my $mydir = "$RealBin/nonbreaking_prefixes";
33 |
34 | my %NONBREAKING_PREFIX = ();
35 | my @protected_patterns = ();
36 | my $protected_patterns_file = "";
37 | my $language = "en";
38 | my $QUIET = 0;
39 | my $HELP = 0;
40 | my $AGGRESSIVE = 0;
41 | my $SKIP_XML = 0;
42 | my $TIMING = 0;
43 | my $NUM_THREADS = 1;
44 | my $NUM_SENTENCES_PER_THREAD = 2000;
45 | my $PENN = 0;
46 | my $NO_ESCAPING = 0;
47 | while (@ARGV)
48 | {
49 | $_ = shift;
50 | /^-b$/ && ($| = 1, next);
51 | /^-l$/ && ($language = shift, next);
52 | /^-q$/ && ($QUIET = 1, next);
53 | /^-h$/ && ($HELP = 1, next);
54 | /^-x$/ && ($SKIP_XML = 1, next);
55 | /^-a$/ && ($AGGRESSIVE = 1, next);
56 | /^-time$/ && ($TIMING = 1, next);
57 | # Option to add list of regexps to be protected
58 | /^-protected/ && ($protected_patterns_file = shift, next);
59 | /^-threads$/ && ($NUM_THREADS = int(shift), next);
60 | /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
61 | /^-penn$/ && ($PENN = 1, next);
62 | /^-no-escape/ && ($NO_ESCAPING = 1, next);
63 | }
64 |
65 | # for time calculation
66 | my $start_time;
67 | if ($TIMING)
68 | {
69 | $start_time = [ Time::HiRes::gettimeofday( ) ];
70 | }
71 |
72 | # print help message
73 | if ($HELP)
74 | {
75 | print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
76 | print "Options:\n";
77 | print " -q ... quiet.\n";
78 | print " -a ... aggressive hyphen splitting.\n";
79 | print " -b ... disable Perl buffering.\n";
80 | print " -time ... enable processing time calculation.\n";
81 | print " -penn ... use Penn treebank-like tokenization.\n";
82 | print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
83 | print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
84 | exit;
85 | }
86 |
87 | if (!$QUIET)
88 | {
89 | print STDERR "Tokenizer Version 1.1\n";
90 | print STDERR "Language: $language\n";
91 | print STDERR "Number of threads: $NUM_THREADS\n";
92 | }
93 |
94 | # load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
95 | load_prefixes($language,\%NONBREAKING_PREFIX);
96 |
97 | if (scalar(%NONBREAKING_PREFIX) eq 0)
98 | {
99 | print STDERR "Warning: No known abbreviations for language '$language'\n";
100 | }
101 |
102 | # Load protected patterns
103 | if ($protected_patterns_file)
104 | {
105 | open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
106 | while() {
107 | chomp;
108 | push @protected_patterns, $_;
109 | }
110 | }
111 |
112 | my @batch_sentences = ();
113 | my @thread_list = ();
114 | my $count_sentences = 0;
115 |
116 | if ($NUM_THREADS > 1)
117 | {# multi-threading tokenization
118 | while()
119 | {
120 | $count_sentences = $count_sentences + 1;
121 | push(@batch_sentences, $_);
122 | if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
123 | {
124 | # assign each thread work
125 | for (my $i=0; $i<$NUM_THREADS; $i++)
126 | {
127 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
128 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
129 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
130 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
131 | push(@thread_list, $new_thread);
132 | }
133 | foreach (@thread_list)
134 | {
135 | my $tokenized_list = $_->join;
136 | foreach (@$tokenized_list)
137 | {
138 | print $_;
139 | }
140 | }
141 | # reset for the new run
142 | @thread_list = ();
143 | @batch_sentences = ();
144 | }
145 | }
146 | # the last batch
147 | if (scalar(@batch_sentences)>0)
148 | {
149 | # assign each thread work
150 | for (my $i=0; $i<$NUM_THREADS; $i++)
151 | {
152 | my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
153 | if ($start_index >= scalar(@batch_sentences))
154 | {
155 | last;
156 | }
157 | my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
158 | if ($end_index >= scalar(@batch_sentences))
159 | {
160 | $end_index = scalar(@batch_sentences)-1;
161 | }
162 | my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
163 | my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
164 | push(@thread_list, $new_thread);
165 | }
166 | foreach (@thread_list)
167 | {
168 | my $tokenized_list = $_->join;
169 | foreach (@$tokenized_list)
170 | {
171 | print $_;
172 | }
173 | }
174 | }
175 | }
176 | else
177 | {# single thread only
178 | while()
179 | {
180 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
181 | {
182 | #don't try to tokenize XML/HTML tag lines
183 | print $_;
184 | }
185 | else
186 | {
187 | print &tokenize($_);
188 | }
189 | }
190 | }
191 |
192 | if ($TIMING)
193 | {
194 | my $duration = Time::HiRes::tv_interval( $start_time );
195 | print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
196 | print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
197 | }
198 |
199 | #####################################################################################
200 | # subroutines afterward
201 |
202 | # tokenize a batch of texts saved in an array
203 | # input: an array containing a batch of texts
204 | # return: another array containing a batch of tokenized texts for the input array
205 | sub tokenize_batch
206 | {
207 | my(@text_list) = @_;
208 | my(@tokenized_list) = ();
209 | foreach (@text_list)
210 | {
211 | if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
212 | {
213 | #don't try to tokenize XML/HTML tag lines
214 | push(@tokenized_list, $_);
215 | }
216 | else
217 | {
218 | push(@tokenized_list, &tokenize($_));
219 | }
220 | }
221 | return \@tokenized_list;
222 | }
223 |
224 | # the actual tokenize function which tokenizes one input string
225 | # input: one string
226 | # return: the tokenized string for the input string
227 | sub tokenize
228 | {
229 | my($text) = @_;
230 |
231 | if ($PENN) {
232 | return tokenize_penn($text);
233 | }
234 |
235 | chomp($text);
236 | $text = " $text ";
237 |
238 | # remove ASCII junk
239 | $text =~ s/\s+/ /g;
240 | $text =~ s/[\000-\037]//g;
241 |
242 | # Find protected patterns
243 | my @protected = ();
244 | foreach my $protected_pattern (@protected_patterns) {
245 | my $t = $text;
246 | while ($t =~ /($protected_pattern)(.*)$/) {
247 | push @protected, $1;
248 | $t = $2;
249 | }
250 | }
251 |
252 | for (my $i = 0; $i < scalar(@protected); ++$i) {
253 | my $subst = sprintf("THISISPROTECTED%.3d", $i);
254 | $text =~ s,\Q$protected[$i], $subst ,g;
255 | }
256 | $text =~ s/ +/ /g;
257 | $text =~ s/^ //g;
258 | $text =~ s/ $//g;
259 |
260 | # seperate out all "other" special characters
261 | $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
262 |
263 | # aggressive hyphen splitting
264 | if ($AGGRESSIVE)
265 | {
266 | $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
267 | }
268 |
269 | #multi-dots stay together
270 | $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
271 | while($text =~ /DOTMULTI\./)
272 | {
273 | $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
274 | $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
275 | }
276 |
277 | # seperate out "," except if within numbers (5,300)
278 | #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
279 |
280 | # separate out "," except if within numbers (5,300)
281 | # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E
282 | # first application uses up B so rule can't see B,C
283 | # two-step version here may create extra spaces but these are removed later
284 | # will also space digit,letter or letter,digit forms (redundant with next section)
285 | $text =~ s/([^\p{IsN}])[,]/$1 , /g;
286 | $text =~ s/[,]([^\p{IsN}])/ , $1/g;
287 |
288 | # separate , pre and post number
289 | #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
290 | #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
291 |
292 | # turn `into '
293 | #$text =~ s/\`/\'/g;
294 |
295 | #turn '' into "
296 | #$text =~ s/\'\'/ \" /g;
297 |
298 | if ($language eq "en")
299 | {
300 | #split contractions right
301 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
302 | $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
303 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
304 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
305 | #special case for "1990's"
306 | $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
307 | }
308 | elsif (($language eq "fr") or ($language eq "it"))
309 | {
310 | #split contractions left
311 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
312 | $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
313 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
314 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
315 | }
316 | else
317 | {
318 | $text =~ s/\'/ \' /g;
319 | }
320 |
321 | #word token method
322 | my @words = split(/\s/,$text);
323 | $text = "";
324 | for (my $i=0;$i<(scalar(@words));$i++)
325 | {
326 | my $word = $words[$i];
327 | if ( $word =~ /^(\S+)\.$/)
328 | {
329 | my $pre = $1;
330 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml
371 | $text =~ s/\'/\'/g; # xml
372 | $text =~ s/\"/\"/g; # xml
373 | $text =~ s/\[/\[/g; # syntax non-terminal
374 | $text =~ s/\]/\]/g; # syntax non-terminal
375 | }
376 |
377 | #ensure final line break
378 | $text .= "\n" unless $text =~ /\n$/;
379 |
380 | return $text;
381 | }
382 |
383 | sub tokenize_penn
384 | {
385 | # Improved compatibility with Penn Treebank tokenization. Useful if
386 | # the text is to later be parsed with a PTB-trained parser.
387 | #
388 | # Adapted from Robert MacIntyre's sed script:
389 | # http://www.cis.upenn.edu/~treebank/tokenizer.sed
390 |
391 | my($text) = @_;
392 | chomp($text);
393 |
394 | # remove ASCII junk
395 | $text =~ s/\s+/ /g;
396 | $text =~ s/[\000-\037]//g;
397 |
398 | # attempt to get correct directional quotes
399 | $text =~ s/^``/`` /g;
400 | $text =~ s/^"/`` /g;
401 | $text =~ s/^`([^`])/` $1/g;
402 | $text =~ s/^'/` /g;
403 | $text =~ s/([ ([{<])"/$1 `` /g;
404 | $text =~ s/([ ([{<])``/$1 `` /g;
405 | $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
406 | $text =~ s/([ ([{<])'/$1 ` /g;
407 | # close quotes handled at end
408 |
409 | $text =~ s=\.\.\.= _ELLIPSIS_ =g;
410 |
411 | # separate out "," except if within numbers (5,300)
412 | $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
413 | # separate , pre and post number
414 | $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
415 | $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
416 |
417 | #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
418 | $text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
419 |
420 | # Separate out intra-token slashes. PTB tokenization doesn't do this, so
421 | # the tokens should be merged prior to parsing with a PTB-trained parser
422 | # (see syntax-hyphen-splitting.perl).
423 | $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
424 |
425 | # Assume sentence tokenization has been done first, so split FINAL periods
426 | # only.
427 | $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
428 | # however, we may as well split ALL question marks and exclamation points,
429 | # since they shouldn't have the abbrev.-marker ambiguity problem
430 | $text =~ s=([?!])= $1 =g;
431 |
432 | # parentheses, brackets, etc.
433 | $text =~ s=([\]\[\(\){}<>])= $1 =g;
434 | $text =~ s/\(/-LRB-/g;
435 | $text =~ s/\)/-RRB-/g;
436 | $text =~ s/\[/-LSB-/g;
437 | $text =~ s/\]/-RSB-/g;
438 | $text =~ s/{/-LCB-/g;
439 | $text =~ s/}/-RCB-/g;
440 |
441 | $text =~ s=--= -- =g;
442 |
443 | # First off, add a space to the beginning and end of each line, to reduce
444 | # necessary number of regexps.
445 | $text =~ s=$= =;
446 | $text =~ s=^= =;
447 |
448 | $text =~ s="= '' =g;
449 | # possessive or close-single-quote
450 | $text =~ s=([^'])' =$1 ' =g;
451 | # as in it's, I'm, we'd
452 | $text =~ s='([sSmMdD]) = '$1 =g;
453 | $text =~ s='ll = 'll =g;
454 | $text =~ s='re = 're =g;
455 | $text =~ s='ve = 've =g;
456 | $text =~ s=n't = n't =g;
457 | $text =~ s='LL = 'LL =g;
458 | $text =~ s='RE = 'RE =g;
459 | $text =~ s='VE = 'VE =g;
460 | $text =~ s=N'T = N'T =g;
461 |
462 | $text =~ s= ([Cc])annot = $1an not =g;
463 | $text =~ s= ([Dd])'ye = $1' ye =g;
464 | $text =~ s= ([Gg])imme = $1im me =g;
465 | $text =~ s= ([Gg])onna = $1on na =g;
466 | $text =~ s= ([Gg])otta = $1ot ta =g;
467 | $text =~ s= ([Ll])emme = $1em me =g;
468 | $text =~ s= ([Mm])ore'n = $1ore 'n =g;
469 | $text =~ s= '([Tt])is = '$1 is =g;
470 | $text =~ s= '([Tt])was = '$1 was =g;
471 | $text =~ s= ([Ww])anna = $1an na =g;
472 |
473 | #word token method
474 | my @words = split(/\s/,$text);
475 | $text = "";
476 | for (my $i=0;$i<(scalar(@words));$i++)
477 | {
478 | my $word = $words[$i];
479 | if ( $word =~ /^(\S+)\.$/)
480 | {
481 | my $pre = $1;
482 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml
511 | $text =~ s/\'/\'/g; # xml
512 | $text =~ s/\"/\"/g; # xml
513 | $text =~ s/\[/\[/g; # syntax non-terminal
514 | $text =~ s/\]/\]/g; # syntax non-terminal
515 |
516 | #ensure final line break
517 | $text .= "\n" unless $text =~ /\n$/;
518 |
519 | return $text;
520 | }
521 |
522 | sub load_prefixes
523 | {
524 | my ($language, $PREFIX_REF) = @_;
525 |
526 | my $prefixfile = "$mydir/nonbreaking_prefix.$language";
527 |
528 | #default back to English if we don't have a language-specific prefix file
529 | if (!(-e $prefixfile))
530 | {
531 | $prefixfile = "$mydir/nonbreaking_prefix.en";
532 | print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
533 | die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
534 | }
535 |
536 | if (-e "$prefixfile")
537 | {
538 | open(PREFIX, "<:utf8", "$prefixfile");
539 | while ()
540 | {
541 | my $item = $_;
542 | chomp($item);
543 | if (($item) && (substr($item,0,1) ne "#"))
544 | {
545 | if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
546 | {
547 | $PREFIX_REF->{$1} = 2;
548 | }
549 | else
550 | {
551 | $PREFIX_REF->{$item} = 1;
552 | }
553 | }
554 | }
555 | close(PREFIX);
556 | }
557 | }
558 |
--------------------------------------------------------------------------------
/data/translate.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | model=".pretrained/model_wmt15_bpe2k_uni_en-ru.npz"
4 | dict="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl"
5 | dict_rev="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl"
6 | source="/misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-src.en.tok.bpe"
7 | saveto=".translate/standard.trans"
8 |
9 | python translate_uni.py $model $dict $dict_rev $source $saveto
--------------------------------------------------------------------------------
/data_iterator.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | import cPickle as pkl
4 | import gzip
5 |
6 |
7 | def fopen(filename, mode='r'):
8 | if filename.endswith('.gz'):
9 | return gzip.open(filename, mode)
10 | return open(filename, mode)
11 |
12 |
13 | class TextIterator:
14 | """Simple Bitext iterator."""
15 | def __init__(self, source, target,
16 | source_dict, target_dict,
17 | batch_size=128,
18 | maxlen=100,
19 | n_words_source=-1,
20 | n_words_target=-1,
21 | cache=10,
22 | eos=False):
23 |
24 | self.source = fopen(source, 'r')
25 | self.target = fopen(target, 'r')
26 |
27 | print 'scan the dataset.'
28 | for si, _ in enumerate(self.source):
29 | pass
30 | for ti, _ in enumerate(self.target):
31 | pass
32 |
33 | self.source.close()
34 | self.target.close()
35 |
36 | assert si == ti, 'the number of the source and target document must the same'
37 | print 'scanned {} lines'.format(si)
38 |
39 | self.source = fopen(source, 'r')
40 | self.target = fopen(target, 'r')
41 |
42 | with open(source_dict, 'rb') as f:
43 | self.source_dict = pkl.load(f)
44 | with open(target_dict, 'rb') as f:
45 | self.target_dict = pkl.load(f)
46 |
47 | self.num = si
48 | self.batch_size = batch_size
49 | self.maxlen = maxlen
50 |
51 | self.n_words_source = n_words_source
52 | self.n_words_target = n_words_target
53 |
54 | self.source_buffer = []
55 | self.target_buffer = []
56 | self.k = batch_size * cache
57 |
58 | self.end_of_data = False
59 |
60 |
61 |
62 |
63 | def __iter__(self):
64 | return self
65 |
66 | def reset(self):
67 | self.source.seek(0)
68 | self.target.seek(0)
69 |
70 | def next(self):
71 | if self.end_of_data:
72 | self.end_of_data = False
73 | self.reset()
74 | raise StopIteration
75 |
76 | source = []
77 | target = []
78 |
79 | # fill buffer, if it's empty
80 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
81 |
82 | if len(self.source_buffer) == 0:
83 | for k_ in xrange(self.k):
84 | ss = self.source.readline()
85 | if ss == "":
86 | break
87 | tt = self.target.readline()
88 | if tt == "":
89 | break
90 |
91 | self.source_buffer.append(ss.strip().split())
92 | self.target_buffer.append(tt.strip().split())
93 |
94 | # sort by target buffer
95 | tlen = numpy.array([len(t) for t in self.target_buffer])
96 | tidx = tlen.argsort()
97 |
98 | _sbuf = [self.source_buffer[i] for i in tidx]
99 | _tbuf = [self.target_buffer[i] for i in tidx]
100 |
101 | self.source_buffer = _sbuf
102 | self.target_buffer = _tbuf
103 |
104 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
105 | self.end_of_data = False
106 | self.reset()
107 | raise StopIteration
108 |
109 | try:
110 |
111 | # actual work here
112 | while True:
113 |
114 | # read from source file and map to word index
115 | try:
116 | ss = self.source_buffer.pop()
117 | except IndexError:
118 | break
119 | ss = [self.source_dict[w] if w in self.source_dict else 1
120 | for w in ss]
121 | if self.n_words_source > 0:
122 | ss = [w if w < self.n_words_source else 1 for w in ss]
123 |
124 | # read from source file and map to word index
125 | tt = self.target_buffer.pop()
126 | tt = [self.target_dict[w] if w in self.target_dict else 1
127 | for w in tt]
128 | if self.n_words_target > 0:
129 | tt = [w if w < self.n_words_target else 1 for w in tt]
130 |
131 | if len(ss) > self.maxlen and len(tt) > self.maxlen:
132 | continue
133 |
134 | source.append(ss)
135 | target.append(tt)
136 |
137 | if len(source) >= self.batch_size or \
138 | len(target) >= self.batch_size:
139 | break
140 | except IOError:
141 | self.end_of_data = True
142 |
143 | if len(source) <= 0 or len(target) <= 0:
144 | self.end_of_data = False
145 | self.reset()
146 | raise StopIteration
147 |
148 | return source, target
149 |
150 |
151 | def iterate(fname, word_dict, n_words):
152 | with open(fname, 'r') as f:
153 | for line in f:
154 | words = line.strip().split()
155 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
156 | x = map(lambda ii: ii if ii < n_words else 1, x)
157 | x += [0]
158 | yield x
159 |
160 |
161 | def check_length(fname):
162 | f = open(fname, 'r')
163 | count = 0
164 | for _ in f:
165 | count += 1
166 | f.close()
167 | return count
--------------------------------------------------------------------------------
/data_iterator.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/data_iterator.pyc
--------------------------------------------------------------------------------
/insepection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import matplotlib
3 | # matplotlib.use('agg')
4 | import copy
5 | import numpy
6 | import os
7 | import seaborn as sns
8 | import pandas as pd
9 | sns.set(context="paper", font="monospace", style='whitegrid')
10 | from matplotlib import pyplot as plot
11 | from matplotlib import rc
12 |
13 | rc('font',**{'family':'Verdana', 'weight': 'normal'})
14 | rc('font', size=8)
15 | rc('text', usetex=True)
16 | rc('text.latex',unicode=True)
17 | rc('text.latex',preamble='\usepackage[utf8]{inputenc}')
18 | rc('text.latex',preamble='\usepackage[russian]{babel}')
19 | rc('text.latex',preamble='\usepackage[german]{babel}')
20 | rc('text.latex',preamble='\usepackage[ngerman]{babel}')
21 |
22 | matplotlib.rcParams['ytick.labelsize'] = 11
23 | matplotlib.rcParams['xtick.labelsize'] = 11
24 |
25 | def heatmap(sources, refs, trans, actions, idx, atten=None, savefig=True, name='test', info=None, show=False):
26 | source = [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
27 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||']
28 | action = actions[idx]
29 |
30 |
31 | if atten:
32 | attention = numpy.array(atten[idx])
33 |
34 | def track(acts, data, annote):
35 | x, y = 0, 0
36 | for a in acts:
37 | x += a
38 | y += 1 - a
39 | # print a, x, y, target[x].encode('utf8')
40 | data[y, x] = 1
41 | annote[y, x] = 'W' if a == 0 else 'C'
42 |
43 | return data, annote
44 | # print target
45 |
46 | data = numpy.zeros((len(source), len(target)))
47 | annote = numpy.chararray(data.shape, itemsize=8)
48 | annote[:] = ''
49 | data, annote = track(action, data, annote)
50 | data[0, 0] = 1
51 | annote[0, 0] = 'S'
52 | if atten:
53 | data[:-1, 1:] += attention.T
54 |
55 | d = pd.DataFrame(data=data, columns=target, index=source)
56 | # p = sns.diverging_palette(220, 10, as_cmap=True)
57 | f, ax = plot.subplots(figsize=(11, 11))
58 | f.set_canvas(plot.gcf().canvas)
59 | g = sns.heatmap(d, ax=ax, annot=annote, fmt='s')
60 | g.xaxis.tick_top()
61 |
62 | plot.xticks(rotation=90)
63 | plot.yticks(rotation=0)
64 | # plot.show()
65 | if savefig:
66 | if not os.path.exists('.images/C_{}'.format(name)):
67 | os.mkdir('.images/C_{}'.format(name))
68 |
69 | filename = 'Idx={}||'.format(info['index'])
70 | for w in info:
71 | if w is not 'index':
72 | filename += '.{}={:.2f}'.format(w, float(info[w]))
73 |
74 | print 'saving...'
75 | f.savefig('.images/C_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
76 | if show:
77 | plot.show()
78 |
79 | print 'plotting done.'
80 | plot.close()
81 |
82 | def heatmap2(sources, refs, trans, actions, idx, atten=None, full_atten=None, savefig=True, name='test', info=None, show=False):
83 | source = ['*'] + [s.strip() for s in sources[idx].decode('utf8').replace('@@', '--').split()] + ['||']
84 | target = ['*'] + [s.strip() for s in trans[idx].decode('utf8').replace('@@', '--').split()] + ['||'] + ['*']
85 | action = actions[idx]
86 |
87 | flag = 0
88 | if atten:
89 | attention = numpy.array(atten[idx])
90 | else:
91 | attention = None
92 |
93 | if full_atten:
94 | fullatten = numpy.array(full_atten[idx])
95 | else:
96 | fullatten = None
97 |
98 | def track(acts, data, annote):
99 | x, y, z = 0, 0, 0
100 | for a in acts:
101 | x += (a == 1)
102 | y += (a == 0)
103 | z += (a == 2)
104 |
105 | # data[y + 1, x] = 1
106 | # data[z, x + 1] = 1
107 | # annote[y, x] = 'W' if a == 0 else 'C'
108 |
109 | return data, annote
110 | # print target
111 |
112 | data = numpy.zeros((len(source), len(target)))
113 | annote = numpy.chararray(data.shape, itemsize=8)
114 | annote[:] = ''
115 | data, annote = track(action, data, annote)
116 | data[1, 0] = 1
117 |
118 | def draw(data_t, ax, attention=None):
119 |
120 | data = copy.copy(data_t)
121 | data[1:-1, 1:-1] += attention.T
122 | d = pd.DataFrame(data=data, columns=target, index=source)
123 | # p = sns.diverging_palette(220, 10, as_cmap=True)
124 | g = sns.heatmap(d, mask=(data==0), square=True, cbar=False, linewidths=0.1, ax=ax, annot=annote, fmt='s')
125 | g.xaxis.tick_top()
126 |
127 | for tick in ax.get_xticklabels():
128 | tick.set_rotation(90)
129 | for tick in ax.get_yticklabels():
130 | tick.set_rotation(0)
131 |
132 | ax.grid(True)
133 | f, [ax1, ax2] = plot.subplots(1, 2, figsize=(22, 11))
134 | f.set_canvas(plot.gcf().canvas)
135 |
136 | draw(data, ax1, attention)
137 | # plot.xticks(rotation=90)
138 | # plot.yticks(rotation=0)
139 | # plot.grid()
140 |
141 | draw(data, ax2, fullatten)
142 | # plot.xticks(rotation=90)
143 | # plot.yticks(rotation=0)
144 | # plot.grid()
145 |
146 |
147 | if savefig:
148 | if not os.path.exists('.images/M_{}'.format(name)):
149 | os.mkdir('.images/M_{}'.format(name))
150 |
151 | filename = 'Idx={}||'.format(info['index'])
152 | for w in info:
153 | if w is not 'index':
154 | filename += '.{}={:.2f}'.format(w, float(info[w]))
155 |
156 | # print 'saving...'
157 | plot.savefig('.images/M_{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=100)
158 |
159 | if show:
160 | plot.show()
161 |
162 | # print 'plotting done.'
163 | plot.close()
164 |
165 |
166 |
167 |
168 |
169 |
170 | def visualize(sources, refs, trans, aligns, idx, savefig=True, name='test', info=None):
171 |
172 | colors = ['b', 'g']
173 |
174 | fig = plot.figure(figsize=(20, 2))
175 | ax = plot.gca()
176 |
177 | # plot.hold('on')
178 |
179 | plot.xlim([0., 10.])
180 |
181 | scolors = []
182 | caidx = 0
183 | coloridx = 0
184 | for sidx in xrange(len([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + [''])):
185 | if caidx >= len(numpy.unique(aligns[idx])) or sidx >= numpy.unique(aligns[idx])[caidx]:
186 | caidx = caidx + 1
187 | coloridx = 1 - coloridx
188 | scolors.append(colors[coloridx])
189 |
190 | tcolors = []
191 | lastidx = -1
192 | coloridx = 1
193 | for tt in aligns[idx]:
194 | if tt != lastidx:
195 | lastidx = tt
196 | coloridx = 1 - coloridx
197 | tcolors.append(colors[coloridx])
198 |
199 | x, y = 0., 1.
200 | s_pos = [(x, y)]
201 | for ii, ss in enumerate([s_.replace('@@', '--').strip() for s_ in sources[idx].split()] + ['']):
202 |
203 | ss.replace('%', '\%')
204 | xx = plot.text(x, y, ss)
205 | xx.set_bbox(dict(color=scolors[ii], alpha=0.1, edgecolor=scolors[ii]))
206 | xx._renderer = fig.canvas.get_renderer()
207 | wext = xx.get_window_extent()
208 | bbox = ax.transData.inverted().transform(wext)
209 | x = bbox[1, 0] + 0.
210 | s_pos.append((x, y))
211 | s_pos.append((bbox[1, 0], y))
212 |
213 | x, y = 0., .95
214 | t_pos = []
215 | for ii, ss in enumerate([s_.decode('utf8').replace('@@', '--') for s_ in trans[idx].split()]):
216 |
217 | ss.replace('%', '\%')
218 | xx = plot.text(x, y, ss)
219 | xx._renderer = fig.canvas.get_renderer()
220 | wext = xx.get_window_extent()
221 | bbox = ax.transData.inverted().transform(wext)
222 | t_pos.append((bbox[0, 0], bbox[0, 1] + 0.03))
223 | x = bbox[1, 0] + 0.
224 | t_pos.append((bbox[1, 0], bbox[0, 1] + 0.03))
225 |
226 | lasttidx = 0
227 | lastidx = -1
228 | for tidx, sidx in enumerate(aligns[idx]):
229 | if lastidx != sidx:
230 | lastidx = sidx
231 | lasttidx = tidx
232 | sidx = numpy.minimum(sidx, len(s_pos) - 1)
233 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
234 | t_pos[tidx][0] - s_pos[sidx][0],
235 | t_pos[tidx][1] - s_pos[sidx][1],
236 | head_width=0., head_length=0.,
237 | fc=tcolors[tidx], ec=tcolors[tidx],
238 | linestyle='dotted', width=0.0001)
239 | for tt in xrange(tidx, len(aligns[idx])):
240 | if aligns[idx][tt] != sidx:
241 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
242 | t_pos[tt][0] - s_pos[sidx][0],
243 | t_pos[tt][1] - s_pos[sidx][1],
244 | head_width=0., head_length=0.,
245 | fc=tcolors[tidx], ec=tcolors[tidx],
246 | linestyle='dotted', width=0.0001)
247 | plot.fill_between([t_pos[tidx][0], s_pos[sidx][0], t_pos[tt][0]],
248 | [t_pos[tidx][1], s_pos[sidx][1], t_pos[tt][1]],
249 | facecolor=tcolors[tidx], alpha=0.1)
250 | break
251 | plot.arrow(s_pos[sidx][0], s_pos[sidx][1],
252 | t_pos[-1][0] - s_pos[sidx][0],
253 | t_pos[-1][1] - s_pos[sidx][1],
254 | head_width=0., head_length=0.,
255 | fc=tcolors[-1], ec=tcolors[-1],
256 | linestyle='dotted', width=0.0001)
257 | plot.fill_between([t_pos[lasttidx][0], s_pos[sidx][0], t_pos[-1][0]],
258 | [t_pos[lasttidx][1], s_pos[sidx][1], t_pos[-1][1]],
259 | facecolor=tcolors[tidx], alpha=0.1)
260 |
261 | # plot.hold('off')
262 |
263 | plot.axis('off')
264 | plot.ylim([0.95, 1.01])
265 | plot.tight_layout()
266 |
267 | if savefig:
268 | if not os.path.exists('.images/{}'.format(name)):
269 | os.mkdir('.images/{}'.format(name))
270 |
271 | filename = 'Idx={}||'.format(info['index'])
272 | for w in info:
273 | if w is not 'index':
274 | filename += '.{}={:.2f}'.format(w, float(info[w]))
275 |
276 | plot.savefig('.images/{}'.format(name) + '/{}'.format(filename) + '.pdf', dpi=300)
277 |
278 | print 'plotting done.'
279 | plot.close()
280 | # plot.show()
281 |
282 |
283 | if __name__ == "__main__":
284 |
285 | sources = ['I cannot understand .']
286 | targets = ['Ich verstehe nicht .']
287 | actions = [[0, 0, 1, 1, 2, 0, 1, 2, 2, 0, 1]]
288 | heatmap2(sources, targets, targets, actions, 0, savefig=False, show=True)
289 |
--------------------------------------------------------------------------------
/insepection.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/insepection.pyc
--------------------------------------------------------------------------------
/itchat.pkl:
--------------------------------------------------------------------------------
1 | (dp0
2 | S'cookies'
3 | p1
4 | (dp2
5 | S'webwx_data_ticket'
6 | p3
7 | S'gSdzU7D7VCmK6kLYm/REsyf8'
8 | p4
9 | sS'wxuin'
10 | p5
11 | S'1059617351'
12 | p6
13 | sS'webwxuvid'
14 | p7
15 | S'7e7876624bedc284f3184c1f6790bab3fb32382982efcab8e8e342feffde08084d7eacd58ec0dd5b5934762d984ea238'
16 | p8
17 | sS'webwx_auth_ticket'
18 | p9
19 | S'CIsBEIHz4qwOGoAB6e/2gBxNrbvLSYvZ6sEa7pcl65diPjyZ2lDKbWoj6R1hg1cyC3eMtluSIcwockeE1rFtthBYz0fgcSK9CKijLujMJxe+V9SAtUxLxdZDUdN/QHJgDAa6zTkGYu+lwz7sXk6T0LmWCSzbGohUtURcm9PybIL/9mUkTldZR3Y8S0Q='
20 | p10
21 | sS'wxloadtime'
22 | p11
23 | S'1488957301_expired'
24 | p12
25 | sS'wxpluginkey'
26 | p13
27 | S'1488934262'
28 | p14
29 | sS'wxsid'
30 | p15
31 | S'f7PxyPc1Cfip7gPz'
32 | p16
33 | sS'mm_lang'
34 | p17
35 | S'zh_CN'
36 | p18
37 | ssS'version'
38 | p19
39 | S'1.2.27'
40 | p20
41 | sS'storage'
42 | p21
43 | (dp22
44 | S'userName'
45 | p23
46 | V@81d220b3ed273e9d1a9d0bd871cc198f5999208926fda248aa8006daa5ffffb0
47 | p24
48 | sS'lastInputUserName'
49 | p25
50 | NsS'memberList'
51 | p26
52 | (lp27
53 | (dp28
54 | S'UserName'
55 | p29
56 | g24
57 | sS'City'
58 | p30
59 | S''
60 | p31
61 | sS'DisplayName'
62 | p32
63 | g31
64 | sS'UniFriend'
65 | p33
66 | I0
67 | sS'OwnerUin'
68 | p34
69 | I0
70 | sS'MemberList'
71 | p35
72 | (lp36
73 | sS'PYQuanPin'
74 | p37
75 | V
76 | p38
77 | sS'RemarkPYInitial'
78 | p39
79 | g38
80 | sS'Uin'
81 | p40
82 | I1059617351
83 | sS'AppAccountFlag'
84 | p41
85 | I0
86 | sS'VerifyFlag'
87 | p42
88 | I0
89 | sS'Province'
90 | p43
91 | g31
92 | sS'KeyWord'
93 | p44
94 | g31
95 | sS'RemarkName'
96 | p45
97 | g38
98 | sS'PYInitial'
99 | p46
100 | g38
101 | sS'ChatRoomId'
102 | p47
103 | I0
104 | sS'HideInputBarFlag'
105 | p48
106 | I0
107 | sVHeadImgFlag
108 | p49
109 | I1
110 | sS'EncryChatRoomId'
111 | p50
112 | g31
113 | sS'AttrStatus'
114 | p51
115 | I0
116 | sS'SnsFlag'
117 | p52
118 | I0
119 | sS'MemberCount'
120 | p53
121 | I0
122 | sVWebWxPluginSwitch
123 | p54
124 | I0
125 | sS'Alias'
126 | p55
127 | g31
128 | sS'Signature'
129 | p56
130 | g38
131 | sS'ContactFlag'
132 | p57
133 | I0
134 | sS'NickName'
135 | p58
136 | VCoral
137 | p59
138 | sS'RemarkPYQuanPin'
139 | p60
140 | g38
141 | sS'HeadImgUrl'
142 | p61
143 | V/cgi-bin/mmwebwx-bin/webwxgeticon?seq=1782193440&username=@81d220b3ed273e9d1a9d0bd871cc198f5999208926fda248aa8006daa5ffffb0&skey=@crypt_4c00d0e1_bf264d5f643dcc22a8b1701b72f216ca
144 | p62
145 | sS'Sex'
146 | p63
147 | I0
148 | sS'StarFriend'
149 | p64
150 | I0
151 | sS'Statues'
152 | p65
153 | I0
154 | sa(dp66
155 | VUserName
156 | p67
157 | V@f3ca6485604bf7a0b3518d4930e5cbdee824941efc1905597b5e6ddad15d3658
158 | p68
159 | sVCity
160 | p69
161 | V\u4e2d\u897f\u533a
162 | p70
163 | sVDisplayName
164 | p71
165 | g38
166 | sVUniFriend
167 | p72
168 | I0
169 | sVMemberList
170 | p73
171 | (lp74
172 | sVPYQuanPin
173 | p75
174 | g38
175 | sVRemarkPYInitial
176 | p76
177 | g38
178 | sVSex
179 | p77
180 | I1
181 | sVAppAccountFlag
182 | p78
183 | I0
184 | sVVerifyFlag
185 | p79
186 | I0
187 | sVProvince
188 | p80
189 | V\u9999\u6e2f
190 | p81
191 | sVKeyWord
192 | p82
193 | g38
194 | sVRemarkName
195 | p83
196 | g38
197 | sVPYInitial
198 | p84
199 | g38
200 | sVIsOwner
201 | p85
202 | I0
203 | sVChatRoomId
204 | p86
205 | I0
206 | sVHideInputBarFlag
207 | p87
208 | I0
209 | sVEncryChatRoomId
210 | p88
211 | g38
212 | sVAttrStatus
213 | p89
214 | I37847143
215 | sVSnsFlag
216 | p90
217 | I49
218 | sVMemberCount
219 | p91
220 | I0
221 | sVOwnerUin
222 | p92
223 | I0
224 | sVAlias
225 | p93
226 | VThoma_Gu
227 | p94
228 | sVSignature
229 | p95
230 | V\u80f8\u304c\u75db\u3044\u3001\u65e5\u3005\u5f37\u307e\u308b
231 | p96
232 | sVContactFlag
233 | p97
234 | I3
235 | sVNickName
236 | p98
237 | V\u30de\u30eb\u30c1\u30d1\u30b9
238 | p99
239 | sVRemarkPYQuanPin
240 | p100
241 | g38
242 | sVHeadImgUrl
243 | p101
244 | V/cgi-bin/mmwebwx-bin/webwxgeticon?seq=647880064&username=@f3ca6485604bf7a0b3518d4930e5cbdee824941efc1905597b5e6ddad15d3658&skey=@crypt_4c00d0e1_bf264d5f643dcc22a8b1701b72f216ca
245 | p102
246 | sVUin
247 | p103
248 | Vwxid_xgph596ajfxh12
249 | p104
250 | sVStarFriend
251 | p105
252 | I0
253 | sVStatues
254 | p106
255 | I0
256 | sasS'chatroomList'
257 | p107
258 | (lp108
259 | sS'nickName'
260 | p109
261 | g59
262 | sS'mpList'
263 | p110
264 | (lp111
265 | ssS'loginInfo'
266 | p112
267 | (dp113
268 | S'SyncKey'
269 | p114
270 | (dp115
271 | VCount
272 | p116
273 | I9
274 | sVList
275 | p117
276 | (lp118
277 | (dp119
278 | VVal
279 | p120
280 | I647880085
281 | sVKey
282 | p121
283 | I1
284 | sa(dp122
285 | VVal
286 | p123
287 | I647880092
288 | sVKey
289 | p124
290 | I2
291 | sa(dp125
292 | VVal
293 | p126
294 | I647880064
295 | sVKey
296 | p127
297 | I3
298 | sa(dp128
299 | VVal
300 | p129
301 | I647880012
302 | sVKey
303 | p130
304 | I11
305 | sa(dp131
306 | VVal
307 | p132
308 | I647880012
309 | sVKey
310 | p133
311 | I13
312 | sa(dp134
313 | VVal
314 | p135
315 | I1488959281
316 | sVKey
317 | p136
318 | I201
319 | sa(dp137
320 | VVal
321 | p138
322 | I1488934262
323 | sVKey
324 | p139
325 | I1000
326 | sa(dp140
327 | VVal
328 | p141
329 | I1488934292
330 | sVKey
331 | p142
332 | I1001
333 | sa(dp143
334 | VVal
335 | p144
336 | I1488859979
337 | sVKey
338 | p145
339 | I1003
340 | sassS'syncUrl'
341 | p146
342 | S'https://webpush.web.wechat.com/cgi-bin/mmwebwx-bin'
343 | p147
344 | sS'skey'
345 | p148
346 | V@crypt_4c00d0e1_bf264d5f643dcc22a8b1701b72f216ca
347 | p149
348 | sS'wxuin'
349 | p150
350 | V1059617351
351 | p151
352 | sS'synckey'
353 | p152
354 | S'1_647880085|2_647880092|3_647880064|11_647880012|13_647880012|201_1488959281|1000_1488934262|1001_1488934292|1003_1488859979'
355 | p153
356 | sS'url'
357 | p154
358 | Vhttps://web.wechat.com/cgi-bin/mmwebwx-bin
359 | p155
360 | sS'pass_ticket'
361 | p156
362 | V35aSttODnTZMiou7%2BxJ9v9C087xjTCfcWoYWANyF03knm19w4vbp6dnQGT1FCf14
363 | p157
364 | sS'wxsid'
365 | p158
366 | Vf7PxyPc1Cfip7gPz
367 | p159
368 | sS'User'
369 | p160
370 | g28
371 | sS'InviteStartCount'
372 | p161
373 | I40
374 | sS'fileUrl'
375 | p162
376 | S'https://file.web.wechat.com/cgi-bin/mmwebwx-bin'
377 | p163
378 | sS'BaseRequest'
379 | p164
380 | (dp165
381 | S'Sid'
382 | p166
383 | g159
384 | sS'Skey'
385 | p167
386 | g149
387 | sS'DeviceID'
388 | p168
389 | g157
390 | sg40
391 | g151
392 | ssS'deviceid'
393 | p169
394 | S'e275584183735061'
395 | p170
396 | ss.
--------------------------------------------------------------------------------
/layers.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/layers.pyc
--------------------------------------------------------------------------------
/mteval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # ref=" /misc/kcgscratch1/ChoGroup/junyoung_exp/wmt15/ruen/dev/newstest2013-ref.ru.tok"
4 | # sed -i 's/@@ //g' $1
5 |
6 | DIR="/work/jg5223/work/SimulTrans/.translate/"
7 |
8 | ./data/multi-bleu.perl $DIR/ref.txt < $DIR/test.txt
9 |
--------------------------------------------------------------------------------
/nmt_uni.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/nmt_uni.pyc
--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import theano.tensor as tensor
3 | import numpy
4 |
5 | from layers import *
6 | profile = False
7 |
8 | # optimizers
9 | # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update
10 |
11 |
12 | # gradient clipping
13 | def grad_clip(grad):
14 | clip_c = 1.
15 | if clip_c > 0.:
16 | g2 = 0.
17 | for g in grad:
18 | g2 += (g ** 2).sum()
19 | new_grads = []
20 | for g in grad:
21 | new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(g2) * clip_c, g))
22 | grad = new_grads
23 | return grad
24 |
25 |
26 | def adam(lr, tparams, grads, inp, cost):
27 | gshared = [theano.shared(p.get_value() * 0.,
28 | name='%s_grad' % k)
29 | for k, p in tparams.iteritems()]
30 | gsup = [(gs, g) for gs, g in zip(gshared, grads)]
31 |
32 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile, on_unused_input='ignore')
33 |
34 | lr0 = lr # 0.0002
35 | b1 = 0.1
36 | b2 = 0.001
37 | e = 1e-8
38 |
39 | updates = []
40 |
41 | i = theano.shared(numpy.float32(0.))
42 | i_t = i + 1.
43 | fix1 = 1. - b1**(i_t)
44 | fix2 = 1. - b2**(i_t)
45 | lr_t = lr0 * (tensor.sqrt(fix2) / fix1)
46 |
47 | for p, g in zip(tparams.values(), gshared):
48 | m = theano.shared(p.get_value() * 0.)
49 | v = theano.shared(p.get_value() * 0.)
50 | m_t = (b1 * g) + ((1. - b1) * m)
51 | v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
52 | g_t = m_t / (tensor.sqrt(v_t) + e)
53 | p_t = p - (lr_t * g_t)
54 | updates.append((m, m_t))
55 | updates.append((v, v_t))
56 | updates.append((p, p_t))
57 | updates.append((i, i_t))
58 |
59 | print 'build optimizer with Adam'
60 | f_update = theano.function([lr], [], updates=updates,
61 | on_unused_input='ignore', profile=profile)
62 |
63 | return f_grad_shared, f_update
64 |
65 |
66 | def adadelta(lr, tparams, grads, inp, cost):
67 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
68 | name='%s_grad' % k)
69 | for k, p in tparams.iteritems()]
70 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
71 | name='%s_rup2' % k)
72 | for k, p in tparams.iteritems()]
73 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
74 | name='%s_rgrad2' % k)
75 | for k, p in tparams.iteritems()]
76 |
77 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
78 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
79 | for rg2, g in zip(running_grads2, grads)]
80 |
81 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
82 | profile=profile)
83 |
84 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
85 | for zg, ru2, rg2 in zip(zipped_grads, running_up2,
86 | running_grads2)]
87 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
88 | for ru2, ud in zip(running_up2, updir)]
89 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
90 |
91 | f_update = theano.function([lr], [], updates=ru2up+param_up,
92 | on_unused_input='ignore', profile=profile)
93 |
94 | print 'build optimizer with Adadelta'
95 | return f_grad_shared, f_update
96 |
97 |
98 | def rmsprop(lr, tparams, grads, inp, cost):
99 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
100 | name='%s_grad' % k)
101 | for k, p in tparams.iteritems()]
102 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
103 | name='%s_rgrad' % k)
104 | for k, p in tparams.iteritems()]
105 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
106 | name='%s_rgrad2' % k)
107 | for k, p in tparams.iteritems()]
108 |
109 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
110 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
111 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
112 | for rg2, g in zip(running_grads2, grads)]
113 |
114 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
115 | profile=profile)
116 |
117 | updir = [theano.shared(p.get_value() * numpy.float32(0.),
118 | name='%s_updir' % k)
119 | for k, p in tparams.iteritems()]
120 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
121 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
122 | running_grads2)]
123 | param_up = [(p, p + udn[1])
124 | for p, udn in zip(itemlist(tparams), updir_new)]
125 | f_update = theano.function([lr], [], updates=updir_new+param_up,
126 | on_unused_input='ignore', profile=profile)
127 |
128 | print 'build optimizer with Rmsprop'
129 | return f_grad_shared, f_update
130 |
131 |
132 | def sgd(lr, tparams, grads, x, mask, y, cost):
133 | gshared = [theano.shared(p.get_value() * 0.,
134 | name='%s_grad' % k)
135 | for k, p in tparams.iteritems()]
136 | gsup = [(gs, g) for gs, g in zip(gshared, grads)]
137 |
138 | f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
139 | profile=profile)
140 |
141 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
142 | f_update = theano.function([lr], [], updates=pup, profile=profile)
143 |
144 | print 'build optimizer with SGD'
145 | return f_grad_shared, f_update
146 |
147 |
--------------------------------------------------------------------------------
/optimizer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/optimizer.pyc
--------------------------------------------------------------------------------
/policy.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/policy.pyc
--------------------------------------------------------------------------------
/pretrain_uni.py:
--------------------------------------------------------------------------------
1 | from nmt_uni import train
2 | from config import pretrain_config
3 |
4 |
5 | def main(job_id, params):
6 | print 'pretraining settings:'
7 | for c, v in sorted(params.items(), key=lambda a:a[0]):
8 | print '{}: {}'.format(c, v)
9 |
10 | validerr = train(**params)
11 | return validerr
12 |
13 | if __name__ == '__main__':
14 | main(0, pretrain_config())
15 |
16 |
17 |
--------------------------------------------------------------------------------
/reward.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/reward.pyc
--------------------------------------------------------------------------------
/run_eval.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=gpu2 python simualtrans_eval.py --sample 1 --batchsize 1 --target 10 --sinit 1 --gamma 1 --recurrent True --Rtype 10 --coverage True
2 |
--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
1 | export THEANO_FLAGS=device=gpu1,floatX=float32
2 | python simultrans_train.py
3 |
4 |
--------------------------------------------------------------------------------
/simultrans_eval.py:
--------------------------------------------------------------------------------
1 | """
2 | Simultaneous Machine Translateion: Training with Policy Gradient
3 |
4 | """
5 | import argparse
6 | import os
7 | import cPickle as pkl
8 |
9 | from bleu import *
10 | from nmt_uni import *
11 | from policy import Controller as Policy
12 | from utils import Progbar, Monitor
13 |
14 | from simultrans_beam import simultaneous_decoding
15 | from simultrans_model import _seqs2words, _bpe2words, _action2delay, PIPE, _padding
16 |
17 | import time
18 |
19 | numpy.random.seed(19920206)
20 | timer = time.time
21 |
22 | WORK = '/misc/kcgscratch1/ChoGroup/thoma_exp/SimulTrans/'
23 | EXP = WORK
24 |
25 | # check hidden folders
26 | def check_env():
27 | import os
28 | paths = ['.policy', '.pretrained', '.log', '.config', '.images', '.translate']
29 | for p in paths:
30 | p = WORK + p
31 | if not os.path.exists(p):
32 | os.mkdir
33 | # run training function:: >>>
34 | def run_simultrans(model,
35 | options_file=None,
36 | config=None,
37 | policy=None,
38 | id=None,
39 | remote=False):
40 | # check envoriments
41 | check_env()
42 | if id is not None:
43 | fcon = WORK + '.config/{}.conf'.format(id)
44 | if os.path.exists(fcon):
45 | print 'load config files'
46 | policy, config = pkl.load(open(fcon, 'r'))
47 |
48 | # ============================================================================== #
49 | # load model model_options
50 | # ============================================================================== #
51 | _model = model
52 | model = WORK + '.pretrained/{}'.format(model)
53 |
54 | if options_file is not None:
55 | with open(options_file, 'rb') as f:
56 | options = pkl.load(f)
57 | else:
58 | with open('%s.pkl' % model, 'rb') as f:
59 | options = pkl.load(f)
60 |
61 | print 'load options...'
62 | for w, p in sorted(options.items(), key=lambda x: x[0]):
63 | print '{}: {}'.format(w, p)
64 |
65 | # load detail settings from option file:
66 | dictionary, dictionary_target = options['dictionaries']
67 |
68 | def _iter(fname):
69 | with open(fname, 'r') as f:
70 | for line in f:
71 | words = line.strip().split()
72 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
73 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
74 | x += [0]
75 | yield x
76 |
77 | def _check_length(fname):
78 | f = open(fname, 'r')
79 | count = 0
80 | for _ in f:
81 | count += 1
82 | f.close()
83 |
84 | return count
85 |
86 | # load source dictionary and invert
87 | with open(dictionary, 'rb') as f:
88 | word_dict = pkl.load(f)
89 | word_idict = dict()
90 | for kk, vv in word_dict.iteritems():
91 | word_idict[vv] = kk
92 | word_idict[0] = ''
93 | word_idict[1] = 'UNK'
94 |
95 | # load target dictionary and invert
96 | with open(dictionary_target, 'rb') as f:
97 | word_dict_trg = pkl.load(f)
98 | word_idict_trg = dict()
99 | for kk, vv in word_dict_trg.iteritems():
100 | word_idict_trg[vv] = kk
101 | word_idict_trg[0] = ''
102 | word_idict_trg[1] = 'UNK'
103 |
104 | ## use additional input for the policy network
105 | options['pre'] = config['pre']
106 |
107 | # ================================================================================= #
108 | # Build a Simultaneous Translator
109 | # ================================================================================= #
110 |
111 | # allocate model parameters
112 | params = init_params(options)
113 | params = load_params(model, params)
114 | tparams = init_tparams(params)
115 |
116 | # print 'build the model for computing cost (full source sentence).'
117 | trng, use_noise, \
118 | _x, _x_mask, _y, _y_mask, \
119 | opt_ret, \
120 | cost, f_cost = build_model(tparams, options)
121 | print 'done'
122 |
123 | # functions for sampler
124 | f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng)
125 |
126 | # function for finetune
127 | if config['finetune'] != 'nope':
128 | f_fine_init, f_fine_cost, f_fine_update = build_fine(tparams, options,
129 | fullmodel=True if config['finetune'] == 'full'
130 | else False)
131 |
132 | def _translate(src, trg, train=False, samples=config['sample'], greedy=False):
133 | ret = simultaneous_decoding(
134 | f_sim_ctx, f_sim_init,
135 | f_sim_next, f_cost,
136 | _policy,
137 | src, trg, word_idict_trg,
138 | step=config['step'], peek=config['peek'], sidx=config['s0'],
139 | n_samples=samples,
140 | reward_config={'target': config['target'],
141 | 'gamma': config['gamma'],
142 | 'Rtype': config['Rtype'],
143 | 'maxsrc': config['maxsrc'],
144 | 'greedy': greedy,
145 | 'upper': config['upper']},
146 | train=train,
147 | use_forget=config['forget'],
148 | use_newinput=config['pre'],
149 | use_coverage=config['coverage'],
150 | on_groundtruth=0 if config['finetune'] == 'nope' else 10)
151 |
152 | print ret
153 | import sys; sys.exit(-1)
154 |
155 |
156 | return ret
157 |
158 | # if not train:
159 | # sample, score, actions, R, tracks, attentions = ret
160 | # return sample, score, actions, R, tracks
161 | # else:
162 | # sample, score, actions, R, info, pipe_t = ret
163 | # return sample, score, actions, R, info, pipe_t
164 |
165 | # check the ID:
166 | policy['base'] = _model
167 | _policy = Policy(trng, options, policy, config,
168 | n_in=options['readout_dim'] + 1 if config['coverage'] else options['readout_dim'],
169 | n_out=3 if config['forget'] else 2,
170 | recurrent=policy['recurrent'], id=id)
171 |
172 | # make the dataset ready for training & validation
173 | # train_ = options['datasets'][0]
174 | # train_num = _check_length
175 | trainIter = TextIterator(options['datasets'][0], options['datasets'][1],
176 | options['dictionaries'][0], options['dictionaries'][1],
177 | n_words_source=options['n_words_src'], n_words_target=options['n_words'],
178 | batch_size=config['batchsize'],
179 | maxlen=options['maxlen'])
180 |
181 | train_num = trainIter.num
182 |
183 | validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1],
184 | options['dictionaries'][0], options['dictionaries'][1],
185 | n_words_source=options['n_words_src'], n_words_target=options['n_words'],
186 | batch_size=1, cache=1,
187 | maxlen=1000000)
188 |
189 | valid_num = validIter.num
190 |
191 | valid_ = options['valid_datasets'][0]
192 | valid_num = _check_length(valid_)
193 | print 'training set {} lines / validation set {} lines'.format(train_num, valid_num)
194 | print 'use the reward function {}'.format(chr(config['Rtype'] + 65))
195 |
196 | # ================================================================================= #
197 | # Main Loop: Run
198 | # ================================================================================= #
199 | print 'Start Simultaneous Translator...'
200 | probar = Progbar(train_num / config['batchsize'], with_history=False)
201 | monitor = None
202 | if remote:
203 | monitor = Monitor(root='http://localhost:9000')
204 |
205 | # freqs
206 | save_freq = 200
207 | sample_freq = 10
208 | valid_freq = 200
209 | valid_size = 200
210 | display_freq = 50
211 | finetune_freq = 5
212 |
213 | history, last_it = _policy.load()
214 | action_space = ['W', 'C', 'F']
215 | Log_avg = {}
216 | time0 = timer()
217 | pipe = PIPE(['x', 'x_mask', 'y', 'y_mask', 'c_mask'])
218 |
219 | for it, (srcs, trgs) in enumerate(trainIter): # only one sentence each iteration
220 | if it < last_it: # go over the scanned lines.
221 | continue
222 |
223 | # for validation
224 | # doing the whole validation!!
225 | reference = []
226 | system = []
227 |
228 | reference2 = []
229 | system2 = []
230 |
231 | if it % valid_freq == 0:
232 | print 'start validation'
233 |
234 | collections = [[], [], [], [], []]
235 | probar_v = Progbar(valid_num / 64 + 1)
236 | for ij, (srcs, trgs) in enumerate(validIter):
237 |
238 | # new_srcs, new_trgs = [], []
239 |
240 | # for src, trg in zip(srcs, trgs):
241 | # if len(src) < config['s0']:
242 | # continue # ignore when the source sentence is less than sidx. we don't use the policy\
243 | # else:
244 | # new_srcs += [src]
245 | # new_trgs += [trg]
246 |
247 | # if len(new_srcs) == 0:
248 | # continue
249 | # srcs, trgs = new_srcs, new_trgs
250 |
251 | statistics = _translate(srcs, trgs, train=False, samples=1, greedy=True)
252 |
253 | quality, delay, reward = zip(*statistics['track'])
254 | reference += statistics['Ref']
255 | system += statistics['Sys']
256 |
257 | # print ' '.join(reference[-1][0])
258 | # print ' '.join(system[-1])
259 |
260 |
261 | # compute the average consective waiting length
262 | def _consective(action):
263 | waits = []
264 | temp = 0
265 | for a in action:
266 | if a == 0:
267 | temp += 1
268 | elif temp > 0:
269 | waits += [temp]
270 | temp = 0
271 |
272 | if temp > 0:
273 | waits += [temp]
274 |
275 | mean = numpy.mean(waits)
276 | gec = numpy.max(waits) # numpy.prod(waits) ** (1./len(waits))
277 | return mean, gec
278 |
279 | def _max_length(action):
280 | _cur = 0
281 | _end = 0
282 | _max = 0
283 | for it, a in enumerate(action):
284 | if a == 0:
285 | _cur += 1
286 | elif a == 2:
287 | _end += 1
288 |
289 | temp = _cur - _end
290 | if temp > _max:
291 | _max = temp
292 | return _max
293 |
294 | maxlen = [_max_length(action) for action in statistics['action']]
295 | means, gecs = zip(*(_consective(action) for action in statistics['action']))
296 |
297 | collections[0] += quality
298 | collections[1] += delay
299 | collections[2] += means
300 | collections[3] += gecs
301 | collections[4] += maxlen
302 |
303 | values = [('quality', numpy.mean(quality)), ('delay', numpy.mean(delay)),
304 | ('wait_mean', numpy.mean(means)), ('wait_max', numpy.mean(gecs)),
305 | ('max_len', numpy.mean(maxlen))]
306 | probar_v.update(ij + 1, values=values)
307 |
308 |
309 | validIter.reset()
310 | valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx = [numpy.mean(a) for a in collections]
311 | print 'Iter = {}: AVG BLEU = {}, DELAY = {}, WAIT(MEAN) = {}, WAIT(MAX) = {}, MaxLen={}'.format(
312 | it, valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx)
313 |
314 | print 'Compute the Corpus BLEU={} (greedy)'.format(corpus_bleu(reference, system))
315 |
316 | with open(WORK + '.translate/test.txt', 'w') as fout:
317 | for sys in system:
318 | fout.write('{}\n'.format(' '.join(sys)))
319 |
320 | with open(WORK + '.translate/ref.txt', 'w') as fout:
321 | for ref in reference:
322 | fout.write('{}\n'.format(' '.join(ref[0])))
323 |
324 |
325 |
326 | if config['upper']:
327 | print 'done'
328 | import sys; sys.exit(-1)
329 |
330 |
331 | # training set sentence tuning
332 | new_srcs, new_trgs = [], []
333 | for src, trg in zip(srcs, trgs):
334 | if len(src) <= config['s0']:
335 | continue # ignore when the source sentence is less than sidx. we don't use the policy\
336 | else:
337 | new_srcs += [src]
338 | new_trgs += [trg]
339 |
340 | if len(new_srcs) == 0:
341 | continue
342 |
343 | srcs, trgs = new_srcs, new_trgs
344 | try:
345 | statistics, info, pipe_t = _translate(srcs, trgs, train=True)
346 | except Exception:
347 | print 'translate a empty sentence. bug.'
348 | continue
349 |
350 |
351 | # samples, scores, actions, rewards, info, pipe_t = _translate(srcs, trgs, train=True)
352 | # print pipe_t
353 |
354 |
355 | if config['finetune'] != 'nope':
356 |
357 | for idx, act in enumerate(pipe_t['action']):
358 | _start = 0
359 | _end = 0
360 | _mask = [0 for _ in srcs[0]]
361 | _cmask = []
362 |
363 | pipe.messages['x'] += srcs
364 | pipe.messages['y'] += [pipe_t['sample'][idx]]
365 |
366 | for a in act:
367 | # print _start, _end
368 | if a == 0:
369 | _mask[_start] = 1
370 | _start += 1
371 | elif a == 2:
372 | _mask[_end] = 0
373 | _end += 1
374 | else:
375 | _cmask.append(_mask)
376 | # print numpy.asarray(_cmask).shape
377 |
378 | pipe.messages['c_mask'].append(_cmask)
379 |
380 | if it % finetune_freq == (finetune_freq - 1):
381 | num = len(pipe.messages['x'])
382 | max_x = max([len(v) for v in pipe.messages['x']])
383 | max_y = max([len(v) for v in pipe.messages['y']])
384 |
385 | xx, xx_mask = _padding(pipe.messages['x'], shape=(max_x, num), return_mask=True, dtype='int64')
386 | yy, yy_mask = _padding(pipe.messages['y'], shape=(max_y, num), return_mask=True, dtype='int64')
387 | cc_mask = _padding(pipe.messages['c_mask'], shape=(max_y, num, max_x)).transpose([0, 2, 1])
388 |
389 | # fine-tune the EncDec of translation
390 | if config['finetune'] == 'full':
391 | cost = f_fine_cost(xx, xx_mask, yy, yy_mask, cc_mask)
392 | elif config['finetune'] == 'decoder':
393 | cost = f_fine_cost(xx, xx_mask, yy, yy_mask, cc_mask)
394 | else:
395 | raise NotImplementedError
396 |
397 | print '\nIter={} || cost = {}'.format(it, cost[0])
398 | f_fine_update(0.00001)
399 | pipe.reset()
400 |
401 | if it % sample_freq == 0:
402 |
403 | print '\nModel:{} has been trained for {} hours'.format(_policy.id, (timer() - time0) / 3600.)
404 | print 'source: ', _bpe2words(_seqs2words([srcs[0]], word_idict))[0]
405 | print 'target: ', _bpe2words(_seqs2words([trgs[0]], word_idict_trg))[0]
406 |
407 | # obtain the translation results
408 | samples = _bpe2words(_seqs2words(statistics['sample'], word_idict_trg))
409 |
410 | # obtain the delay (normalized)
411 | # delays = _action2delay(srcs[0], statistics['action'])
412 |
413 | c = 0
414 | for j in xrange(len(samples)):
415 |
416 | if statistics['secs'][j][0] == 0:
417 | if c < 5:
418 | c += 1
419 |
420 | print '---ID: {}'.format(_policy.id)
421 | print 'sample: ', samples[j]
422 | # print 'action: ', ','.join(
423 | # ['{}({})'.format(action_space[t], f)
424 | # for t, f in
425 | # zip(statistics['action'][j], statistics['forgotten'][j])])
426 |
427 | print 'action: ', ','.join(
428 | ['{}'.format(action_space[t])
429 | for t in statistics['action'][j]])
430 |
431 | print 'quality:', statistics['track'][j][0]
432 | print 'delay:', statistics['track'][j][1]
433 | # print 'score:', statistics['score'][j]
434 | break
435 |
436 | values = [(w, info[w]) for w in info]
437 | probar.update(it + 1, values=values)
438 |
439 |
440 | # NaN detector
441 | for w in info:
442 | if numpy.isnan(info[w]) or numpy.isinf(info[w]):
443 | raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id)
444 |
445 | # remote display
446 | if remote:
447 | logs = {'R': info['R'], 'Q': info['Q'],
448 | 'D': info['D'], 'P': float(info['P'])}
449 | # print logs
450 | for w in logs:
451 | Log_avg[w] = Log_avg.get(w, 0) + logs[w]
452 |
453 | if it % display_freq == (display_freq - 1):
454 | for w in Log_avg:
455 | Log_avg[w] /= display_freq
456 |
457 | monitor.display(it + 1, Log_avg)
458 | Log_avg = dict()
459 |
460 | # save the history & model
461 | history += [info]
462 | if it % save_freq == 0:
463 | _policy.save(history, it)
464 |
465 |
466 | if __name__ == "__main__":
467 | parser = argparse.ArgumentParser()
468 | parser.add_argument('-s', '--step', type=int, default=1)
469 | parser.add_argument('-k', '--peek', type=int, default=1)
470 | parser.add_argument('-i', '--sinit', type=int, default=1)
471 | parser.add_argument('-n', '--sample', type=int, default=20)
472 | parser.add_argument('-b', '--batchsize', type=int, default=10)
473 | parser.add_argument('-c', action="store_true", default=False)
474 | parser.add_argument('-o', type=str, default=None)
475 |
476 | parser.add_argument('--updater', type=str, default='REINFORCE')
477 | parser.add_argument('--recurrent', default=False)
478 | parser.add_argument('--layernorm', default=False)
479 | parser.add_argument('--upper', default=False)
480 | parser.add_argument('--target', type=float, default=0.5)
481 | parser.add_argument('--gamma', type=float, default=10)
482 | parser.add_argument('--prop', type=float, default=0.5) # only useful for random policy
483 | parser.add_argument('--Rtype', type=int, default=0) # 0, 1, 2, 3
484 | parser.add_argument('--forget', default=False)
485 | parser.add_argument('--maxsrc', type=float, default=10)
486 | parser.add_argument('--pre', default=False)
487 | parser.add_argument('--coverage', default=False)
488 | parser.add_argument('--finetune', type=str, default='nope')
489 | parser.add_argument('--id', type=str, default=None)
490 | # parser.add_argument('-m', '--model', type=str,
491 | # default='model_wmt15_bpe2k_uni_en-de.npz')
492 | parser.add_argument('-m', '--model', type=str,
493 | default='model_wmt15_bpe2k_uni_en-ru.npz')
494 | parser.add_argument('--remote', default=False)
495 | args = parser.parse_args()
496 | print args # print settings
497 |
498 | policy = OrderedDict()
499 | policy['prop'] = args.prop
500 | policy['recurrent'] = args.recurrent
501 | policy['layernorm'] = args.layernorm
502 | policy['updater'] = args.updater
503 | policy['act_mask'] = True
504 |
505 | config = OrderedDict()
506 | config['step'] = args.step
507 | config['peek'] = args.peek
508 | config['s0'] = args.sinit
509 | config['sample'] = args.sample
510 | config['batchsize'] = args.batchsize
511 | config['target'] = args.target
512 | config['gamma'] = args.gamma
513 | config['Rtype'] = args.Rtype
514 | config['forget'] = args.forget
515 | config['maxsrc'] = args.maxsrc
516 | config['pre'] = args.pre
517 | config['coverage'] = args.coverage
518 | config['upper'] = False
519 |
520 | config['finetune'] = args.finetune
521 |
522 | run_simultrans(args.model,
523 | options_file=args.o,
524 | config=config,
525 | policy=policy,
526 | id=args.id,
527 | remote=args.remote)
528 |
529 |
530 |
531 |
532 |
--------------------------------------------------------------------------------
/simultrans_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/simultrans_model.pyc
--------------------------------------------------------------------------------
/simultrans_model_clean.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/simultrans_model_clean.pyc
--------------------------------------------------------------------------------
/simultrans_train.py:
--------------------------------------------------------------------------------
1 | """
2 | Simultaneous Machine Translateion: Training with Policy Gradient
3 |
4 | """
5 | import argparse
6 | import os
7 | import cPickle as pkl
8 |
9 | from bleu import *
10 | from nmt_uni import *
11 | from policy import Controller as Policy
12 | from utils import Progbar, Monitor
13 | from data_iterator import check_length, iterate
14 |
15 | from simultrans_model_clean import simultaneous_decoding
16 | from simultrans_model_clean import _seqs2words, _bpe2words, _padding
17 | from actors import get_actor
18 | import time
19 |
20 | numpy.random.seed(19920206)
21 | timer = time.time
22 |
23 |
24 | # run training function:: >>>
25 | def run_simultrans(model,
26 | options_file=None,
27 | config=None,
28 | id=None,
29 | remote=False):
30 |
31 | WORK = config['workspace']
32 |
33 | # check hidden folders
34 | paths = ['.policy', '.pretrained', '.log', '.config', '.images', '.translate']
35 | for p in paths:
36 | p = WORK + p
37 | if not os.path.exists(p):
38 | os.mkdir(p)
39 |
40 | if id is not None:
41 | fcon = WORK + '.config/{}.conf'.format(id)
42 | if os.path.exists(fcon):
43 | print 'load config files'
44 | policy, config = pkl.load(open(fcon, 'r'))
45 |
46 | # ============================================================================== #
47 | # load model model_options
48 | # ============================================================================== #
49 | _model = model.split('/')[-1]
50 |
51 | if options_file is not None:
52 | with open(options_file, 'rb') as f:
53 | options = pkl.load(f)
54 | else:
55 | with open('%s.pkl' % model, 'rb') as f:
56 | options = pkl.load(f)
57 |
58 | print 'merge configuration into options'
59 | for w in config:
60 | # if (w in options) and (config[w] is not None):
61 | options[w] = config[w]
62 |
63 | print 'load options...'
64 | for w, p in sorted(options.items(), key=lambda x: x[0]):
65 | print '{}: {}'.format(w, p)
66 |
67 | # load detail settings from option file:
68 | dictionary, dictionary_target = options['dictionaries']
69 |
70 | # load source dictionary and invert
71 | with open(dictionary, 'rb') as f:
72 | word_dict = pkl.load(f)
73 | word_idict = dict()
74 | for kk, vv in word_dict.iteritems():
75 | word_idict[vv] = kk
76 | word_idict[0] = ''
77 | word_idict[1] = 'UNK'
78 |
79 | # load target dictionary and invert
80 | with open(dictionary_target, 'rb') as f:
81 | word_dict_trg = pkl.load(f)
82 | word_idict_trg = dict()
83 | for kk, vv in word_dict_trg.iteritems():
84 | word_idict_trg[vv] = kk
85 | word_idict_trg[0] = ''
86 | word_idict_trg[1] = 'UNK'
87 |
88 | options['pre'] = config['pre']
89 |
90 | # ========================================================================= #
91 | # Build a Simultaneous Translator
92 | # ========================================================================= #
93 |
94 | # allocate model parameters
95 | params = init_params(options)
96 | params = load_params(model, params)
97 | tparams = init_tparams(params)
98 |
99 | # print 'build the model for computing cost (full source sentence).'
100 | trng, use_noise, \
101 | _x, _x_mask, _y, _y_mask, \
102 | opt_ret, \
103 | cost, f_cost = build_model(tparams, options)
104 | print 'done'
105 |
106 | # functions for sampler
107 | f_sim_ctx, f_sim_init, f_sim_next = build_simultaneous_sampler(tparams, options, trng)
108 |
109 | # function for finetune the underlying model
110 | if options['finetune']:
111 | ff_init, ff_cost, ff_update = build_simultaneous_model(tparams, options, rl=True)
112 | funcs = [f_sim_ctx, f_sim_init, f_sim_next, f_cost, ff_init, ff_cost, ff_update]
113 |
114 | else:
115 | funcs = [f_sim_ctx, f_sim_init, f_sim_next, f_cost]
116 |
117 | # build a res-predictor
118 | if options['predict']:
119 | params_act = get_actor('gru')[0](options, prefix='pdt',
120 | nin=options['dim'])
121 | pass
122 |
123 |
124 | # check the ID:
125 | options['base'] = _model
126 | agent = Policy(trng, options,
127 | n_in=options['readout_dim'] + 1 if options['coverage'] else options['readout_dim'],
128 | n_out=3 if config['forget'] else 2,
129 | recurrent=options['recurrent'], id=id)
130 |
131 | # make the dataset ready for training & validation
132 | trainIter = TextIterator(options['datasets'][0], options['datasets'][1],
133 | options['dictionaries'][0], options['dictionaries'][1],
134 | n_words_source=options['n_words_src'], n_words_target=options['n_words'],
135 | batch_size=config['batchsize'],
136 | maxlen=options['maxlen'])
137 |
138 | train_num = trainIter.num
139 |
140 | validIter = TextIterator(options['valid_datasets'][0], options['valid_datasets'][1],
141 | options['dictionaries'][0], options['dictionaries'][1],
142 | n_words_source=options['n_words_src'], n_words_target=options['n_words'],
143 | batch_size=20, cache=10,
144 | maxlen=1000000)
145 |
146 | valid_num = validIter.num
147 | print 'training set {} lines / validation set {} lines'.format(train_num, valid_num)
148 | print 'use the reward function {}'.format(chr(config['Rtype'] + 65))
149 |
150 | # ========================================================================== #
151 | # Main Loop: Run
152 | # ========================================================================== #
153 | print 'Start Simultaneous Translator...'
154 | monitor = None
155 | if remote:
156 | monitor = Monitor(root='http://localhost:9000')
157 |
158 | # freqs
159 | save_freq = 200
160 | sample_freq = 10
161 | valid_freq = 200
162 | valid_size = 200
163 | display_freq = 50
164 | finetune_freq = 5
165 |
166 | history, last_it = agent.load()
167 | action_space = ['W', 'C', 'F']
168 | Log_avg = {}
169 | time0 = timer()
170 |
171 | pipe = OrderedDict()
172 | for key in ['x', 'x_mask', 'y', 'y_mask', 'c_mask']:
173 | pipe[key] = []
174 |
175 | def _translate(src, trg, samples=None, train=False,
176 | greedy=False, show=False, full=False):
177 | time0 = time.time()
178 | if full:
179 | options1 = copy.copy(options)
180 | options1['upper'] = True
181 | else:
182 | options1 = options
183 |
184 | ret = simultaneous_decoding(
185 | funcs, agent, options1,
186 | src, trg, word_idict_trg,
187 | samples, greedy, train)
188 |
189 | if show:
190 | info = ret[1]
191 | values = [(w, float(info[w])) for w in info if w != 'advantages']
192 | print ' , '.join(['{}={:.3f}'.format(k, f) for k, f in values]),
193 | print '...{}s'.format(time.time() - time0)
194 |
195 | return ret
196 |
197 | for it, (srcs, trgs) in enumerate(trainIter): # only one sentence each iteration
198 | if it < last_it: # go over the scanned lines.
199 | continue
200 |
201 | # for validation
202 | # doing the whole validation!!
203 | reference = []
204 | system = []
205 |
206 | if it % valid_freq == (valid_freq-1):
207 | print 'start validation'
208 |
209 | collections = [[], [], [], [], []]
210 | probar_v = Progbar(valid_num / 20 + 1)
211 | for ij, (srcs, trgs) in enumerate(validIter):
212 |
213 | statistics = _translate(srcs, trgs, samples=1, train=False, greedy=True)
214 |
215 | quality, delay, reward = zip(*statistics['track'])
216 | reference += statistics['Ref']
217 | system += statistics['Sys']
218 |
219 | # compute the average consective waiting length
220 | def _consective(action):
221 | waits = []
222 | temp = 0
223 | for a in action:
224 | if a == 0:
225 | temp += 1
226 | elif temp > 0:
227 | waits += [temp]
228 | temp = 0
229 |
230 | if temp > 0:
231 | waits += [temp]
232 |
233 | mean = numpy.mean(waits)
234 | gec = numpy.max(waits) # numpy.prod(waits) ** (1./len(waits))
235 | return mean, gec
236 |
237 | def _max_length(action):
238 | _cur = 0
239 | _end = 0
240 | _max = 0
241 | for it, a in enumerate(action):
242 | if a == 0:
243 | _cur += 1
244 | elif a == 2:
245 | _end += 1
246 |
247 | temp = _cur - _end
248 | if temp > _max:
249 | _max = temp
250 | return _max
251 |
252 | maxlen = [_max_length(action) for action in statistics['action']]
253 | means, gecs = zip(*(_consective(action) for action in statistics['action']))
254 |
255 | collections[0] += quality
256 | collections[1] += delay
257 | collections[2] += means
258 | collections[3] += gecs
259 | collections[4] += maxlen
260 |
261 | values = [('quality', numpy.mean(quality)), ('delay', numpy.mean(delay)),
262 | ('wait_mean', numpy.mean(means)), ('wait_max', numpy.mean(gecs)),
263 | ('max_len', numpy.mean(maxlen))]
264 | probar_v.update(ij + 1, values=values)
265 |
266 | validIter.reset()
267 | valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx = [numpy.mean(a) for a in collections]
268 | print 'Iter = {}: AVG BLEU = {}, DELAY = {}, WAIT(MEAN) = {}, WAIT(MAX) = {}, MaxLen={}'.format(
269 | it, valid_bleu, valid_delay, valid_wait, valid_wait_gec, valid_mx)
270 |
271 | print 'Compute the Corpus BLEU={} (greedy)'.format(corpus_bleu(reference, system))
272 |
273 | with open(WORK + '.translate/test.txt', 'w') as fout:
274 | for sys in system:
275 | fout.write('{}\n'.format(' '.join(sys)))
276 |
277 | with open(WORK + '.translate/ref.txt', 'w') as fout:
278 | for ref in reference:
279 | fout.write('{}\n'.format(' '.join(ref[0])))
280 |
281 | history += [collections]
282 | print 'done'
283 |
284 | if options['upper']:
285 | print 'done'
286 | import sys; sys.exit(-1)
287 |
288 | # training set sentence tuning
289 | new_srcs, new_trgs = [], []
290 | for src, trg in zip(srcs, trgs):
291 | if len(src) <= options['s0']:
292 | continue # ignore when the source sentence is less than sidx.
293 | else:
294 | new_srcs += [src]
295 | new_trgs += [trg]
296 |
297 | if len(new_srcs) == 0:
298 | continue
299 |
300 | srcs, trgs = new_srcs, new_trgs
301 | statistics, info = _translate(srcs, trgs, train=True, show=True)
302 |
303 | if it % sample_freq == 0:
304 |
305 | # obtain the translation results
306 | samples = _bpe2words(
307 | _seqs2words(statistics['sample'], word_idict_trg,
308 | statistics['action'], 1))
309 | sources = _bpe2words(
310 | _seqs2words(statistics['SWord'], word_idict,
311 | statistics['action'], 0))
312 | targets = _bpe2words(
313 | _seqs2words(statistics['TWord'], word_idict_trg))
314 |
315 | # obtain the delay (normalized)
316 | # delays = _action2delay(srcs[0], statistics['action'])
317 |
318 | c = 0
319 | for j in xrange(len(samples)):
320 |
321 | if statistics['seq_info'][j][0] == 0:
322 | if c < (config['sample']/2.):
323 | c += 1
324 | continue
325 |
326 | print '--Iter: {}'.format(it)
327 | print 'source: ', sources[j]
328 | print 'sample: ', samples[j]
329 | print 'target: ', targets[j]
330 | print 'quality:', statistics['track'][j][0]
331 | print 'delay:', statistics['track'][j][1]
332 | print 'reward:', statistics['track'][j][2]
333 | break
334 |
335 |
336 | # NaN detector
337 | #for w in info:
338 | # if numpy.isnan(info[w]) or numpy.isinf(info[w]):
339 | # raise RuntimeError, 'NaN/INF is detected!! {} : ID={}'.format(w, id)
340 |
341 | # remote display
342 | if remote:
343 | logs = {'R': info['R'], 'Q': info['Q'],
344 | 'D': info['D'], 'P': float(info['P'])}
345 | if 'a_cost' in info:
346 | logs['A'] = info['a_cost']
347 |
348 | print logs
349 | for w in logs:
350 | Log_avg[w] = Log_avg.get(w, 0) + logs[w]
351 |
352 | if it % display_freq == (display_freq - 1):
353 | for w in Log_avg:
354 | Log_avg[w] /= display_freq
355 |
356 | monitor.display(it + 1, Log_avg)
357 | Log_avg = dict()
358 |
359 | # save the history & model
360 | history += [info]
361 | if it % save_freq == 0:
362 | agent.save(history, it)
363 |
364 |
365 | if __name__ == "__main__":
366 | from config import rl_config
367 | config = rl_config()
368 |
369 | run_simultrans(config['model'],
370 | options_file=config['option'],
371 | config=config,
372 | id=None,
373 | remote=False)
374 |
375 |
376 |
377 |
378 |
--------------------------------------------------------------------------------
/translate_uni.py:
--------------------------------------------------------------------------------
1 | '''
2 | Translates a source file using a translation model.
3 | '''
4 | import theano
5 | import argparse
6 |
7 | import numpy
8 | import cPickle as pkl
9 |
10 | from nmt_uni import (build_model, build_sampler, gen_sample, load_params,
11 | init_params, init_tparams, prepare_data)
12 |
13 | from multiprocessing import Process, Queue
14 |
15 |
16 | def translate_model(queue, rqueue, pid, model, options, k, normalize, kp, sigma):
17 |
18 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
19 | trng = RandomStreams(1234)
20 |
21 | # allocate model parameters
22 | params = init_params(options)
23 |
24 | # load model parameters and set theano shared variables
25 | params = load_params(model, params)
26 | tparams = init_tparams(params)
27 |
28 | trng, use_noise, \
29 | x, x_mask, y, y_mask, \
30 | opt_ret, \
31 | cost = \
32 | build_model(tparams, options)
33 | inps = [x, x_mask, y, y_mask]
34 |
35 | f_log_probs = theano.function(inps, cost)
36 |
37 | # word index
38 | f_init, f_next = build_sampler(tparams, options, trng)
39 |
40 | def _translate(idx, seq):
41 | all_samples = []
42 | all_scores = []
43 | all_c = []
44 | for kidx in xrange(kp):
45 | if kidx == 0:
46 | ss = -1.
47 | else:
48 | ss = sigma
49 | # sample given an input sequence and obtain scores
50 | sample, score, c = gen_sample(tparams, f_init, f_next,
51 | numpy.array(seq).reshape([len(seq), 1]),
52 | options, trng=trng, k=1, maxlen=200,
53 | stochastic=True, argmax=True, sigma=ss)
54 |
55 | # normalize scores according to sequence lengths
56 | if normalize:
57 | lengths = numpy.array([len(s) for s in sample])
58 | score = score / lengths
59 | #print idx, score
60 | sidx = numpy.argmin(score)
61 | all_samples.append(sample[sidx])
62 | all_scores.append(score[sidx])
63 | all_c.append(c[0])
64 |
65 | source_list = [seq] * kp
66 | x, x_mask, y, y_mask = prepare_data(source_list, all_samples, maxlen=None)
67 | all_scores = f_log_probs(x, x_mask, y, y_mask)
68 | if normalize:
69 | lengths = numpy.array([len(s) for s in all_samples])
70 | all_scores = all_scores / lengths
71 |
72 | print idx, all_scores
73 | sidx = numpy.argmin(all_scores)
74 | return all_samples[sidx], all_c[sidx]
75 |
76 | while True:
77 | req = queue.get()
78 | if req is None:
79 | break
80 |
81 | idx, x = req[0], req[1]
82 | print pid, '-', idx
83 | seq = _translate(idx, x)
84 |
85 | rqueue.put((idx, seq))
86 |
87 | return
88 |
89 |
90 | def main(model, dictionary, dictionary_target, source_file, saveto, k=5,
91 | normalize=False, n_process=5, chr_level=False,
92 | options_file=None, sigma=-1., kp=1):
93 |
94 | # load model model_options
95 | if options_file is not None:
96 | with open(options_file, 'rb') as f:
97 | options = pkl.load(f)
98 | else:
99 | with open('%s.pkl' % model, 'rb') as f:
100 | options = pkl.load(f)
101 |
102 | # load source dictionary and invert
103 | with open(dictionary, 'rb') as f:
104 | word_dict = pkl.load(f)
105 | word_idict = dict()
106 | for kk, vv in word_dict.iteritems():
107 | word_idict[vv] = kk
108 | word_idict[0] = ''
109 | word_idict[1] = 'UNK'
110 |
111 | # load target dictionary and invert
112 | with open(dictionary_target, 'rb') as f:
113 | word_dict_trg = pkl.load(f)
114 | word_idict_trg = dict()
115 | for kk, vv in word_dict_trg.iteritems():
116 | word_idict_trg[vv] = kk
117 | word_idict_trg[0] = ''
118 | word_idict_trg[1] = 'UNK'
119 |
120 | # create input and output queues for processes
121 | queue = Queue()
122 | rqueue = Queue()
123 | processes = [None] * n_process
124 | for midx in xrange(n_process):
125 | processes[midx] = Process(
126 | target=translate_model,
127 | args=(queue, rqueue, midx, model, options, k, normalize, kp, sigma))
128 | processes[midx].start()
129 |
130 | # utility function
131 | def _seqs2words(caps):
132 | capsw = []
133 | for cc in caps:
134 | ww = []
135 | for w in cc:
136 | if w == 0:
137 | break
138 | ww.append(word_idict_trg[w])
139 | capsw.append(' '.join(ww))
140 | return capsw
141 |
142 | def _send_jobs(fname):
143 | with open(fname, 'r') as f:
144 | for idx, line in enumerate(f):
145 | if chr_level:
146 | words = list(line.decode('utf-8').strip())
147 | else:
148 | words = line.strip().split()
149 | x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
150 | x = map(lambda ii: ii if ii < options['n_words'] else 1, x)
151 | x += [0]
152 | queue.put((idx, x))
153 | return idx+1
154 |
155 | def _finish_processes():
156 | for midx in xrange(n_process):
157 | queue.put(None)
158 |
159 | def _retrieve_jobs(n_samples):
160 | trans = [None] * n_samples
161 | c = [None] * n_samples
162 | for idx in xrange(n_samples):
163 | resp = rqueue.get()
164 | trans[resp[0]] = resp[1][0]
165 | c[resp[0]] = resp[1][1]
166 | if numpy.mod(idx, 10) == 0:
167 | print 'Sample ', (idx+1), '/', n_samples, ' Done'
168 |
169 | return trans, c
170 |
171 | print 'Translating ', source_file, '...'
172 | n_samples = _send_jobs(source_file)
173 | trans, c = _retrieve_jobs(n_samples)
174 | trans = _seqs2words(trans)
175 | _finish_processes()
176 | with open(saveto, 'w') as f:
177 | print >>f, '\n'.join(trans)
178 | print >>f, '{}\n'.format(c)
179 | print 'Done'
180 |
181 |
182 | if __name__ == "__main__":
183 | parser = argparse.ArgumentParser()
184 | parser.add_argument('-k', type=int, default=5)
185 | parser.add_argument('-kp', type=int, default=1)
186 | parser.add_argument('-p', type=int, default=5)
187 | parser.add_argument('-n', action="store_true", default=False)
188 | parser.add_argument('-c', action="store_true", default=False)
189 | parser.add_argument('-o', type=str, default=None)
190 | parser.add_argument('-s', type=float, default=-1.)
191 | parser.add_argument('model', type=str)
192 | parser.add_argument('dictionary', type=str)
193 | parser.add_argument('dictionary_target', type=str)
194 | parser.add_argument('source', type=str)
195 | parser.add_argument('saveto', type=str)
196 |
197 | args = parser.parse_args()
198 |
199 | main(args.model, args.dictionary, args.dictionary_target, args.source,
200 | args.saveto, k=args.k, normalize=args.n, n_process=args.p,
201 | chr_level=args.c, options_file=args.o, kp=args.kp, sigma=args.s)
202 |
--------------------------------------------------------------------------------
/translate_uni.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | dataset=dev
3 | model="/work/jg5223/work/SimulTrans/.pretrained/model_wmt15_bpe2k_uni_en-ru.npz"
4 | dict="/scratch/jg5223/data/wmt15/ruen/train/all_ru-en.en.tok.bpe.word.pkl"
5 | dict_rev="/scratch/jg5223/data/wmt15/ruen/train/all_ru-en.ru.tok.bpe.word.pkl"
6 | source="/scratch/jg5223/data/wmt15/ruen/${dataset}/newstest2013-src.en.tok.bpe"
7 | saveto="./enrugreedy.out"
8 | reference="/scratch/jg5223/data/wmt15/ruen/${dataset}/newstest2013-src.ru.tok"
9 |
10 | # pyenv local anaconda-2.4.0
11 | THEANO_FLAGS="floatX=float32, device=cpu" python translate_uni.py -p 8 -k 1 $model $dict $dict_rev $source $saveto
12 |
13 | ./data/multi-bleu.perl $reference < $saveto
14 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is for functions to help the translation
3 | """
4 | import numpy as np
5 | import time
6 | import sys
7 | import json
8 |
9 | class Monitor(object):
10 | def __init__(self, root='http://localhost:9000'):
11 | self.root = root
12 |
13 | def display(self, batch, logs={}):
14 | import requests
15 | send = {}
16 | send['epoch'] = batch
17 | for k, v in logs.items():
18 | send[k] = v
19 |
20 | try:
21 | requests.post(self.root + '/publish/epoch/end/',
22 | {'data': json.dumps(send)})
23 | except:
24 | print('Warning: could not reach RemoteMonitor '
25 | 'root server at ' + str(self.root))
26 |
27 |
28 |
29 | class Progbar(object):
30 | def __init__(self, target, width=30, verbose=1, with_history=True):
31 | '''
32 | @param target: total number of steps expected
33 | '''
34 | self.width = width
35 | self.target = target
36 | self.sum_values = {}
37 | self.unique_values = []
38 | self.start = time.time()
39 | self.total_width = 0
40 | self.seen_so_far = 0
41 | self.verbose = verbose
42 | self.with_history = with_history
43 |
44 | def update(self, current, values=[]):
45 | '''
46 | @param current: index of current step
47 | @param values: list of tuples (name, value_for_last_step).
48 | The progress bar will display averages for these values.
49 | '''
50 | if not self.with_history:
51 | self.sum_values = {}
52 | self.unique_values = []
53 |
54 | for k, v in values:
55 | if k not in self.sum_values:
56 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
57 | self.unique_values.append(k)
58 | else:
59 | self.sum_values[k][0] += v * (current - self.seen_so_far)
60 | self.sum_values[k][1] += (current - self.seen_so_far)
61 | self.seen_so_far = current
62 |
63 | now = time.time()
64 | if self.verbose == 1:
65 | prev_total_width = self.total_width
66 | sys.stdout.write("\b" * prev_total_width)
67 | sys.stdout.write("\r")
68 |
69 | numdigits = int(np.floor(np.log10(self.target))) + 1
70 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
71 | bar = barstr % (current, self.target)
72 | prog = float(current)/self.target
73 | prog_width = int(self.width*prog)
74 | if prog_width > 0:
75 | bar += ('.'*(prog_width-1))
76 | if current < self.target:
77 | bar += '(-w-)'
78 | else:
79 | bar += '(-v-)!!'
80 | bar += ('~' * (self.width-prog_width))
81 | bar += ']'
82 | sys.stdout.write(bar)
83 | self.total_width = len(bar)
84 |
85 | if current:
86 | time_per_unit = (now - self.start) / current
87 | else:
88 | time_per_unit = 0
89 | eta = time_per_unit*(self.target - current)
90 | info = ''
91 | if current < self.target:
92 | info += ' - ETA: %ds' % eta
93 | else:
94 | info += ' - %ds' % (now - self.start)
95 | for k in self.unique_values:
96 | if k == 'perplexity' or k == 'PPL':
97 | info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1])))
98 | else:
99 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
100 |
101 | self.total_width += len(info)
102 | if prev_total_width > self.total_width:
103 | info += ((prev_total_width-self.total_width) * " ")
104 |
105 | sys.stdout.write(info)
106 | sys.stdout.flush()
107 |
108 | if current >= self.target:
109 | sys.stdout.write("\n")
110 |
111 | if self.verbose == 2:
112 | if current >= self.target:
113 | info = '%ds' % (now - self.start)
114 | for k in self.unique_values:
115 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 | sys.stdout.write(info + "\n")
117 |
118 | def add(self, n, values=[]):
119 | self.update(self.seen_so_far + n, values)
120 |
121 | def clear(self):
122 | self.sum_values = {}
123 | self.unique_values = []
124 | self.total_width = 0
125 | self.seen_so_far = 0
126 |
--------------------------------------------------------------------------------
/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/utils.pyc
--------------------------------------------------------------------------------
/utils/msyh.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nyu-dl/dl4mt-simul-trans/392ff3148e944be6fbc475d5285441807902e2e0/utils/msyh.ttf
--------------------------------------------------------------------------------