├── .gitignore ├── CHANGELOG.md ├── Dockerfile.cpu ├── Dockerfile.gpu ├── LICENSE ├── README.md ├── data ├── README.md ├── build_dictionary.py ├── length.py ├── merge.sh ├── multi-bleu-detok.perl ├── multi-bleu.perl ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.el │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.pt │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.sv │ └── nonbreaking_prefix.ta ├── postprocess.sh ├── preprocess.sh ├── shuffle.py ├── strip_sgml.py └── tokenizer.perl ├── doc ├── factored_neural_machine_translation.md └── multi_gpu_training.md ├── nematus ├── .gitignore ├── __init__.py ├── beam_search_sampler.py ├── config.py ├── data_iterator.py ├── exception.py ├── exponential_smoothing.py ├── initializers.py ├── layers.py ├── learning_schedule.py ├── metrics │ ├── __init__.py │ ├── beer.py │ ├── chrf.py │ ├── meteor.py │ ├── reference.py │ ├── scorer.py │ ├── scorer_interpolator.py │ ├── scorer_provider.py │ ├── sentence_bleu.py │ ├── test_chrf.py │ ├── test_scorer_provider.py │ └── test_sentence_bleu.py ├── model_inputs.py ├── model_loader.py ├── model_updater.py ├── mrt_utils.py ├── nmt.py ├── random_sampler.py ├── rescore.py ├── rnn_inference.py ├── rnn_model.py ├── sample_client.py ├── sampler_inputs.py ├── sampling_utils.py ├── score.py ├── server.py ├── server │ ├── README.md │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── nematus_style.py │ │ └── provider.py │ ├── request.py │ └── response.py ├── server_translator.py ├── settings.py ├── shuffle.py ├── tf_utils.py ├── theano_tf_convert.py ├── train.py ├── training_progress.py ├── transformer.py ├── transformer_attention_modules.py ├── transformer_blocks.py ├── transformer_inference.py ├── transformer_layers.py ├── translate.py ├── translate_utils.py └── util.py ├── setup.py ├── test ├── README.md ├── data │ ├── corpus.de │ ├── corpus.en │ ├── vocab.de.json │ ├── vocab.en.json │ └── vocab.json ├── en-de │ ├── in │ ├── ref │ ├── ref2 │ ├── ref_score │ └── references ├── en-ro │ ├── in │ ├── ref │ ├── ref_score │ └── references ├── models │ └── .gitignore ├── test_score.py ├── test_train.sh ├── test_train_l2_loss.sh ├── test_train_mapl2_loss.sh ├── test_train_outputactivations.sh ├── test_train_reload.sh ├── test_train_summaries.sh ├── test_train_transformer.sh ├── test_translate.py ├── test_translate_sampling.py └── test_utils.py └── utils ├── attention.js ├── attention_web.php ├── copy_unknown_words.py ├── plot_heatmap.py └── visualize_probs.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build 3 | dist 4 | nmt.egg-info 5 | .idea 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | CHANGELOG 2 | --------- 3 | 4 | development version 5 | ----------- 6 | 7 | - main new features: 8 | - factored input for Transformers 9 | - DropHead regularization 10 | 11 | v0.5 (19/5/2020) 12 | ---------- 13 | 14 | changes since 0.4: 15 | 16 | - main new features: 17 | - minimum risk training (MRT) 18 | - new inference code with ensemble decoding support for Transformer/RNN mix 19 | - compatibility with TF 2 20 | 21 | - other new features 22 | - lexical model for RNNs 23 | - gradient accumulation support 24 | - exponential smoothing 25 | - warmup-plateau-decay learning schedule 26 | - sampling translation strategy 27 | 28 | - fixes 29 | - fix regressions with deep RNN decoders 30 | 31 | 32 | v0.4 (17/12/2018) 33 | ---------- 34 | 35 | changes since 0.3: 36 | 37 | - main new features: 38 | - Transformer architecture 39 | - multi-GPU training 40 | - codebase moved to Python 3 41 | 42 | - other new features: 43 | - label smoothing 44 | - mixture of softmaxes 45 | 46 | - fixes: 47 | - re-enable BLEU validation (via --valid_script) 48 | - fix MAP-L2 regularization 49 | - fix server mode 50 | 51 | v0.3 (23/5/2018) 52 | ---------- 53 | - Tensorflow backend. The main model was rewritten to support Tensorflow in lieu of Theano. 54 | A few features have not been implemented in the Tensorflow model. 55 | 56 | - currently supported: 57 | - re-implementation of default Nematus model 58 | - model compatibility with Theano version and conversion via `theano_tf_convert.py` 59 | - same scripts and command line API for training, translating and (re)scoring 60 | - layer normalisation 61 | - tied embeddings 62 | - deep models 63 | - ensemble decoding 64 | - input features 65 | 66 | - not yet supported: 67 | - minimum risk training 68 | - LSTM cells 69 | - learning rate annealing 70 | 71 | - new features: 72 | - batch decoding 73 | - more efficient training with --token_batch_size 74 | 75 | v0.2 (17/12/2017) 76 | ---------- 77 | 78 | - layer normalisation (Ba et al, 2016) https://arxiv.org/abs/1607.06450 79 | - weight normalisation (Salimans and Kingma, 2016) https://arxiv.org/abs/1602.07868 80 | - deep models (Zhou et al., 2016; Wu et al., 2016; Miceli Barone et al., 2017) https://arxiv.org/abs/1606.04199 https://arxiv.org/abs/1609.08144 https://arxiv.org/abs/1707.07631 81 | - better memory efficiency 82 | - save historical gradient information for seamless resuming of interrupted training runs 83 | - server mode 84 | - sgdmomentum optimizer 85 | - learning rate annealing 86 | - LSTM cells 87 | - deep fusion (https://arxiv.org/abs/1503.03535) 88 | - various bugfixes 89 | 90 | v0.1 (2/3/2017) 91 | --------------- 92 | 93 | - arbitrary input features (factored neural machine translation) http://www.statmt.org/wmt16/pdf/W16-2209.pdf 94 | - ensemble decoding (and new translation API to support it) 95 | - dropout on all layers (Gal, 2015) http://arxiv.org/abs/1512.05287 96 | - minimum risk training (Shen et al, 2016) http://aclweb.org/anthology/P16-1159 97 | - tied embeddings (Press and Wolf, 2016) https://arxiv.org/abs/1608.05859 98 | - command line interface for training 99 | - n-best output for decoder 100 | - more output options (attention weights; word-level probabilities) and visualization scripts 101 | - performance improvements to decoder 102 | - better memory efficiency 103 | - rescoring support 104 | - execute arbitrary validation scripts (for BLEU early stopping) 105 | - vocabulary files and model parameters are stored in JSON format (backward-compatible loading) 106 | -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER Tom Kocmi 3 | 4 | RUN apt-get update && apt-get install -y \ 5 | cmake \ 6 | git \ 7 | python \ 8 | python3 \ 9 | vim \ 10 | nano \ 11 | libopenblas-dev \ 12 | python3-dev \ 13 | python3-pip \ 14 | xml-twig-tools 15 | 16 | RUN pip3 install --upgrade pip 17 | RUN pip3 install --upgrade setuptools 18 | RUN pip3 install tensorflow==1.15 19 | 20 | RUN mkdir -p /path/to 21 | WORKDIR /path/to/ 22 | 23 | # Install mosesdecoder 24 | RUN git clone https://github.com/moses-smt/mosesdecoder 25 | 26 | # Install subwords 27 | RUN git clone https://github.com/rsennrich/subword-nmt 28 | 29 | # Install nematus 30 | COPY . /path/to/nematus 31 | WORKDIR /path/to/nematus 32 | RUN python3 setup.py install 33 | 34 | WORKDIR / 35 | 36 | # playground will contain user defined scripts, it should be run as: 37 | # docker run -v `pwd`:/playground -it nematus-docker 38 | RUN mkdir playground 39 | WORKDIR /playground 40 | 41 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-cudnn7-devel 2 | MAINTAINER Tom Kocmi 3 | 4 | # Install git, wget, python-dev, pip and other dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | git \ 7 | wget \ 8 | cmake \ 9 | vim \ 10 | nano \ 11 | python3 \ 12 | libopenblas-dev \ 13 | python3-dev \ 14 | python3-pip \ 15 | python3-nose \ 16 | python3-numpy \ 17 | python3-scipy \ 18 | python3-pygraphviz \ 19 | xml-twig-tools 20 | 21 | RUN pip3 install --upgrade pip 22 | RUN pip3 install -U setuptools 23 | RUN pip3 install tensorflow==1.15 24 | 25 | # Set CUDA_ROOT 26 | ENV CUDA_ROOT /usr/local/cuda/bin 27 | 28 | 29 | RUN mkdir -p /path/to 30 | WORKDIR /path/to/ 31 | 32 | # Install mosesdecoder 33 | RUN git clone https://github.com/moses-smt/mosesdecoder 34 | 35 | # Install subwords 36 | RUN git clone https://github.com/rsennrich/subword-nmt 37 | 38 | # Install nematus 39 | COPY . /path/to/nematus 40 | WORKDIR /path/to/nematus 41 | RUN python3 setup.py install 42 | 43 | WORKDIR / 44 | 45 | # playground will contain user defined scripts, it should be run as: 46 | # nvidia-docker run -v `pwd`:/playground -it nematus-docker 47 | RUN mkdir playground 48 | WORKDIR /playground 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Kyunghyun Cho 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of Nematus nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This directory contains small scripts for data processing and evaluation. 2 | Other useful scripts and sample data is provided at https://github.com/rsennrich/wmt16-scripts 3 | 4 | 5 | Evaluation 6 | ---------- 7 | 8 | This directory contains two evaluation scripts: 9 | 10 | - multi-bleu.perl (from Moses decoder) computes tokenized, case-sensitive BLEU 11 | scores. This script is widely used in NMT research, but we discourage its use 12 | for publication because different groups use different tokenization, which 13 | biases comparisons to previous work. 14 | 15 | usage: 16 | ./multi-bleu.perl ref_file < test_file 17 | 18 | - multi-bleu-detok.perl expects that the reference file and output file are not 19 | tokenized (untokenized reference; detokenized output). It performs tokenization 20 | internally, using the tokenization routine from the NIST BLEU scorer 21 | (mteval-v13a.pl). This script can be used as a plaintext alternative of 22 | mteval-v13a.pl, giving the same results. 23 | 24 | usage: 25 | ./multi-bleu-detok.perl ref_file < test_file -------------------------------------------------------------------------------- /data/build_dictionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from collections import OrderedDict 4 | import fileinput 5 | import sys 6 | 7 | import numpy 8 | import json 9 | 10 | 11 | def main(): 12 | for filename in sys.argv[1:]: 13 | print('Processing', filename) 14 | word_freqs = OrderedDict() 15 | with open(filename, 'r', encoding='utf-8') as f: 16 | for line in f: 17 | words_in = line.strip().split(' ') 18 | for w in words_in: 19 | if w not in word_freqs: 20 | word_freqs[w] = 0 21 | word_freqs[w] += 1 22 | words = list(word_freqs.keys()) 23 | freqs = list(word_freqs.values()) 24 | 25 | sorted_idx = numpy.argsort(freqs) 26 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 27 | 28 | worddict = OrderedDict() 29 | worddict[''] = 0 30 | worddict[''] = 1 31 | worddict[''] = 2 32 | # FIXME We shouldn't assume , , and aren't BPE subwords. 33 | for ii, ww in enumerate(sorted_words): 34 | worddict[ww] = ii+3 35 | 36 | # The JSON RFC requires that JSON text be represented using either 37 | # UTF-8, UTF-16, or UTF-32, with UTF-8 being recommended. 38 | # We use UTF-8 regardless of the user's locale settings. 39 | with open('%s.json'%filename, 'w', encoding='utf-8') as f: 40 | json.dump(worddict, f, indent=2, ensure_ascii=False) 41 | 42 | print('Done') 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /data/length.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import sys 5 | 6 | for name in sys.argv[1:]: 7 | lens = [] 8 | with open(name, 'r') as f: 9 | for ll in f: 10 | lens.append(len(ll.strip().split(' '))) 11 | print(name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens)) 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /data/merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | SRC=$1 5 | TRG=$2 6 | 7 | FSRC=all_${1}-${2}.${1} 8 | FTRG=all_${1}-${2}.${2} 9 | 10 | echo "" > $FSRC 11 | for F in *${1}-${2}.${1} 12 | do 13 | if [ "$F" = "$FSRC" ]; then 14 | echo "pass" 15 | else 16 | cat $F >> $FSRC 17 | fi 18 | done 19 | 20 | 21 | echo "" > $FTRG 22 | for F in *${1}-${2}.${2} 23 | do 24 | if [ "$F" = "$FTRG" ]; then 25 | echo "pass" 26 | else 27 | cat $F >> $FTRG 28 | fi 29 | done 30 | -------------------------------------------------------------------------------- /data/multi-bleu-detok.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # This file uses the internal tokenization of mteval-v13a.pl, 7 | # giving the exact same (case-sensitive) results on untokenized text. 8 | # 9 | # like multi-bleu.perl , it supports plain text input and multiple references. 10 | 11 | # $Id$ 12 | use warnings; 13 | use strict; 14 | 15 | my $lowercase = 0; 16 | if ($ARGV[0] eq "-lc") { 17 | $lowercase = 1; 18 | shift; 19 | } 20 | 21 | my $stem = $ARGV[0]; 22 | if (!defined $stem) { 23 | print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n"; 24 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 25 | exit(1); 26 | } 27 | 28 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 29 | 30 | my @REF; 31 | my $ref=0; 32 | while(-e "$stem$ref") { 33 | &add_to_ref("$stem$ref",\@REF); 34 | $ref++; 35 | } 36 | &add_to_ref($stem,\@REF) if -e $stem; 37 | die("ERROR: could not find reference file $stem") unless scalar @REF; 38 | 39 | # add additional references explicitly specified on the command line 40 | shift; 41 | foreach my $stem (@ARGV) { 42 | &add_to_ref($stem,\@REF) if -e $stem; 43 | } 44 | 45 | 46 | 47 | sub add_to_ref { 48 | my ($file,$REF) = @_; 49 | my $s=0; 50 | if ($file =~ /.gz$/) { 51 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 52 | } else { 53 | open(REF,$file) or die "Can't read $file"; 54 | } 55 | while() { 56 | chomp; 57 | $_ = tokenization($_); 58 | push @{$$REF[$s++]}, $_; 59 | } 60 | close(REF); 61 | } 62 | 63 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 64 | my $s=0; 65 | while() { 66 | chomp; 67 | $_ = lc if $lowercase; 68 | $_ = tokenization($_); 69 | my @WORD = split; 70 | my %REF_NGRAM = (); 71 | my $length_translation_this_sentence = scalar(@WORD); 72 | my ($closest_diff,$closest_length) = (9999,9999); 73 | foreach my $reference (@{$REF[$s]}) { 74 | # print "$s $_ <=> $reference\n"; 75 | $reference = lc($reference) if $lowercase; 76 | my @WORD = split(' ',$reference); 77 | my $length = scalar(@WORD); 78 | my $diff = abs($length_translation_this_sentence-$length); 79 | if ($diff < $closest_diff) { 80 | $closest_diff = $diff; 81 | $closest_length = $length; 82 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 83 | } elsif ($diff == $closest_diff) { 84 | $closest_length = $length if $length < $closest_length; 85 | # from two references with the same closeness to me 86 | # take the *shorter* into account, not the "first" one. 87 | } 88 | for(my $n=1;$n<=4;$n++) { 89 | my %REF_NGRAM_N = (); 90 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 91 | my $ngram = "$n"; 92 | for(my $w=0;$w<$n;$w++) { 93 | $ngram .= " ".$WORD[$start+$w]; 94 | } 95 | $REF_NGRAM_N{$ngram}++; 96 | } 97 | foreach my $ngram (keys %REF_NGRAM_N) { 98 | if (!defined($REF_NGRAM{$ngram}) || 99 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 100 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 101 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 102 | } 103 | } 104 | } 105 | } 106 | $length_translation += $length_translation_this_sentence; 107 | $length_reference += $closest_length; 108 | for(my $n=1;$n<=4;$n++) { 109 | my %T_NGRAM = (); 110 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 111 | my $ngram = "$n"; 112 | for(my $w=0;$w<$n;$w++) { 113 | $ngram .= " ".$WORD[$start+$w]; 114 | } 115 | $T_NGRAM{$ngram}++; 116 | } 117 | foreach my $ngram (keys %T_NGRAM) { 118 | $ngram =~ /^(\d+) /; 119 | my $n = $1; 120 | # my $corr = 0; 121 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 122 | $TOTAL[$n] += $T_NGRAM{$ngram}; 123 | if (defined($REF_NGRAM{$ngram})) { 124 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 125 | $CORRECT[$n] += $T_NGRAM{$ngram}; 126 | # $corr = $T_NGRAM{$ngram}; 127 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 128 | } 129 | else { 130 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 131 | # $corr = $REF_NGRAM{$ngram}; 132 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 133 | } 134 | } 135 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 136 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 137 | } 138 | } 139 | $s++; 140 | } 141 | my $brevity_penalty = 1; 142 | my $bleu = 0; 143 | 144 | my @bleu=(); 145 | 146 | for(my $n=1;$n<=4;$n++) { 147 | if (defined ($TOTAL[$n])){ 148 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 149 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 150 | }else{ 151 | $bleu[$n]=0; 152 | } 153 | } 154 | 155 | if ($length_reference==0){ 156 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 157 | exit(1); 158 | } 159 | 160 | if ($length_translation<$length_reference) { 161 | $brevity_penalty = exp(1-$length_reference/$length_translation); 162 | } 163 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 164 | my_log( $bleu[2] ) + 165 | my_log( $bleu[3] ) + 166 | my_log( $bleu[4] ) ) / 4) ; 167 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 168 | 100*$bleu, 169 | 100*$bleu[1], 170 | 100*$bleu[2], 171 | 100*$bleu[3], 172 | 100*$bleu[4], 173 | $brevity_penalty, 174 | $length_translation / $length_reference, 175 | $length_translation, 176 | $length_reference; 177 | 178 | sub my_log { 179 | return -9999999999 unless $_[0]; 180 | return log($_[0]); 181 | } 182 | 183 | 184 | 185 | sub tokenization 186 | { 187 | my ($norm_text) = @_; 188 | 189 | # language-independent part: 190 | $norm_text =~ s///g; # strip "skipped" tags 191 | $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines 192 | $norm_text =~ s/\n/ /g; # join lines 193 | $norm_text =~ s/"/"/g; # convert SGML tag for quote to " 194 | $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & 195 | $norm_text =~ s/</ 196 | $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < 197 | 198 | # language-dependent part (assuming Western languages): 199 | $norm_text = " $norm_text "; 200 | $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation 201 | $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit 202 | $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit 203 | $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit 204 | $norm_text =~ s/\s+/ /g; # one space only between words 205 | $norm_text =~ s/^\s+//; # no leading space 206 | $norm_text =~ s/\s+$//; # no trailing space 207 | 208 | return $norm_text; 209 | } 210 | -------------------------------------------------------------------------------- /data/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chomp; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chomp; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | 172 | print STDERR "It is not advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n"; 173 | 174 | sub my_log { 175 | return -9999999999 unless $_[0]; 176 | return log($_[0]); 177 | } 178 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /data/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # merges subword units that were split by BPE 4 | 5 | sed -r 's/\@\@ //g' -------------------------------------------------------------------------------- /data/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | P=$1 4 | 5 | # source language (example: fr) 6 | S=$2 7 | # target language (example: en) 8 | T=$3 9 | 10 | # path to nematus/data 11 | P1=$4 12 | 13 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt) 14 | P2=$5 15 | 16 | # tokenize 17 | perl $P1/tokenizer.perl -threads 5 -l $S < {P}.${S} > {P}.${S}.tok 18 | perl $P1/tokenizer.perl -threads 5 -l $T < {P}.${T} > {P}.${T}.tok 19 | 20 | # learn BPE on joint vocabulary: 21 | cat {P}.${S}.tok {P}.${T}.tok | python $P2/learn_bpe.py -s 20000 > ${S}${T}.bpe 22 | 23 | python3 $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${S}.tok > {P}.${S}.tok.bpe 24 | python3 $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${T}.tok > {P}.${T}.tok.bpe 25 | 26 | # build dictionary 27 | python3 $P1/build_dictionary.py {P}.${S}.tok.bpe 28 | python3 $P1/build_dictionary.py {P}.${T}.tok.bpe 29 | 30 | -------------------------------------------------------------------------------- /data/shuffle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import math 4 | import os 5 | import random 6 | import sys 7 | import tempfile 8 | 9 | 10 | # TODO Make CHUNK_SIZE user configurable? 11 | CHUNK_SIZE = 10000000 # Number of lines. 12 | 13 | def jointly_shuffle_files(files, temporary=False): 14 | """Randomly shuffle the given files, applying the same permutation to each. 15 | 16 | Since the same permutation is applied to all input files, they must 17 | contain the same number of input lines. 18 | 19 | If 'temporary' is True then the shuffled files are written to temporary 20 | files. Otherwise, the shuffled files are written to files with the same 21 | paths as the originals, but with the added suffix '.shuf'. 22 | 23 | In addition to shuffling the files, any leading or trailing whitespace is 24 | removed from each line. 25 | 26 | In order to handle large files, the input files are not read into memory 27 | in full, but instead are read in chunks of size CHUNK_SIZE. 28 | 29 | Args: 30 | files: a list of strings specifying the paths of the input files. 31 | temporary: a Boolean (see description above). 32 | 33 | Returns: 34 | A list containing a file object for each shuffled file, in the same 35 | order as the input files. Each file object is open and positioned at 36 | the start of the file. 37 | """ 38 | 39 | # Determine the number of lines (should be the same for all files). 40 | total_lines = 0 41 | for _ in open(files[0]): 42 | total_lines += 1 43 | 44 | # Randomly permute the list of line numbers. 45 | perm = list(range(total_lines)) 46 | random.shuffle(perm) 47 | 48 | # Convert the list of line numbers to a list of chunk indices and offsets. 49 | ordering = [(i // CHUNK_SIZE, i % CHUNK_SIZE) for i in perm] 50 | 51 | # Sort each file according to the generated ordering. 52 | return [_sort_file(path, ordering, temporary) for path in files] 53 | 54 | 55 | def _sort_file(path, ordering, temporary): 56 | 57 | # Open a temporary file for each chunk. 58 | 59 | num_chunks = math.ceil(len(ordering) / CHUNK_SIZE) 60 | dirname, filename = os.path.split(os.path.realpath(path)) 61 | chunk_files = [tempfile.TemporaryFile(prefix=filename+'.chunk'+str(i), 62 | dir=dirname, mode='w+', 63 | encoding="UTF-8") 64 | for i in range(num_chunks)] 65 | 66 | # Read one chunk at a time from path and write the lines to the temporary 67 | # files in the order specified by ordering. 68 | 69 | def _write_chunk_in_order(chunk, chunk_num, out_file): 70 | for i, j in ordering: 71 | if i == chunk_num: 72 | out_file.write(chunk[j] + '\n') 73 | 74 | chunk = [] 75 | chunk_num = 0 76 | for i, line in enumerate(open(path)): 77 | if i > 0 and (i % CHUNK_SIZE) == 0: 78 | _write_chunk_in_order(chunk, chunk_num, chunk_files[chunk_num]) 79 | chunk = [] 80 | chunk_num += 1 81 | chunk.append(line.strip()) 82 | if chunk: 83 | _write_chunk_in_order(chunk, chunk_num, chunk_files[chunk_num]) 84 | 85 | # Open the output file. 86 | if temporary: 87 | out_file = tempfile.TemporaryFile(prefix=filename+'.shuf', dir=dirname, 88 | mode='w+', encoding='UTF-8') 89 | else: 90 | out_file = open(path+'.shuf', mode='w', encoding='UTF-8') 91 | 92 | # Seek to the start of the chunk files. 93 | for chunk_file in chunk_files: 94 | chunk_file.seek(0) 95 | 96 | # Write the output. 97 | for i, _ in ordering: 98 | line = chunk_files[i].readline() 99 | out_file.write(line) 100 | 101 | # Seek to the start so that the file object is ready for reading. 102 | out_file.seek(0) 103 | 104 | return out_file 105 | 106 | 107 | if __name__ == '__main__': 108 | jointly_shuffle_files(sys.argv[1:]) 109 | -------------------------------------------------------------------------------- /data/strip_sgml.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | 5 | def main(): 6 | fin = sys.stdin 7 | fout = sys.stdout 8 | for l in fin: 9 | line = l.strip() 10 | text = re.sub('<[^<]+>', "", line).strip() 11 | if len(text) == 0: 12 | continue 13 | print(text, file=fout) 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | 19 | -------------------------------------------------------------------------------- /doc/factored_neural_machine_translation.md: -------------------------------------------------------------------------------- 1 | FACTORED NEURAL MACHINE TRANSLATION 2 | ----------------------------------- 3 | 4 | Nematus supports arbitrary input features through factored representations, similar to factored models popularized with Moses. 5 | This can be used to add linguistic features such as lemmas, POS, or dependency labels, or potentially other types of information. 6 | The pipe symbol "|" serves as a factor separator and should not otherwise appear in the text. 7 | 8 | To use factored models, follow these steps: 9 | 10 | - preprocess the source side of the training, development and test data to include factors. Consider this example sentence, in an unfactored (or 1-factored) representation, and with 4 factors per word: 11 | 12 | Leonidas begged in the arena . 13 | 14 | Leonidas|Leonidas|NNP|nsubj begged|beg|VBD|root in|in|IN|prep the|the|DT|det gladiatorial|gladiatorial|JJ|amod arena|arena|NN|pobj 15 | 16 | https://github.com/rsennrich/wmt16-scripts/tree/master/factored_sample provides sample scripts to produce a factored representation from a CoNLL file, and BPE-segmented text. 17 | 18 | - in the arguments to nematus.nmt.train, adjust the following options: 19 | - factors: the number of factors per word 20 | - dim_per_factor: the size of the embedding layer for each factor (a list of integers) 21 | - dim_word: the total size of the input embedding (must match the sum of dim_per_factor) 22 | - dictionaries: add a vocabulary file for each factor (in the order they appear), plus a vocabulary file for the target side 23 | 24 | an example config is shown at https://github.com/rsennrich/wmt16-scripts/blob/master/factored_sample/config.py 25 | 26 | - commands for training and running Nematus are otherwise identical to the non-factored version 27 | 28 | 29 | PUBLICATIONS 30 | ------------ 31 | 32 | factored neural machine translation is described in: 33 | 34 | Sennrich, Rico, Haddow, Barry (2016): Linguistic Input Features Improve Neural Machine Translation, Proc. of the First Conference on Machine Translation (WMT16). Berlin, Germany -------------------------------------------------------------------------------- /doc/multi_gpu_training.md: -------------------------------------------------------------------------------- 1 | Multi-GPU Training with Nematus 2 | ------------------------------- 3 | 4 | Nematus supports multi-GPU training; this shows how to make the best use of it. 5 | 6 | Controlling devices: 7 | -------------------- 8 | 9 | by default, Nematus will split training across all available devices. 10 | To control which device(s) to use for training, use `CUDA_VISIBLE_DEVICES`. 11 | 12 | For example, this command uses the first two devices: 13 | 14 | ``` 15 | CUDA_VISIBLE_DEVICES=0,1 python3 nematus/train.py 16 | ``` 17 | 18 | Update strategy and batch size: 19 | ------------------------------- 20 | 21 | Nematus will perform an update after a fixed number of sentences (`--batch_size`) or tokens (rounded down to full sentences; `--token_batch_size`). If both are defined, `--token_batch_size` takes priority. 22 | 23 | When training on multiple devices, Nematus uses Synchronous SGD, and sentences in a batch are split between GPUs. 24 | We choose this strategy for transparency. In principle, if training a model on the same data with the same command line parameters, 25 | you should get similar results (except for random variation), even if systems are trained on different (number of) GPUs. 26 | 27 | Generally, you should choose a large batch size to benefit from multi-GPU training and stabilize training of Transformers. 28 | Our [baseline configuration](https://github.com/EdinburghNLP/wmt17-transformer-scripts/blob/master/training/scripts/train.sh) uses a `token_batch_size` of 16384, 29 | and was tested on 4 GPUs with 12GB of memory each. 30 | 31 | If you want to train a model with a batch size between updates that exceeds the memory available on your devices (because you are limited in the size and/or number of GPUs), 32 | Nematus supports two ways of further splitting up the batch. 33 | 34 | - define `--max_sentences_per_device` or `--max_tokens_per_device`. This is the size of the batch that is processed on a single device at once. Batches are accumulated until reaching the total batch size. For example, defining `--max_tokens_per_device 4096` should ensure that the Transformer baseline will train successfully with 1-4 GPUs without running out of memory. 35 | - define `--gradient_aggregation_steps`. This will split the minibatch that is sent to a device into X steps, and the gradients from all steps are accumulated. For example, defining `--gradient_aggregation_steps 4` on a training run with 1 device should result in the same memory consumption as `--gradient_aggregation_steps 1` with 4 devices. 36 | -------------------------------------------------------------------------------- /nematus/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | -------------------------------------------------------------------------------- /nematus/__init__.py: -------------------------------------------------------------------------------- 1 | from nematus import * 2 | from . import rescore 3 | from . import translate 4 | -------------------------------------------------------------------------------- /nematus/exception.py: -------------------------------------------------------------------------------- 1 | class Error(Exception): 2 | def __init__(self, msg): 3 | self.msg = msg 4 | -------------------------------------------------------------------------------- /nematus/exponential_smoothing.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | # How often to update smoothed variables (in terms of training steps). 5 | DEFAULT_UPDATE_FREQUENCY = 5 6 | 7 | 8 | class ExponentialSmoothing(object): 9 | """Defines TensorFlow variables and operations for exponential smoothing. 10 | 11 | Following Marian [1], we maintain smoothed versions of all trainable 12 | variables. This class creates the smoothed variables (assuming that the 13 | model has already been initialized) and provides operations that can be 14 | run to update the variables and to interchange the values of the raw and 15 | the smoothed variables (which can be used to swap-in the smoothed versions 16 | for validation, for instance). 17 | 18 | Ideally, the smoothed variables would be updated after every training step, 19 | but in practice that introduces a noticeable overhead (around 20%) 20 | due to the need to transfer tensor values from GPU memory into CPU memory. 21 | Instead we allow updating after every N steps by increasing the smoothing 22 | factor accordingly. The default N=5 seems to be a good compromise. 23 | 24 | [1] 25 | "Marian: Fast Neural Machine Translation in C++", 26 | Junczys-Dowmunt et al., in Proceedings of ACL 2018, System Demonstrations. 27 | """ 28 | 29 | def __init__(self, smoothing_factor, 30 | update_frequency=DEFAULT_UPDATE_FREQUENCY): 31 | """Creates TF variables and operations. 32 | 33 | Args: 34 | smoothing_factor: float controlling weight of past vs new values. 35 | update_frequency: integer indicating how often updates will occur. 36 | """ 37 | self._update_frequency = update_frequency 38 | adjusted_smoothing_factor = smoothing_factor * update_frequency 39 | # Smoothed variables are stored in CPU memory to avoid eating into 40 | # valuable GPU memory. 41 | device_spec = tf.DeviceSpec(device_type="CPU", device_index=0) 42 | with tf.device(device_spec): 43 | # Create variables to hold the smoothed versions of all trainable 44 | # variables. 45 | smooth_vars = {} 46 | for v in tf.compat.v1.trainable_variables(): 47 | assert v.name[-2:] == ":0" 48 | name = v.name[:-2] + "_smooth" 49 | s = tf.compat.v1.get_variable(name=name, 50 | initializer=tf.zeros_like(v), 51 | trainable=False, 52 | use_resource=True) 53 | smooth_vars[v.name] = s 54 | # Define the ops to update the smoothed variables. 55 | self._update_ops = [] 56 | for v in tf.compat.v1.trainable_variables(): 57 | s = smooth_vars[v.name] 58 | updated_s = (1 - adjusted_smoothing_factor) * s \ 59 | + adjusted_smoothing_factor * v 60 | self._update_ops += [tf.compat.v1.assign(s, updated_s)] 61 | # Define the ops to swap the raw and smoothed variables. 62 | self._swap_ops = [] 63 | for v in tf.compat.v1.trainable_variables(): 64 | s = smooth_vars[v.name] 65 | v_value = v.read_value() 66 | s_value = s.read_value() 67 | with tf.control_dependencies([v_value, s_value]): 68 | self._swap_ops += [v.assign(s_value)] 69 | self._swap_ops += [s.assign(v_value)] 70 | 71 | @property 72 | def update_ops(self): 73 | return self._update_ops 74 | 75 | @property 76 | def swap_ops(self): 77 | return self._swap_ops 78 | 79 | @property 80 | def update_frequency(self): 81 | return self._update_frequency 82 | -------------------------------------------------------------------------------- /nematus/initializers.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Parameter initializers 3 | ''' 4 | 5 | import numpy 6 | 7 | def ortho_weight(ndim): 8 | W = numpy.random.randn(ndim, ndim) 9 | u, s, v = numpy.linalg.svd(W) 10 | return u.astype('float32') 11 | 12 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 13 | if nout is None: 14 | nout = nin 15 | if nout == nin and ortho: 16 | W = ortho_weight(nin) 17 | else: 18 | W = scale * numpy.random.randn(nin, nout) 19 | return W.astype('float32') 20 | 21 | -------------------------------------------------------------------------------- /nematus/learning_schedule.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class ConstantSchedule(object): 5 | """Implements a trivial learning schedule with a fixed learning rate.""" 6 | 7 | def __init__(self, learning_rate): 8 | """Builds TF graph nodes defining the learning rate function. 9 | 10 | Args: 11 | learning_rate: a float specifying the learning rate. 12 | """ 13 | self._learning_rate = tf.constant(learning_rate) 14 | 15 | @property 16 | def learning_rate(self): 17 | return self._learning_rate 18 | 19 | 20 | class TransformerSchedule(object): 21 | """Implements the learning schedule from the original Transformer paper. 22 | 23 | See Section 5.3 of "Attention Is All You Need" (Vaswani et al., 2017). 24 | """ 25 | 26 | def __init__(self, global_step, dim, warmup_steps): 27 | """Builds TF graph nodes defining the learning rate function. 28 | 29 | Args: 30 | global_step: a tf.Variable containing the current update step. 31 | dim: an integer specifying the model's hidden state size. 32 | warmup_steps: an integer specifying the number of warm-up steps. 33 | """ 34 | t = tf.cast(global_step+1, tf.float32) 35 | a = tf.pow(t, -0.5) 36 | b = t * (warmup_steps ** (-1.5)) 37 | self._learning_rate = dim ** (-0.5) * tf.minimum(a, b) 38 | 39 | @property 40 | def learning_rate(self): 41 | return self._learning_rate 42 | 43 | 44 | class WarmupPlateauDecaySchedule(object): 45 | """Implements a parameterized warm-up / plateau / decay learning schedule. 46 | 47 | The schedule begins with a warm-up phase where the learning rate is 48 | linearly increased from zero to the peak learning rate. The rate is then 49 | held constant for a pre-defined period (possibly zero steps, making this 50 | phase optional). Finally the rate is decayed (currently according to an 51 | inverse square-root function, but this could be made configurable in the 52 | future). 53 | """ 54 | 55 | def __init__(self, global_step, peak_learning_rate, warmup_steps, 56 | plateau_steps): 57 | """Builds TF graph nodes defining the learning rate function. 58 | 59 | Args: 60 | global_step: a tf.Variable containing the current update step. 61 | peak_learning_rate: a float specifying the peak learning rate. 62 | warmup_steps: an integer specifying the number of warm-up steps. 63 | plateau_steps: an integer specifying the number of plateau steps. 64 | """ 65 | t = tf.cast(global_step+1, tf.float32) 66 | warmup_float = tf.cast(warmup_steps, tf.float32) 67 | # Function a: warmup 68 | a = (t / warmup_float) * peak_learning_rate 69 | # Function b: plateau 70 | b = peak_learning_rate 71 | # Function c: decay 72 | decay_start = warmup_float + plateau_steps 73 | c = (tf.sqrt(decay_start) / tf.sqrt(t)) * peak_learning_rate 74 | # Take the minimum of a, b, and c. This will be a for t < warmup_steps, 75 | # c for t > decay_start, and b in-between. 76 | self._learning_rate = tf.minimum(tf.minimum(a, b), c) 77 | 78 | @property 79 | def learning_rate(self): 80 | return self._learning_rate 81 | -------------------------------------------------------------------------------- /nematus/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdinburghNLP/nematus/49d050863bc9644b8c0a9d9ab6e54ccd30f927dd/nematus/metrics/__init__.py -------------------------------------------------------------------------------- /nematus/metrics/beer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import subprocess, threading 5 | 6 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 7 | if sys.version_info < (3, 6): 8 | ModuleNotFoundError = SystemError 9 | 10 | try: 11 | from .scorer import Scorer 12 | from .reference import Reference 13 | except (ModuleNotFoundError, ImportError) as e: 14 | from metrics.scorer import Scorer 15 | from metrics.reference import Reference 16 | 17 | 18 | class BeerError(Exception): 19 | def __init__(self, value): 20 | self.value = value 21 | def __str__(self): 22 | return repr(self.value) 23 | 24 | class BeerScorer(Scorer): 25 | """ 26 | Python wrapper for the BEER metric. Starts a BEER process and keeps it alive, so that the model 27 | can be kept in memeory. Arguments are the BEER language abbreviation and the path to the BEER 28 | installation. They need to be specified as follows:"beer_language=lg,beer_path=path" (any order). 29 | """ 30 | def __init__(self, argument_string): 31 | Scorer.__init__(self, argument_string) 32 | 33 | #Lock for the BEER process, which can only handle one request at a time: 34 | self.lock = threading.Lock() 35 | 36 | #Get necessary arguments for starting BEER from argument string parsed in Scorer.__init__() 37 | self._beer_language = self._arguments["beer_language"] 38 | self._beer_path = self._arguments["beer_path"] + "/" 39 | 40 | #Start a BEER process: 41 | command = self._beer_path+"beer -l "+self._beer_language+" --workingMode interactive " 42 | self.beer_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 43 | 44 | def set_reference(self, reference_tokens): 45 | """ 46 | Construct a BeerReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses. 47 | This can be done any time. 48 | """ 49 | self.lock.acquire() 50 | self._reference = BeerReference(reference_tokens, self) 51 | self.lock.release() 52 | 53 | def terminate_process(self): 54 | """ 55 | Waits for the current request to be processed and terminates the BEER process. 56 | """ 57 | self.lock.acquire() 58 | self.beer_process.terminate() 59 | self.lock.release() 60 | 61 | def kill_process(self): 62 | """ 63 | Kills the BEER process right away. 64 | """ 65 | self.beer_process.kill() 66 | 67 | class BeerReference(Reference): 68 | """ 69 | BEER reference object, against which hypotheses can be scored. 70 | """ 71 | def __init__(self, reference_tokens, beer_scorer): 72 | Reference.__init__(self, reference_tokens) 73 | 74 | #Construct reference string from tokens 75 | self._reference_string = " ".join(reference_tokens) 76 | self._beer_scorer = beer_scorer 77 | 78 | def score(self, hypothesis_tokens): 79 | 80 | #Construct hypothesis string from hypothesis tokens: 81 | hypothesis_string = " ".join(hypothesis_tokens) 82 | 83 | #Acquire lock to make sure BEER process is not in use: 84 | self._beer_scorer.lock.acquire() 85 | 86 | #Score hypothesis string against reference string 87 | try: 88 | self._beer_scorer.beer_process.stdin.write("EVAL ||| "+hypothesis_string+" ||| "+self._reference_string+"\n") 89 | except: 90 | raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip()) 91 | 92 | #Read feature values from process output 93 | std_out = self._beer_scorer.beer_process.stdout.readline() 94 | #Release the process lock 95 | self._beer_scorer.lock.release() 96 | 97 | #Check if BEER returned a score: 98 | try: 99 | n = float(std_out) 100 | except: 101 | raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip()) 102 | #Return final score 103 | return n 104 | -------------------------------------------------------------------------------- /nematus/metrics/chrf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 6 | if sys.version_info < (3, 6): 7 | ModuleNotFoundError = SystemError 8 | 9 | try: 10 | from .scorer import Scorer 11 | from .reference import Reference 12 | except (ModuleNotFoundError, ImportError) as e: 13 | from metrics.scorer import Scorer 14 | from metrics.reference import Reference 15 | 16 | class CharacterFScorer(Scorer): 17 | """ 18 | Scores CharacterFScoreReference objects. 19 | """ 20 | 21 | def __init__(self, argument_string): 22 | """ 23 | Initialises metric-specific parameters. 24 | """ 25 | Scorer.__init__(self, argument_string) 26 | # use character n-gram order of 4 by default 27 | if not 'n' in list(self._arguments.keys()): 28 | self._arguments['n'] = 6 29 | # use beta = 1 by default (recommendation by Maja Popovic for generative modelling) 30 | if not 'beta' in list(self._arguments.keys()): 31 | self._arguments['beta'] = 1 32 | 33 | def set_reference(self, reference_tokens): 34 | """ 35 | Sets the reference against hypotheses are scored. 36 | """ 37 | self._reference = CharacterFScoreReference( 38 | reference_tokens, 39 | self._arguments['n'], 40 | self._arguments['beta'] 41 | ) 42 | 43 | class CharacterFScoreReference(Reference): 44 | """ 45 | References for Character F-Score, as proposed by Popovic (2015): http://www.statmt.org/wmt15/pdf/WMT49.pdf 46 | """ 47 | 48 | def __init__(self, reference_tokens, n=6, beta=1): 49 | """ 50 | @param reference the reference translation that hypotheses shall be 51 | scored against. 52 | @param n maximum character n-gram order to consider. 53 | @param beta algorithm paramater beta (interpolation weight, needs to be > 0). 54 | """ 55 | if beta <= 0: 56 | raise ValueError("Value of beta needs to be larger than zero!") 57 | 58 | Reference.__init__(self, reference_tokens) 59 | self.n = n 60 | self.max_order = n 61 | self.beta_squared = beta ** 2 62 | 63 | # The paper specifies that whitespace is ignored, but for a training objective, 64 | #it's perhaps better to leave it in. According to the paper, it makes no 65 | #difference in practise for scoring. 66 | self._reference_string = " ".join(reference_tokens).strip() 67 | 68 | # Get n-grams from reference: 69 | self._reference_ngrams = self._get_ngrams(self._reference_string, self.n) 70 | 71 | def _get_ngrams(self, tokens, n): 72 | """ 73 | Extracts all n-grams up to order @param n from a list of @param tokens. 74 | """ 75 | n_grams_dict = {} 76 | length = len(tokens) 77 | #If the reference is shorter than n characters, insist on an exact match: 78 | if len(tokens) < n: 79 | self.max_order = len(tokens) 80 | m = 1 81 | while m <= n: #n-gram order 82 | i = m 83 | n_grams_list = [] 84 | order_dict = {} 85 | while (i <= length): 86 | n_grams_list.append(tokens[i-m:i]) 87 | i += 1 88 | for ngr in n_grams_list: 89 | order_dict[ngr] = order_dict.setdefault(ngr,0) + 1 90 | n_grams_dict[m] = order_dict 91 | m += 1 92 | return n_grams_dict 93 | 94 | def score(self, hypothesis_tokens): 95 | """ 96 | Scores @param hypothesis against this reference. 97 | 98 | @return the sentence-level ChrF score: 1.0 is best, 0.0 worst. 99 | """ 100 | #See comment above on treating whitespace. 101 | hypothesis_string = " ".join(hypothesis_tokens).strip() 102 | 103 | #If the hypothesis or the reference is empty, insist on an exact match: 104 | if len(self._reference_string) < 1 or len(hypothesis_string) < 1: 105 | if hypothesis_string == self._reference_string: 106 | return 1.0 107 | else: 108 | return 0.0 109 | 110 | hypothesis_ngrams = self._get_ngrams(hypothesis_string, self.n) 111 | 112 | #Calculate character precision: 113 | chrP = 0.0 114 | chrR = 0.0 115 | for m in range(1,self.n+1): 116 | hyp_count = 0.0 117 | count_total = 0.0 118 | count_in = 0.0 119 | for ngr in hypothesis_ngrams[m]: 120 | hyp_count = hypothesis_ngrams[m][ngr] 121 | count_total += hyp_count 122 | if ngr in self._reference_ngrams[m]: 123 | count_in += min(hyp_count, self._reference_ngrams[m][ngr]) 124 | #Catch division by zero: 125 | if count_total == 0.0: 126 | chrP += 0.0 127 | else: 128 | chrP += count_in / count_total 129 | #average chrP over n-gram orders: 130 | chrP = chrP / float(self.max_order) 131 | 132 | #Calculate character recall: 133 | for m in range(1,self.n+1): 134 | ref_count = 0.0 135 | count_total = 0.0 136 | count_in = 0.0 137 | for ngr in self._reference_ngrams[m]: 138 | ref_count = self._reference_ngrams[m][ngr] 139 | count_total += ref_count 140 | if ngr in hypothesis_ngrams[m]: 141 | count_in += min(ref_count, hypothesis_ngrams[m][ngr]) 142 | #Catch division by zero: 143 | if count_total == 0.0: 144 | chrR += 0.0 145 | else: 146 | chrR += count_in/count_total 147 | #average chrR over n-gram orders: 148 | chrR = chrR / float(self.max_order) 149 | 150 | #Catch division by zero: 151 | if chrP == 0.0 and chrR == 0.0: 152 | return 0.0 153 | return (1 + self.beta_squared) * (chrP*chrR) / ((self.beta_squared * chrP) + chrR) 154 | -------------------------------------------------------------------------------- /nematus/metrics/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import subprocess, threading 5 | 6 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 7 | if sys.version_info < (3, 6): 8 | ModuleNotFoundError = SystemError 9 | 10 | try: 11 | from .scorer import Scorer 12 | from .reference import Reference 13 | except (ModuleNotFoundError, ImportError) as e: 14 | from metrics.scorer import Scorer 15 | from metrics.reference import Reference 16 | 17 | class MeteorError(Exception): 18 | def __init__(self, value): 19 | self.value = value 20 | def __str__(self): 21 | return repr(self.value) 22 | 23 | class MeteorScorer(Scorer): 24 | """ 25 | Python wrapper for the METEOR metric. Starts a METEOR process and keeps it alive, so that the model 26 | can be kept in memeory. Arguments are the meteor language abbreviation and the path to the METEOR 27 | installation. They need to be specified as follows:"meteor_language=lg,meteor_path=path" (any order). 28 | """ 29 | def __init__(self, argument_string): 30 | Scorer.__init__(self, argument_string) 31 | 32 | #Lock for the METEOR process, which can only handle one request at a time: 33 | self.lock = threading.Lock() 34 | 35 | #Get necessary arguments for starting METEOR from argument string parsed in Scorer.__init__() 36 | self._meteor_language = self._arguments["meteor_language"] 37 | self._meteor_path = self._arguments["meteor_path"] + "/" 38 | 39 | #Start a METEOR process: 40 | command = "java -Xmx2G -jar "+self._meteor_path+"meteor-*.jar - - -l "+self._meteor_language+" -stdio" 41 | self.meteor_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 42 | 43 | def set_reference(self, reference_tokens): 44 | """ 45 | Construct a MeteorReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses. 46 | This can be done any time. 47 | """ 48 | self.lock.acquire() 49 | self._reference = MeteorReference(reference_tokens, self) 50 | self.lock.release() 51 | 52 | def terminate_process(self): 53 | """ 54 | Waits for the current request to be processed and terminates the METEOR process. 55 | """ 56 | self.lock.acquire() 57 | self.meteor_process.terminate() 58 | self.lock.release() 59 | 60 | def kill_process(self): 61 | """ 62 | Kills the METEOR process right away. 63 | """ 64 | self.meteor_process.kill() 65 | 66 | class MeteorReference(Reference): 67 | """ 68 | METEOR reference object, against which hypotheses can be scored. 69 | """ 70 | def __init__(self, reference_tokens, meteor_scorer): 71 | Reference.__init__(self, reference_tokens) 72 | 73 | #Construct reference string from tokens 74 | self._reference_string = " ".join(reference_tokens) 75 | self._meteor_scorer = meteor_scorer 76 | 77 | def score(self, hypothesis_tokens): 78 | 79 | #Construct hypothesis string from hypothesis tokens: 80 | hypothesis_string = " ".join(hypothesis_tokens) 81 | 82 | #Acquire lock to make sure METEOR process is not in use: 83 | self._meteor_scorer.lock.acquire() 84 | 85 | #Score hypothesis string against reference string 86 | try: 87 | self._meteor_scorer.meteor_process.stdin.write("SCORE ||| "+self._reference_string+" ||| "+hypothesis_string+"\n") 88 | except: 89 | raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip()) 90 | 91 | #Read feature values from process output 92 | std_out = self._meteor_scorer.meteor_process.stdout.readline() 93 | 94 | #Pass feature values to METEOR process for computation of the final score 95 | try: 96 | self._meteor_scorer.meteor_process.stdin.write("EVAL ||| "+std_out) 97 | except: 98 | raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip()) 99 | std_out = self._meteor_scorer.meteor_process.stdout.readline() 100 | 101 | #Release the process lock 102 | self._meteor_scorer.lock.release() 103 | 104 | #Check if Meteor returned a score: 105 | try: 106 | n = float(std_out) 107 | except: 108 | raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip()) 109 | 110 | #Return final score 111 | return n 112 | -------------------------------------------------------------------------------- /nematus/metrics/reference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from abc import ABCMeta, abstractmethod 4 | 5 | class Reference(metaclass=ABCMeta): 6 | """ 7 | Abstract base class for re-usable translation reference. Hypotheses can be 8 | scored against this reference through the evaluation metric implemented in 9 | its `score` function. 10 | """ 11 | 12 | def __init__(self, reference_tokens): 13 | """ 14 | @param reference the reference translation that hypotheses shall be 15 | scored against. 16 | """ 17 | self._reference_tokens = reference_tokens 18 | #additional (metric-specific) parameters to be defined in subclass 19 | 20 | @abstractmethod 21 | def score(self, hypothesis_tokens): 22 | """ 23 | Scores @param hypothesis against this reference. 24 | """ 25 | pass #to be implemented in sublcass 26 | 27 | def score_matrix(self, hypothesis_matrix): 28 | """ 29 | Scores every hypothesis in @param hypotheses against this reference. 30 | @param hypothesis_matrix an iterable of iterables of tokens. 31 | """ 32 | return [self.score(hypothesis_tokens) for hypothesis_tokens in hypothesis_matrix] 33 | -------------------------------------------------------------------------------- /nematus/metrics/scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from abc import ABCMeta, abstractmethod 4 | 5 | class Scorer(metaclass=ABCMeta): 6 | """ 7 | Abstract base class for MT evaluation metric. Can be passed on to a 8 | Reference for scoring translation hypotheses. 9 | """ 10 | 11 | def __init__(self, argument_string): 12 | """ 13 | @param argument_string the metric-specific parameters (such as n-gram 14 | order for BLEU, language for METEOR, etc.) 15 | """ 16 | # parse arguments 17 | self._reference = None # to be set via `self.set_reference()` 18 | self._arguments = {} 19 | if argument_string: 20 | argument_strings = argument_string.split(",") 21 | for a in argument_strings: 22 | argument, value = a.split("=") 23 | argument = argument.strip() 24 | value = value.strip() 25 | try: 26 | value = int(value) # change type to int if applicable 27 | except ValueError: 28 | value = value 29 | self._arguments[argument] = value 30 | 31 | @abstractmethod 32 | def set_reference(self, reference_tokens): 33 | """ 34 | Sets the reference against which one or many hypotheses can be scored 35 | via `self.score()` and `self.score_matrix()`. 36 | """ 37 | pass # instantiate a Reference object and store it at self._reference 38 | 39 | def score(self, hypothesis_tokens): 40 | """ 41 | Scores @param hypothesis against this reference. 42 | """ 43 | return self._reference.score(hypothesis_tokens) 44 | 45 | def score_matrix(self, hypothesis_matrix): 46 | """ 47 | Scores every hypothesis in @param hypotheses against this reference. 48 | @param hypothesis_matrix an iterable of iterables of tokens. 49 | """ 50 | return self._reference.score_matrix(hypothesis_matrix) 51 | -------------------------------------------------------------------------------- /nematus/metrics/scorer_interpolator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 6 | if sys.version_info < (3, 6): 7 | ModuleNotFoundError = SystemError 8 | 9 | try: 10 | from .scorer import Scorer 11 | from . import scorer_provider as sp 12 | except (ModuleNotFoundError, ImportError) as e: 13 | from metrics.scorer import Scorer 14 | from metrics import scorer_provider as sp 15 | 16 | class ScorerInterpolator(Scorer): 17 | """ 18 | Creates a scorer that interpolates scores from 1..n sub-scorers, e.g., 19 | 0.5 * SENTENCEBLEU + 0.5 * METEOR. 20 | """ 21 | 22 | def __init__(self, config_string): 23 | """ 24 | @param config_string example: 25 | `INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor` 26 | """ 27 | self._scorers = [] 28 | self._weights = [] 29 | # parse arguments 30 | scorers = config_string.split(";") 31 | scorers = [scorer.strip() for scorer in scorers] 32 | try: 33 | instruction, weights = scorers[0].split("w=") 34 | assert instruction.strip() == "INTERPOLATE" 35 | weights = [float(w) for w in weights.split(',')] 36 | scorers = [sp.ScorerProvider().get(s) for s in scorers[1:]] 37 | except: 38 | raise SyntaxError("Ill-formated interpolation of metrics. Example of valid definition: `INTERPOLATE w=0.5,0.5`.") 39 | # assertions 40 | assert len(weights) == len(scorers) 41 | assert sum(weights) == 1.0 42 | # init scorers 43 | for i, scorer in enumerate(scorers): 44 | self._scorers.append(scorer) 45 | self._weights.append(weights[i]) 46 | 47 | def set_reference(self, reference_tokens): 48 | """ 49 | Sets the reference against which one or many hypotheses can be scored 50 | via `self.score()` and `self.score_matrix()`. 51 | """ 52 | for scorer in self._scorers: 53 | scorer.set_reference(reference_tokens) 54 | 55 | def score(self, hypothesis_tokens): 56 | """ 57 | Scores @param hypothesis with all scorers added via `self.add_scorer` 58 | and interpolates the scores with the respective weights. 59 | """ 60 | return sum([s.score(hypothesis_tokens) * w for w, s in zip(self._weights, self._scorers)]) 61 | 62 | def score_matrix(self, hypothesis_matrix): 63 | """ 64 | Scores every hypothesis in @param hypotheses with all scorers added via 65 | `self.add_scorer` and interpolates the scores with the respective 66 | weights. 67 | """ 68 | return sum([s.score_matrix(hypothesis_matrix) * w for w, s in zip(self._weights, self._scorers)]) 69 | -------------------------------------------------------------------------------- /nematus/metrics/scorer_provider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 6 | if sys.version_info < (3, 6): 7 | ModuleNotFoundError = SystemError 8 | 9 | try: 10 | from . import scorer_interpolator as si 11 | from .sentence_bleu import SentenceBleuScorer 12 | from .meteor import MeteorScorer 13 | from .beer import BeerScorer 14 | from .chrf import CharacterFScorer 15 | except: 16 | from metrics import scorer_interpolator as si 17 | from metrics.sentence_bleu import SentenceBleuScorer 18 | from metrics.meteor import MeteorScorer 19 | from metrics.beer import BeerScorer 20 | from metrics.chrf import CharacterFScorer 21 | 22 | class ScorerProvider: 23 | """ 24 | Parses a config string and returns a matching scorer object with the given 25 | parameters. 26 | """ 27 | #from bleu import SentenceBleuScorer 28 | 29 | def __init__(self): 30 | pass 31 | 32 | def get(self, config_string): 33 | """ 34 | Returns a scorer matching the metric and parameters defined in @param 35 | config string. 36 | 37 | Example: ScorerProvider.get("BLEU n=4") returns a SmoothedBleuScorer 38 | object that considers n-gram precision up to n=4. 39 | 40 | If more than one metrics are provided (separated by `;`), 41 | an interpolated scorer will be returned. 42 | 43 | Example: ScorerProvider.get("INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor") 44 | returns an InterpolatedScorer object that scores hypotheses 45 | using 0.5 * bleu_score + 0.5 * meteor_score. 46 | """ 47 | # interpolation 48 | if config_string.startswith("INTERPOLATE"): 49 | return si.ScorerInterpolator(config_string) 50 | try: 51 | scorer, arguments = config_string.split(" ", 1) 52 | except ValueError: 53 | scorer = config_string 54 | arguments = '' 55 | if scorer == 'SENTENCEBLEU': 56 | return SentenceBleuScorer(arguments) 57 | elif scorer == 'METEOR': 58 | return MeteorScorer(arguments) 59 | elif scorer == 'BEER': 60 | return BeerScorer(arguments) 61 | elif scorer == 'CHRF': 62 | return CharacterFScorer(arguments) 63 | # add other scorers here 64 | else: 65 | raise NotImplementedError("No such scorer: %s" % scorer) 66 | -------------------------------------------------------------------------------- /nematus/metrics/sentence_bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from math import exp 5 | from operator import mul 6 | from collections import defaultdict 7 | from functools import reduce 8 | 9 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 10 | if sys.version_info < (3, 6): 11 | ModuleNotFoundError = SystemError 12 | 13 | try: 14 | from .scorer import Scorer 15 | from .reference import Reference 16 | except (ModuleNotFoundError, ImportError) as e: 17 | from metrics.scorer import Scorer 18 | from metrics.reference import Reference 19 | 20 | class SentenceBleuScorer(Scorer): 21 | """ 22 | Scores SmoothedBleuReference objects. 23 | """ 24 | 25 | def __init__(self, argument_string): 26 | """ 27 | Initialises metric-specific parameters. 28 | """ 29 | Scorer.__init__(self, argument_string) 30 | # use n-gram order of 4 by default 31 | if not 'n' in list(self._arguments.keys()): 32 | self._arguments['n'] = 4 33 | 34 | def set_reference(self, reference_tokens): 35 | """ 36 | Sets the reference against hypotheses are scored. 37 | """ 38 | self._reference = SentenceBleuReference( 39 | reference_tokens, 40 | self._arguments['n'] 41 | ) 42 | 43 | class SentenceBleuReference(Reference): 44 | """ 45 | Smoothed sentence-level BLEU as as proposed by Lin and Och (2004). 46 | Implemented as described in (Chen and Cherry, 2014). 47 | """ 48 | 49 | def __init__(self, reference_tokens, n=4): 50 | """ 51 | @param reference the reference translation that hypotheses shall be 52 | scored against. Must be an iterable of tokens (any 53 | type). 54 | @param n maximum n-gram order to consider. 55 | """ 56 | Reference.__init__(self, reference_tokens) 57 | self.n = n 58 | # preprocess reference 59 | self._reference_length = len(self._reference_tokens) 60 | self._reference_ngrams = self._get_ngrams(self._reference_tokens, self.n) 61 | 62 | def _get_ngrams(self, tokens, max_n): 63 | """ 64 | Extracts all n-grams of order 1 up to (and including) @param max_n from 65 | a list of @param tokens. 66 | """ 67 | n_grams = [] 68 | for n in range(1, max_n+1): 69 | n_grams.append(defaultdict(int)) 70 | for n_gram in zip(*[tokens[i:] for i in range(n)]): 71 | n_grams[n-1][n_gram] += 1 72 | return n_grams 73 | 74 | def score(self, hypothesis_tokens): 75 | """ 76 | Scores @param hypothesis against this reference. 77 | 78 | @return the smoothed sentence-level BLEU score: 1.0 is best, 0.0 worst. 79 | """ 80 | def product(iterable): 81 | return reduce(mul, iterable, 1) 82 | def ngram_precisions(ref_ngrams, hyp_ngrams): 83 | precisions = [] 84 | for n in range(1, self.n+1): 85 | overlap = 0 86 | for ref_ngram, ref_ngram_count in list(ref_ngrams[n-1].items()): 87 | if ref_ngram in hyp_ngrams[n-1]: 88 | overlap += min(ref_ngram_count, hyp_ngrams[n-1][ref_ngram]) 89 | hyp_length = max(0, len(hypothesis_tokens)-n+1) 90 | if n >= 2: 91 | # smoothing as proposed by Lin and Och (2004), 92 | # implemented as described in (Chen and Cherry, 2014) 93 | overlap += 1 94 | hyp_length += 1 95 | precisions.append(overlap/hyp_length if hyp_length > 0 else 0.0) 96 | return precisions 97 | def brevity_penalty(ref_length, hyp_length): 98 | return min(1.0, exp(1-(ref_length/hyp_length if hyp_length > 0 else 0.0))) 99 | # preprocess hypothesis 100 | hypothesis_length = len(hypothesis_tokens) 101 | hypothesis_ngrams = self._get_ngrams(hypothesis_tokens, self.n) 102 | # calculate n-gram precision for all orders 103 | np = ngram_precisions(self._reference_ngrams, hypothesis_ngrams) 104 | # calculate brevity penalty 105 | bp = brevity_penalty(self._reference_length, hypothesis_length) 106 | # compose final BLEU score 107 | return product(np)**(1/self.n) * bp 108 | -------------------------------------------------------------------------------- /nematus/metrics/test_chrf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | 5 | from metrics.chrf import CharacterFScorer 6 | 7 | class TestCharacterFScoreReference(unittest.TestCase): 8 | """ 9 | Regression tests for SmoothedBleuReference 10 | """ 11 | @staticmethod 12 | def tokenize(sentence): 13 | return sentence.split(" ") 14 | def test_identical_segments(self): 15 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 16 | scorer = CharacterFScorer('n=6,beta=3') 17 | scorer.set_reference(segment) 18 | self.assertEqual(scorer.score(segment), 1.0) 19 | def test_completely_different_segments(self): 20 | segment_a = self.tokenize("AAAAAA") 21 | segment_b = self.tokenize("BBBB") 22 | scorer = CharacterFScorer('n=3,beta=3') 23 | scorer.set_reference(segment_a) 24 | self.assertEqual(scorer.score(segment_b), 0.0) 25 | def test_empty_string(self): 26 | segment_a = self.tokenize("") 27 | segment_b = self.tokenize("") 28 | scorer = CharacterFScorer('n=6,beta=3') 29 | scorer.set_reference(segment_a) 30 | self.assertEqual(scorer.score(segment_b), 1.0) 31 | def test_one_character_empty_string(self): 32 | segment_a = self.tokenize("A") 33 | segment_b = self.tokenize("") 34 | scorer = CharacterFScorer('n=6,beta=3') 35 | scorer.set_reference(segment_a) 36 | self.assertEqual(scorer.score(segment_b), 0.0) 37 | def test_empty_string_one_character(self): 38 | segment_a = self.tokenize("") 39 | segment_b = self.tokenize("A") 40 | scorer = CharacterFScorer('n=6,beta=3') 41 | scorer.set_reference(segment_a) 42 | self.assertEqual(scorer.score(segment_b), 0.0) 43 | def test_half_right(self): 44 | segment_a = self.tokenize("AB") 45 | segment_b = self.tokenize("AA") 46 | scorer = CharacterFScorer('n=6,beta=3') 47 | scorer.set_reference(segment_a) 48 | self.assertEqual(scorer.score(segment_b), 0.25) 49 | def test_one_character(self): 50 | segment_a = self.tokenize("A") 51 | segment_b = self.tokenize("A") 52 | scorer = CharacterFScorer('n=6,beta=3') 53 | scorer.set_reference(segment_a) 54 | self.assertEqual(scorer.score(segment_b), 1.0) 55 | def test_almost_correct(self): 56 | segment_a = self.tokenize("risk assessment has to be undertaken by those who are qualified and expert in that area - that is the scientists .") 57 | segment_b = self.tokenize(" risk assessment must be made of those who are qualified and expertise in the sector - these are the scientists .") 58 | scorer = CharacterFScorer('n=6,beta=3') 59 | scorer.set_reference(segment_a) 60 | self.assertEqual('{0:.12f}'.format(scorer.score(segment_b)), "0.652414427449") 61 | 62 | if __name__ == '__main__': 63 | unittest.main() 64 | -------------------------------------------------------------------------------- /nematus/metrics/test_scorer_provider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | 5 | from metrics.scorer_provider import ScorerProvider 6 | from metrics.sentence_bleu import SentenceBleuScorer 7 | 8 | class TestScorerProvider(unittest.TestCase): 9 | """ 10 | Regression tests for ScorerProvider 11 | """ 12 | @staticmethod 13 | def tokenize(sentence): 14 | return sentence.split(" ") 15 | 16 | def test_single_metric(self): 17 | config_string = "SENTENCEBLEU n=4" 18 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 19 | reference_scorer = SentenceBleuScorer('n=4') 20 | provided_scorer = ScorerProvider().get(config_string) 21 | reference_scorer.set_reference(segment) 22 | provided_scorer.set_reference(segment) 23 | self.assertEqual( 24 | reference_scorer.score(segment), 25 | provided_scorer.score(segment) 26 | ) 27 | 28 | def test_interpolated_metrics(self): 29 | config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4" 30 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 31 | reference_scorer = SentenceBleuScorer('n=4') 32 | provided_scorer = ScorerProvider().get(config_string) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer 33 | reference_scorer.set_reference(segment) 34 | provided_scorer.set_reference(segment) 35 | self.assertEqual( 36 | reference_scorer.score(segment), 37 | provided_scorer.score(segment) 38 | ) 39 | 40 | 41 | if __name__ == '__main__': 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /nematus/metrics/test_sentence_bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | 5 | from metrics.sentence_bleu import SentenceBleuScorer 6 | 7 | class TestSentenceBleuReference(unittest.TestCase): 8 | """ 9 | Regression tests for SmoothedBleuReference 10 | """ 11 | @staticmethod 12 | def tokenize(sentence): 13 | return sentence.split(" ") 14 | def test_identical_segments(self): 15 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 16 | scorer = SentenceBleuScorer('n=4') 17 | scorer.set_reference(segment) 18 | self.assertEqual(scorer.score(segment), 1.0) 19 | def test_completely_different_segments(self): 20 | segment_a = self.tokenize("A A A") 21 | segment_b = self.tokenize("B B B") 22 | scorer = SentenceBleuScorer('n=4') 23 | scorer.set_reference(segment_a) 24 | self.assertEqual(scorer.score(segment_b), 0.0) 25 | def test_clipping(self): 26 | segment_a = self.tokenize("The very nice man") 27 | segment_b = self.tokenize("man man man man") 28 | scorer = SentenceBleuScorer('n=1') 29 | scorer.set_reference(segment_a) 30 | self.assertNotEqual(scorer.score(segment_b), 1.0) 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /nematus/model_inputs.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class ModelInputs(object): 5 | def __init__(self, config): 6 | # variable dimensions 7 | seq_len, batch_size, mrt_sampleN= None, None, None 8 | # mrt_sampleN = batch_size X sampleN 9 | 10 | self.x = tf.compat.v1.placeholder( 11 | name='x', 12 | shape=(config.factors, seq_len, batch_size), 13 | dtype=tf.int32) 14 | 15 | self.x_mask = tf.compat.v1.placeholder( 16 | name='x_mask', 17 | shape=(seq_len, batch_size), 18 | dtype=tf.float32) 19 | 20 | self.y = tf.compat.v1.placeholder( 21 | name='y', 22 | shape=(seq_len, batch_size), 23 | dtype=tf.int32) 24 | 25 | self.y_mask = tf.compat.v1.placeholder( 26 | name='y_mask', 27 | shape=(seq_len, batch_size), 28 | dtype=tf.float32) 29 | 30 | self.scores = tf.compat.v1.placeholder( 31 | name='scores', 32 | shape=(mrt_sampleN), 33 | dtype=tf.float32) 34 | 35 | self.index = tf.compat.v1.placeholder( 36 | name='index', 37 | shape=(mrt_sampleN), 38 | dtype=tf.int32) 39 | 40 | self.training = tf.compat.v1.placeholder_with_default( 41 | False, 42 | name='training', 43 | shape=()) 44 | -------------------------------------------------------------------------------- /nematus/nmt.py: -------------------------------------------------------------------------------- 1 | train.py -------------------------------------------------------------------------------- /nematus/rescore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Rescoring an n-best list of translations using a translation model. 4 | ''' 5 | 6 | import sys 7 | import logging 8 | if __name__ == '__main__': 9 | # Parse console arguments. 10 | from settings import RescorerSettings 11 | rescorer_settings = RescorerSettings(from_console_arguments=True) 12 | # Set the logging level. This needs to be done before the tensorflow 13 | # module is imported. 14 | level = logging.DEBUG if rescorer_settings.verbose else logging.INFO 15 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s') 16 | 17 | from tempfile import NamedTemporaryFile 18 | 19 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 20 | if sys.version_info < (3, 6): 21 | ModuleNotFoundError = SystemError 22 | 23 | try: 24 | from .config import load_config_from_json_file 25 | from .score import calc_scores 26 | except (ModuleNotFoundError, ImportError) as e: 27 | from config import load_config_from_json_file 28 | from score import calc_scores 29 | 30 | 31 | 32 | def rescore(source_file, nbest_file, output_file, rescorer_settings, options): 33 | 34 | lines = source_file.readlines() 35 | nbest_lines = nbest_file.readlines() 36 | 37 | # create plain text file for scoring 38 | with NamedTemporaryFile(mode='w+', prefix='rescore-tmpin') as tmp_in, \ 39 | NamedTemporaryFile(mode='w+', prefix='rescore-tmpout') as tmp_out: 40 | for line in nbest_lines: 41 | linesplit = line.split(' ||| ') 42 | # Get the source file index (zero-based). 43 | idx = int(linesplit[0]) 44 | tmp_in.write(lines[idx]) 45 | tmp_out.write(linesplit[1] + '\n') 46 | 47 | tmp_in.seek(0) 48 | tmp_out.seek(0) 49 | scores = calc_scores(tmp_in, tmp_out, rescorer_settings, options) 50 | 51 | for i, line in enumerate(nbest_lines): 52 | score_str = ' '.join([str(s[i]) for s in scores]) 53 | output_file.write('{0} {1}\n'.format(line.strip(), score_str)) 54 | 55 | 56 | def main(source_file, nbest_file, output_file, rescorer_settings): 57 | # load model model_options 58 | options = [] 59 | for model in rescorer_settings.models: 60 | config = load_config_from_json_file(model) 61 | setattr(config, 'reload', model) 62 | options.append(config) 63 | 64 | rescore(source_file, nbest_file, output_file, rescorer_settings, options) 65 | 66 | 67 | if __name__ == "__main__": 68 | main(source_file=rescorer_settings.source, 69 | nbest_file=rescorer_settings.input, 70 | output_file=rescorer_settings.output, 71 | rescorer_settings=rescorer_settings) 72 | -------------------------------------------------------------------------------- /nematus/sample_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import requests # use `pip install requests` if not available on your system 5 | 6 | SOURCE_SEGMENTS = { 7 | "de":"Die Wahrheit ist selten rein und nie einfach .".split(), 8 | "en":"The truth is rarely pure and never simple .".split() 9 | } 10 | 11 | class Client(object): 12 | """ 13 | A sample client for Nematus Server instances. 14 | 15 | Uses the Nematus API style, i.e., the server (`server.py`) must be started 16 | with `style=Nematus` to serve requests from this client. 17 | """ 18 | def __init__(self, host, port): 19 | self.host = host 20 | self.port = port 21 | self.headers = { 22 | 'content-type': 'application/json' 23 | } 24 | 25 | def _get_url(self, path='/'): 26 | return "http://{0}:{1}{2}".format(self.host, self.port, path) 27 | 28 | def translate(self, segment): 29 | """ 30 | Returns the translation of a list of segments. 31 | """ 32 | return self.translate_segments([segment])[0] 33 | 34 | def translate_segments(self, segments): 35 | """ 36 | Returns the translation of a single segment. 37 | """ 38 | payload = json.dumps({'segments': segments}) 39 | url = self._get_url('/translate') 40 | response = requests.post(url, headers=self.headers, data=payload) 41 | return [segment['translation'] for segment in response.json()['data']] 42 | 43 | def print_server_status(self): 44 | """ 45 | Prints the server's status report. 46 | """ 47 | url = self._get_url('/status') 48 | response = requests.get(url, headers=self.headers) 49 | print((json.dumps(response.json(), indent=4))) 50 | 51 | 52 | if __name__ == "__main__": 53 | host = 'localhost' 54 | port = 8080 55 | client = Client(host, port) 56 | client.print_server_status() 57 | source_segment = SOURCE_SEGMENTS['de'] 58 | print(('Translating "{0}"'.format(source_segment))) 59 | target_segment = client.translate(source_segment) 60 | print(target_segment) 61 | -------------------------------------------------------------------------------- /nematus/sampler_inputs.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class SamplerInputs: 5 | """Input placeholders for RandomSampler and BeamSearchSampler.""" 6 | 7 | def __init__(self): 8 | 9 | # Number of sentences in the input. When sampling, this is not 10 | # necessarily the same as the batch size, hence the modified name. The 11 | # actual batch size (i.e. as seen by the model) will vary: usually 12 | # it's batch_size_x * beam_size because we tile the input sentences, 13 | # but in the Transformer encoder it's just batch_size_x. 14 | self.batch_size_x = tf.compat.v1.placeholder( 15 | name='batch_size_x', 16 | shape=(), 17 | dtype=tf.int32) 18 | 19 | # Maximum translation length. 20 | self.max_translation_len = tf.compat.v1.placeholder( 21 | name='max_translation_len', 22 | shape=(), 23 | dtype=tf.int32) 24 | 25 | # Alpha parameter for length normalization. 26 | self.normalization_alpha = tf.compat.v1.placeholder( 27 | name='normalization_alpha', 28 | shape=(), 29 | dtype=tf.float32) 30 | -------------------------------------------------------------------------------- /nematus/sampling_utils.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import tensorflow as tf 3 | import logging 4 | 5 | class SamplingUtils(object): 6 | def __init__(self, config_or_settings_obj): 7 | self.sampling_temperature = config_or_settings_obj.sampling_temperature 8 | self.translation_strategy = config_or_settings_obj.translation_strategy 9 | 10 | def adjust_logits(self, logits): 11 | if self.sampling_temperature != 1.0: 12 | logging.debug("adjust temperature") 13 | logits = logits / tf.constant(self.sampling_temperature, dtype=tf.float32) 14 | 15 | return logits 16 | 17 | 18 | -------------------------------------------------------------------------------- /nematus/score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Given a parallel corpus of sentence pairs: with one-to-one of target and source sentences, 4 | produce the score. 5 | """ 6 | 7 | import logging 8 | if __name__ == '__main__': 9 | # Parse console arguments. 10 | from settings import ScorerSettings 11 | scorer_settings = ScorerSettings(from_console_arguments=True) 12 | # Set the logging level. This needs to be done before the tensorflow 13 | # module is imported. 14 | level = logging.DEBUG if scorer_settings.verbose else logging.INFO 15 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s') 16 | 17 | import argparse 18 | import sys 19 | import tempfile 20 | 21 | import tensorflow as tf 22 | 23 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 24 | if sys.version_info < (3, 6): 25 | ModuleNotFoundError = SystemError 26 | 27 | try: 28 | from .config import load_config_from_json_file 29 | from .data_iterator import TextIterator 30 | from .exponential_smoothing import ExponentialSmoothing 31 | from . import model_loader 32 | from . import rnn_model 33 | from . import train 34 | from . import transformer 35 | except (ModuleNotFoundError, ImportError) as e: 36 | from config import load_config_from_json_file 37 | from data_iterator import TextIterator 38 | from exponential_smoothing import ExponentialSmoothing 39 | import model_loader 40 | import rnn_model 41 | import train 42 | import transformer 43 | 44 | 45 | 46 | # FIXME pass in paths not file objects, since we need to know the paths anyway 47 | def calc_scores(source_file, target_file, scorer_settings, configs): 48 | """Calculates sentence pair scores using each of the specified models. 49 | 50 | By default (when scorer_settings.normalization_alpha is 0.0), the score 51 | is the sentence-level cross entropy, otherwise it's a normalized version. 52 | 53 | Args: 54 | source_file: file object for file containing source sentences. 55 | target_file: file object for file containing target sentences. 56 | scorer_settings: a ScorerSettings object. 57 | configs: a list of Namespace objects specifying the model configs. 58 | 59 | Returns: 60 | A list of lists of floats. The outer list contains one list for each 61 | model (in the same order given by configs). The inner list contains 62 | one score for each sentence pair. 63 | """ 64 | scores = [] 65 | for config in configs: 66 | g = tf.Graph() 67 | with g.as_default(): 68 | tf_config = tf.compat.v1.ConfigProto() 69 | tf_config.allow_soft_placement = True 70 | with tf.compat.v1.Session(config=tf_config) as sess: 71 | 72 | logging.info('Building model...') 73 | 74 | # Create the model graph. 75 | if config.model_type == 'transformer': 76 | model = transformer.Transformer(config) 77 | else: 78 | model = rnn_model.RNNModel(config) 79 | 80 | # Add smoothing variables (if the model was trained with 81 | # smoothing). 82 | if config.exponential_smoothing > 0.0: 83 | smoothing = ExponentialSmoothing( 84 | config.exponential_smoothing) 85 | 86 | # Restore the model variables. 87 | saver = model_loader.init_or_restore_variables(config, sess) 88 | 89 | # Swap-in the smoothed versions of the variables (if present). 90 | if config.exponential_smoothing > 0.0: 91 | sess.run(fetches=smoothing.swap_ops) 92 | 93 | text_iterator = TextIterator( 94 | source=source_file.name, 95 | target=target_file.name, 96 | source_dicts=config.source_dicts, 97 | target_dict=config.target_dict, 98 | model_type=config.model_type, 99 | batch_size=scorer_settings.minibatch_size, 100 | maxlen=float('inf'), 101 | source_vocab_sizes=config.source_vocab_sizes, 102 | target_vocab_size=config.target_vocab_size, 103 | use_factor=(config.factors > 1), 104 | sort_by_length=False) 105 | 106 | ce_vals, _ = train.calc_cross_entropy_per_sentence( 107 | sess, 108 | model, 109 | config, 110 | text_iterator, 111 | normalization_alpha=scorer_settings.normalization_alpha) 112 | 113 | scores.append(ce_vals) 114 | return scores 115 | 116 | 117 | def write_scores(source_file, target_file, scores, output_file, scorer_settings): 118 | 119 | source_file.seek(0) 120 | target_file.seek(0) 121 | source_lines = source_file.readlines() 122 | target_lines = target_file.readlines() 123 | 124 | for i, line in enumerate(target_lines): 125 | score_str = ' '.join(map(str,[s[i] for s in scores])) 126 | if scorer_settings.verbose: 127 | output_file.write('{0} '.format(line.strip())) 128 | output_file.write('{0}\n'.format(score_str)) 129 | 130 | 131 | def main(source_file, target_file, output_file, scorer_settings): 132 | # load model model_options 133 | configs = [] 134 | for model in scorer_settings.models: 135 | config = load_config_from_json_file(model) 136 | setattr(config, 'reload', model) 137 | configs.append(config) 138 | 139 | scores = calc_scores(source_file, target_file, scorer_settings, configs) 140 | write_scores(source_file, target_file, scores, output_file, scorer_settings) 141 | 142 | 143 | if __name__ == "__main__": 144 | main(source_file=scorer_settings.source, 145 | target_file=scorer_settings.target, 146 | output_file=scorer_settings.output, 147 | scorer_settings=scorer_settings) 148 | -------------------------------------------------------------------------------- /nematus/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Runs Nematus as a Web Server. 5 | """ 6 | 7 | import json 8 | import pkg_resources 9 | import logging 10 | 11 | from bottle import Bottle, request, response 12 | from bottle_log import LoggingPlugin 13 | 14 | from server.response import TranslationResponse 15 | from server.api.provider import request_provider, response_provider 16 | 17 | from settings import ServerSettings 18 | from server_translator import Translator 19 | 20 | class NematusServer(object): 21 | """ 22 | Keeps a Nematus model in memory to answer http translation requests. 23 | """ 24 | 25 | STATUS_LOADING = 'loading' 26 | STATUS_OK = 'ok' 27 | 28 | def __init__(self, server_settings): 29 | """ 30 | Loads a translation model and initialises the webserver. 31 | 32 | @param server_settings: see `settings.py` 33 | """ 34 | self._style = server_settings.style 35 | self._host = server_settings.host 36 | self._port = server_settings.port 37 | self._threads = server_settings.threads 38 | self._debug = server_settings.verbose 39 | self._models = server_settings.models 40 | self._num_processes = server_settings.num_processes 41 | self._status = self.STATUS_LOADING 42 | # start webserver 43 | self._server = Bottle() 44 | self._server.config['logging.level'] = 'DEBUG' if server_settings.verbose else 'WARNING' 45 | self._server.config['logging.format'] = '%(levelname)s: %(message)s' 46 | self._server.install(LoggingPlugin(self._server.config)) 47 | logging.info("Starting Nematus Server") 48 | # start translation workers 49 | logging.info("Loading translation models") 50 | self._translator = Translator(server_settings) 51 | self._status = self.STATUS_OK 52 | 53 | def status(self): 54 | """ 55 | Reports on the status of this translation server. 56 | """ 57 | response_data = { 58 | 'status': self._status, 59 | 'models': self._models, 60 | 'version': pkg_resources.require("nematus")[0].version, 61 | 'service': 'nematus', 62 | } 63 | response.content_type = "application/json" 64 | return json.dumps(response_data) 65 | 66 | def translate(self): 67 | """ 68 | Processes a translation request. 69 | """ 70 | translation_request = request_provider(self._style, request) 71 | logging.debug("REQUEST - " + repr(translation_request)) 72 | 73 | translations = self._translator.translate( 74 | translation_request.segments, 75 | translation_request.settings 76 | ) 77 | response_data = { 78 | 'status': TranslationResponse.STATUS_OK, 79 | 'segments': [translation.target_words for translation in translations], 80 | } 81 | translation_response = response_provider(self._style, **response_data) 82 | logging.debug("RESPONSE - " + repr(translation_response)) 83 | 84 | response.content_type = translation_response.get_content_type() 85 | return repr(translation_response) 86 | 87 | def start(self): 88 | """ 89 | Starts the webserver. 90 | """ 91 | self._route() 92 | self._server.run(host=self._host, port=self._port, debug=self._debug, server='tornado', threads=self._threads) 93 | self._cleanup() 94 | 95 | def _cleanup(self): 96 | """ 97 | Graceful exit for components. 98 | """ 99 | self._translator.shutdown() 100 | 101 | def _route(self): 102 | """ 103 | Routes webserver paths to functions. 104 | """ 105 | self._server.route('/status', method="GET", callback=self.status) 106 | self._server.route('/translate', method="POST", callback=self.translate) 107 | 108 | 109 | if __name__ == "__main__": 110 | # parse console arguments 111 | server_settings = ServerSettings(from_console_arguments=True) 112 | server = NematusServer(server_settings) 113 | server.start() 114 | -------------------------------------------------------------------------------- /nematus/server/README.md: -------------------------------------------------------------------------------- 1 | # Nematus Server 2 | Runs Nematus as a web service. 3 | 4 | ## Basic Usage 5 | 6 | The command 7 | 8 | ```bash 9 | python3 server.py -m model.npz 10 | ``` 11 | 12 | will start Nematus Server at `localhost` on port 8080, using translation model `model.npz`. Once the model has been loaded, the server is ready to answer translation requests according to the API outlined below. 13 | 14 | ### Required Arguments 15 | 16 | Nematus Server needs at least one translation model, provided via the `-m` or `--models` parameter. Multiple models (for ensemble decoding) are delimited with spaces: 17 | 18 | ```bash 19 | python3 server.py -m model1.npz model2.npz model3.npz model4.npz 20 | ``` 21 | 22 | ### Optional Arguments 23 | 24 | | Argument | Default Value | Description | 25 | | --------------------|---------------| -------------------------| 26 | | `--host` | `localhost` | Host name | 27 | | `--port` | `8080` | Port | 28 | | `-p`, | `1` | Number of translation processes to start. Each process loads all models specified in `-m`/`--models`. | 29 | | `--device-list` | any | The devices to start translation processes on, e.g., `gpu0 gpu1 gpu6`. Defaults to any available device. | 30 | | `-v` | off | Verbose mode | 31 | 32 | 33 | ## API 34 | Nematus Server supports several API styles. 35 | 36 | ### Nematus Translation API 37 | 38 | #### Translation Request 39 | 40 | `POST http://host:port/translate` 41 | 42 | Content-Type: application/json 43 | 44 | ##### Query Parameters 45 | 46 | | Parameter | Type | Default Value | Description | 47 | | --------------------|-----------------------|-----------|-------------| 48 | | ``segments`` | ``list(list(str))`` | | The sentences to be translated (source language). Each sentence is a list of tokens. | 49 | | ``normalize`` | ``boolean`` | ``true`` | Normalise scores by sentence length. | 50 | | ``beam_width`` | ``int`` | ``5`` | The beam width to be used for decoding. | 51 | | ``character_level`` | ``boolean`` | ``false`` | Enables character- rather than subword-level translation. | 52 | | ``n_best`` | ``int`` | ``1`` | Return n best translations per segment. | 53 | | ``suppress_unk`` | ``boolean`` | ``false`` | Suppress hypotheses containing UNK. | 54 | 55 | Sample request: 56 | 57 | ```json 58 | { 59 | "segments": [ 60 | ["I", "can", "resist", "everything", "except", "temptation", "."], 61 | ["The", "truth", "is", "rarely", "pure", "and", "never", "simple", "."] 62 | ], 63 | } 64 | ``` 65 | 66 | ##### Response Body 67 | 68 | If successful, the response body contains a JSON object with the following structure: 69 | 70 | ```json 71 | { 72 | "status": "ok", 73 | "data": [ 74 | { 75 | "translation": ["ich", "kann", "dem", "alles", "außer", "Versuchung", "widerstehen", "."], 76 | }, 77 | { 78 | "translation": ["die", "Wahrheit", "ist", "selten", "rein", "und", "nie", "einfach", "."], 79 | } 80 | ] 81 | } 82 | ``` 83 | 84 | #### Status Request 85 | 86 | `GET http://host:port/status` 87 | 88 | ##### Response Body 89 | 90 | If successful, the response body contains a JSON object with the following structure: 91 | 92 | ```json 93 | { 94 | "status": "ok", 95 | "models": [ 96 | "wmt16-en-de-model1.npz", 97 | "wmt16-en-de-model2.npz", 98 | "wmt16-en-de-model3.npz", 99 | "wmt16-en-de-model4.npz", 100 | ], 101 | "version": "0.1.dev0", 102 | "service": "nematus" 103 | } 104 | ``` 105 | 106 | 107 | ## Sample Client 108 | 109 | A sample client, written in Python, is available in `sample_client.py`. 110 | -------------------------------------------------------------------------------- /nematus/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdinburghNLP/nematus/49d050863bc9644b8c0a9d9ab6e54ccd30f927dd/nematus/server/__init__.py -------------------------------------------------------------------------------- /nematus/server/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EdinburghNLP/nematus/49d050863bc9644b8c0a9d9ab6e54ccd30f927dd/nematus/server/api/__init__.py -------------------------------------------------------------------------------- /nematus/server/api/nematus_style.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Defines the Nematus API for translation requests and responses. 5 | """ 6 | 7 | import json 8 | from ..request import TranslationRequest 9 | from ..response import TranslationResponse 10 | 11 | class TranslationRequestNematus(TranslationRequest): 12 | def _parse(self): 13 | # never produce search graph 14 | self.get_search_graph = False 15 | 16 | request = self._request.json 17 | if 'segments' in request: 18 | self.segments = [' '.join(tokens) for tokens in request['segments']] 19 | if 'beam_width' in request: 20 | self.settings.beam_width = request['beam_width'] 21 | if 'normalize' in request: 22 | self.settings.normalization_alpha = request['normalize'] 23 | if 'character_level' in request: 24 | self.settings.char_level = request['character_level'] 25 | if 'suppress_unk' in request: 26 | self.settings.suppress_unk = request['suppress_unk'] 27 | if 'return_word_alignment' in request: 28 | self.settings.get_alignment = request['return_word_alignment'] 29 | if 'return_word_probabilities' in request: 30 | self.settings.get_word_probs = request['return_word_probabilities'] 31 | 32 | def _format(self): 33 | request = { 34 | 'id': str(self.settings.request_id), 35 | 'data': [segment for segment in self.segments] 36 | } 37 | return json.dumps(request) 38 | 39 | class TranslationResponseNematus(TranslationResponse): 40 | def _format(self): 41 | response = { 42 | 'status': '', 43 | 'data': [], 44 | } 45 | if self._status == self.STATUS_OK: 46 | response['status'] = 'ok' 47 | for i, translation in enumerate(self._segments): 48 | segment = {'translation': translation} 49 | if self._word_alignments: 50 | segment['word_alignment'] = self._word_alignments[i] 51 | if self._word_probabilities: 52 | segment['word_probabilities'] = self._word_probabilities[i] 53 | response['data'].append(segment) 54 | else: 55 | response['status'] = 'error' 56 | return json.dumps(response) 57 | -------------------------------------------------------------------------------- /nematus/server/api/provider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Implements providors for TranslationRequest and TranslationResponse objects 5 | of a specific API style. 6 | """ 7 | 8 | def request_provider(style, request): 9 | """ 10 | Turns a raw request body into a TranslationRequest of a given API style 11 | @param style. 12 | """ 13 | from .nematus_style import TranslationRequestNematus 14 | mapping = { 15 | 'Nematus': TranslationRequestNematus 16 | } 17 | try: 18 | return mapping[style](request) 19 | except IndexError: 20 | raise NotImplementedError("Invalid API style: {0}".format(style)) 21 | 22 | def response_provider(style, **response_args): 23 | """ 24 | Formats @param response_args as a TranslationResponse of a given API style 25 | @param style. 26 | """ 27 | from .nematus_style import TranslationResponseNematus 28 | mapping = { 29 | 'Nematus': TranslationResponseNematus 30 | } 31 | try: 32 | return mapping[style](**response_args) 33 | except IndexError: 34 | raise NotImplementedError("Invalid API style: {0}".format(style)) 35 | -------------------------------------------------------------------------------- /nematus/server/request.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Defines the abstract request format for Nematus server. 5 | """ 6 | 7 | from abc import ABCMeta, abstractmethod 8 | 9 | from settings import TranslationSettings 10 | 11 | class TranslationRequest(object, metaclass=ABCMeta): 12 | """ 13 | Abstract translation request base class. 14 | """ 15 | 16 | def __init__(self, request): 17 | """ 18 | Initialises a translation request. 19 | 20 | @type raw_body: str 21 | @param raw_body: the POST request submitted to Nematus server. 22 | """ 23 | self._request = request 24 | self.segments = [] 25 | self.settings = TranslationSettings() # default values 26 | self._parse() 27 | 28 | @abstractmethod 29 | def _format(self): 30 | """ 31 | Formats this translation response. 32 | """ 33 | pass # to be implemented in subclasses 34 | 35 | def __repr__(self): 36 | """ 37 | Returns the raw body of this translation request. 38 | """ 39 | return self._format() 40 | 41 | @abstractmethod 42 | def _parse(self): 43 | """ 44 | Parses the request's raw body. Sets or overrides 45 | * self.segments 46 | * self.beam_width 47 | * self.normalize 48 | * self.character_level 49 | * self.n_best 50 | * self.suppress_unk 51 | * self.return_word_alignment 52 | * self.return_word_probabilities 53 | """ 54 | pass # to be implemented in subclasses 55 | -------------------------------------------------------------------------------- /nematus/server/response.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Defines the abstract response format for Nematus server. 5 | """ 6 | 7 | from abc import ABCMeta, abstractmethod 8 | 9 | class TranslationResponse(object, metaclass=ABCMeta): 10 | """ 11 | Abstract translation response base class. 12 | """ 13 | 14 | STATUS_OK = 0 15 | STATUS_ERROR = 1 16 | 17 | def __init__(self, status, segments, word_alignments=None, word_probabilities=None): 18 | """ 19 | Initialises a translation response. 20 | 21 | @type segments: list(str) 22 | @param segments: the translated segments to be included. 23 | """ 24 | self._content_type = "application/json" 25 | self._status = status 26 | self._segments = segments 27 | self._word_alignments = word_alignments 28 | self._word_probabilities = word_probabilities 29 | self._response = self._format() 30 | 31 | @abstractmethod 32 | def _format(self): 33 | """ 34 | Formats this translation response. 35 | """ 36 | pass # to be implemented in subclasses 37 | 38 | def __repr__(self): 39 | """ 40 | Returns the raw body of this translation response. 41 | """ 42 | return self._format() 43 | 44 | def get_content_type(self): 45 | return self._content_type 46 | -------------------------------------------------------------------------------- /nematus/shuffle.py: -------------------------------------------------------------------------------- 1 | ../data/shuffle.py -------------------------------------------------------------------------------- /nematus/tf_utils.py: -------------------------------------------------------------------------------- 1 | """TensorFlow-specific utility functions.""" 2 | 3 | import tensorflow as tf 4 | 5 | def assert_shapes(shapes): 6 | """Wrapper for tf.debugging.assert_shapes.""" 7 | 8 | # tf.debugging.assert_shapes is only supported in 1.14 and later, so 9 | # the call is wrapped in a try-except to allow Nematus to run on earlier 10 | # versions. 11 | try: 12 | assertion_op = tf.debugging.assert_shapes(shapes) 13 | with tf.control_dependencies([assertion_op]): 14 | pass 15 | except (AttributeError, TypeError) as e: 16 | pass 17 | 18 | 19 | def get_available_gpus(): 20 | """Returns a list of the identifiers of all visible GPUs. 21 | 22 | Source: https://stackoverflow.com/questions/38559755 23 | """ 24 | from tensorflow.python.client import device_lib 25 | local_device_protos = device_lib.list_local_devices() 26 | return [x.name for x in local_device_protos if x.device_type == 'GPU'] 27 | 28 | 29 | def get_shape_list(inputs): 30 | """Returns a list of input dimensions, statically where possible. 31 | 32 | TODO What is this useful for? 33 | 34 | Adopted from the tensor2tensor library. 35 | """ 36 | inputs = tf.convert_to_tensor(value=inputs) 37 | # If input's rank is unknown, return dynamic shape. 38 | if inputs.get_shape().dims is None: 39 | dims_list = tf.shape(input=inputs) 40 | else: 41 | static_dims_list = inputs.get_shape().as_list() 42 | dynamic_shape = tf.shape(input=inputs) 43 | # Replace the unspecified static dimensions with dynamic ones. 44 | dims_list = list() 45 | for i in range(len(static_dims_list)): 46 | dim = static_dims_list[i] 47 | if dim is None: 48 | dim = dynamic_shape[i] 49 | dims_list.append(dim) 50 | return dims_list 51 | -------------------------------------------------------------------------------- /nematus/training_progress.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Training progress 3 | ''' 4 | 5 | import json 6 | 7 | class TrainingProgress(object): 8 | ''' 9 | Object used to store, serialize and deserialize pure python variables that change during training and should be preserved in order to properly restart the training process 10 | ''' 11 | 12 | def load_from_json(self, file_name): 13 | with open(file_name, 'r', encoding='utf-8') as fh: 14 | self.__dict__.update(json.load(fh)) 15 | 16 | def save_to_json(self, file_name): 17 | with open(file_name, 'w', encoding='utf-8') as fh: 18 | # TODO ensure_ascii=False? 19 | json.dump(self.__dict__, fh, indent=2) 20 | -------------------------------------------------------------------------------- /nematus/transformer_blocks.py: -------------------------------------------------------------------------------- 1 | """Adapted from Nematode: https://github.com/demelin/nematode """ 2 | 3 | import sys 4 | import tensorflow as tf 5 | 6 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 7 | if sys.version_info < (3, 6): 8 | ModuleNotFoundError = SystemError 9 | 10 | try: 11 | from .transformer_attention_modules import MultiHeadAttentionLayer 12 | from .transformer_layers import \ 13 | ProcessingLayer, \ 14 | FeedForwardNetwork, \ 15 | LayerNormLayer, \ 16 | RMSNormLayer 17 | 18 | except (ModuleNotFoundError, ImportError) as e: 19 | from transformer_attention_modules import MultiHeadAttentionLayer 20 | from transformer_layers import \ 21 | ProcessingLayer, \ 22 | FeedForwardNetwork, \ 23 | LayerNormLayer, \ 24 | RMSNormLayer 25 | 26 | # from attention_modules import SingleHeadAttentionLayer, FineGrainedAttentionLayer 27 | 28 | 29 | class AttentionBlock(object): 30 | """ Defines a single attention block (referred to as 'sub-layer' in the paper) comprising of a single multi-head 31 | attention layer preceded by a pre-processing layer and followed by a post-processing layer. """ 32 | 33 | def __init__(self, 34 | config, 35 | float_dtype, 36 | self_attention, 37 | training, 38 | from_rnn=False, 39 | tie_attention=False): 40 | # Set attributes 41 | self.self_attention = self_attention 42 | if not tie_attention: 43 | if self_attention: 44 | attn_name = 'self_attn' 45 | else: 46 | attn_name = 'cross_attn' 47 | else: 48 | attn_name = 'tied_attn' 49 | 50 | memory_size = config.state_size 51 | if from_rnn: 52 | memory_size *= 2 53 | 54 | if config.layer_normalization_type == 'layernorm': 55 | layernorm = LayerNormLayer 56 | elif config.layer_normalization_type == 'rmsnorm': 57 | layernorm = RMSNormLayer 58 | 59 | # Build layers 60 | self.pre_attn = ProcessingLayer(config.state_size, 61 | use_layer_norm=layernorm, 62 | dropout_rate=0., 63 | training=training, 64 | name='pre_{:s}_sublayer'.format(attn_name)) 65 | 66 | self.attn = MultiHeadAttentionLayer(memory_size, 67 | config.state_size, 68 | config.state_size, 69 | config.state_size, 70 | config.state_size, 71 | config.transformer_num_heads, 72 | float_dtype, 73 | dropout_attn=config.transformer_dropout_attn, 74 | drophead=config.transformer_drophead, 75 | training=training, 76 | name='{:s}_sublayer'.format(attn_name)) 77 | 78 | self.post_attn = ProcessingLayer(config.state_size, 79 | use_layer_norm=False, 80 | dropout_rate=config.transformer_dropout_residual, 81 | training=training, 82 | name='post_{:s}_sublayer'.format(attn_name)) 83 | 84 | def forward(self, inputs, memory_context, attn_mask, layer_memories=None): 85 | """ Propagates input data through the block. """ 86 | if not self.self_attention: 87 | assert (memory_context is not None), \ 88 | 'Encoder memories have to be provided for encoder-decoder attention computation.' 89 | attn_inputs = self.pre_attn.forward(inputs) 90 | attn_outputs, layer_memories = self.attn.forward(attn_inputs, memory_context, attn_mask, layer_memories) 91 | block_out = self.post_attn.forward(attn_outputs, residual_inputs=inputs) 92 | return block_out, layer_memories 93 | 94 | 95 | class FFNBlock(object): 96 | """ Defines a single feed-forward network block (referred to as 'sub-layer' in the transformer paper) comprising of 97 | a single feed-forward network preceded by a pre-processing layer and followed by a post-processing layer. """ 98 | 99 | def __init__(self, 100 | config, 101 | ffn_dims, 102 | float_dtype, 103 | is_final, 104 | training): 105 | # Set attributes 106 | self.is_final = is_final 107 | 108 | if config.layer_normalization_type == 'layernorm': 109 | layernorm = LayerNormLayer 110 | elif config.layer_normalization_type == 'rmsnorm': 111 | layernorm = RMSNormLayer 112 | 113 | # Build layers 114 | self.pre_ffn = ProcessingLayer(config.state_size, 115 | use_layer_norm=layernorm, 116 | dropout_rate=0., 117 | training=training, 118 | name='pre_ffn_sublayer') 119 | self.ffn = FeedForwardNetwork(ffn_dims, 120 | float_dtype, 121 | use_bias=True, 122 | activation=tf.nn.relu, 123 | use_layer_norm=False, 124 | dropout_rate=config.transformer_dropout_relu, 125 | training=training, 126 | name='ffn_sublayer') 127 | self.post_ffn = ProcessingLayer(config.state_size, 128 | use_layer_norm=False, 129 | dropout_rate=config.transformer_dropout_residual, 130 | training=training, 131 | name='post_ffn_sublayer') 132 | if is_final: 133 | self.pre_final = ProcessingLayer(config.state_size, 134 | use_layer_norm=layernorm, 135 | dropout_rate=0., 136 | training=training, 137 | name='final_transform') 138 | 139 | def forward(self, inputs): 140 | """ Propagates input data through the block. """ 141 | ffn_inputs = self.pre_ffn.forward(inputs) 142 | ffn_outputs = self.ffn.forward(ffn_inputs) 143 | block_out = self.post_ffn.forward(ffn_outputs, residual_inputs=inputs) 144 | if self.is_final: 145 | block_out = self.pre_final.forward(block_out) 146 | return block_out 147 | -------------------------------------------------------------------------------- /nematus/translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """Translates a source file using a translation model (or ensemble).""" 4 | 5 | import sys 6 | import logging 7 | if __name__ == '__main__': 8 | # Parse console arguments. 9 | from settings import TranslationSettings 10 | settings = TranslationSettings(from_console_arguments=True) 11 | # Set the logging level. This needs to be done before the tensorflow 12 | # module is imported. 13 | level = logging.DEBUG if settings.verbose else logging.INFO 14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s') 15 | 16 | import argparse 17 | 18 | import tensorflow as tf 19 | 20 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 21 | if sys.version_info < (3, 6): 22 | ModuleNotFoundError = SystemError 23 | 24 | try: 25 | from .beam_search_sampler import BeamSearchSampler 26 | from .config import load_config_from_json_file 27 | from .exponential_smoothing import ExponentialSmoothing 28 | from . import model_loader 29 | from .random_sampler import RandomSampler 30 | from . import rnn_model 31 | from .sampling_utils import SamplingUtils 32 | from .transformer import Transformer as TransformerModel 33 | from . import translate_utils 34 | except (ModuleNotFoundError, ImportError) as e: 35 | from beam_search_sampler import BeamSearchSampler 36 | from config import load_config_from_json_file 37 | from exponential_smoothing import ExponentialSmoothing 38 | import model_loader 39 | from random_sampler import RandomSampler 40 | import rnn_model 41 | from sampling_utils import SamplingUtils 42 | from transformer import Transformer as TransformerModel 43 | import translate_utils 44 | 45 | 46 | def main(settings): 47 | """ 48 | Translates a source language file (or STDIN) into a target language file 49 | (or STDOUT). 50 | """ 51 | # Create the TensorFlow session. 52 | g = tf.Graph() 53 | with g.as_default(): 54 | tf_config = tf.compat.v1.ConfigProto() 55 | tf_config.allow_soft_placement = True 56 | session = tf.compat.v1.Session(config=tf_config) 57 | 58 | # Load config file for each model. 59 | configs = [] 60 | for model in settings.models: 61 | config = load_config_from_json_file(model) 62 | setattr(config, 'reload', model) 63 | setattr(config, 'translation_maxlen', settings.translation_maxlen) 64 | configs.append(config) 65 | 66 | # Create the model graphs. 67 | logging.debug("Loading models\n") 68 | models = [] 69 | for i, config in enumerate(configs): 70 | with tf.compat.v1.variable_scope("model%d" % i) as scope: 71 | if config.model_type == "transformer": 72 | model = TransformerModel(config) 73 | else: 74 | model = rnn_model.RNNModel(config) 75 | model.sampling_utils = SamplingUtils(settings) 76 | models.append(model) 77 | 78 | # Add smoothing variables (if the models were trained with smoothing). 79 | #FIXME Assumes either all models were trained with smoothing or none were. 80 | if configs[0].exponential_smoothing > 0.0: 81 | smoothing = ExponentialSmoothing(configs[0].exponential_smoothing) 82 | 83 | # Restore the model variables. 84 | for i, config in enumerate(configs): 85 | with tf.compat.v1.variable_scope("model%d" % i) as scope: 86 | _ = model_loader.init_or_restore_variables(config, session, 87 | ensemble_scope=scope) 88 | 89 | # Swap-in the smoothed versions of the variables. 90 | if configs[0].exponential_smoothing > 0.0: 91 | session.run(fetches=smoothing.swap_ops) 92 | 93 | max_translation_len = settings.translation_maxlen 94 | 95 | # Create a BeamSearchSampler / RandomSampler. 96 | if settings.translation_strategy == 'beam_search': 97 | sampler = BeamSearchSampler(models, configs, settings.beam_size) 98 | else: 99 | assert settings.translation_strategy == 'sampling' 100 | sampler = RandomSampler(models, configs, settings.beam_size) 101 | 102 | # Warn about the change from neg log probs to log probs for the RNN. 103 | if settings.n_best: 104 | model_types = [config.model_type for config in configs] 105 | if 'rnn' in model_types: 106 | logging.warn('n-best scores for RNN models have changed from ' 107 | 'positive to negative (as of commit 95793196...). ' 108 | 'If you are using the scores for reranking etc, then ' 109 | 'you may need to update your scripts.') 110 | 111 | # Translate the source file. 112 | translate_utils.translate_file( 113 | input_file=settings.input, 114 | output_file=settings.output, 115 | session=session, 116 | sampler=sampler, 117 | config=configs[0], 118 | max_translation_len=max_translation_len, 119 | normalization_alpha=settings.normalization_alpha, 120 | nbest=settings.n_best, 121 | minibatch_size=settings.minibatch_size, 122 | maxibatch_size=settings.maxibatch_size) 123 | 124 | 125 | if __name__ == "__main__": 126 | main(settings) 127 | -------------------------------------------------------------------------------- /nematus/translate_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import time 4 | 5 | import numpy 6 | import tensorflow as tf 7 | 8 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 9 | if sys.version_info < (3, 6): 10 | ModuleNotFoundError = SystemError 11 | 12 | try: 13 | from . import exception 14 | from . import util 15 | except (ModuleNotFoundError, ImportError) as e: 16 | import exception 17 | import util 18 | 19 | 20 | def translate_batch(session, sampler, x, x_mask, max_translation_len, 21 | normalization_alpha): 22 | """Translate a batch using a RandomSampler or BeamSearchSampler. 23 | 24 | Args: 25 | session: a TensorFlow session. 26 | sampler: a BeamSearchSampler or RandomSampler object. 27 | x: input Tensor with shape (factors, max_seq_len, batch_size). 28 | x_mask: mask Tensor for x with shape (max_seq_len, batch_size). 29 | max_translation_len: integer specifying maximum translation length. 30 | normalization_alpha: float specifying alpha parameter for length 31 | normalization. 32 | 33 | Returns: 34 | A list of lists of (translation, score) pairs. The outer list contains 35 | one list for each input sentence in the batch. The inner lists contain 36 | k elements (where k is the beam size), sorted by score in best-first 37 | order. 38 | """ 39 | 40 | x_tiled = numpy.tile(x, reps=[1, 1, sampler.beam_size]) 41 | x_mask_tiled = numpy.tile(x_mask, reps=[1, sampler.beam_size]) 42 | 43 | feed_dict = {} 44 | 45 | # Feed inputs to the models. 46 | for model, config in zip(sampler.models, sampler.configs): 47 | if config.model_type == 'rnn': 48 | feed_dict[model.inputs.x] = x_tiled 49 | feed_dict[model.inputs.x_mask] = x_mask_tiled 50 | else: 51 | assert config.model_type == 'transformer' 52 | # Inputs don't need to be tiled in the Transformer because it 53 | # checks for different batch sizes in the encoder and decoder and 54 | # does its own tiling internally at the connection points. 55 | feed_dict[model.inputs.x] = x 56 | feed_dict[model.inputs.x_mask] = x_mask 57 | feed_dict[model.inputs.training] = False 58 | 59 | # Feed inputs to the sampler. 60 | feed_dict[sampler.inputs.batch_size_x] = x.shape[-1] 61 | feed_dict[sampler.inputs.max_translation_len] = max_translation_len 62 | feed_dict[sampler.inputs.normalization_alpha] = normalization_alpha 63 | 64 | # Run the sampler. 65 | translations, scores = session.run(sampler.outputs, feed_dict=feed_dict) 66 | 67 | assert len(translations) == x.shape[-1] 68 | assert len(scores) == x.shape[-1] 69 | 70 | # Sort the translations by score. The scores are (optionally normalized) 71 | # log probs so higher values are better. 72 | beams = [] 73 | for i in range(len(translations)): 74 | pairs = zip(translations[i], scores[i]) 75 | beams.append(sorted(pairs, key=lambda pair: pair[1], reverse=True)) 76 | 77 | return beams 78 | 79 | 80 | def translate_file(input_file, output_file, session, sampler, config, 81 | max_translation_len, normalization_alpha, nbest=False, 82 | minibatch_size=80, maxibatch_size=20): 83 | """Translates a source file using a RandomSampler or BeamSearchSampler. 84 | 85 | Args: 86 | input_file: file object from which source sentences will be read. 87 | output_file: file object to which translations will be written. 88 | session: TensorFlow session. 89 | sampler: BeamSearchSampler or RandomSampler object. 90 | config: model config. 91 | max_translation_len: integer specifying maximum translation length. 92 | normalization_alpha: float specifying alpha parameter for length 93 | normalization. 94 | nbest: if True, produce n-best output with scores; otherwise 1-best. 95 | minibatch_size: minibatch size in sentences. 96 | maxibatch_size: number of minibatches to read and sort, pre-translation. 97 | """ 98 | 99 | def translate_maxibatch(maxibatch, num_to_target, num_prev_translated): 100 | """Translates an individual maxibatch. 101 | 102 | Args: 103 | maxibatch: a list of sentences. 104 | num_to_target: dictionary mapping target vocabulary IDs to strings. 105 | num_prev_translated: the number of previously translated sentences. 106 | """ 107 | 108 | # Sort the maxibatch by length and split into minibatches. 109 | try: 110 | minibatches, idxs = util.read_all_lines(config, maxibatch, 111 | minibatch_size) 112 | except exception.Error as x: 113 | logging.error(x.msg) 114 | sys.exit(1) 115 | 116 | # Translate the minibatches and store the resulting beam (i.e. 117 | # translations and scores) for each sentence. 118 | beams = [] 119 | for x in minibatches: 120 | y_dummy = numpy.zeros(shape=(len(x),1)) 121 | x, x_mask, _, _ = util.prepare_data(x, y_dummy, config.factors, 122 | maxlen=None) 123 | sample = translate_batch(session, sampler, x, x_mask, 124 | max_translation_len, normalization_alpha) 125 | beams.extend(sample) 126 | num_translated = num_prev_translated + len(beams) 127 | logging.info('Translated {} sents'.format(num_translated)) 128 | 129 | # Put beams into the same order as the input maxibatch. 130 | tmp = numpy.array(beams, dtype=numpy.object) 131 | ordered_beams = tmp[idxs.argsort()] 132 | 133 | # Write the translations to the output file. 134 | for i, beam in enumerate(ordered_beams): 135 | if nbest: 136 | num = num_prev_translated + i 137 | for sent, cost in beam: 138 | translation = util.seq2words(sent, num_to_target) 139 | line = "{} ||| {} ||| {}\n".format(num, translation, 140 | str(cost)) 141 | output_file.write(line) 142 | else: 143 | best_hypo, cost = beam[0] 144 | line = util.seq2words(best_hypo, num_to_target) + '\n' 145 | output_file.write(line) 146 | 147 | _, _, _, num_to_target = util.load_dictionaries(config) 148 | 149 | logging.info("NOTE: Length of translations is capped to {}".format( 150 | max_translation_len)) 151 | 152 | start_time = time.time() 153 | 154 | num_translated = 0 155 | maxibatch = [] 156 | while True: 157 | line = input_file.readline() 158 | if line == "": 159 | if len(maxibatch) > 0: 160 | translate_maxibatch(maxibatch, num_to_target, num_translated) 161 | num_translated += len(maxibatch) 162 | break 163 | maxibatch.append(line) 164 | if len(maxibatch) == (maxibatch_size * minibatch_size): 165 | translate_maxibatch(maxibatch, num_to_target, num_translated) 166 | num_translated += len(maxibatch) 167 | maxibatch = [] 168 | 169 | duration = time.time() - start_time 170 | logging.info('Translated {} sents in {} sec. Speed {} sents/sec'.format( 171 | num_translated, duration, num_translated/duration)) 172 | -------------------------------------------------------------------------------- /nematus/util.py: -------------------------------------------------------------------------------- 1 | """Utility functions.""" 2 | 3 | import pickle as pkl 4 | import json 5 | import logging 6 | import numpy 7 | import sys 8 | 9 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError 10 | if sys.version_info < (3, 6): 11 | ModuleNotFoundError = SystemError 12 | 13 | try: 14 | from . import exception 15 | except (ModuleNotFoundError, ImportError) as e: 16 | import exception 17 | 18 | # batch preparation 19 | def prepare_data(seqs_x, seqs_y, n_factors, maxlen=None): 20 | # x: a list of sentences 21 | lengths_x = [len(s) for s in seqs_x] 22 | lengths_y = [len(s) for s in seqs_y] 23 | 24 | if maxlen is not None: 25 | new_seqs_x = [] 26 | new_seqs_y = [] 27 | new_lengths_x = [] 28 | new_lengths_y = [] 29 | for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y): 30 | if l_x < maxlen and l_y < maxlen: 31 | new_seqs_x.append(s_x) 32 | new_lengths_x.append(l_x) 33 | new_seqs_y.append(s_y) 34 | new_lengths_y.append(l_y) 35 | lengths_x = new_lengths_x 36 | seqs_x = new_seqs_x 37 | lengths_y = new_lengths_y 38 | seqs_y = new_seqs_y 39 | 40 | if len(lengths_x) < 1 or len(lengths_y) < 1: 41 | return None, None, None, None 42 | 43 | n_samples = len(seqs_x) 44 | maxlen_x = numpy.max(lengths_x) + 1 45 | maxlen_y = numpy.max(lengths_y) + 1 46 | 47 | x = numpy.zeros((n_factors, maxlen_x, n_samples)).astype('int64') 48 | y = numpy.zeros((maxlen_y, n_samples)).astype('int64') 49 | x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') 50 | y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32') 51 | for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)): 52 | x[:, :lengths_x[idx], idx] = list(zip(*s_x)) 53 | x_mask[:lengths_x[idx]+1, idx] = 1. 54 | y[:lengths_y[idx], idx] = s_y 55 | y_mask[:lengths_y[idx]+1, idx] = 1. 56 | 57 | return x, x_mask, y, y_mask 58 | 59 | 60 | def load_dict(filename, model_type): 61 | try: 62 | # build_dictionary.py writes JSON files as UTF-8 so assume that here. 63 | with open(filename, 'r', encoding='utf-8') as f: 64 | d = json.load(f) 65 | except: 66 | # FIXME Should we be assuming UTF-8? 67 | with open(filename, 'r', encoding='utf-8') as f: 68 | d = pkl.load(f) 69 | 70 | # The transformer model requires vocab dictionaries to use the new style 71 | # special symbols. If the dictionary looks like an old one then tell the 72 | # user to update it. 73 | if model_type == 'transformer' and ("" not in d or d[""] != 1): 74 | logging.error('you must update \'{}\' for use with the ' 75 | '\'transformer\' model type. Please re-run ' 76 | 'build_dictionary.py to generate a new vocabulary ' 77 | 'dictionary.'.format(filename)) 78 | sys.exit(1) 79 | 80 | return d 81 | 82 | 83 | def seq2words(seq, inverse_dictionary, join=True): 84 | seq = numpy.array(seq, dtype='int64') 85 | assert len(seq.shape) == 1 86 | return factoredseq2words(seq.reshape([seq.shape[0], 1]), 87 | [inverse_dictionary], 88 | join) 89 | 90 | def factoredseq2words(seq, inverse_dictionaries, join=True): 91 | assert len(seq.shape) == 2 92 | assert len(inverse_dictionaries) == seq.shape[1] 93 | words = [] 94 | eos_reached = False 95 | for i, w in enumerate(seq): 96 | if eos_reached: 97 | break 98 | factors = [] 99 | for j, f in enumerate(w): 100 | if f == 0: 101 | eos_reached = True 102 | break 103 | # This assert has been commented out because it's possible for 104 | # non-zero values to follow zero values for Transformer models. 105 | # TODO Check why this happens 106 | #assert (i == len(seq) - 1) or (seq[i+1][j] == 0), \ 107 | # ('Zero not at the end of sequence', seq) 108 | elif f in inverse_dictionaries[j]: 109 | factors.append(inverse_dictionaries[j][f]) 110 | else: 111 | factors.append('UNK') 112 | word = '|'.join(factors) 113 | words.append(word) 114 | return ' '.join(words) if join else words 115 | 116 | def reverse_dict(dictt): 117 | keys, values = list(zip(*list(dictt.items()))) 118 | r_dictt = dict(list(zip(values, keys))) 119 | return r_dictt 120 | 121 | 122 | def load_dictionaries(config): 123 | model_type = config.model_type 124 | source_to_num = [load_dict(d, model_type) for d in config.source_dicts] 125 | target_to_num = load_dict(config.target_dict, model_type) 126 | num_to_source = [reverse_dict(d) for d in source_to_num] 127 | num_to_target = reverse_dict(target_to_num) 128 | return source_to_num, target_to_num, num_to_source, num_to_target 129 | 130 | 131 | def read_all_lines(config, sentences, batch_size): 132 | source_to_num, _, _, _ = load_dictionaries(config) 133 | 134 | if config.source_vocab_sizes != None: 135 | assert len(config.source_vocab_sizes) == len(source_to_num) 136 | for d, vocab_size in zip(source_to_num, config.source_vocab_sizes): 137 | if vocab_size != None and vocab_size > 0: 138 | for key, idx in list(d.items()): 139 | if idx >= vocab_size: 140 | del d[key] 141 | 142 | lines = [] 143 | for sent in sentences: 144 | line = [] 145 | for w in sent.strip().split(): 146 | if config.factors == 1: 147 | w = [source_to_num[0][w] if w in source_to_num[0] else 2] 148 | else: 149 | w = [source_to_num[i][f] if f in source_to_num[i] else 2 150 | for (i,f) in enumerate(w.split('|'))] 151 | if len(w) != config.factors: 152 | raise exception.Error( 153 | 'Expected {0} factors, but input word has {1}\n'.format( 154 | config.factors, len(w))) 155 | line.append(w) 156 | lines.append(line) 157 | lines = numpy.array(lines) 158 | lengths = numpy.array([len(l) for l in lines]) 159 | idxs = lengths.argsort() 160 | lines = lines[idxs] 161 | 162 | #merge into batches 163 | batches = [] 164 | for i in range(0, len(lines), batch_size): 165 | batch = lines[i:i+batch_size] 166 | batches.append(batch) 167 | 168 | return batches, idxs 169 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import io 5 | import setuptools 6 | 7 | setuptools.setup( 8 | name='nematus', 9 | version='0.5', 10 | description='Neural machine translation tools on top of Tensorflow', 11 | long_description=io.open(os.path.join(os.path.dirname( 12 | os.path.abspath(__file__)), 'README.md'),encoding='UTF-8').read(), 13 | license='BSD 3-clause', 14 | url='http://github.com/EdinburghNLP/nematus', 15 | install_requires=['numpy', 16 | 'tensorflow'], 17 | classifiers=['Development Status :: 3 - Alpha', 18 | 'Intended Audience :: Science/Research', 19 | 'License :: OSI Approved :: BSD License', 20 | 'Operating System :: OS Independent', 21 | 'Topic :: Scientific/Engineering'], 22 | packages = ['nematus', 'nematus.metrics'], 23 | ) 24 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | Testing Nematus 2 | --------------- 3 | 4 | To test translation (on GPU 0), execute 5 | 6 | CUDA_VISIBLE_DEVICES=0 python3 test_translate.py 7 | 8 | To test scoring (on GPU 0), execute 9 | 10 | CUDA_VISIBLE_DEVICES=0 python3 test_score.py 11 | 12 | more sample models (including scripts for pre- and postprocessing) 13 | are provided at: http://statmt.org/rsennrich/wmt16_systems/ 14 | 15 | to test training (on GPU 0), execute 16 | 17 | CUDA_VISIBLE_DEVICES=0 ./test_train.sh 18 | 19 | note that the training script is just a toy setup to make sure the scripts run, 20 | and to allow for speed comparisons. For instructions to train a 21 | real-scale system, check the instructions at https://github.com/rsennrich/wmt16-scripts 22 | -------------------------------------------------------------------------------- /test/en-de/in: -------------------------------------------------------------------------------- 1 | a Republican strategy to counter the re-election of Obama 2 | Republican leaders justified their policy by the need to combat electoral fraud . 3 | however , the Brenn@@ an Centre considers this a myth , stating that electoral fraud is rar@@ er in the United States than the number of people killed by lightning . 4 | indeed , Republican lawyers identified only 300 cases of electoral fraud in the United States in a decade . 5 | one thing is certain : these new provisions will have a negative impact on voter turn@@ -out . 6 | -------------------------------------------------------------------------------- /test/en-de/ref: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie gegen die Wiederwahl Obamas 2 | 0.977844655514 0.90209954977 0.927412986755 0.984532177448 0.183520868421 0.907861471176 0.994144678116 0.917708992958 0.990146577358 3 | die republi@@ kanische Führung begründet ihre Politik mit der Notwendigkeit , Wahl@@ betrug zu bekämpfen . 4 | 0.624975204468 0.467659324408 0.895200014114 0.922666728497 0.332508355379 0.962346553802 0.985188066959 0.511733949184 0.702501058578 0.733234107494 0.834280848503 0.298875242472 0.978177785873 0.962297916412 0.991670489311 0.998888731003 0.999692261219 5 | das Brenn@@ an Zentrum hält dies aber für einen Mythos , der besagt , dass Wahl@@ betrug in den USA seltener ist als die Zahl der getö@@ teten Menschen . 6 | 0.153531059623 0.871728599072 0.346277505159 0.747219443321 0.871806800365 0.120552673936 0.37667247653 0.782940626144 0.822250068188 0.98460739851 0.73440104723 0.481711357832 0.311930894852 0.961221635342 0.896834015846 0.427923560143 0.903929233551 0.673036038876 0.992655754089 0.739101171494 0.754340946674 0.522766292095 0.916598856449 0.96203070879 0.791576385498 0.890906095505 0.162579834461 0.99129909277 0.765361487865 0.619172334671 0.999593555927 7 | tatsächlich wurden in den USA in einem Jahrzehnt nur 300 Fälle von Wahl@@ betrug in den USA festgestellt . 8 | 0.874663293362 0.193072125316 0.830588340759 0.950349152088 0.536000072956 0.732309579849 0.601523339748 0.985651493073 0.771518468857 0.963857293129 0.582112908363 0.782780885696 0.960188984871 0.962329685688 0.735553085804 0.973220407963 0.69519174099 0.764474630356 0.998193442822 0.999425113201 9 | eines ist sicher : diese neuen Bestimmungen werden negative Auswirkungen auf die Wahlbeteiligung haben . 10 | 0.634134709835 0.78360158205 0.81129103899 0.985949218273 0.919415593147 0.925939559937 0.844495713711 0.82704269886 0.344317674637 0.952615022659 0.954769909382 0.629434704781 0.463058054447 0.923200011253 0.998686730862 0.999255955219 11 | -------------------------------------------------------------------------------- /test/en-de/ref2: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie gegen die Wiederwahl Obamas 2 | die republi@@ kanische Führung begründet ihre Politik mit der Notwendigkeit , Wahl@@ betrug zu bekämpfen . 3 | das Brenn@@ an Zentrum hält dies aber für einen Mythos , der besagt , dass Wahl@@ betrug in den USA seltener ist als die Zahl der getö@@ teten Menschen . 4 | tatsächlich wurden in den USA in einem Jahrzehnt nur 300 Fälle von Wahl@@ betrug in den USA festgestellt . 5 | eines ist sicher : diese neuen Bestimmungen werden negative Auswirkungen auf die Wahlbeteiligung haben . 6 | -------------------------------------------------------------------------------- /test/en-de/ref_score: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 0.688558 2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 1.18311 3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 1.44055 4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 2.32595 5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 0.40967 6 | -------------------------------------------------------------------------------- /test/en-de/references: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 6 | -------------------------------------------------------------------------------- /test/en-ro/in: -------------------------------------------------------------------------------- 1 | the European Commission decided on Tuesday to resume payments for Romania under the " Economic competitiveness " and " Environment " programs , both interrupted in early April 2015 . 2 | the judge did not rule on whether L@@ M@@ FAO 's song itself was an un@@ authorized copy of " H@@ ust@@ lin ' . " 3 | the Romanian national team is part of Group D in the World Cup in England , along with France , Ireland , Canada and Italy . 4 | it sends a message : your country does not value you becoming a parent . 5 | the round@@ about will be made at the appropriate time , we must consider the trams traffic in the area , and we also need an approval from the National Roads . 6 | -------------------------------------------------------------------------------- /test/en-ro/ref: -------------------------------------------------------------------------------- 1 | Comisia Europeană a decis , marți , să reia plățile pentru România în cadrul programelor " competitivitate economică " și " Mediu " , ambele întrerupte la începutul lunii aprilie 2015 . 2 | 0.995251238346 0.554548621178 0.986067473888 0.977536916733 0.471415698528 0.965951085091 0.991383254528 0.735538363457 0.99354493618 0.959721267223 0.960633397102 0.987248241901 0.73650187254 0.958207905293 0.329731225967 0.941679000854 0.48397654295 0.872097313404 0.995552778244 0.99405169487 0.820243418217 0.72900468111 0.978062391281 0.980996310711 0.959786713123 0.870699226856 0.956985473633 0.989414513111 0.948426306248 0.996526777744 0.996653676033 0.995466053486 0.999979257584 3 | judecătorul nu a exclus dacă melodia L@@ M@@ FAO în sine a fost o copie ne@@ autorizată a " H@@ ust@@ lin " " . 4 | 0.748930931091 0.976350605488 0.90377175808 0.238382071257 0.800515711308 0.51756888628 0.782619535923 0.955519676208 0.894009530544 0.183243229985 0.996174514294 0.782620131969 0.927685260773 0.802042484283 0.788843691349 0.390572547913 0.356075167656 0.823610961437 0.785067260265 0.941457808018 0.976138412952 0.979526996613 0.859899282455 0.516458272934 0.989753842354 0.999218225479 5 | naționala României face parte din Grupa D în Cupa Mondială din Anglia , alături de Franța , Irlanda , Canada și Italia . 6 | 0.336522132158 0.97390460968 0.485618531704 0.998266816139 0.977845489979 0.972954690456 0.995464265347 0.582527756691 0.900587379932 0.904148697853 0.926693975925 0.990065574646 0.982615590096 0.970086634159 0.995798170567 0.985046744347 0.999237596989 0.992471039295 0.998591423035 0.994875609875 0.995780050755 0.996373534203 0.996117174625 0.99995225668 7 | transmite un mesaj : țara dumneavoastră nu apreciază că devine părinte . 8 | 0.343225359917 0.930076539516 0.998842597008 0.99683535099 0.859772562981 0.548672556877 0.990485429764 0.126094281673 0.79455691576 0.418934345245 0.782570242882 0.974693894386 0.999907135963 9 | discu@@ tia despre care se va face la momentul oportun , trebuie sa avem in vedere traficul de tramvaie din zona si avem nevoie si de o aprobare de la Compania Nationala de auto@@ str@@ azi . 10 | 0.0200441926718 0.983767747879 0.254417777061 0.626059830189 0.914376199245 0.6536039114 0.598313570023 0.473640501499 0.660739302635 0.727347791195 0.59266859293 0.936970472336 0.982369661331 0.226246803999 0.963698983192 0.996792733669 0.877014875412 0.466113090515 0.902705550194 0.558049559593 0.952391505241 0.783247053623 0.856589257717 0.994170725346 0.818028509617 0.973481237888 0.627151310444 0.944025158882 0.787506222725 0.954702436924 0.380503386259 0.954006671906 0.737255275249 0.340464830399 0.983342587948 0.980352401733 0.994605183601 0.99982637167 11 | -------------------------------------------------------------------------------- /test/en-ro/ref_score: -------------------------------------------------------------------------------- 1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor " Competitivitate Econom@@ ica " și " Mediu " , ambele intre@@ rupte la inceputul lunii aprilie 2015 . 1.10127 2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui " H@@ ust@@ lin ' " . 1.43826 3 | nationala " tricol@@ ora " face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia . 1.16586 4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte . 2.04865 5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale . 2.03933 6 | -------------------------------------------------------------------------------- /test/en-ro/references: -------------------------------------------------------------------------------- 1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor " Competitivitate Econom@@ ica " și " Mediu " , ambele intre@@ rupte la inceputul lunii aprilie 2015 . 2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui " H@@ ust@@ lin ' " . 3 | nationala " tricol@@ ora " face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia . 4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte . 5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale . 6 | -------------------------------------------------------------------------------- /test/models/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build 3 | dist 4 | nmt.egg-info 5 | -------------------------------------------------------------------------------- /test/test_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import unittest 6 | import logging 7 | 8 | sys.path.append(os.path.abspath('../nematus')) 9 | from score import main as score 10 | from settings import ScorerSettings 11 | from test_utils import load_wmt16_model 12 | 13 | level = logging.DEBUG 14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s') 15 | 16 | class TestScore(unittest.TestCase): 17 | """ 18 | Regression tests for scoring with WMT16 models 19 | """ 20 | 21 | def setUp(self): 22 | """ 23 | Download pre-trained models 24 | """ 25 | load_wmt16_model('en','de') 26 | 27 | def scoreEqual(self, output1, output2): 28 | """Given two files with translation scores, check that probabilities 29 | are equal within rounding error. 30 | """ 31 | with open(output1, 'r', encoding='utf-8') as out1, \ 32 | open(output2, 'r', encoding='utf-8') as out2: 33 | for (line1, line2) in zip(out1.readlines(), out2.readlines()): 34 | score1 = float(line1.split()[-1]) 35 | score2 = float(line2.split()[-1]) 36 | self.assertAlmostEqual(score1, score2, places=5) 37 | 38 | # English-German WMT16 system, no dropout 39 | def test_ende(self): 40 | os.chdir('models/en-de/') 41 | with open('../../en-de/in', 'r', encoding='utf-8') as in_file, \ 42 | open('../../en-de/references', 'r', encoding='utf-8') as ref_file, \ 43 | open('../../en-de/out_score', 'w', encoding='utf-8') as score_file: 44 | settings = ScorerSettings() 45 | settings.models = ['model.npz'] 46 | settings.minibatch_size = 80 47 | settings.normalization_alpha = 1.0 48 | score(in_file, ref_file, score_file, settings) 49 | os.chdir('../..') 50 | self.scoreEqual('en-de/ref_score', 'en-de/out_score') 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /test/test_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.en.json data/vocab.de.json \ 13 | --dim_word 256 \ 14 | --dim 512 \ 15 | --n_words_src 30000 \ 16 | --n_words 30000 \ 17 | --maxlen 50 \ 18 | --optimizer adam \ 19 | --lrate 0.0001 \ 20 | --batch_size 40 \ 21 | --no_shuffle \ 22 | --dispFreq 500 \ 23 | --finish_after 500 24 | -------------------------------------------------------------------------------- /test/test_train_l2_loss.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.en.json data/vocab.de.json \ 13 | --dim_word 256 \ 14 | --dim 512 \ 15 | --n_words_src 30000 \ 16 | --n_words 30000 \ 17 | --maxlen 50 \ 18 | --optimizer adam \ 19 | --lrate 0.0001 \ 20 | --batch_size 40 \ 21 | --no_shuffle \ 22 | --dispFreq 500 \ 23 | --finish_after 500 \ 24 | --decay_c 0.0001 25 | -------------------------------------------------------------------------------- /test/test_train_mapl2_loss.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.en.json data/vocab.de.json \ 13 | --dim_word 256 \ 14 | --dim 512 \ 15 | --n_words_src 30000 \ 16 | --n_words 30000 \ 17 | --maxlen 50 \ 18 | --optimizer adam \ 19 | --lrate 0.0001 \ 20 | --batch_size 40 \ 21 | --no_shuffle \ 22 | --dispFreq 500 \ 23 | --finish_after 500 \ 24 | --map_decay_c 0.0001 \ 25 | --prior_model models/model.npz-500 \ 26 | --reload latest_checkpoint 27 | -------------------------------------------------------------------------------- /test/test_train_outputactivations.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.en.json data/vocab.de.json \ 13 | --dim_word 256 \ 14 | --dim 512 \ 15 | --n_words_src 30000 \ 16 | --n_words 30000 \ 17 | --maxlen 50 \ 18 | --optimizer adam \ 19 | --lrate 0.0001 \ 20 | --batch_size 40 \ 21 | --no_shuffle \ 22 | --dispFreq 500 \ 23 | --finish_after 500 \ 24 | --output_hidden_activation relu 25 | -------------------------------------------------------------------------------- /test/test_train_reload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.en.json data/vocab.de.json \ 13 | --dim_word 256 \ 14 | --dim 512 \ 15 | --n_words_src 30000 \ 16 | --n_words 30000 \ 17 | --maxlen 50 \ 18 | --optimizer adam \ 19 | --lrate 0.0001 \ 20 | --batch_size 40 \ 21 | --no_shuffle \ 22 | --dispFreq 500 \ 23 | --finish_after 500 \ 24 | --reload latest_checkpoint 25 | -------------------------------------------------------------------------------- /test/test_train_summaries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.en.json data/vocab.de.json \ 13 | --dim_word 256 \ 14 | --dim 512 \ 15 | --n_words_src 30000 \ 16 | --n_words 30000 \ 17 | --maxlen 50 \ 18 | --optimizer adam \ 19 | --lrate 0.0001 \ 20 | --batch_size 40 \ 21 | --no_shuffle \ 22 | --dispFreq 500 \ 23 | --finish_after 500 \ 24 | --reload latest_checkpoint \ 25 | --summaryFreq 30000 26 | -------------------------------------------------------------------------------- /test/test_train_transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | ../nematus/train.py \ 10 | --model models/model.npz \ 11 | --datasets data/corpus.en data/corpus.de \ 12 | --dictionaries data/vocab.json data/vocab.json \ 13 | --n_words_src 10000 \ 14 | --n_words 10000 \ 15 | --model_type transformer \ 16 | --embedding_size 128 \ 17 | --tie_encoder_decoder_embeddings \ 18 | --tie_decoder_embeddings \ 19 | --state_size 128 \ 20 | --transformer_enc_depth 2 \ 21 | --transformer_dec_depth 2 \ 22 | --transformer_ffn_hidden_size 256 \ 23 | --loss_function per-token-cross-entropy \ 24 | --clip_c 0.0 \ 25 | --label_smoothing 0.1 \ 26 | --optimizer adam \ 27 | --adam_beta1 0.9 \ 28 | --adam_beta2 0.98 \ 29 | --adam_epsilon 1e-09 \ 30 | --learning_schedule transformer \ 31 | --warmup_steps 4000 \ 32 | --maxlen 100 \ 33 | --batch_size 300 \ 34 | --token_batch_size 3000 \ 35 | --disp_freq 500 \ 36 | --finish_after 500 37 | -------------------------------------------------------------------------------- /test/test_translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import unittest 6 | import logging 7 | 8 | sys.path.append(os.path.abspath('../nematus')) 9 | from translate import main as translate 10 | from settings import TranslationSettings 11 | from test_utils import load_wmt16_model 12 | 13 | level = logging.DEBUG 14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s') 15 | 16 | class TestTranslate(unittest.TestCase): 17 | """ 18 | Regression tests for translation with WMT16 models 19 | """ 20 | 21 | def setUp(self): 22 | """ 23 | Download pre-trained models 24 | """ 25 | load_wmt16_model('en','de') 26 | 27 | def outputEqual(self, output1, output2): 28 | """given two translation outputs, check that output string is identical 29 | """ 30 | with open(output1, 'r', encoding='utf-8') as out1, \ 31 | open(output2, 'r', encoding='utf-8') as out2: 32 | for (line1, line2) in zip(out1.readlines(), out2.readlines()): 33 | self.assertEqual(line1.strip(), line2.strip()) 34 | 35 | # English-German WMT16 system, no dropout 36 | def test_ende(self): 37 | with open('en-de/in', 'r', encoding='utf-8') as in_file, \ 38 | open('en-de/out', 'w', encoding='utf-8') as out_file: 39 | os.chdir('models/en-de/') 40 | settings = TranslationSettings() 41 | settings.input = in_file 42 | settings.output = out_file 43 | settings.models = ["model.npz"] 44 | settings.beam_size = 12 45 | settings.normalization_alpha = 1.0 46 | translate(settings=settings) 47 | os.chdir('../..') 48 | self.outputEqual('en-de/ref2','en-de/out') 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /test/test_translate_sampling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import unittest 6 | import logging 7 | 8 | sys.path.append(os.path.abspath('../nematus')) 9 | from translate import main as translate 10 | from settings import TranslationSettings 11 | from test_utils import load_wmt16_model 12 | 13 | level = logging.DEBUG 14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s') 15 | 16 | class TestTranslate(unittest.TestCase): 17 | """ 18 | Regression tests for translation with WMT16 models 19 | """ 20 | 21 | def setUp(self): 22 | """ 23 | Download pre-trained models 24 | """ 25 | load_wmt16_model('en','de') 26 | 27 | def outputEqual(self, output1, output2): 28 | """given two translation outputs, check that output string is identical 29 | """ 30 | with open(output1, 'r', encoding='utf-8') as out1, \ 31 | open(output2, 'r', encoding='utf-8') as out2: 32 | for (line1, line2) in zip(out1.readlines(), out2.readlines()): 33 | self.assertEqual(line1.strip(), line2.strip()) 34 | 35 | # English-German WMT16 system, no dropout 36 | def test_ende(self): 37 | with open('en-de/in', 'r', encoding='utf-8') as in_file, \ 38 | open('en-de/out', 'w', encoding='utf-8') as out_file: 39 | os.chdir('models/en-de/') 40 | settings = TranslationSettings() 41 | settings.input = in_file 42 | settings.output = out_file 43 | settings.models = ["model.npz"] 44 | settings.beam_size = 12 45 | settings.normalization_alpha = 1.0 46 | settings.translation_strategy = 'sampling' 47 | settings.sampling_temperature = 0.4 48 | translate(settings=settings) 49 | os.chdir('../..') 50 | self.outputEqual('en-de/ref2','en-de/out') 51 | 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import requests 6 | from shutil import copyfile 7 | 8 | sys.path.append(os.path.abspath('../nematus')) 9 | from theano_tf_convert import theano_to_tensorflow_model 10 | 11 | def load_wmt16_model(src, target): 12 | path = os.path.join('models', '{0}-{1}'.format(src,target)) 13 | try: 14 | os.makedirs(path) 15 | except OSError: 16 | pass 17 | for filename in ['model.npz.json', 'model.npz', 'vocab.{0}.json'.format(src), 'vocab.{0}.json'.format(target)]: 18 | if not os.path.exists(os.path.join(path, filename)): 19 | if filename == 'model.npz' and os.path.exists(os.path.join(path, 'model.npz.index')): 20 | continue 21 | r = requests.get('http://data.statmt.org/rsennrich/wmt16_systems/{0}-{1}/'.format(src,target) + filename, stream=True) 22 | with open(os.path.join(path, filename), 'wb') as f: 23 | for chunk in r.iter_content(1024**2): 24 | f.write(chunk) 25 | 26 | # regression test is based on Theano model - convert to TF names 27 | if filename == 'model.npz.json' and not os.path.exists(os.path.join(path, 'model.npz.index')): 28 | copyfile(os.path.join(path, 'model.npz.json'), os.path.join(path, 'model-theano.npz.json')) 29 | elif filename == 'model.npz' and not os.path.exists(os.path.join(path, 'model.npz.index')): 30 | os.rename(os.path.join(path, 'model.npz'), os.path.join(path, 'model-theano.npz')) 31 | theano_to_tensorflow_model(os.path.join(path, 'model-theano.npz'), os.path.join(path, 'model.npz')) 32 | -------------------------------------------------------------------------------- /utils/copy_unknown_words.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script is to replace the unknown words in target sentences with their aligned words in source sentences. 3 | Args: 4 | - input: a text file (json format), each line 5 | including a full alignment matrix, a pair of source and target sentences 6 | - output (optional): updated text file (json format) 7 | - unknown word token (optional): a string, default="UNK" 8 | To use: 9 | python copy_unknown_words.py -i translation.txt -o updated_translation.txt -u 'UNK' 10 | ''' 11 | 12 | import json 13 | import numpy 14 | import argparse 15 | import sys 16 | 17 | ''' 18 | Example input file: 19 | {"id": 0, "prob": 0, "target_sent": "Obama empfängt Netanjahu", "matrix": [[0.9239920377731323, 0.04680762067437172, 0.003626488381996751, 0.02343202754855156, 0.0021418146789073944], [0.009942686185240746, 0.4995519518852234, 0.44341862201690674, 0.02077348716557026, 0.026313267648220062], [0.01032756082713604, 0.6475557088851929, 0.029476342722773552, 0.27724361419677734, 0.035396818071603775], [0.0010026689851656556, 0.35200807452201843, 0.06362949311733246, 0.4778701961040497, 0.1054895892739296]], "source_sent": "Obama kindly receives Netanjahu"} 20 | ''' 21 | 22 | def copy_unknown_words(filename, out_filename, unk_token): 23 | for line in filename: 24 | sent_pair = json.loads(line) 25 | # print "Translation:" 26 | # print sent_pair 27 | source_sent = sent_pair["source_sent"] 28 | target_sent = sent_pair["target_sent"] 29 | # matrix dimension: (len(target_sent) + 1) * (len(source_sent) + 1) 30 | # sum of values in a row = 1 31 | full_alignment = sent_pair["matrix"] 32 | source_words = source_sent.split() 33 | target_words = target_sent.split() 34 | # get the indices of maximum values in each row 35 | # (best alignment for each target word) 36 | hard_alignment = numpy.argmax(full_alignment, axis=1) 37 | # print hard_alignment 38 | 39 | updated_target_words = [] 40 | for j in range(len(target_words)): 41 | if target_words[j] == unk_token: 42 | unk_source = source_words[hard_alignment[j]] 43 | updated_target_words.append(unk_source) 44 | else: 45 | updated_target_words.append(target_words[j]) 46 | 47 | sent_pair["target_sent"] = " ".join(updated_target_words) 48 | # print "Updated translation:" 49 | # print sent_pair 50 | sent_pair = json.dumps(sent_pair).decode('unicode-escape').encode('utf8') 51 | print(sent_pair, file=out_filename) 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 56 | metavar='PATH', required=True, 57 | help='''Input text file in json format including alignment matrix, 58 | source sentences, target sentences''') 59 | parser.add_argument('--output', '-o', type=argparse.FileType('w'), 60 | default=sys.stdout, metavar='PATH', 61 | help="Output file (default: standard output)") 62 | parser.add_argument('--unknown', '-u', type=str, nargs = '?', default="UNK", 63 | help='Unknown token to be replaced (default: "UNK")') 64 | 65 | args = parser.parse_args() 66 | 67 | copy_unknown_words(args.input, args.output, args.unk) 68 | 69 | 70 | -------------------------------------------------------------------------------- /utils/plot_heatmap.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import matplotlib.pyplot as plt 3 | import sys 4 | import json 5 | import argparse 6 | 7 | # input: 8 | # alignment matrix - numpy array 9 | # shape (target tokens + eos, number of hidden source states = source tokens +eos) 10 | # one line correpsonds to one decoding step producing one target token 11 | # each line has the attention model weights corresponding to that decoding step 12 | # each float on a line is the attention model weight for a corresponding source state. 13 | # plot: a heat map of the alignment matrix 14 | # x axis are the source tokens (alignment is to source hidden state that roughly corresponds to a source token) 15 | # y axis are the target tokens 16 | 17 | # http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor 18 | def plot_head_map(mma, target_labels, source_labels): 19 | fig, ax = plt.subplots() 20 | heatmap = ax.pcolor(mma, cmap=plt.cm.Blues) 21 | 22 | # put the major ticks at the middle of each cell 23 | ax.set_xticks(numpy.arange(mma.shape[1])+0.5, minor=False) 24 | ax.set_yticks(numpy.arange(mma.shape[0])+0.5, minor=False) 25 | 26 | # without this I get some extra columns rows 27 | # http://stackoverflow.com/questions/31601351/why-does-this-matplotlib-heatmap-have-an-extra-blank-column 28 | ax.set_xlim(0, int(mma.shape[1])) 29 | ax.set_ylim(0, int(mma.shape[0])) 30 | 31 | # want a more natural, table-like display 32 | ax.invert_yaxis() 33 | ax.xaxis.tick_top() 34 | 35 | # source words -> column labels 36 | ax.set_xticklabels(source_labels, minor=False) 37 | # target words -> row labels 38 | ax.set_yticklabels(target_labels, minor=False) 39 | 40 | plt.xticks(rotation=45) 41 | 42 | #plt.tight_layout() 43 | plt.show() 44 | 45 | # column labels -> target words 46 | # row labels -> source words 47 | 48 | def read_alignment_matrix(f): 49 | header = f.readline().strip().split('|||') 50 | if header[0] == '': 51 | return None, None, None, None 52 | sid = int(header[0].strip()) 53 | # number of tokens in source and translation +1 for eos 54 | src_count, trg_count = map(int,header[-1].split()) 55 | # source words 56 | source_labels = header[3].decode('UTF-8').split() 57 | source_labels.append('') 58 | # target words 59 | target_labels = header[1].decode('UTF-8').split() 60 | target_labels.append('') 61 | 62 | mm = [] 63 | for r in range(trg_count): 64 | alignment = map(float,f.readline().strip().split()) 65 | mm.append(alignment) 66 | mma = numpy.array(mm) 67 | return sid,mma, target_labels, source_labels 68 | 69 | 70 | def read_plot_alignment_matrices(f, n): 71 | while(f): 72 | sid, mma, target_labels, source_labels = read_alignment_matrix(f) 73 | if mma is None: 74 | return 75 | if sid >n: 76 | return 77 | plot_head_map(mma, target_labels, source_labels) 78 | # empty line separating the matrices 79 | f.readline() 80 | 81 | 82 | """ 83 | Adding functions to read the json format. 84 | """ 85 | 86 | def read_plot_alignment_json(file, n): 87 | while (file): 88 | sid, mma, target_labels, source_labels = read_alignment_json(file) 89 | if mma is None: 90 | return 91 | if sid > n: 92 | return 93 | plot_head_map(mma, target_labels, source_labels) 94 | 95 | def read_alignment_json(file): 96 | data = file.readline() ##one line containing the json object. 97 | if len(data.strip()) == 0: 98 | return None, None, None, None 99 | jdata = json.loads(data) 100 | ## messy json encodings... TODO: make this better 101 | jdata = json.loads(json.dumps(jdata).decode('unicode-escape').encode('utf8')) 102 | #print jdata 103 | sid = int(jdata["id"]) 104 | mma = numpy.array(jdata["matrix"]) 105 | ##target words 106 | target_labels = jdata["target_sent"].split() 107 | target_labels.append('') 108 | ##source words 109 | source_labels = jdata["source_sent"].split() 110 | source_labels.append('') 111 | return sid,mma, target_labels, source_labels 112 | 113 | if __name__ == "__main__": 114 | 115 | parser = argparse.ArgumentParser() 116 | # '/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/en-de/test.alignment' 117 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 118 | default='/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/ro-en/newstest2016-roen-src.ro.alignment', metavar='PATH', 119 | help="Input file (default: standard input)") 120 | 121 | parser.add_argument('--json', '-j', required = False,action="store_true", 122 | help="If this option is used, then read alignment matrix from a Json formatted file.") 123 | args = parser.parse_args() 124 | 125 | if args.json: 126 | read_plot_alignment_json(args.input, 10) ##n is the maximum number of sentences to process. 127 | else: 128 | read_plot_alignment_matrices(args.input,10) 129 | >>>>>>> origin/nematus-liucan 130 | -------------------------------------------------------------------------------- /utils/visualize_probs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | # given a source sentence, a target sentence, and a sequence of probabilities (one per target word, plus an end-of-sentence probability), 5 | # visualize the probability of each target word via HTML output. 6 | # black fields indicate high confidence, light fields low confidence. 7 | # example input: 8 | """ 9 | Unsere digitalen Leben haben die Notwendigkeit, stark, lebenslustig und erfolgreich zu erscheinen, verdoppelt. 10 | Our digital lives have doubled the need to appear strong, lifel... ike and successful . 11 | 0.882218956947 0.989946246147 0.793388187885 0.790167689323 0.768674969673 0.941913545132 0.955783545971 0.777168631554 0.266917765141 0.909709095955 0.990240097046 0.341023534536 0.828059256077 0.854399263859 0.906807541847 0.960786998272 0.997184157372""" 12 | 13 | html_text = """ 15 | 16 | 17 | 18 | Results page 19 | 20 | 35 | 36 | \n 37 | \n 38 | 39 | 40 | {0} 41 |
42 | 43 | 44 | 45 | """ 46 | 47 | 48 | def print_probdist(infile, outfile): 49 | 50 | entries = [] 51 | 52 | for i, line in enumerate(infile): 53 | if i % 3 == 0: 54 | #words = line.split() 55 | entry = "" 56 | #for w in words: 57 | #entry += "" + w + "\n" 58 | entry = "" + line + "\n" 59 | entries.append(entry) 60 | 61 | if i % 3 == 1: 62 | words = line.split() 63 | words.append('</s>') 64 | elif i % 3 == 2: 65 | probs = list(map(float, line.split())) 66 | entry = "" 67 | for w,p in zip(words, probs): 68 | color = '#%02x%02x%02x' % (int((1-p)*255), int((1-p)*255), int((1-p)*255)) 69 | entry += "{1}".format(color, w) 70 | entry = "" + entry + "\n" 71 | entries.append(entry) 72 | 73 | 74 | outfile.write(html_text.format('\n'.join(entries))) 75 | 76 | 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 79 | default=sys.stdin, metavar='PATH', 80 | help="Input file (default: standard input)") 81 | parser.add_argument('--output', '-o', type=argparse.FileType('w'), 82 | default=sys.stdout, metavar='PATH', 83 | help="Output file (default: standard output)") 84 | 85 | args = parser.parse_args() 86 | 87 | print_probdist(args.input, args.output) --------------------------------------------------------------------------------