├── .gitignore
├── CHANGELOG.md
├── Dockerfile.cpu
├── Dockerfile.gpu
├── LICENSE
├── README.md
├── data
    ├── README.md
    ├── build_dictionary.py
    ├── length.py
    ├── merge.sh
    ├── multi-bleu-detok.perl
    ├── multi-bleu.perl
    ├── nonbreaking_prefixes
    │   ├── README.txt
    │   ├── nonbreaking_prefix.ca
    │   ├── nonbreaking_prefix.cs
    │   ├── nonbreaking_prefix.de
    │   ├── nonbreaking_prefix.el
    │   ├── nonbreaking_prefix.en
    │   ├── nonbreaking_prefix.es
    │   ├── nonbreaking_prefix.fi
    │   ├── nonbreaking_prefix.fr
    │   ├── nonbreaking_prefix.hu
    │   ├── nonbreaking_prefix.is
    │   ├── nonbreaking_prefix.it
    │   ├── nonbreaking_prefix.lv
    │   ├── nonbreaking_prefix.nl
    │   ├── nonbreaking_prefix.pl
    │   ├── nonbreaking_prefix.pt
    │   ├── nonbreaking_prefix.ro
    │   ├── nonbreaking_prefix.ru
    │   ├── nonbreaking_prefix.sk
    │   ├── nonbreaking_prefix.sl
    │   ├── nonbreaking_prefix.sv
    │   └── nonbreaking_prefix.ta
    ├── postprocess.sh
    ├── preprocess.sh
    ├── shuffle.py
    ├── strip_sgml.py
    └── tokenizer.perl
├── doc
    ├── factored_neural_machine_translation.md
    └── multi_gpu_training.md
├── nematus
    ├── .gitignore
    ├── __init__.py
    ├── beam_search_sampler.py
    ├── config.py
    ├── data_iterator.py
    ├── exception.py
    ├── exponential_smoothing.py
    ├── initializers.py
    ├── layers.py
    ├── learning_schedule.py
    ├── metrics
    │   ├── __init__.py
    │   ├── beer.py
    │   ├── chrf.py
    │   ├── meteor.py
    │   ├── reference.py
    │   ├── scorer.py
    │   ├── scorer_interpolator.py
    │   ├── scorer_provider.py
    │   ├── sentence_bleu.py
    │   ├── test_chrf.py
    │   ├── test_scorer_provider.py
    │   └── test_sentence_bleu.py
    ├── model_inputs.py
    ├── model_loader.py
    ├── model_updater.py
    ├── mrt_utils.py
    ├── nmt.py
    ├── random_sampler.py
    ├── rescore.py
    ├── rnn_inference.py
    ├── rnn_model.py
    ├── sample_client.py
    ├── sampler_inputs.py
    ├── sampling_utils.py
    ├── score.py
    ├── server.py
    ├── server
    │   ├── README.md
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── nematus_style.py
    │   │   └── provider.py
    │   ├── request.py
    │   └── response.py
    ├── server_translator.py
    ├── settings.py
    ├── shuffle.py
    ├── tf_utils.py
    ├── theano_tf_convert.py
    ├── train.py
    ├── training_progress.py
    ├── transformer.py
    ├── transformer_attention_modules.py
    ├── transformer_blocks.py
    ├── transformer_inference.py
    ├── transformer_layers.py
    ├── translate.py
    ├── translate_utils.py
    └── util.py
├── setup.py
├── test
    ├── README.md
    ├── data
    │   ├── corpus.de
    │   ├── corpus.en
    │   ├── vocab.de.json
    │   ├── vocab.en.json
    │   └── vocab.json
    ├── en-de
    │   ├── in
    │   ├── ref
    │   ├── ref2
    │   ├── ref_score
    │   └── references
    ├── en-ro
    │   ├── in
    │   ├── ref
    │   ├── ref_score
    │   └── references
    ├── models
    │   └── .gitignore
    ├── test_score.py
    ├── test_train.sh
    ├── test_train_l2_loss.sh
    ├── test_train_mapl2_loss.sh
    ├── test_train_outputactivations.sh
    ├── test_train_reload.sh
    ├── test_train_summaries.sh
    ├── test_train_transformer.sh
    ├── test_translate.py
    ├── test_translate_sampling.py
    └── test_utils.py
└── utils
    ├── attention.js
    ├── attention_web.php
    ├── copy_unknown_words.py
    ├── plot_heatmap.py
    └── visualize_probs.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build
3 | dist
4 | nmt.egg-info
5 | .idea
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | CHANGELOG
  2 | ---------
  3 | 
  4 | development version
  5 | -----------
  6 | 
  7 |  - main new features:
  8 |    - factored input for Transformers
  9 |    - DropHead regularization
 10 | 
 11 | v0.5 (19/5/2020)
 12 | ----------
 13 | 
 14 | changes since 0.4:
 15 | 
 16 |  - main new features:
 17 |    - minimum risk training (MRT)
 18 |    - new inference code with ensemble decoding support for Transformer/RNN mix
 19 |    - compatibility with TF 2
 20 | 
 21 |  - other new features
 22 |    - lexical model for RNNs
 23 |    - gradient accumulation support
 24 |    - exponential smoothing
 25 |    - warmup-plateau-decay learning schedule
 26 |    - sampling translation strategy
 27 | 
 28 |  - fixes
 29 |    - fix regressions with deep RNN decoders
 30 | 
 31 | 
 32 | v0.4 (17/12/2018)
 33 | ----------
 34 | 
 35 | changes since 0.3:
 36 | 
 37 |  - main new features:
 38 |    - Transformer architecture
 39 |    - multi-GPU training
 40 |    - codebase moved to Python 3
 41 | 
 42 |  - other new features:
 43 |    - label smoothing
 44 |    - mixture of softmaxes
 45 | 
 46 |  - fixes:
 47 |    - re-enable BLEU validation (via --valid_script)
 48 |    - fix MAP-L2 regularization
 49 |    - fix server mode
 50 | 
 51 | v0.3 (23/5/2018)
 52 | ----------
 53 |  - Tensorflow backend. The main model was rewritten to support Tensorflow in lieu of Theano.
 54 |    A few features have not been implemented in the Tensorflow model.
 55 | 
 56 |  - currently supported:
 57 |    - re-implementation of default Nematus model
 58 |    - model compatibility with Theano version and conversion via `theano_tf_convert.py`
 59 |    - same scripts and command line API for training, translating and (re)scoring
 60 |    - layer normalisation
 61 |    - tied embeddings
 62 |    - deep models
 63 |    - ensemble decoding
 64 |    - input features
 65 |  
 66 |  - not yet supported:
 67 |    - minimum risk training
 68 |    - LSTM cells
 69 |    - learning rate annealing
 70 | 
 71 |  - new features:
 72 |    - batch decoding
 73 |    - more efficient training with --token_batch_size
 74 | 
 75 | v0.2 (17/12/2017)
 76 | ----------
 77 | 
 78 |  - layer normalisation (Ba et al, 2016) https://arxiv.org/abs/1607.06450
 79 |  - weight normalisation (Salimans and Kingma, 2016) https://arxiv.org/abs/1602.07868
 80 |  - deep models (Zhou et al., 2016; Wu et al., 2016; Miceli Barone et al., 2017) https://arxiv.org/abs/1606.04199 https://arxiv.org/abs/1609.08144 https://arxiv.org/abs/1707.07631
 81 |  - better memory efficiency
 82 |  - save historical gradient information for seamless resuming of interrupted training runs
 83 |  - server mode
 84 |  - sgdmomentum optimizer
 85 |  - learning rate annealing
 86 |  - LSTM cells
 87 |  - deep fusion (https://arxiv.org/abs/1503.03535)
 88 |  - various bugfixes
 89 | 
 90 | v0.1 (2/3/2017)
 91 | ---------------
 92 | 
 93 |  - arbitrary input features (factored neural machine translation) http://www.statmt.org/wmt16/pdf/W16-2209.pdf
 94 |  - ensemble decoding (and new translation API to support it)
 95 |  - dropout on all layers (Gal, 2015) http://arxiv.org/abs/1512.05287
 96 |  - minimum risk training (Shen et al, 2016) http://aclweb.org/anthology/P16-1159
 97 |  - tied embeddings (Press and Wolf, 2016) https://arxiv.org/abs/1608.05859
 98 |  - command line interface for training
 99 |  - n-best output for decoder
100 |  - more output options (attention weights; word-level probabilities) and visualization scripts
101 |  - performance improvements to decoder
102 |  - better memory efficiency
103 |  - rescoring support
104 |  - execute arbitrary validation scripts (for BLEU early stopping)
105 |  - vocabulary files and model parameters are stored in JSON format (backward-compatible loading)
106 | 


--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | MAINTAINER Tom Kocmi <kocmi@ufal.mff.cuni.cz>
 3 | 
 4 | RUN apt-get update && apt-get install -y \
 5 | 	cmake \
 6 | 	git \
 7 | 	python \
 8 | 	python3 \
 9 | 	vim \
10 | 	nano \
11 | 	libopenblas-dev \
12 | 	python3-dev \
13 | 	python3-pip \
14 | 	xml-twig-tools
15 | 
16 | RUN pip3 install --upgrade pip
17 | RUN pip3 install --upgrade setuptools
18 | RUN pip3 install tensorflow==1.15
19 | 
20 | RUN mkdir -p /path/to
21 | WORKDIR /path/to/
22 | 
23 | # Install mosesdecoder
24 | RUN git clone https://github.com/moses-smt/mosesdecoder
25 | 
26 | # Install subwords
27 | RUN git clone https://github.com/rsennrich/subword-nmt
28 | 
29 | # Install nematus
30 | COPY . /path/to/nematus
31 | WORKDIR /path/to/nematus
32 | RUN python3 setup.py install
33 | 
34 | WORKDIR /
35 | 
36 | # playground will contain user defined scripts, it should be run as:
37 | # docker run -v `pwd`:/playground -it nematus-docker
38 | RUN mkdir playground
39 | WORKDIR /playground
40 | 
41 | 


--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-cudnn7-devel
 2 | MAINTAINER Tom Kocmi <kocmi@ufal.mff.cuni.cz>
 3 | 
 4 | # Install git, wget, python-dev, pip and other dependencies
 5 | RUN apt-get update && apt-get install -y \
 6 |   git \
 7 |   wget \
 8 |   cmake \
 9 |   vim \
10 |   nano \
11 |   python3 \
12 |   libopenblas-dev \
13 |   python3-dev \
14 |   python3-pip \
15 |   python3-nose \
16 |   python3-numpy \
17 |   python3-scipy \
18 |   python3-pygraphviz \
19 |   xml-twig-tools
20 | 
21 | RUN pip3 install --upgrade pip
22 | RUN pip3 install -U setuptools
23 | RUN pip3 install tensorflow==1.15
24 | 
25 | # Set CUDA_ROOT
26 | ENV CUDA_ROOT /usr/local/cuda/bin
27 | 
28 | 
29 | RUN mkdir -p /path/to
30 | WORKDIR /path/to/
31 | 
32 | # Install mosesdecoder
33 | RUN git clone https://github.com/moses-smt/mosesdecoder
34 | 
35 | # Install subwords
36 | RUN git clone https://github.com/rsennrich/subword-nmt
37 | 
38 | # Install nematus
39 | COPY . /path/to/nematus
40 | WORKDIR /path/to/nematus
41 | RUN python3 setup.py install
42 | 
43 | WORKDIR /
44 | 
45 | # playground will contain user defined scripts, it should be run as:
46 | # nvidia-docker run -v `pwd`:/playground -it nematus-docker
47 | RUN mkdir playground
48 | WORKDIR /playground
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Kyunghyun Cho
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of Nematus nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains small scripts for data processing and evaluation.
 2 | Other useful scripts and sample data is provided at https://github.com/rsennrich/wmt16-scripts
 3 | 
 4 | 
 5 | Evaluation
 6 | ----------
 7 | 
 8 | This directory contains two evaluation scripts:
 9 | 
10 |  - multi-bleu.perl (from Moses decoder) computes tokenized, case-sensitive BLEU 
11 |    scores. This script is widely used in NMT research, but we discourage its use 
12 |    for publication because different groups use different tokenization, which 
13 |    biases comparisons to previous work.
14 | 
15 |    usage:
16 |    ./multi-bleu.perl ref_file < test_file
17 | 
18 |  - multi-bleu-detok.perl expects that the reference file and output file are not 
19 |    tokenized (untokenized reference; detokenized output). It performs tokenization 
20 |    internally, using the tokenization routine from the NIST BLEU scorer 
21 |    (mteval-v13a.pl). This script can be used as a plaintext alternative of 
22 |    mteval-v13a.pl, giving the same results.
23 | 
24 |    usage:
25 |    ./multi-bleu-detok.perl ref_file < test_file


--------------------------------------------------------------------------------
/data/build_dictionary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from collections import OrderedDict
 4 | import fileinput
 5 | import sys
 6 | 
 7 | import numpy
 8 | import json
 9 | 
10 | 
11 | def main():
12 |     for filename in sys.argv[1:]:
13 |         print('Processing', filename)
14 |         word_freqs = OrderedDict()
15 |         with open(filename, 'r', encoding='utf-8') as f:
16 |             for line in f:
17 |                 words_in = line.strip().split(' ')
18 |                 for w in words_in:
19 |                     if w not in word_freqs:
20 |                         word_freqs[w] = 0
21 |                     word_freqs[w] += 1
22 |         words = list(word_freqs.keys())
23 |         freqs = list(word_freqs.values())
24 | 
25 |         sorted_idx = numpy.argsort(freqs)
26 |         sorted_words = [words[ii] for ii in sorted_idx[::-1]]
27 | 
28 |         worddict = OrderedDict()
29 |         worddict['<EOS>'] = 0
30 |         worddict['<GO>'] = 1
31 |         worddict['<UNK>'] = 2
32 |         # FIXME We shouldn't assume <EOS>, <GO>, and <UNK> aren't BPE subwords.
33 |         for ii, ww in enumerate(sorted_words):
34 |             worddict[ww] = ii+3
35 | 
36 |         # The JSON RFC requires that JSON text be represented using either
37 |         # UTF-8, UTF-16, or UTF-32, with UTF-8 being recommended.
38 |         # We use UTF-8 regardless of the user's locale settings.
39 |         with open('%s.json'%filename, 'w', encoding='utf-8') as f:
40 |             json.dump(worddict, f, indent=2, ensure_ascii=False)
41 | 
42 |         print('Done')
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/data/length.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import sys
 5 | 
 6 | for name in sys.argv[1:]:
 7 |     lens = []
 8 |     with open(name, 'r') as f:
 9 |         for ll in f:
10 |             lens.append(len(ll.strip().split(' ')))
11 |     print(name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens))
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/data/merge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | SRC=$1
 5 | TRG=$2
 6 | 
 7 | FSRC=all_${1}-${2}.${1}
 8 | FTRG=all_${1}-${2}.${2}
 9 | 
10 | echo "" > $FSRC
11 | for F in *${1}-${2}.${1}
12 | do
13 |     if [ "$F" = "$FSRC" ]; then
14 |         echo "pass"
15 |     else
16 |         cat $F >> $FSRC
17 |     fi
18 | done
19 | 
20 | 
21 | echo "" > $FTRG
22 | for F in *${1}-${2}.${2}
23 | do
24 |     if [ "$F" = "$FTRG" ]; then
25 |         echo "pass"
26 |     else
27 |         cat $F >> $FTRG
28 |     fi
29 | done
30 | 


--------------------------------------------------------------------------------
/data/multi-bleu-detok.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # This file uses the internal tokenization of mteval-v13a.pl,
  7 | # giving the exact same (case-sensitive) results on untokenized text.
  8 | # 
  9 | # like multi-bleu.perl , it supports plain text input and multiple references.
 10 | 
 11 | # $Id$
 12 | use warnings;
 13 | use strict;
 14 | 
 15 | my $lowercase = 0;
 16 | if ($ARGV[0] eq "-lc") {
 17 |   $lowercase = 1;
 18 |   shift;
 19 | }
 20 | 
 21 | my $stem = $ARGV[0];
 22 | if (!defined $stem) {
 23 |   print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n";
 24 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 25 |   exit(1);
 26 | }
 27 | 
 28 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 29 | 
 30 | my @REF;
 31 | my $ref=0;
 32 | while(-e "$stem$ref") {
 33 |     &add_to_ref("$stem$ref",\@REF);
 34 |     $ref++;
 35 | }
 36 | &add_to_ref($stem,\@REF) if -e $stem;
 37 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 38 | 
 39 | # add additional references explicitly specified on the command line
 40 | shift;
 41 | foreach my $stem (@ARGV) {
 42 |     &add_to_ref($stem,\@REF) if -e $stem;
 43 | }
 44 | 
 45 | 
 46 | 
 47 | sub add_to_ref {
 48 |     my ($file,$REF) = @_;
 49 |     my $s=0;
 50 |     if ($file =~ /.gz$/) {
 51 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 52 |     } else { 
 53 | 	open(REF,$file) or die "Can't read $file";
 54 |     }
 55 |     while(<REF>) {
 56 | 	chomp;
 57 | 	$_ = tokenization($_);
 58 | 	push @{$$REF[$s++]}, $_;
 59 |     }
 60 |     close(REF);
 61 | }
 62 | 
 63 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 64 | my $s=0;
 65 | while(<STDIN>) {
 66 |     chomp;
 67 |     $_ = lc if $lowercase;
 68 |     $_ = tokenization($_);
 69 |     my @WORD = split;
 70 |     my %REF_NGRAM = ();
 71 |     my $length_translation_this_sentence = scalar(@WORD);
 72 |     my ($closest_diff,$closest_length) = (9999,9999);
 73 |     foreach my $reference (@{$REF[$s]}) {
 74 | #      print "$s $_ <=> $reference\n";
 75 |   $reference = lc($reference) if $lowercase;
 76 | 	my @WORD = split(' ',$reference);
 77 | 	my $length = scalar(@WORD);
 78 |         my $diff = abs($length_translation_this_sentence-$length);
 79 | 	if ($diff < $closest_diff) {
 80 | 	    $closest_diff = $diff;
 81 | 	    $closest_length = $length;
 82 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 83 | 	} elsif ($diff == $closest_diff) {
 84 |             $closest_length = $length if $length < $closest_length;
 85 |             # from two references with the same closeness to me
 86 |             # take the *shorter* into account, not the "first" one.
 87 |         }
 88 | 	for(my $n=1;$n<=4;$n++) {
 89 | 	    my %REF_NGRAM_N = ();
 90 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 91 | 		my $ngram = "$n";
 92 | 		for(my $w=0;$w<$n;$w++) {
 93 | 		    $ngram .= " ".$WORD[$start+$w];
 94 | 		}
 95 | 		$REF_NGRAM_N{$ngram}++;
 96 | 	    }
 97 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 98 | 		if (!defined($REF_NGRAM{$ngram}) ||
 99 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
100 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
101 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
102 | 		}
103 | 	    }
104 | 	}
105 |     }
106 |     $length_translation += $length_translation_this_sentence;
107 |     $length_reference += $closest_length;
108 |     for(my $n=1;$n<=4;$n++) {
109 | 	my %T_NGRAM = ();
110 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
111 | 	    my $ngram = "$n";
112 | 	    for(my $w=0;$w<$n;$w++) {
113 | 		$ngram .= " ".$WORD[$start+$w];
114 | 	    }
115 | 	    $T_NGRAM{$ngram}++;
116 | 	}
117 | 	foreach my $ngram (keys %T_NGRAM) {
118 | 	    $ngram =~ /^(\d+) /;
119 | 	    my $n = $1;
120 |             # my $corr = 0;
121 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
122 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
123 | 	    if (defined($REF_NGRAM{$ngram})) {
124 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
125 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
126 |                     # $corr =  $T_NGRAM{$ngram};
127 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
128 | 		}
129 | 		else {
130 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
131 |                     # $corr =  $REF_NGRAM{$ngram};
132 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
133 | 		}
134 | 	    }
135 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
136 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
137 | 	}
138 |     }
139 |     $s++;
140 | }
141 | my $brevity_penalty = 1;
142 | my $bleu = 0;
143 | 
144 | my @bleu=();
145 | 
146 | for(my $n=1;$n<=4;$n++) {
147 |   if (defined ($TOTAL[$n])){
148 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
149 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
150 |   }else{
151 |     $bleu[$n]=0;
152 |   }
153 | }
154 | 
155 | if ($length_reference==0){
156 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
157 |   exit(1);
158 | }
159 | 
160 | if ($length_translation<$length_reference) {
161 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
162 | }
163 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
164 | 				my_log( $bleu[2] ) +
165 | 				my_log( $bleu[3] ) +
166 | 				my_log( $bleu[4] ) ) / 4) ;
167 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
168 |     100*$bleu,
169 |     100*$bleu[1],
170 |     100*$bleu[2],
171 |     100*$bleu[3],
172 |     100*$bleu[4],
173 |     $brevity_penalty,
174 |     $length_translation / $length_reference,
175 |     $length_translation,
176 |     $length_reference;
177 | 
178 | sub my_log {
179 |   return -9999999999 unless $_[0];
180 |   return log($_[0]);
181 | }
182 | 
183 | 
184 | 
185 | sub tokenization
186 | {
187 | 	my ($norm_text) = @_;
188 | 
189 | # language-independent part:
190 | 	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
191 | 	$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
192 | 	$norm_text =~ s/\n/ /g; # join lines
193 | 	$norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
194 | 	$norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
195 | 	$norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
196 | 	$norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
197 | 
198 | # language-dependent part (assuming Western languages):
199 | 	$norm_text = " $norm_text ";
200 | 	$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
201 | 	$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
202 | 	$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
203 | 	$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
204 | 	$norm_text =~ s/\s+/ /g; # one space only between words
205 | 	$norm_text =~ s/^\s+//;  # no leading space
206 | 	$norm_text =~ s/\s+$//;  # no trailing space
207 | 
208 | 	return $norm_text;
209 | }
210 | 


--------------------------------------------------------------------------------
/data/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | # add additional references explicitly specified on the command line
 35 | shift;
 36 | foreach my $stem (@ARGV) {
 37 |     &add_to_ref($stem,\@REF) if -e $stem;
 38 | }
 39 | 
 40 | 
 41 | 
 42 | sub add_to_ref {
 43 |     my ($file,$REF) = @_;
 44 |     my $s=0;
 45 |     if ($file =~ /.gz$/) {
 46 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 47 |     } else { 
 48 | 	open(REF,$file) or die "Can't read $file";
 49 |     }
 50 |     while(<REF>) {
 51 | 	chomp;
 52 | 	push @{$$REF[$s++]}, $_;
 53 |     }
 54 |     close(REF);
 55 | }
 56 | 
 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 58 | my $s=0;
 59 | while(<STDIN>) {
 60 |     chomp;
 61 |     $_ = lc if $lowercase;
 62 |     my @WORD = split;
 63 |     my %REF_NGRAM = ();
 64 |     my $length_translation_this_sentence = scalar(@WORD);
 65 |     my ($closest_diff,$closest_length) = (9999,9999);
 66 |     foreach my $reference (@{$REF[$s]}) {
 67 | #      print "$s $_ <=> $reference\n";
 68 |   $reference = lc($reference) if $lowercase;
 69 | 	my @WORD = split(' ',$reference);
 70 | 	my $length = scalar(@WORD);
 71 |         my $diff = abs($length_translation_this_sentence-$length);
 72 | 	if ($diff < $closest_diff) {
 73 | 	    $closest_diff = $diff;
 74 | 	    $closest_length = $length;
 75 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 76 | 	} elsif ($diff == $closest_diff) {
 77 |             $closest_length = $length if $length < $closest_length;
 78 |             # from two references with the same closeness to me
 79 |             # take the *shorter* into account, not the "first" one.
 80 |         }
 81 | 	for(my $n=1;$n<=4;$n++) {
 82 | 	    my %REF_NGRAM_N = ();
 83 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 84 | 		my $ngram = "$n";
 85 | 		for(my $w=0;$w<$n;$w++) {
 86 | 		    $ngram .= " ".$WORD[$start+$w];
 87 | 		}
 88 | 		$REF_NGRAM_N{$ngram}++;
 89 | 	    }
 90 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 91 | 		if (!defined($REF_NGRAM{$ngram}) ||
 92 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 93 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 94 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 95 | 		}
 96 | 	    }
 97 | 	}
 98 |     }
 99 |     $length_translation += $length_translation_this_sentence;
100 |     $length_reference += $closest_length;
101 |     for(my $n=1;$n<=4;$n++) {
102 | 	my %T_NGRAM = ();
103 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104 | 	    my $ngram = "$n";
105 | 	    for(my $w=0;$w<$n;$w++) {
106 | 		$ngram .= " ".$WORD[$start+$w];
107 | 	    }
108 | 	    $T_NGRAM{$ngram}++;
109 | 	}
110 | 	foreach my $ngram (keys %T_NGRAM) {
111 | 	    $ngram =~ /^(\d+) /;
112 | 	    my $n = $1;
113 |             # my $corr = 0;
114 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
115 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
116 | 	    if (defined($REF_NGRAM{$ngram})) {
117 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
119 |                     # $corr =  $T_NGRAM{$ngram};
120 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
121 | 		}
122 | 		else {
123 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
124 |                     # $corr =  $REF_NGRAM{$ngram};
125 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
126 | 		}
127 | 	    }
128 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130 | 	}
131 |     }
132 |     $s++;
133 | }
134 | my $brevity_penalty = 1;
135 | my $bleu = 0;
136 | 
137 | my @bleu=();
138 | 
139 | for(my $n=1;$n<=4;$n++) {
140 |   if (defined ($TOTAL[$n])){
141 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143 |   }else{
144 |     $bleu[$n]=0;
145 |   }
146 | }
147 | 
148 | if ($length_reference==0){
149 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150 |   exit(1);
151 | }
152 | 
153 | if ($length_translation<$length_reference) {
154 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
155 | }
156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157 | 				my_log( $bleu[2] ) +
158 | 				my_log( $bleu[3] ) +
159 | 				my_log( $bleu[4] ) ) / 4) ;
160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161 |     100*$bleu,
162 |     100*$bleu[1],
163 |     100*$bleu[2],
164 |     100*$bleu[3],
165 |     100*$bleu[4],
166 |     $brevity_penalty,
167 |     $length_translation / $length_reference,
168 |     $length_translation,
169 |     $length_reference;
170 | 
171 | 
172 | print STDERR "It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
173 | 
174 | sub my_log {
175 |   return -9999999999 unless $_[0];
176 |   return log($_[0]);
177 | }
178 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | ﻿#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 | 
211 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/data/postprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # merges subword units that were split by BPE
4 | 
5 | sed -r 's/\@\@ //g'


--------------------------------------------------------------------------------
/data/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | P=$1
 4 | 
 5 | # source language (example: fr)
 6 | S=$2
 7 | # target language (example: en)
 8 | T=$3
 9 | 
10 | # path to nematus/data
11 | P1=$4
12 | 
13 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt)
14 | P2=$5
15 | 
16 | # tokenize
17 | perl $P1/tokenizer.perl -threads 5 -l $S < {P}.${S} > {P}.${S}.tok
18 | perl $P1/tokenizer.perl -threads 5 -l $T < {P}.${T} > {P}.${T}.tok
19 | 
20 | # learn BPE on joint vocabulary:
21 | cat {P}.${S}.tok {P}.${T}.tok | python $P2/learn_bpe.py -s 20000 > ${S}${T}.bpe
22 | 
23 | python3 $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${S}.tok > {P}.${S}.tok.bpe
24 | python3 $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${T}.tok > {P}.${T}.tok.bpe
25 | 
26 | # build dictionary
27 | python3 $P1/build_dictionary.py {P}.${S}.tok.bpe
28 | python3 $P1/build_dictionary.py {P}.${T}.tok.bpe
29 | 
30 | 


--------------------------------------------------------------------------------
/data/shuffle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import math
  4 | import os
  5 | import random
  6 | import sys
  7 | import tempfile
  8 | 
  9 | 
 10 | # TODO Make CHUNK_SIZE user configurable?
 11 | CHUNK_SIZE = 10000000  # Number of lines.
 12 | 
 13 | def jointly_shuffle_files(files, temporary=False):
 14 |     """Randomly shuffle the given files, applying the same permutation to each.
 15 | 
 16 |     Since the same permutation is applied to all input files, they must
 17 |     contain the same number of input lines.
 18 | 
 19 |     If 'temporary' is True then the shuffled files are written to temporary
 20 |     files. Otherwise, the shuffled files are written to files with the same
 21 |     paths as the originals, but with the added suffix '.shuf'.
 22 | 
 23 |     In addition to shuffling the files, any leading or trailing whitespace is
 24 |     removed from each line.
 25 | 
 26 |     In order to handle large files, the input files are not read into memory
 27 |     in full, but instead are read in chunks of size CHUNK_SIZE.
 28 | 
 29 |     Args:
 30 |         files: a list of strings specifying the paths of the input files.
 31 |         temporary: a Boolean (see description above).
 32 | 
 33 |     Returns:
 34 |         A list containing a file object for each shuffled file, in the same
 35 |         order as the input files. Each file object is open and positioned at
 36 |         the start of the file.
 37 |     """
 38 | 
 39 |     # Determine the number of lines (should be the same for all files).
 40 |     total_lines = 0
 41 |     for _ in open(files[0]):
 42 |         total_lines += 1
 43 | 
 44 |     # Randomly permute the list of line numbers.
 45 |     perm = list(range(total_lines))
 46 |     random.shuffle(perm)
 47 | 
 48 |     # Convert the list of line numbers to a list of chunk indices and offsets.
 49 |     ordering = [(i // CHUNK_SIZE, i % CHUNK_SIZE) for i in perm]
 50 | 
 51 |     # Sort each file according to the generated ordering.
 52 |     return [_sort_file(path, ordering, temporary) for path in files]
 53 | 
 54 | 
 55 | def _sort_file(path, ordering, temporary):
 56 | 
 57 |     # Open a temporary file for each chunk.
 58 | 
 59 |     num_chunks = math.ceil(len(ordering) / CHUNK_SIZE)
 60 |     dirname, filename = os.path.split(os.path.realpath(path))
 61 |     chunk_files = [tempfile.TemporaryFile(prefix=filename+'.chunk'+str(i),
 62 |                                           dir=dirname, mode='w+',
 63 |                                           encoding="UTF-8")
 64 |                    for i in range(num_chunks)]
 65 | 
 66 |     # Read one chunk at a time from path and write the lines to the temporary
 67 |     # files in the order specified by ordering.
 68 | 
 69 |     def _write_chunk_in_order(chunk, chunk_num, out_file):
 70 |         for i, j in ordering:
 71 |             if i == chunk_num:
 72 |                 out_file.write(chunk[j] + '\n')
 73 | 
 74 |     chunk = []
 75 |     chunk_num = 0
 76 |     for i, line in enumerate(open(path)):
 77 |         if i > 0 and (i % CHUNK_SIZE) == 0:
 78 |             _write_chunk_in_order(chunk, chunk_num, chunk_files[chunk_num])
 79 |             chunk = []
 80 |             chunk_num += 1
 81 |         chunk.append(line.strip())
 82 |     if chunk:
 83 |         _write_chunk_in_order(chunk, chunk_num, chunk_files[chunk_num])
 84 | 
 85 |     # Open the output file.
 86 |     if temporary:
 87 |         out_file = tempfile.TemporaryFile(prefix=filename+'.shuf', dir=dirname,
 88 |                                           mode='w+', encoding='UTF-8')
 89 |     else:
 90 |         out_file = open(path+'.shuf', mode='w', encoding='UTF-8')
 91 | 
 92 |     # Seek to the start of the chunk files.
 93 |     for chunk_file in chunk_files:
 94 |         chunk_file.seek(0)
 95 | 
 96 |     # Write the output.
 97 |     for i, _ in ordering:
 98 |         line = chunk_files[i].readline()
 99 |         out_file.write(line)
100 | 
101 |     # Seek to the start so that the file object is ready for reading.
102 |     out_file.seek(0)
103 | 
104 |     return out_file
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     jointly_shuffle_files(sys.argv[1:])
109 | 


--------------------------------------------------------------------------------
/data/strip_sgml.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | 
 4 | 
 5 | def main():
 6 |     fin = sys.stdin
 7 |     fout = sys.stdout
 8 |     for l in fin:
 9 |         line = l.strip()
10 |         text = re.sub('<[^<]+>', "", line).strip()
11 |         if len(text) == 0:
12 |             continue
13 |         print(text, file=fout)
14 |                 
15 | 
16 | if __name__ == "__main__":
17 |     main()
18 | 
19 | 


--------------------------------------------------------------------------------
/doc/factored_neural_machine_translation.md:
--------------------------------------------------------------------------------
 1 | FACTORED NEURAL MACHINE TRANSLATION
 2 | -----------------------------------
 3 | 
 4 | Nematus supports arbitrary input features through factored representations, similar to factored models popularized with Moses.
 5 | This can be used to add linguistic features such as lemmas, POS, or dependency labels, or potentially other types of information.
 6 | The pipe symbol "|" serves as a factor separator and should not otherwise appear in the text.
 7 | 
 8 | To use factored models, follow these steps:
 9 | 
10 |   - preprocess the source side of the training, development and test data to include factors. Consider this example sentence, in an unfactored (or 1-factored) representation, and with 4 factors per word:
11 | 
12 |     Leonidas begged in the arena .
13 | 
14 |     Leonidas|Leonidas|NNP|nsubj begged|beg|VBD|root in|in|IN|prep the|the|DT|det gladiatorial|gladiatorial|JJ|amod arena|arena|NN|pobj
15 | 
16 |     https://github.com/rsennrich/wmt16-scripts/tree/master/factored_sample provides sample scripts to produce a factored representation from a CoNLL file, and BPE-segmented text.
17 | 
18 |   - in the arguments to nematus.nmt.train, adjust the following options:
19 |     - factors: the number of factors per word
20 |     - dim_per_factor: the size of the embedding layer for each factor (a list of integers)
21 |     - dim_word: the total size of the input embedding (must match the sum of dim_per_factor)
22 |     - dictionaries: add a vocabulary file for each factor (in the order they appear), plus a vocabulary file for the target side
23 | 
24 |     an example config is shown at https://github.com/rsennrich/wmt16-scripts/blob/master/factored_sample/config.py
25 | 
26 |   - commands for training and running Nematus are otherwise identical to the non-factored version
27 | 
28 | 
29 | PUBLICATIONS
30 | ------------
31 | 
32 | factored neural machine translation is described in:
33 | 
34 | Sennrich, Rico, Haddow, Barry (2016): Linguistic Input Features Improve Neural Machine Translation, Proc. of the First Conference on Machine Translation (WMT16). Berlin, Germany


--------------------------------------------------------------------------------
/doc/multi_gpu_training.md:
--------------------------------------------------------------------------------
 1 | Multi-GPU Training with Nematus
 2 | -------------------------------
 3 | 
 4 | Nematus supports multi-GPU training; this shows how to make the best use of it.
 5 | 
 6 | Controlling devices:
 7 | --------------------
 8 | 
 9 | by default, Nematus will split training across all available devices.
10 | To control which device(s) to use for training, use `CUDA_VISIBLE_DEVICES`.
11 | 
12 | For example, this command uses the first two devices:
13 | 
14 | ```
15 | CUDA_VISIBLE_DEVICES=0,1 python3 nematus/train.py
16 | ```
17 | 
18 | Update strategy and batch size:
19 | -------------------------------
20 | 
21 | Nematus will perform an update after a fixed number of sentences (`--batch_size`) or tokens (rounded down to full sentences; `--token_batch_size`). If both are defined, `--token_batch_size` takes priority.
22 | 
23 | When training on multiple devices, Nematus uses Synchronous SGD, and sentences in a batch are split between GPUs.
24 | We choose this strategy for transparency. In principle, if training a model on the same data with the same command line parameters,
25 | you should get similar results (except for random variation), even if systems are trained on different (number of) GPUs.
26 | 
27 | Generally, you should choose a large batch size to benefit from multi-GPU training and stabilize training of Transformers.
28 | Our [baseline configuration](https://github.com/EdinburghNLP/wmt17-transformer-scripts/blob/master/training/scripts/train.sh) uses a `token_batch_size` of 16384,
29 | and was tested on 4 GPUs with 12GB of memory each.
30 | 
31 | If you want to train a model with a batch size between updates that exceeds the memory available on your devices (because you are limited in the size and/or number of GPUs),
32 | Nematus supports two ways of further splitting up the batch.
33 | 
34 |  - define `--max_sentences_per_device` or `--max_tokens_per_device`. This is the size of the batch that is processed on a single device at once. Batches are accumulated until reaching the total batch size. For example, defining `--max_tokens_per_device 4096` should ensure that the Transformer baseline will train successfully with 1-4 GPUs without running out of memory.
35 |  - define `--gradient_aggregation_steps`. This will split the minibatch that is sent to a device into X steps, and the gradients from all steps are accumulated. For example, defining `--gradient_aggregation_steps 4` on a training run with 1 device should result in the same memory consumption as `--gradient_aggregation_steps 1` with 4 devices.
36 | 


--------------------------------------------------------------------------------
/nematus/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/nematus/__init__.py:
--------------------------------------------------------------------------------
1 | from nematus import *
2 | from . import rescore
3 | from . import translate
4 | 


--------------------------------------------------------------------------------
/nematus/exception.py:
--------------------------------------------------------------------------------
1 | class Error(Exception):
2 |     def __init__(self, msg):
3 |         self.msg = msg
4 | 


--------------------------------------------------------------------------------
/nematus/exponential_smoothing.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | # How often to update smoothed variables (in terms of training steps).
 5 | DEFAULT_UPDATE_FREQUENCY = 5
 6 | 
 7 | 
 8 | class ExponentialSmoothing(object):
 9 |     """Defines TensorFlow variables and operations for exponential smoothing.
10 | 
11 |     Following Marian [1], we maintain smoothed versions of all trainable
12 |     variables. This class creates the smoothed variables (assuming that the
13 |     model has already been initialized) and provides operations that can be
14 |     run to update the variables and to interchange the values of the raw and
15 |     the smoothed variables (which can be used to swap-in the smoothed versions
16 |     for validation, for instance).
17 | 
18 |     Ideally, the smoothed variables would be updated after every training step,
19 |     but in practice that introduces a noticeable overhead (around 20%)
20 |     due to the need to transfer tensor values from GPU memory into CPU memory.
21 |     Instead we allow updating after every N steps by increasing the smoothing
22 |     factor accordingly. The default N=5 seems to be a good compromise.
23 | 
24 |     [1]
25 |      "Marian: Fast Neural Machine Translation in C++",
26 |      Junczys-Dowmunt et al., in Proceedings of ACL 2018, System Demonstrations.
27 |     """
28 | 
29 |     def __init__(self, smoothing_factor,
30 |                  update_frequency=DEFAULT_UPDATE_FREQUENCY):
31 |         """Creates TF variables and operations.
32 | 
33 |         Args:
34 |             smoothing_factor: float controlling weight of past vs new values.
35 |             update_frequency: integer indicating how often updates will occur.
36 |         """
37 |         self._update_frequency = update_frequency
38 |         adjusted_smoothing_factor = smoothing_factor * update_frequency
39 |         # Smoothed variables are stored in CPU memory to avoid eating into
40 |         # valuable GPU memory.
41 |         device_spec = tf.DeviceSpec(device_type="CPU", device_index=0)
42 |         with tf.device(device_spec):
43 |             # Create variables to hold the smoothed versions of all trainable
44 |             # variables.
45 |             smooth_vars = {}
46 |             for v in tf.compat.v1.trainable_variables():
47 |                 assert v.name[-2:] == ":0"
48 |                 name = v.name[:-2] + "_smooth"
49 |                 s = tf.compat.v1.get_variable(name=name,
50 |                                     initializer=tf.zeros_like(v),
51 |                                     trainable=False,
52 |                                     use_resource=True)
53 |                 smooth_vars[v.name] = s
54 |             # Define the ops to update the smoothed variables.
55 |             self._update_ops = []
56 |             for v in tf.compat.v1.trainable_variables():
57 |                 s = smooth_vars[v.name]
58 |                 updated_s = (1 - adjusted_smoothing_factor) * s \
59 |                             + adjusted_smoothing_factor * v
60 |                 self._update_ops += [tf.compat.v1.assign(s, updated_s)]
61 |             # Define the ops to swap the raw and smoothed variables.
62 |             self._swap_ops = []
63 |             for v in tf.compat.v1.trainable_variables():
64 |                 s = smooth_vars[v.name]
65 |                 v_value = v.read_value()
66 |                 s_value = s.read_value()
67 |                 with tf.control_dependencies([v_value, s_value]):
68 |                     self._swap_ops += [v.assign(s_value)]
69 |                     self._swap_ops += [s.assign(v_value)]
70 | 
71 |     @property
72 |     def update_ops(self):
73 |         return self._update_ops
74 | 
75 |     @property
76 |     def swap_ops(self):
77 |         return self._swap_ops
78 | 
79 |     @property
80 |     def update_frequency(self):
81 |         return self._update_frequency
82 | 


--------------------------------------------------------------------------------
/nematus/initializers.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Parameter initializers
 3 | '''
 4 | 
 5 | import numpy
 6 | 
 7 | def ortho_weight(ndim):
 8 |     W = numpy.random.randn(ndim, ndim)
 9 |     u, s, v = numpy.linalg.svd(W)
10 |     return u.astype('float32')
11 | 
12 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
13 |     if nout is None:
14 |         nout = nin
15 |     if nout == nin and ortho:
16 |         W = ortho_weight(nin)
17 |     else:
18 |         W = scale * numpy.random.randn(nin, nout)
19 |     return W.astype('float32')
20 | 
21 | 


--------------------------------------------------------------------------------
/nematus/learning_schedule.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class ConstantSchedule(object):
 5 |     """Implements a trivial learning schedule with a fixed learning rate."""
 6 | 
 7 |     def __init__(self, learning_rate):
 8 |         """Builds TF graph nodes defining the learning rate function.
 9 | 
10 |         Args:
11 |             learning_rate: a float specifying the learning rate.
12 |         """
13 |         self._learning_rate = tf.constant(learning_rate)
14 | 
15 |     @property
16 |     def learning_rate(self):
17 |         return self._learning_rate
18 | 
19 | 
20 | class TransformerSchedule(object):
21 |     """Implements the learning schedule from the original Transformer paper.
22 | 
23 |     See Section 5.3 of "Attention Is All You Need" (Vaswani et al., 2017).
24 |     """
25 | 
26 |     def __init__(self, global_step, dim, warmup_steps):
27 |         """Builds TF graph nodes defining the learning rate function.
28 | 
29 |         Args:
30 |             global_step: a tf.Variable containing the current update step.
31 |             dim: an integer specifying the model's hidden state size.
32 |             warmup_steps: an integer specifying the number of warm-up steps.
33 |         """
34 |         t = tf.cast(global_step+1, tf.float32)
35 |         a = tf.pow(t, -0.5)
36 |         b = t * (warmup_steps ** (-1.5))
37 |         self._learning_rate = dim ** (-0.5) * tf.minimum(a, b)
38 | 
39 |     @property
40 |     def learning_rate(self):
41 |         return self._learning_rate
42 | 
43 | 
44 | class WarmupPlateauDecaySchedule(object):
45 |     """Implements a parameterized warm-up / plateau / decay learning schedule.
46 | 
47 |     The schedule begins with a warm-up phase where the learning rate is
48 |     linearly increased from zero to the peak learning rate. The rate is then
49 |     held constant for a pre-defined period (possibly zero steps, making this
50 |     phase optional). Finally the rate is decayed (currently according to an
51 |     inverse square-root function, but this could be made configurable in the
52 |     future).
53 |     """
54 | 
55 |     def __init__(self, global_step, peak_learning_rate, warmup_steps,
56 |                  plateau_steps):
57 |         """Builds TF graph nodes defining the learning rate function.
58 | 
59 |         Args:
60 |             global_step: a tf.Variable containing the current update step.
61 |             peak_learning_rate: a float specifying the peak learning rate.
62 |             warmup_steps: an integer specifying the number of warm-up steps.
63 |             plateau_steps: an integer specifying the number of plateau steps.
64 |         """
65 |         t = tf.cast(global_step+1, tf.float32)
66 |         warmup_float = tf.cast(warmup_steps, tf.float32)
67 |         # Function a: warmup
68 |         a = (t / warmup_float) * peak_learning_rate
69 |         # Function b: plateau
70 |         b = peak_learning_rate
71 |         # Function c: decay
72 |         decay_start = warmup_float + plateau_steps
73 |         c = (tf.sqrt(decay_start) / tf.sqrt(t)) * peak_learning_rate
74 |         # Take the minimum of a, b, and c. This will be a for t < warmup_steps,
75 |         # c for t > decay_start, and b in-between.
76 |         self._learning_rate = tf.minimum(tf.minimum(a, b), c)
77 | 
78 |     @property
79 |     def learning_rate(self):
80 |         return self._learning_rate
81 | 


--------------------------------------------------------------------------------
/nematus/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EdinburghNLP/nematus/49d050863bc9644b8c0a9d9ab6e54ccd30f927dd/nematus/metrics/__init__.py


--------------------------------------------------------------------------------
/nematus/metrics/beer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import subprocess, threading
  5 | 
  6 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
  7 | if sys.version_info < (3, 6):
  8 |     ModuleNotFoundError = SystemError
  9 | 
 10 | try:
 11 |     from .scorer import Scorer
 12 |     from .reference import Reference
 13 | except (ModuleNotFoundError, ImportError) as e:
 14 |     from metrics.scorer import Scorer
 15 |     from metrics.reference import Reference
 16 | 
 17 | 
 18 | class BeerError(Exception):
 19 |     def __init__(self, value):
 20 |         self.value = value
 21 |     def __str__(self):
 22 |         return repr(self.value)
 23 | 
 24 | class BeerScorer(Scorer):
 25 |     """
 26 |     Python wrapper for the BEER metric. Starts a BEER process and keeps it alive, so that the model
 27 |     can be kept in memeory. Arguments are the BEER language abbreviation and the path to the BEER
 28 |     installation. They need to be specified as follows:"beer_language=lg,beer_path=path" (any order).
 29 |     """
 30 |     def __init__(self, argument_string):
 31 |         Scorer.__init__(self, argument_string)
 32 |         
 33 |         #Lock for the BEER process, which can only handle one request at a time:
 34 |         self.lock = threading.Lock()
 35 |         
 36 |         #Get necessary arguments for starting BEER from argument string parsed in Scorer.__init__()
 37 |         self._beer_language = self._arguments["beer_language"]
 38 |         self._beer_path = self._arguments["beer_path"] + "/"
 39 |         
 40 |         #Start a BEER process:
 41 |         command = self._beer_path+"beer -l "+self._beer_language+" --workingMode interactive "
 42 |         self.beer_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
 43 | 
 44 |     def set_reference(self, reference_tokens):
 45 |         """
 46 |         Construct a BeerReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses.
 47 |         This can be done any time.
 48 |         """
 49 |         self.lock.acquire()
 50 |         self._reference = BeerReference(reference_tokens, self)
 51 |         self.lock.release()
 52 | 
 53 |     def terminate_process(self):
 54 |         """
 55 |         Waits for the current request to be processed and terminates the BEER process.
 56 |         """
 57 |         self.lock.acquire()
 58 |         self.beer_process.terminate()
 59 |         self.lock.release()
 60 |         
 61 |     def kill_process(self):
 62 |         """
 63 |         Kills the BEER process right away.
 64 |         """
 65 |         self.beer_process.kill()
 66 | 
 67 | class BeerReference(Reference):
 68 |     """
 69 |     BEER reference object, against which hypotheses can be scored.
 70 |     """
 71 |     def __init__(self, reference_tokens, beer_scorer):
 72 |         Reference.__init__(self, reference_tokens)
 73 |         
 74 |         #Construct reference string from tokens
 75 |         self._reference_string = " ".join(reference_tokens)
 76 |         self._beer_scorer = beer_scorer
 77 | 
 78 |     def score(self, hypothesis_tokens):
 79 |         
 80 |         #Construct hypothesis string from hypothesis tokens:
 81 |         hypothesis_string = " ".join(hypothesis_tokens)
 82 |         
 83 |         #Acquire lock to make sure BEER process is not in use:
 84 |         self._beer_scorer.lock.acquire()
 85 |         
 86 |         #Score hypothesis string against reference string
 87 |         try:
 88 |             self._beer_scorer.beer_process.stdin.write("EVAL ||| "+hypothesis_string+" ||| "+self._reference_string+"\n")
 89 |         except:
 90 |             raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip())
 91 |         
 92 |         #Read feature values from process output
 93 |         std_out = self._beer_scorer.beer_process.stdout.readline()
 94 |         #Release the process lock
 95 |         self._beer_scorer.lock.release()
 96 |         
 97 |         #Check if BEER returned a score:
 98 |         try:
 99 |             n = float(std_out)
100 |         except:
101 |             raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip())
102 |         #Return final score
103 |         return n
104 | 


--------------------------------------------------------------------------------
/nematus/metrics/chrf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | 
  5 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
  6 | if sys.version_info < (3, 6):
  7 |     ModuleNotFoundError = SystemError
  8 | 
  9 | try:
 10 |     from .scorer import Scorer
 11 |     from .reference import Reference
 12 | except (ModuleNotFoundError, ImportError) as e:
 13 |     from metrics.scorer import Scorer
 14 |     from metrics.reference import Reference
 15 | 
 16 | class CharacterFScorer(Scorer):
 17 |     """
 18 |     Scores CharacterFScoreReference objects.
 19 |     """
 20 | 
 21 |     def __init__(self, argument_string):
 22 |         """
 23 |         Initialises metric-specific parameters.
 24 |         """
 25 |         Scorer.__init__(self, argument_string)
 26 |         # use character n-gram order of 4 by default
 27 |         if not 'n' in list(self._arguments.keys()):
 28 |             self._arguments['n'] = 6
 29 |         # use beta = 1 by default (recommendation by Maja Popovic for generative modelling)
 30 |         if not 'beta' in list(self._arguments.keys()):
 31 |             self._arguments['beta'] = 1
 32 | 
 33 |     def set_reference(self, reference_tokens):
 34 |         """
 35 |         Sets the reference against hypotheses are scored.
 36 |         """
 37 |         self._reference = CharacterFScoreReference(
 38 |             reference_tokens,
 39 |             self._arguments['n'],
 40 |             self._arguments['beta']
 41 |         )
 42 | 
 43 | class CharacterFScoreReference(Reference):
 44 |     """
 45 |     References for Character F-Score, as proposed by Popovic (2015): http://www.statmt.org/wmt15/pdf/WMT49.pdf
 46 |     """
 47 | 
 48 |     def __init__(self, reference_tokens, n=6, beta=1):
 49 |         """
 50 |         @param reference the reference translation that hypotheses shall be
 51 |                          scored against.
 52 |         @param n         maximum character n-gram order to consider.
 53 |         @param beta      algorithm paramater beta (interpolation weight, needs to be > 0).
 54 |         """
 55 |         if beta <= 0:
 56 |             raise ValueError("Value of beta needs to be larger than zero!")
 57 |         
 58 |         Reference.__init__(self, reference_tokens)
 59 |         self.n = n
 60 |         self.max_order = n
 61 |         self.beta_squared = beta ** 2
 62 |         
 63 |         # The paper specifies that whitespace is ignored, but for a training objective,
 64 |         #it's perhaps better to leave it in. According to the paper, it makes no
 65 |         #difference in practise for scoring.
 66 |         self._reference_string = " ".join(reference_tokens).strip()
 67 |                 
 68 |         # Get n-grams from reference:
 69 |         self._reference_ngrams = self._get_ngrams(self._reference_string, self.n)
 70 |         
 71 |     def _get_ngrams(self, tokens, n):
 72 |         """
 73 |         Extracts all n-grams up to order @param n from a list of @param tokens.
 74 |         """     
 75 |         n_grams_dict = {}
 76 |         length = len(tokens)
 77 |         #If the reference is shorter than n characters, insist on an exact match:
 78 |         if len(tokens) < n:
 79 |             self.max_order = len(tokens)
 80 |         m = 1
 81 |         while m <= n: #n-gram order
 82 |             i = m
 83 |             n_grams_list = []
 84 |             order_dict = {}
 85 |             while (i <= length):
 86 |                 n_grams_list.append(tokens[i-m:i])
 87 |                 i += 1            
 88 |             for ngr in n_grams_list:
 89 |                 order_dict[ngr] = order_dict.setdefault(ngr,0) + 1
 90 |             n_grams_dict[m] = order_dict
 91 |             m += 1
 92 |         return n_grams_dict
 93 | 
 94 |     def score(self, hypothesis_tokens):
 95 |         """
 96 |         Scores @param hypothesis against this reference.
 97 | 
 98 |         @return the sentence-level ChrF score: 1.0 is best, 0.0 worst.
 99 |         """
100 |         #See comment above on treating whitespace.
101 |         hypothesis_string = " ".join(hypothesis_tokens).strip()
102 |         
103 |         #If the hypothesis or the reference is empty, insist on an exact match:
104 |         if len(self._reference_string) < 1 or len(hypothesis_string) < 1:
105 |             if hypothesis_string == self._reference_string:
106 |                 return 1.0
107 |             else:
108 |                 return 0.0
109 |         
110 |         hypothesis_ngrams = self._get_ngrams(hypothesis_string, self.n)
111 |         
112 |         #Calculate character precision:
113 |         chrP = 0.0
114 |         chrR = 0.0
115 |         for m in range(1,self.n+1):
116 |             hyp_count = 0.0
117 |             count_total = 0.0
118 |             count_in = 0.0
119 |             for ngr in hypothesis_ngrams[m]:
120 |                 hyp_count = hypothesis_ngrams[m][ngr]
121 |                 count_total += hyp_count
122 |                 if ngr in self._reference_ngrams[m]:
123 |                     count_in += min(hyp_count, self._reference_ngrams[m][ngr])
124 |             #Catch division by zero:
125 |             if count_total == 0.0:
126 |                 chrP += 0.0
127 |             else:
128 |                 chrP += count_in / count_total    
129 |         #average chrP over n-gram orders:        
130 |         chrP = chrP / float(self.max_order)
131 |         
132 |         #Calculate character recall:
133 |         for m in range(1,self.n+1):
134 |             ref_count = 0.0
135 |             count_total = 0.0
136 |             count_in = 0.0
137 |             for ngr in self._reference_ngrams[m]:
138 |                 ref_count = self._reference_ngrams[m][ngr]
139 |                 count_total += ref_count
140 |                 if ngr in hypothesis_ngrams[m]:
141 |                     count_in += min(ref_count, hypothesis_ngrams[m][ngr])
142 |             #Catch division by zero:
143 |             if count_total == 0.0:
144 |                 chrR += 0.0
145 |             else:    
146 |                 chrR += count_in/count_total
147 |         #average chrR over n-gram orders:
148 |         chrR = chrR / float(self.max_order)
149 |                 
150 |         #Catch division by zero:
151 |         if chrP == 0.0 and chrR == 0.0:
152 |             return 0.0
153 |         return (1 + self.beta_squared) * (chrP*chrR) / ((self.beta_squared * chrP) + chrR)
154 | 


--------------------------------------------------------------------------------
/nematus/metrics/meteor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import subprocess, threading
  5 | 
  6 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
  7 | if sys.version_info < (3, 6):
  8 |     ModuleNotFoundError = SystemError
  9 | 
 10 | try:
 11 |     from .scorer import Scorer
 12 |     from .reference import Reference
 13 | except (ModuleNotFoundError, ImportError) as e:
 14 |     from metrics.scorer import Scorer
 15 |     from metrics.reference import Reference
 16 | 
 17 | class MeteorError(Exception):
 18 |     def __init__(self, value):
 19 |         self.value = value
 20 |     def __str__(self):
 21 |         return repr(self.value)
 22 | 
 23 | class MeteorScorer(Scorer):
 24 |     """
 25 |     Python wrapper for the METEOR metric. Starts a METEOR process and keeps it alive, so that the model
 26 |     can be kept in memeory. Arguments are the meteor language abbreviation and the path to the METEOR
 27 |     installation. They need to be specified as follows:"meteor_language=lg,meteor_path=path" (any order).
 28 |     """
 29 |     def __init__(self, argument_string):
 30 |         Scorer.__init__(self, argument_string)
 31 |         
 32 |         #Lock for the METEOR process, which can only handle one request at a time:
 33 |         self.lock = threading.Lock()
 34 |         
 35 |         #Get necessary arguments for starting METEOR from argument string parsed in Scorer.__init__()
 36 |         self._meteor_language = self._arguments["meteor_language"]
 37 |         self._meteor_path = self._arguments["meteor_path"] + "/"
 38 |         
 39 |         #Start a METEOR process:
 40 |         command = "java -Xmx2G -jar "+self._meteor_path+"meteor-*.jar - - -l "+self._meteor_language+" -stdio"
 41 |         self.meteor_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
 42 | 
 43 |     def set_reference(self, reference_tokens):
 44 |         """
 45 |         Construct a MeteorReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses.
 46 |         This can be done any time.
 47 |         """
 48 |         self.lock.acquire()
 49 |         self._reference = MeteorReference(reference_tokens, self)
 50 |         self.lock.release()
 51 | 
 52 |     def terminate_process(self):
 53 |         """
 54 |         Waits for the current request to be processed and terminates the METEOR process.
 55 |         """
 56 |         self.lock.acquire()
 57 |         self.meteor_process.terminate()
 58 |         self.lock.release()
 59 |         
 60 |     def kill_process(self):
 61 |         """
 62 |         Kills the METEOR process right away.
 63 |         """
 64 |         self.meteor_process.kill()
 65 | 
 66 | class MeteorReference(Reference):
 67 |     """
 68 |     METEOR reference object, against which hypotheses can be scored.
 69 |     """
 70 |     def __init__(self, reference_tokens, meteor_scorer):
 71 |         Reference.__init__(self, reference_tokens)
 72 |         
 73 |         #Construct reference string from tokens
 74 |         self._reference_string = " ".join(reference_tokens)
 75 |         self._meteor_scorer = meteor_scorer
 76 | 
 77 |     def score(self, hypothesis_tokens):
 78 |         
 79 |         #Construct hypothesis string from hypothesis tokens:
 80 |         hypothesis_string = " ".join(hypothesis_tokens)
 81 |         
 82 |         #Acquire lock to make sure METEOR process is not in use:
 83 |         self._meteor_scorer.lock.acquire()
 84 |         
 85 |         #Score hypothesis string against reference string
 86 |         try:
 87 |             self._meteor_scorer.meteor_process.stdin.write("SCORE ||| "+self._reference_string+" ||| "+hypothesis_string+"\n")
 88 |         except:
 89 |             raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip())
 90 |         
 91 |         #Read feature values from process output
 92 |         std_out = self._meteor_scorer.meteor_process.stdout.readline()
 93 |         
 94 |         #Pass feature values to METEOR process for computation of the final score
 95 |         try:
 96 |             self._meteor_scorer.meteor_process.stdin.write("EVAL ||| "+std_out)
 97 |         except:
 98 |             raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip())
 99 |         std_out = self._meteor_scorer.meteor_process.stdout.readline()
100 |         
101 |         #Release the process lock
102 |         self._meteor_scorer.lock.release()
103 |         
104 |         #Check if Meteor returned a score:
105 |         try:
106 |             n = float(std_out)
107 |         except:
108 |             raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip())
109 |         
110 |         #Return final score
111 |         return n
112 | 


--------------------------------------------------------------------------------
/nematus/metrics/reference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from abc import ABCMeta, abstractmethod
 4 | 
 5 | class Reference(metaclass=ABCMeta):
 6 |     """
 7 |     Abstract base class for re-usable translation reference. Hypotheses can be
 8 |     scored against this reference through the evaluation metric implemented in
 9 |     its `score` function.
10 |     """
11 | 
12 |     def __init__(self, reference_tokens):
13 |         """
14 |         @param reference the reference translation that hypotheses shall be
15 |                          scored against.
16 |         """
17 |         self._reference_tokens = reference_tokens
18 |         #additional (metric-specific) parameters to be defined in subclass
19 | 
20 |     @abstractmethod
21 |     def score(self, hypothesis_tokens):
22 |         """
23 |         Scores @param hypothesis against this reference.
24 |         """
25 |         pass #to be implemented in sublcass
26 | 
27 |     def score_matrix(self, hypothesis_matrix):
28 |         """
29 |         Scores every hypothesis in @param hypotheses against this reference.
30 |         @param hypothesis_matrix an iterable of iterables of tokens.
31 |         """
32 |         return [self.score(hypothesis_tokens) for hypothesis_tokens in hypothesis_matrix]
33 | 


--------------------------------------------------------------------------------
/nematus/metrics/scorer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from abc import ABCMeta, abstractmethod
 4 | 
 5 | class Scorer(metaclass=ABCMeta):
 6 |     """
 7 |     Abstract base class for MT evaluation metric. Can be passed on to a
 8 |     Reference for scoring translation hypotheses.
 9 |     """
10 | 
11 |     def __init__(self, argument_string):
12 |         """
13 |         @param argument_string the metric-specific parameters (such as n-gram
14 |         order for BLEU, language for METEOR, etc.)
15 |         """
16 |         # parse arguments
17 |         self._reference = None # to be set via `self.set_reference()`
18 |         self._arguments = {}
19 |         if argument_string:
20 |             argument_strings = argument_string.split(",")
21 |             for a in argument_strings:
22 |                 argument, value = a.split("=")
23 |                 argument = argument.strip()
24 |                 value = value.strip()
25 |                 try:
26 |                     value = int(value) # change type to int if applicable
27 |                 except ValueError:
28 |                     value = value
29 |                 self._arguments[argument] = value
30 | 
31 |     @abstractmethod
32 |     def set_reference(self, reference_tokens):
33 |         """
34 |         Sets the reference against which one or many hypotheses can be scored
35 |         via `self.score()` and `self.score_matrix()`.
36 |         """
37 |         pass # instantiate a Reference object and store it at self._reference
38 | 
39 |     def score(self, hypothesis_tokens):
40 |         """
41 |         Scores @param hypothesis against this reference.
42 |         """
43 |         return self._reference.score(hypothesis_tokens)
44 | 
45 |     def score_matrix(self, hypothesis_matrix):
46 |         """
47 |         Scores every hypothesis in @param hypotheses against this reference.
48 |         @param hypothesis_matrix an iterable of iterables of tokens.
49 |         """
50 |         return self._reference.score_matrix(hypothesis_matrix)
51 | 


--------------------------------------------------------------------------------
/nematus/metrics/scorer_interpolator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
 6 | if sys.version_info < (3, 6):
 7 |     ModuleNotFoundError = SystemError
 8 | 
 9 | try:
10 |     from .scorer import Scorer
11 |     from . import scorer_provider as sp
12 | except (ModuleNotFoundError, ImportError) as e:
13 |     from metrics.scorer import Scorer
14 |     from metrics import scorer_provider as sp
15 | 
16 | class ScorerInterpolator(Scorer):
17 |     """
18 |     Creates a scorer that interpolates scores from 1..n sub-scorers, e.g.,
19 |     0.5 * SENTENCEBLEU + 0.5 * METEOR.
20 |     """
21 | 
22 |     def __init__(self, config_string):
23 |         """
24 |         @param config_string example:
25 |         `INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor`
26 |         """
27 |         self._scorers = []
28 |         self._weights = []
29 |         # parse arguments
30 |         scorers = config_string.split(";")
31 |         scorers = [scorer.strip() for scorer in scorers]
32 |         try:
33 |             instruction, weights = scorers[0].split("w=")
34 |             assert instruction.strip() == "INTERPOLATE"
35 |             weights = [float(w) for w in weights.split(',')]
36 |             scorers = [sp.ScorerProvider().get(s) for s in scorers[1:]]
37 |         except:
38 |             raise SyntaxError("Ill-formated interpolation of metrics. Example of valid definition: `INTERPOLATE w=0.5,0.5`.")
39 |         # assertions
40 |         assert len(weights) == len(scorers)
41 |         assert sum(weights) == 1.0
42 |         # init scorers
43 |         for i, scorer in enumerate(scorers):
44 |             self._scorers.append(scorer)
45 |             self._weights.append(weights[i])
46 | 
47 |     def set_reference(self, reference_tokens):
48 |         """
49 |         Sets the reference against which one or many hypotheses can be scored
50 |         via `self.score()` and `self.score_matrix()`.
51 |         """
52 |         for scorer in self._scorers:
53 |             scorer.set_reference(reference_tokens)
54 | 
55 |     def score(self, hypothesis_tokens):
56 |         """
57 |         Scores @param hypothesis with all scorers added via `self.add_scorer`
58 |         and interpolates the scores with the respective weights.
59 |         """
60 |         return sum([s.score(hypothesis_tokens) * w for w, s in zip(self._weights, self._scorers)])
61 | 
62 |     def score_matrix(self, hypothesis_matrix):
63 |         """
64 |         Scores every hypothesis in @param hypotheses with all scorers added via
65 |         `self.add_scorer` and interpolates the scores with the respective
66 |         weights.
67 |         """
68 |         return sum([s.score_matrix(hypothesis_matrix) * w for w, s in zip(self._weights, self._scorers)])
69 | 


--------------------------------------------------------------------------------
/nematus/metrics/scorer_provider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
 6 | if sys.version_info < (3, 6):
 7 |     ModuleNotFoundError = SystemError
 8 | 
 9 | try:
10 |     from . import scorer_interpolator as si
11 |     from .sentence_bleu import SentenceBleuScorer
12 |     from .meteor import MeteorScorer
13 |     from .beer import BeerScorer
14 |     from .chrf import CharacterFScorer
15 | except:
16 |     from metrics import scorer_interpolator as si
17 |     from metrics.sentence_bleu import SentenceBleuScorer
18 |     from metrics.meteor import MeteorScorer
19 |     from metrics.beer import BeerScorer
20 |     from metrics.chrf import CharacterFScorer
21 | 
22 | class ScorerProvider:
23 |     """
24 |     Parses a config string and returns a matching scorer object with the given
25 |     parameters.
26 |     """
27 |     #from bleu import SentenceBleuScorer
28 | 
29 |     def __init__(self):
30 |         pass
31 | 
32 |     def get(self, config_string):
33 |         """
34 |         Returns a scorer matching the metric and parameters defined in @param
35 |         config string.
36 | 
37 |         Example: ScorerProvider.get("BLEU n=4") returns a SmoothedBleuScorer
38 |                  object that considers n-gram precision up to n=4.
39 | 
40 |         If more than one metrics are provided (separated by `;`),
41 |         an interpolated scorer will be returned.
42 | 
43 |         Example: ScorerProvider.get("INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor")
44 |                  returns an InterpolatedScorer object that scores hypotheses
45 |                  using 0.5 * bleu_score + 0.5 * meteor_score.
46 |         """
47 |         # interpolation
48 |         if config_string.startswith("INTERPOLATE"):
49 |             return si.ScorerInterpolator(config_string)
50 |         try:
51 |             scorer, arguments = config_string.split(" ", 1)
52 |         except ValueError:
53 |             scorer = config_string
54 |             arguments = ''
55 |         if scorer == 'SENTENCEBLEU':
56 |             return SentenceBleuScorer(arguments)
57 |         elif scorer == 'METEOR':
58 |             return MeteorScorer(arguments)
59 |         elif scorer == 'BEER':
60 |             return BeerScorer(arguments)
61 |         elif scorer == 'CHRF':
62 |             return CharacterFScorer(arguments)
63 |         # add other scorers here
64 |         else:
65 |             raise NotImplementedError("No such scorer: %s" % scorer)
66 | 


--------------------------------------------------------------------------------
/nematus/metrics/sentence_bleu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | from math import exp
  5 | from operator import mul
  6 | from collections import defaultdict
  7 | from functools import reduce
  8 | 
  9 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
 10 | if sys.version_info < (3, 6):
 11 |     ModuleNotFoundError = SystemError
 12 | 
 13 | try:
 14 |     from .scorer import Scorer
 15 |     from .reference import Reference
 16 | except (ModuleNotFoundError, ImportError) as e:
 17 |     from metrics.scorer import Scorer
 18 |     from metrics.reference import Reference
 19 | 
 20 | class SentenceBleuScorer(Scorer):
 21 |     """
 22 |     Scores SmoothedBleuReference objects.
 23 |     """
 24 | 
 25 |     def __init__(self, argument_string):
 26 |         """
 27 |         Initialises metric-specific parameters.
 28 |         """
 29 |         Scorer.__init__(self, argument_string)
 30 |         # use n-gram order of 4 by default
 31 |         if not 'n' in list(self._arguments.keys()):
 32 |             self._arguments['n'] = 4
 33 | 
 34 |     def set_reference(self, reference_tokens):
 35 |         """
 36 |         Sets the reference against hypotheses are scored.
 37 |         """
 38 |         self._reference = SentenceBleuReference(
 39 |             reference_tokens,
 40 |             self._arguments['n']
 41 |         )
 42 | 
 43 | class SentenceBleuReference(Reference):
 44 |     """
 45 |     Smoothed sentence-level BLEU as as proposed by Lin and Och (2004).
 46 |     Implemented as described in (Chen and Cherry, 2014).
 47 |     """
 48 | 
 49 |     def __init__(self, reference_tokens, n=4):
 50 |         """
 51 |         @param reference the reference translation that hypotheses shall be
 52 |                          scored against. Must be an iterable of tokens (any
 53 |                          type).
 54 |         @param n         maximum n-gram order to consider.
 55 |         """
 56 |         Reference.__init__(self, reference_tokens)
 57 |         self.n = n
 58 |         # preprocess reference
 59 |         self._reference_length = len(self._reference_tokens)
 60 |         self._reference_ngrams = self._get_ngrams(self._reference_tokens, self.n)
 61 | 
 62 |     def _get_ngrams(self, tokens, max_n):
 63 |         """
 64 |         Extracts all n-grams of order 1 up to (and including) @param max_n from
 65 |         a list of @param tokens.
 66 |         """
 67 |         n_grams = []
 68 |         for n in range(1, max_n+1):
 69 |             n_grams.append(defaultdict(int))
 70 |             for n_gram in zip(*[tokens[i:] for i in range(n)]):
 71 |                 n_grams[n-1][n_gram] += 1
 72 |         return n_grams
 73 | 
 74 |     def score(self, hypothesis_tokens):
 75 |         """
 76 |         Scores @param hypothesis against this reference.
 77 | 
 78 |         @return the smoothed sentence-level BLEU score: 1.0 is best, 0.0 worst.
 79 |         """
 80 |         def product(iterable):
 81 |             return reduce(mul, iterable, 1)
 82 |         def ngram_precisions(ref_ngrams, hyp_ngrams):
 83 |             precisions = []
 84 |             for n in range(1, self.n+1):
 85 |                 overlap = 0
 86 |                 for ref_ngram, ref_ngram_count in list(ref_ngrams[n-1].items()):
 87 |                     if ref_ngram in hyp_ngrams[n-1]:
 88 |                         overlap += min(ref_ngram_count, hyp_ngrams[n-1][ref_ngram])
 89 |                 hyp_length = max(0, len(hypothesis_tokens)-n+1)
 90 |                 if n >= 2:
 91 |                     # smoothing as proposed by Lin and Och (2004),
 92 |                     # implemented as described in (Chen and Cherry, 2014)
 93 |                     overlap += 1
 94 |                     hyp_length += 1
 95 |                 precisions.append(overlap/hyp_length if hyp_length > 0 else 0.0)
 96 |             return precisions
 97 |         def brevity_penalty(ref_length, hyp_length):
 98 |             return min(1.0, exp(1-(ref_length/hyp_length if hyp_length > 0 else 0.0)))
 99 |         # preprocess hypothesis
100 |         hypothesis_length = len(hypothesis_tokens)
101 |         hypothesis_ngrams = self._get_ngrams(hypothesis_tokens, self.n)
102 |         # calculate n-gram precision for all orders
103 |         np = ngram_precisions(self._reference_ngrams, hypothesis_ngrams)
104 |         # calculate brevity penalty
105 |         bp = brevity_penalty(self._reference_length, hypothesis_length)
106 |         # compose final BLEU score
107 |         return product(np)**(1/self.n) * bp
108 | 


--------------------------------------------------------------------------------
/nematus/metrics/test_chrf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import unittest
 4 | 
 5 | from metrics.chrf import CharacterFScorer
 6 | 
 7 | class TestCharacterFScoreReference(unittest.TestCase):
 8 |     """
 9 |     Regression tests for SmoothedBleuReference
10 |     """
11 |     @staticmethod
12 |     def tokenize(sentence):
13 |         return sentence.split(" ")
14 |     def test_identical_segments(self):
15 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
16 |         scorer = CharacterFScorer('n=6,beta=3')
17 |         scorer.set_reference(segment)
18 |         self.assertEqual(scorer.score(segment), 1.0)   
19 |     def test_completely_different_segments(self):
20 |         segment_a = self.tokenize("AAAAAA")
21 |         segment_b = self.tokenize("BBBB")
22 |         scorer = CharacterFScorer('n=3,beta=3')
23 |         scorer.set_reference(segment_a)
24 |         self.assertEqual(scorer.score(segment_b), 0.0)
25 |     def test_empty_string(self):
26 |         segment_a = self.tokenize("")
27 |         segment_b = self.tokenize("")
28 |         scorer = CharacterFScorer('n=6,beta=3')
29 |         scorer.set_reference(segment_a)
30 |         self.assertEqual(scorer.score(segment_b), 1.0)
31 |     def test_one_character_empty_string(self):
32 |         segment_a = self.tokenize("A")
33 |         segment_b = self.tokenize("")
34 |         scorer = CharacterFScorer('n=6,beta=3')
35 |         scorer.set_reference(segment_a)
36 |         self.assertEqual(scorer.score(segment_b), 0.0)
37 |     def test_empty_string_one_character(self):
38 |         segment_a = self.tokenize("")
39 |         segment_b = self.tokenize("A")
40 |         scorer = CharacterFScorer('n=6,beta=3')
41 |         scorer.set_reference(segment_a)
42 |         self.assertEqual(scorer.score(segment_b), 0.0)
43 |     def test_half_right(self):
44 |         segment_a = self.tokenize("AB")
45 |         segment_b = self.tokenize("AA")
46 |         scorer = CharacterFScorer('n=6,beta=3')
47 |         scorer.set_reference(segment_a)
48 |         self.assertEqual(scorer.score(segment_b), 0.25)                     
49 |     def test_one_character(self):
50 |         segment_a = self.tokenize("A")
51 |         segment_b = self.tokenize("A")
52 |         scorer = CharacterFScorer('n=6,beta=3')
53 |         scorer.set_reference(segment_a)
54 |         self.assertEqual(scorer.score(segment_b), 1.0)
55 |     def test_almost_correct(self):
56 |         segment_a = self.tokenize("risk assessment has to be undertaken by those who are qualified and expert in that area - that is the scientists .")
57 |         segment_b = self.tokenize(" risk assessment must be made of those who are qualified and expertise in the sector - these are the scientists .")
58 |         scorer = CharacterFScorer('n=6,beta=3')
59 |         scorer.set_reference(segment_a)
60 |         self.assertEqual('{0:.12f}'.format(scorer.score(segment_b)), "0.652414427449")
61 |     
62 | if __name__ == '__main__':
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------
/nematus/metrics/test_scorer_provider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import unittest
 4 | 
 5 | from metrics.scorer_provider import ScorerProvider
 6 | from metrics.sentence_bleu import SentenceBleuScorer
 7 | 
 8 | class TestScorerProvider(unittest.TestCase):
 9 |     """
10 |     Regression tests for ScorerProvider
11 |     """
12 |     @staticmethod
13 |     def tokenize(sentence):
14 |         return sentence.split(" ")
15 | 
16 |     def test_single_metric(self):
17 |         config_string = "SENTENCEBLEU n=4"
18 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
19 |         reference_scorer = SentenceBleuScorer('n=4')
20 |         provided_scorer = ScorerProvider().get(config_string)
21 |         reference_scorer.set_reference(segment)
22 |         provided_scorer.set_reference(segment)
23 |         self.assertEqual(
24 |             reference_scorer.score(segment),
25 |             provided_scorer.score(segment)
26 |         )
27 | 
28 |     def test_interpolated_metrics(self):
29 |         config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4"
30 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
31 |         reference_scorer = SentenceBleuScorer('n=4')
32 |         provided_scorer = ScorerProvider().get(config_string) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer
33 |         reference_scorer.set_reference(segment)
34 |         provided_scorer.set_reference(segment)
35 |         self.assertEqual(
36 |             reference_scorer.score(segment),
37 |             provided_scorer.score(segment)
38 |         )
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/nematus/metrics/test_sentence_bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import unittest
 4 | 
 5 | from metrics.sentence_bleu import SentenceBleuScorer
 6 | 
 7 | class TestSentenceBleuReference(unittest.TestCase):
 8 |     """
 9 |     Regression tests for SmoothedBleuReference
10 |     """
11 |     @staticmethod
12 |     def tokenize(sentence):
13 |         return sentence.split(" ")
14 |     def test_identical_segments(self):
15 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
16 |         scorer = SentenceBleuScorer('n=4')
17 |         scorer.set_reference(segment)
18 |         self.assertEqual(scorer.score(segment), 1.0)
19 |     def test_completely_different_segments(self):
20 |         segment_a = self.tokenize("A A A")
21 |         segment_b = self.tokenize("B B B")
22 |         scorer = SentenceBleuScorer('n=4')
23 |         scorer.set_reference(segment_a)
24 |         self.assertEqual(scorer.score(segment_b), 0.0)
25 |     def test_clipping(self):
26 |         segment_a = self.tokenize("The very nice man")
27 |         segment_b = self.tokenize("man man man man")
28 |         scorer = SentenceBleuScorer('n=1')
29 |         scorer.set_reference(segment_a)
30 |         self.assertNotEqual(scorer.score(segment_b), 1.0)
31 | 
32 | if __name__ == '__main__':
33 |     unittest.main()
34 | 


--------------------------------------------------------------------------------
/nematus/model_inputs.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class ModelInputs(object):
 5 |     def __init__(self, config):
 6 |         # variable dimensions
 7 |         seq_len, batch_size, mrt_sampleN= None, None, None
 8 |         # mrt_sampleN = batch_size X sampleN
 9 | 
10 |         self.x = tf.compat.v1.placeholder(
11 |             name='x',
12 |             shape=(config.factors, seq_len, batch_size),
13 |             dtype=tf.int32)
14 | 
15 |         self.x_mask = tf.compat.v1.placeholder(
16 |             name='x_mask',
17 |             shape=(seq_len, batch_size),
18 |             dtype=tf.float32)
19 | 
20 |         self.y = tf.compat.v1.placeholder(
21 |             name='y',
22 |             shape=(seq_len, batch_size),
23 |             dtype=tf.int32)
24 | 
25 |         self.y_mask = tf.compat.v1.placeholder(
26 |             name='y_mask',
27 |             shape=(seq_len, batch_size),
28 |             dtype=tf.float32)
29 | 
30 |         self.scores = tf.compat.v1.placeholder(
31 |             name='scores',
32 |             shape=(mrt_sampleN),
33 |             dtype=tf.float32)
34 | 
35 |         self.index = tf.compat.v1.placeholder(
36 |             name='index',
37 |             shape=(mrt_sampleN),
38 |             dtype=tf.int32)
39 | 
40 |         self.training = tf.compat.v1.placeholder_with_default(
41 |             False,
42 |             name='training',
43 |             shape=())
44 | 


--------------------------------------------------------------------------------
/nematus/nmt.py:
--------------------------------------------------------------------------------
1 | train.py


--------------------------------------------------------------------------------
/nematus/rescore.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | Rescoring an n-best list of translations using a translation model.
 4 | '''
 5 | 
 6 | import sys
 7 | import logging
 8 | if __name__ == '__main__':
 9 |     # Parse console arguments.
10 |     from settings import RescorerSettings
11 |     rescorer_settings = RescorerSettings(from_console_arguments=True)
12 |     # Set the logging level. This needs to be done before the tensorflow
13 |     # module is imported.
14 |     level = logging.DEBUG if rescorer_settings.verbose else logging.INFO
15 |     logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
16 | 
17 | from tempfile import NamedTemporaryFile
18 | 
19 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
20 | if sys.version_info < (3, 6):
21 |     ModuleNotFoundError = SystemError
22 | 
23 | try:
24 |     from .config import load_config_from_json_file
25 |     from .score import calc_scores
26 | except (ModuleNotFoundError, ImportError) as e:
27 |     from config import load_config_from_json_file
28 |     from score import calc_scores
29 | 
30 | 
31 | 
32 | def rescore(source_file, nbest_file, output_file, rescorer_settings, options):
33 | 
34 |     lines = source_file.readlines()
35 |     nbest_lines = nbest_file.readlines()
36 | 
37 |     # create plain text file for scoring
38 |     with NamedTemporaryFile(mode='w+', prefix='rescore-tmpin') as tmp_in, \
39 |          NamedTemporaryFile(mode='w+', prefix='rescore-tmpout') as tmp_out:
40 |         for line in nbest_lines:
41 |             linesplit = line.split(' ||| ')
42 |             # Get the source file index (zero-based).
43 |             idx = int(linesplit[0])
44 |             tmp_in.write(lines[idx])
45 |             tmp_out.write(linesplit[1] + '\n')
46 | 
47 |         tmp_in.seek(0)
48 |         tmp_out.seek(0)
49 |         scores = calc_scores(tmp_in, tmp_out, rescorer_settings, options)
50 | 
51 |     for i, line in enumerate(nbest_lines):
52 |         score_str = ' '.join([str(s[i]) for s in scores])
53 |         output_file.write('{0} {1}\n'.format(line.strip(), score_str))
54 | 
55 | 
56 | def main(source_file, nbest_file, output_file, rescorer_settings):
57 |     # load model model_options
58 |     options = []
59 |     for model in rescorer_settings.models:
60 |         config = load_config_from_json_file(model)
61 |         setattr(config, 'reload', model)
62 |         options.append(config)
63 | 
64 |     rescore(source_file, nbest_file, output_file, rescorer_settings, options)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main(source_file=rescorer_settings.source,
69 |          nbest_file=rescorer_settings.input,
70 |          output_file=rescorer_settings.output,
71 |          rescorer_settings=rescorer_settings)
72 | 


--------------------------------------------------------------------------------
/nematus/sample_client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import json
 4 | import requests # use `pip install requests` if not available on your system
 5 | 
 6 | SOURCE_SEGMENTS = {
 7 |     "de":"Die Wahrheit ist selten rein und nie einfach .".split(),
 8 |     "en":"The truth is rarely pure and never simple .".split()
 9 | }
10 | 
11 | class Client(object):
12 |     """
13 |     A sample client for Nematus Server instances.
14 | 
15 |     Uses the Nematus API style, i.e., the server (`server.py`) must be started
16 |     with `style=Nematus` to serve requests from this client.
17 |     """
18 |     def __init__(self, host, port):
19 |         self.host = host
20 |         self.port = port
21 |         self.headers = {
22 |             'content-type': 'application/json'
23 |         }
24 | 
25 |     def _get_url(self, path='/'):
26 |         return "http://{0}:{1}{2}".format(self.host, self.port, path)
27 | 
28 |     def translate(self, segment):
29 |         """
30 |         Returns the translation of a list of segments.
31 |         """
32 |         return self.translate_segments([segment])[0]
33 | 
34 |     def translate_segments(self, segments):
35 |         """
36 |         Returns the translation of a single segment.
37 |         """
38 |         payload = json.dumps({'segments': segments})
39 |         url = self._get_url('/translate')
40 |         response = requests.post(url, headers=self.headers, data=payload)
41 |         return [segment['translation'] for segment in response.json()['data']]
42 | 
43 |     def print_server_status(self):
44 |         """
45 |         Prints the server's status report.
46 |         """
47 |         url = self._get_url('/status')
48 |         response = requests.get(url, headers=self.headers)
49 |         print((json.dumps(response.json(), indent=4)))
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     host = 'localhost'
54 |     port = 8080
55 |     client = Client(host, port)
56 |     client.print_server_status()
57 |     source_segment = SOURCE_SEGMENTS['de']
58 |     print(('Translating "{0}"'.format(source_segment)))
59 |     target_segment = client.translate(source_segment)
60 |     print(target_segment)
61 | 


--------------------------------------------------------------------------------
/nematus/sampler_inputs.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class SamplerInputs:
 5 |     """Input placeholders for RandomSampler and BeamSearchSampler."""
 6 | 
 7 |     def __init__(self):
 8 | 
 9 |         # Number of sentences in the input. When sampling, this is not
10 |         # necessarily the same as the batch size, hence the modified name. The
11 |         # actual batch size (i.e. as seen by the model) will vary: usually
12 |         # it's batch_size_x * beam_size because we tile the input sentences,
13 |         # but in the Transformer encoder it's just batch_size_x.
14 |         self.batch_size_x = tf.compat.v1.placeholder(
15 |             name='batch_size_x',
16 |             shape=(),
17 |             dtype=tf.int32)
18 | 
19 |         # Maximum translation length.
20 |         self.max_translation_len = tf.compat.v1.placeholder(
21 |             name='max_translation_len',
22 |             shape=(),
23 |             dtype=tf.int32)
24 | 
25 |         # Alpha parameter for length normalization.
26 |         self.normalization_alpha = tf.compat.v1.placeholder(
27 |             name='normalization_alpha',
28 |             shape=(),
29 |             dtype=tf.float32)
30 | 


--------------------------------------------------------------------------------
/nematus/sampling_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import tensorflow as tf
 3 | import logging
 4 | 
 5 | class SamplingUtils(object):
 6 |     def __init__(self, config_or_settings_obj):
 7 |         self.sampling_temperature = config_or_settings_obj.sampling_temperature
 8 |         self.translation_strategy = config_or_settings_obj.translation_strategy
 9 | 
10 |     def adjust_logits(self, logits):
11 |         if self.sampling_temperature != 1.0:
12 |             logging.debug("adjust temperature")
13 |             logits = logits / tf.constant(self.sampling_temperature, dtype=tf.float32)
14 | 
15 |         return logits
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/nematus/score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Given a parallel corpus of sentence pairs: with one-to-one of target and source sentences,
  4 | produce the score.
  5 | """
  6 | 
  7 | import logging
  8 | if __name__ == '__main__':
  9 |     # Parse console arguments.
 10 |     from settings import ScorerSettings
 11 |     scorer_settings = ScorerSettings(from_console_arguments=True)
 12 |     # Set the logging level. This needs to be done before the tensorflow
 13 |     # module is imported.
 14 |     level = logging.DEBUG if scorer_settings.verbose else logging.INFO
 15 |     logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
 16 | 
 17 | import argparse
 18 | import sys
 19 | import tempfile
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
 24 | if sys.version_info < (3, 6):
 25 |     ModuleNotFoundError = SystemError
 26 | 
 27 | try:
 28 |     from .config import load_config_from_json_file
 29 |     from .data_iterator import TextIterator
 30 |     from .exponential_smoothing import ExponentialSmoothing
 31 |     from . import model_loader
 32 |     from . import rnn_model
 33 |     from . import train
 34 |     from . import transformer
 35 | except (ModuleNotFoundError, ImportError) as e:
 36 |     from config import load_config_from_json_file
 37 |     from data_iterator import TextIterator
 38 |     from exponential_smoothing import ExponentialSmoothing
 39 |     import model_loader
 40 |     import rnn_model
 41 |     import train
 42 |     import transformer
 43 | 
 44 | 
 45 | 
 46 | # FIXME pass in paths not file objects, since we need to know the paths anyway
 47 | def calc_scores(source_file, target_file, scorer_settings, configs):
 48 |     """Calculates sentence pair scores using each of the specified models.
 49 | 
 50 |     By default (when scorer_settings.normalization_alpha is 0.0), the score
 51 |     is the sentence-level cross entropy, otherwise it's a normalized version.
 52 | 
 53 |     Args:
 54 |         source_file: file object for file containing source sentences.
 55 |         target_file: file object for file containing target sentences.
 56 |         scorer_settings: a ScorerSettings object.
 57 |         configs: a list of Namespace objects specifying the model configs.
 58 | 
 59 |     Returns:
 60 |         A list of lists of floats. The outer list contains one list for each
 61 |         model (in the same order given by configs). The inner list contains
 62 |         one score for each sentence pair.
 63 |     """
 64 |     scores = []
 65 |     for config in configs:
 66 |         g = tf.Graph()
 67 |         with g.as_default():
 68 |             tf_config = tf.compat.v1.ConfigProto()
 69 |             tf_config.allow_soft_placement = True
 70 |             with tf.compat.v1.Session(config=tf_config) as sess:
 71 | 
 72 |                 logging.info('Building model...')
 73 | 
 74 |                 # Create the model graph.
 75 |                 if config.model_type == 'transformer':
 76 |                     model = transformer.Transformer(config)
 77 |                 else:
 78 |                     model = rnn_model.RNNModel(config)
 79 | 
 80 |                 # Add smoothing variables (if the model was trained with
 81 |                 # smoothing).
 82 |                 if config.exponential_smoothing > 0.0:
 83 |                     smoothing = ExponentialSmoothing(
 84 |                         config.exponential_smoothing)
 85 | 
 86 |                 # Restore the model variables.
 87 |                 saver = model_loader.init_or_restore_variables(config, sess)
 88 | 
 89 |                 # Swap-in the smoothed versions of the variables (if present).
 90 |                 if config.exponential_smoothing > 0.0:
 91 |                     sess.run(fetches=smoothing.swap_ops)
 92 | 
 93 |                 text_iterator = TextIterator(
 94 |                     source=source_file.name,
 95 |                     target=target_file.name,
 96 |                     source_dicts=config.source_dicts,
 97 |                     target_dict=config.target_dict,
 98 |                     model_type=config.model_type,
 99 |                     batch_size=scorer_settings.minibatch_size,
100 |                     maxlen=float('inf'),
101 |                     source_vocab_sizes=config.source_vocab_sizes,
102 |                     target_vocab_size=config.target_vocab_size,
103 |                     use_factor=(config.factors > 1),
104 |                     sort_by_length=False)
105 | 
106 |                 ce_vals, _ = train.calc_cross_entropy_per_sentence(
107 |                     sess,
108 |                     model,
109 |                     config,
110 |                     text_iterator,
111 |                     normalization_alpha=scorer_settings.normalization_alpha)
112 | 
113 |                 scores.append(ce_vals)
114 |     return scores
115 | 
116 | 
117 | def write_scores(source_file, target_file, scores, output_file, scorer_settings):
118 | 
119 |     source_file.seek(0)
120 |     target_file.seek(0)
121 |     source_lines = source_file.readlines()
122 |     target_lines = target_file.readlines()
123 | 
124 |     for i, line in enumerate(target_lines):
125 |         score_str = ' '.join(map(str,[s[i] for s in scores]))
126 |         if scorer_settings.verbose:
127 |             output_file.write('{0} '.format(line.strip()))
128 |         output_file.write('{0}\n'.format(score_str))
129 | 
130 | 
131 | def main(source_file, target_file, output_file, scorer_settings):
132 |     # load model model_options
133 |     configs = []
134 |     for model in scorer_settings.models:
135 |         config = load_config_from_json_file(model)
136 |         setattr(config, 'reload', model)
137 |         configs.append(config)
138 | 
139 |     scores = calc_scores(source_file, target_file, scorer_settings, configs)
140 |     write_scores(source_file, target_file, scores, output_file, scorer_settings)
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     main(source_file=scorer_settings.source,
145 |          target_file=scorer_settings.target,
146 |          output_file=scorer_settings.output,
147 |          scorer_settings=scorer_settings)
148 | 


--------------------------------------------------------------------------------
/nematus/server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Runs Nematus as a Web Server.
  5 | """
  6 | 
  7 | import json
  8 | import pkg_resources
  9 | import logging
 10 | 
 11 | from bottle import Bottle, request, response
 12 | from bottle_log import LoggingPlugin
 13 | 
 14 | from server.response import TranslationResponse
 15 | from server.api.provider import request_provider, response_provider
 16 | 
 17 | from settings import ServerSettings
 18 | from server_translator import Translator
 19 | 
 20 | class NematusServer(object):
 21 |     """
 22 |     Keeps a Nematus model in memory to answer http translation requests.
 23 |     """
 24 | 
 25 |     STATUS_LOADING = 'loading'
 26 |     STATUS_OK = 'ok'
 27 | 
 28 |     def __init__(self, server_settings):
 29 |         """
 30 |         Loads a translation model and initialises the webserver.
 31 | 
 32 |         @param server_settings: see `settings.py`
 33 |         """
 34 |         self._style = server_settings.style
 35 |         self._host = server_settings.host
 36 |         self._port = server_settings.port
 37 |         self._threads = server_settings.threads
 38 |         self._debug = server_settings.verbose
 39 |         self._models = server_settings.models
 40 |         self._num_processes = server_settings.num_processes
 41 |         self._status = self.STATUS_LOADING
 42 |         # start webserver
 43 |         self._server = Bottle()
 44 |         self._server.config['logging.level'] = 'DEBUG' if server_settings.verbose else 'WARNING'
 45 |         self._server.config['logging.format'] = '%(levelname)s: %(message)s'
 46 |         self._server.install(LoggingPlugin(self._server.config))
 47 |         logging.info("Starting Nematus Server")
 48 |         # start translation workers
 49 |         logging.info("Loading translation models")
 50 |         self._translator = Translator(server_settings)
 51 |         self._status = self.STATUS_OK
 52 | 
 53 |     def status(self):
 54 |         """
 55 |         Reports on the status of this translation server.
 56 |         """
 57 |         response_data = {
 58 |             'status': self._status,
 59 |             'models': self._models,
 60 |             'version': pkg_resources.require("nematus")[0].version,
 61 |             'service': 'nematus',
 62 |         }
 63 |         response.content_type = "application/json"
 64 |         return json.dumps(response_data)
 65 | 
 66 |     def translate(self):
 67 |         """
 68 |         Processes a translation request.
 69 |         """
 70 |         translation_request = request_provider(self._style, request)
 71 |         logging.debug("REQUEST - " + repr(translation_request))
 72 | 
 73 |         translations = self._translator.translate(
 74 |             translation_request.segments,
 75 |             translation_request.settings
 76 |         )
 77 |         response_data = {
 78 |             'status': TranslationResponse.STATUS_OK,
 79 |             'segments': [translation.target_words for translation in translations],
 80 |         }
 81 |         translation_response = response_provider(self._style, **response_data)
 82 |         logging.debug("RESPONSE - " + repr(translation_response))
 83 | 
 84 |         response.content_type = translation_response.get_content_type()
 85 |         return repr(translation_response)
 86 | 
 87 |     def start(self):
 88 |         """
 89 |         Starts the webserver.
 90 |         """
 91 |         self._route()
 92 |         self._server.run(host=self._host, port=self._port, debug=self._debug, server='tornado', threads=self._threads)
 93 |         self._cleanup()
 94 | 
 95 |     def _cleanup(self):
 96 |         """
 97 |         Graceful exit for components.
 98 |         """
 99 |         self._translator.shutdown()
100 | 
101 |     def _route(self):
102 |         """
103 |         Routes webserver paths to functions.
104 |         """
105 |         self._server.route('/status', method="GET", callback=self.status)
106 |         self._server.route('/translate', method="POST", callback=self.translate)
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     # parse console arguments
111 |     server_settings = ServerSettings(from_console_arguments=True)
112 |     server = NematusServer(server_settings)
113 |     server.start()
114 | 


--------------------------------------------------------------------------------
/nematus/server/README.md:
--------------------------------------------------------------------------------
  1 | # Nematus Server
  2 | Runs Nematus as a web service.
  3 | 
  4 | ## Basic Usage
  5 | 
  6 | The command
  7 | 
  8 | ```bash
  9 | python3 server.py -m model.npz
 10 | ```
 11 | 
 12 | will start Nematus Server at `localhost` on port 8080, using translation model `model.npz`. Once the model has been loaded, the server is ready to answer translation requests according to the API outlined below.
 13 | 
 14 | ### Required Arguments
 15 | 
 16 | Nematus Server needs at least one translation model, provided via the `-m` or `--models` parameter. Multiple models (for ensemble decoding) are delimited with spaces:
 17 | 
 18 | ```bash
 19 | python3 server.py -m model1.npz model2.npz model3.npz model4.npz
 20 | ```
 21 | 
 22 | ### Optional Arguments
 23 | 
 24 | | Argument            | Default Value | Description              |
 25 | | --------------------|---------------| -------------------------|
 26 | | `--host`            | `localhost`   | Host name                |
 27 | | `--port`            | `8080`        | Port                     |
 28 | | `-p`,               | `1`           | Number of translation processes to start. Each process loads all models specified in `-m`/`--models`. |
 29 | | `--device-list`     | any           | The devices to start translation processes on, e.g., `gpu0 gpu1 gpu6`. Defaults to any available device. |
 30 | | `-v`                | off           | Verbose mode             |
 31 | 
 32 | 
 33 | ## API
 34 | Nematus Server supports several API styles.
 35 | 
 36 | ### Nematus Translation API
 37 | 
 38 | #### Translation Request
 39 | 
 40 | `POST http://host:port/translate`
 41 | 
 42 | Content-Type: application/json
 43 | 
 44 | ##### Query Parameters
 45 | 
 46 | | Parameter           | Type                  | Default Value | Description |
 47 | | --------------------|-----------------------|-----------|-------------|
 48 | | ``segments``        | ``list(list(str))``   |           | The sentences to be translated (source language). Each sentence is a list of tokens. |
 49 | | ``normalize``       | ``boolean``           | ``true``  | Normalise scores by sentence length. |
 50 | | ``beam_width``      | ``int``               | ``5``     | The beam width to be used for decoding. |
 51 | | ``character_level`` | ``boolean``           | ``false`` | Enables character- rather than subword-level translation. |
 52 | | ``n_best``          | ``int``               | ``1``     | Return n best translations per segment. |
 53 | | ``suppress_unk``    | ``boolean``           | ``false`` | Suppress hypotheses containing UNK. |
 54 | 
 55 | Sample request:
 56 | 
 57 | ```json
 58 | {
 59 | 	"segments": [
 60 | 		["I", "can", "resist", "everything", "except", "temptation", "."],
 61 | 		["The", "truth", "is", "rarely", "pure", "and", "never", "simple", "."]
 62 | 	],
 63 | }
 64 | ```
 65 | 
 66 | ##### Response Body
 67 | 
 68 | If successful, the response body contains a JSON object with the following structure:
 69 | 
 70 | ```json
 71 | {
 72 |     "status": "ok",
 73 |     "data": [
 74 |         {
 75 |             "translation": ["ich", "kann", "dem", "alles", "außer", "Versuchung", "widerstehen", "."],
 76 |         },
 77 |         {
 78 |             "translation": ["die", "Wahrheit", "ist", "selten", "rein", "und", "nie", "einfach", "."],
 79 |         }
 80 |     ]
 81 | }
 82 | ```
 83 | 
 84 | #### Status Request
 85 | 
 86 | `GET http://host:port/status`
 87 | 
 88 | ##### Response Body
 89 | 
 90 | If successful, the response body contains a JSON object with the following structure:
 91 | 
 92 | ```json
 93 | {
 94 |   "status": "ok",
 95 |   "models": [
 96 |     "wmt16-en-de-model1.npz",
 97 |     "wmt16-en-de-model2.npz",
 98 |     "wmt16-en-de-model3.npz",
 99 |     "wmt16-en-de-model4.npz",
100 |   ],
101 |   "version": "0.1.dev0",
102 |   "service": "nematus"
103 | }
104 | ```
105 | 
106 | 
107 | ## Sample Client
108 | 
109 | A sample client, written in Python, is available in `sample_client.py`.
110 | 


--------------------------------------------------------------------------------
/nematus/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EdinburghNLP/nematus/49d050863bc9644b8c0a9d9ab6e54ccd30f927dd/nematus/server/__init__.py


--------------------------------------------------------------------------------
/nematus/server/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EdinburghNLP/nematus/49d050863bc9644b8c0a9d9ab6e54ccd30f927dd/nematus/server/api/__init__.py


--------------------------------------------------------------------------------
/nematus/server/api/nematus_style.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Defines the Nematus API for translation requests and responses.
 5 | """
 6 | 
 7 | import json
 8 | from ..request import TranslationRequest
 9 | from ..response import TranslationResponse
10 | 
11 | class TranslationRequestNematus(TranslationRequest):
12 |     def _parse(self):
13 |         # never produce search graph
14 |         self.get_search_graph = False
15 | 
16 |         request = self._request.json
17 |         if 'segments' in request:
18 |             self.segments = [' '.join(tokens) for tokens in request['segments']]
19 |         if 'beam_width' in request:
20 |             self.settings.beam_width = request['beam_width']
21 |         if 'normalize' in request:
22 |             self.settings.normalization_alpha = request['normalize']
23 |         if 'character_level' in request:
24 |             self.settings.char_level = request['character_level']
25 |         if 'suppress_unk' in request:
26 |             self.settings.suppress_unk = request['suppress_unk']
27 |         if 'return_word_alignment' in request:
28 |             self.settings.get_alignment = request['return_word_alignment']
29 |         if 'return_word_probabilities' in request:
30 |             self.settings.get_word_probs = request['return_word_probabilities']
31 | 
32 |     def _format(self):
33 |         request = {
34 |             'id': str(self.settings.request_id),
35 |             'data': [segment for segment in self.segments]
36 |         }
37 |         return json.dumps(request)
38 | 
39 | class TranslationResponseNematus(TranslationResponse):
40 |     def _format(self):
41 |         response = {
42 |             'status': '',
43 |             'data': [],
44 |         }
45 |         if self._status == self.STATUS_OK:
46 |             response['status'] = 'ok'
47 |             for i, translation in enumerate(self._segments):
48 |                 segment = {'translation': translation}
49 |                 if self._word_alignments:
50 |                     segment['word_alignment'] = self._word_alignments[i]
51 |                 if self._word_probabilities:
52 |                     segment['word_probabilities'] = self._word_probabilities[i]
53 |                 response['data'].append(segment)
54 |         else:
55 |             response['status'] = 'error'
56 |         return json.dumps(response)
57 | 


--------------------------------------------------------------------------------
/nematus/server/api/provider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Implements providors for TranslationRequest and TranslationResponse objects
 5 | of a specific API style.
 6 | """
 7 | 
 8 | def request_provider(style, request):
 9 |     """
10 |     Turns a raw request body into a TranslationRequest of a given API style
11 |     @param style.
12 |     """
13 |     from .nematus_style import TranslationRequestNematus
14 |     mapping = {
15 |         'Nematus': TranslationRequestNematus
16 |     }
17 |     try:
18 |         return mapping[style](request)
19 |     except IndexError:
20 |         raise NotImplementedError("Invalid API style: {0}".format(style))
21 | 
22 | def response_provider(style, **response_args):
23 |     """
24 |     Formats @param response_args as a TranslationResponse of a given API style
25 |     @param style.
26 |     """
27 |     from .nematus_style import TranslationResponseNematus
28 |     mapping = {
29 |         'Nematus': TranslationResponseNematus
30 |     }
31 |     try:
32 |         return mapping[style](**response_args)
33 |     except IndexError:
34 |         raise NotImplementedError("Invalid API style: {0}".format(style))
35 | 


--------------------------------------------------------------------------------
/nematus/server/request.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Defines the abstract request format for Nematus server.
 5 | """
 6 | 
 7 | from abc import ABCMeta, abstractmethod
 8 | 
 9 | from settings import TranslationSettings
10 | 
11 | class TranslationRequest(object, metaclass=ABCMeta):
12 |     """
13 |     Abstract translation request base class.
14 |     """
15 | 
16 |     def __init__(self, request):
17 |         """
18 |         Initialises a translation request.
19 | 
20 |         @type raw_body: str
21 |         @param raw_body: the POST request submitted to Nematus server.
22 |         """
23 |         self._request = request
24 |         self.segments = []
25 |         self.settings = TranslationSettings() # default values
26 |         self._parse()
27 | 
28 |     @abstractmethod
29 |     def _format(self):
30 |         """
31 |         Formats this translation response.
32 |         """
33 |         pass # to be implemented in subclasses
34 | 
35 |     def __repr__(self):
36 |         """
37 |         Returns the raw body of this translation request.
38 |         """
39 |         return self._format()
40 | 
41 |     @abstractmethod
42 |     def _parse(self):
43 |         """
44 |         Parses the request's raw body. Sets or overrides
45 |         * self.segments
46 |         * self.beam_width
47 |         * self.normalize
48 |         * self.character_level
49 |         * self.n_best
50 |         * self.suppress_unk
51 |         * self.return_word_alignment
52 |         * self.return_word_probabilities
53 |         """
54 |         pass # to be implemented in subclasses
55 | 


--------------------------------------------------------------------------------
/nematus/server/response.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Defines the abstract response format for Nematus server.
 5 | """
 6 | 
 7 | from abc import ABCMeta, abstractmethod
 8 | 
 9 | class TranslationResponse(object, metaclass=ABCMeta):
10 |     """
11 |     Abstract translation response base class.
12 |     """
13 | 
14 |     STATUS_OK = 0
15 |     STATUS_ERROR = 1
16 | 
17 |     def __init__(self, status, segments, word_alignments=None, word_probabilities=None):
18 |         """
19 |         Initialises a translation response.
20 | 
21 |         @type segments: list(str)
22 |         @param segments: the translated segments to be included.
23 |         """
24 |         self._content_type = "application/json"
25 |         self._status = status
26 |         self._segments = segments
27 |         self._word_alignments = word_alignments
28 |         self._word_probabilities = word_probabilities
29 |         self._response = self._format()
30 | 
31 |     @abstractmethod
32 |     def _format(self):
33 |         """
34 |         Formats this translation response.
35 |         """
36 |         pass # to be implemented in subclasses
37 | 
38 |     def __repr__(self):
39 |         """
40 |         Returns the raw body of this translation response.
41 |         """
42 |         return self._format()
43 | 
44 |     def get_content_type(self):
45 |         return self._content_type
46 | 


--------------------------------------------------------------------------------
/nematus/shuffle.py:
--------------------------------------------------------------------------------
1 | ../data/shuffle.py


--------------------------------------------------------------------------------
/nematus/tf_utils.py:
--------------------------------------------------------------------------------
 1 | """TensorFlow-specific utility functions."""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | def assert_shapes(shapes):
 6 |     """Wrapper for tf.debugging.assert_shapes."""
 7 | 
 8 |     # tf.debugging.assert_shapes is only supported in 1.14 and later, so
 9 |     # the call is wrapped in a try-except to allow Nematus to run on earlier
10 |     # versions.
11 |     try:
12 |         assertion_op = tf.debugging.assert_shapes(shapes)
13 |         with tf.control_dependencies([assertion_op]):
14 |             pass
15 |     except (AttributeError, TypeError) as e:
16 |         pass
17 | 
18 | 
19 | def get_available_gpus():
20 |     """Returns a list of the identifiers of all visible GPUs.
21 | 
22 |     Source: https://stackoverflow.com/questions/38559755
23 |     """
24 |     from tensorflow.python.client import device_lib
25 |     local_device_protos = device_lib.list_local_devices()
26 |     return [x.name for x in local_device_protos if x.device_type == 'GPU']
27 | 
28 | 
29 | def get_shape_list(inputs):
30 |     """Returns a list of input dimensions, statically where possible.
31 | 
32 |     TODO What is this useful for?
33 | 
34 |     Adopted from the tensor2tensor library.
35 |     """
36 |     inputs = tf.convert_to_tensor(value=inputs)
37 |     # If input's rank is unknown, return dynamic shape.
38 |     if inputs.get_shape().dims is None:
39 |         dims_list = tf.shape(input=inputs)
40 |     else:
41 |         static_dims_list = inputs.get_shape().as_list()
42 |         dynamic_shape = tf.shape(input=inputs)
43 |         # Replace the unspecified static dimensions with dynamic ones.
44 |         dims_list = list()
45 |         for i in range(len(static_dims_list)):
46 |             dim = static_dims_list[i]
47 |             if dim is None:
48 |                 dim = dynamic_shape[i]
49 |             dims_list.append(dim)
50 |     return dims_list
51 | 


--------------------------------------------------------------------------------
/nematus/training_progress.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Training progress
 3 | '''
 4 | 
 5 | import json
 6 | 
 7 | class TrainingProgress(object):
 8 |     '''
 9 |     Object used to store, serialize and deserialize pure python variables that change during training and should be preserved in order to properly restart the training process
10 |     '''
11 | 
12 |     def load_from_json(self, file_name):
13 |         with open(file_name, 'r', encoding='utf-8') as fh:
14 |             self.__dict__.update(json.load(fh))
15 | 
16 |     def save_to_json(self, file_name):
17 |         with open(file_name, 'w', encoding='utf-8') as fh:
18 |             # TODO ensure_ascii=False?
19 |             json.dump(self.__dict__, fh, indent=2)
20 | 


--------------------------------------------------------------------------------
/nematus/transformer_blocks.py:
--------------------------------------------------------------------------------
  1 | """Adapted from Nematode: https://github.com/demelin/nematode """
  2 | 
  3 | import sys
  4 | import tensorflow as tf
  5 | 
  6 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
  7 | if sys.version_info < (3, 6):
  8 |     ModuleNotFoundError = SystemError
  9 | 
 10 | try:
 11 |     from .transformer_attention_modules import MultiHeadAttentionLayer
 12 |     from .transformer_layers import \
 13 |         ProcessingLayer, \
 14 |         FeedForwardNetwork, \
 15 |         LayerNormLayer, \
 16 |         RMSNormLayer
 17 | 
 18 | except (ModuleNotFoundError, ImportError) as e:
 19 |     from transformer_attention_modules import MultiHeadAttentionLayer
 20 |     from transformer_layers import \
 21 |         ProcessingLayer, \
 22 |         FeedForwardNetwork, \
 23 |         LayerNormLayer, \
 24 |         RMSNormLayer
 25 | 
 26 | # from attention_modules import SingleHeadAttentionLayer, FineGrainedAttentionLayer
 27 | 
 28 | 
 29 | class AttentionBlock(object):
 30 |     """ Defines a single attention block (referred to as 'sub-layer' in the paper) comprising of a single multi-head
 31 |     attention layer preceded by a pre-processing layer and followed by a post-processing layer. """
 32 | 
 33 |     def __init__(self,
 34 |                  config,
 35 |                  float_dtype,
 36 |                  self_attention,
 37 |                  training,
 38 |                  from_rnn=False,
 39 |                  tie_attention=False):
 40 |         # Set attributes
 41 |         self.self_attention = self_attention
 42 |         if not tie_attention:
 43 |             if self_attention:
 44 |                 attn_name = 'self_attn'
 45 |             else:
 46 |                 attn_name = 'cross_attn'
 47 |         else:
 48 |             attn_name = 'tied_attn'
 49 | 
 50 |         memory_size = config.state_size
 51 |         if from_rnn:
 52 |             memory_size *= 2
 53 | 
 54 |         if config.layer_normalization_type == 'layernorm':
 55 |             layernorm = LayerNormLayer
 56 |         elif config.layer_normalization_type == 'rmsnorm':
 57 |             layernorm = RMSNormLayer
 58 | 
 59 |         # Build layers
 60 |         self.pre_attn = ProcessingLayer(config.state_size,
 61 |                                         use_layer_norm=layernorm,
 62 |                                         dropout_rate=0.,
 63 |                                         training=training,
 64 |                                         name='pre_{:s}_sublayer'.format(attn_name))
 65 | 
 66 |         self.attn = MultiHeadAttentionLayer(memory_size,
 67 |                                             config.state_size,
 68 |                                             config.state_size,
 69 |                                             config.state_size,
 70 |                                             config.state_size,
 71 |                                             config.transformer_num_heads,
 72 |                                             float_dtype,
 73 |                                             dropout_attn=config.transformer_dropout_attn,
 74 |                                             drophead=config.transformer_drophead,
 75 |                                             training=training,
 76 |                                             name='{:s}_sublayer'.format(attn_name))
 77 | 
 78 |         self.post_attn = ProcessingLayer(config.state_size,
 79 |                                          use_layer_norm=False,
 80 |                                          dropout_rate=config.transformer_dropout_residual,
 81 |                                          training=training,
 82 |                                          name='post_{:s}_sublayer'.format(attn_name))
 83 | 
 84 |     def forward(self, inputs, memory_context, attn_mask, layer_memories=None):
 85 |         """ Propagates input data through the block. """
 86 |         if not self.self_attention:
 87 |             assert (memory_context is not None), \
 88 |                 'Encoder memories have to be provided for encoder-decoder attention computation.'
 89 |         attn_inputs = self.pre_attn.forward(inputs)
 90 |         attn_outputs, layer_memories = self.attn.forward(attn_inputs, memory_context, attn_mask, layer_memories)
 91 |         block_out = self.post_attn.forward(attn_outputs, residual_inputs=inputs)
 92 |         return block_out, layer_memories
 93 | 
 94 | 
 95 | class FFNBlock(object):
 96 |     """ Defines a single feed-forward network block (referred to as 'sub-layer' in the transformer paper) comprising of
 97 |     a single feed-forward network preceded by a pre-processing layer and followed by a post-processing layer. """
 98 | 
 99 |     def __init__(self,
100 |                  config,
101 |                  ffn_dims,
102 |                  float_dtype,
103 |                  is_final,
104 |                  training):
105 |         # Set attributes
106 |         self.is_final = is_final
107 | 
108 |         if config.layer_normalization_type == 'layernorm':
109 |             layernorm = LayerNormLayer
110 |         elif config.layer_normalization_type == 'rmsnorm':
111 |             layernorm = RMSNormLayer
112 | 
113 |         # Build layers
114 |         self.pre_ffn = ProcessingLayer(config.state_size,
115 |                                        use_layer_norm=layernorm,
116 |                                        dropout_rate=0.,
117 |                                        training=training,
118 |                                        name='pre_ffn_sublayer')
119 |         self.ffn = FeedForwardNetwork(ffn_dims,
120 |                                       float_dtype,
121 |                                       use_bias=True,
122 |                                       activation=tf.nn.relu,
123 |                                       use_layer_norm=False,
124 |                                       dropout_rate=config.transformer_dropout_relu,
125 |                                       training=training,
126 |                                       name='ffn_sublayer')
127 |         self.post_ffn = ProcessingLayer(config.state_size,
128 |                                         use_layer_norm=False,
129 |                                         dropout_rate=config.transformer_dropout_residual,
130 |                                         training=training,
131 |                                         name='post_ffn_sublayer')
132 |         if is_final:
133 |             self.pre_final = ProcessingLayer(config.state_size,
134 |                                              use_layer_norm=layernorm,
135 |                                              dropout_rate=0.,
136 |                                              training=training,
137 |                                              name='final_transform')
138 | 
139 |     def forward(self, inputs):
140 |         """ Propagates input data through the block. """
141 |         ffn_inputs = self.pre_ffn.forward(inputs)
142 |         ffn_outputs = self.ffn.forward(ffn_inputs)
143 |         block_out = self.post_ffn.forward(ffn_outputs, residual_inputs=inputs)
144 |         if self.is_final:
145 |             block_out = self.pre_final.forward(block_out)
146 |         return block_out
147 | 


--------------------------------------------------------------------------------
/nematus/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """Translates a source file using a translation model (or ensemble)."""
  4 | 
  5 | import sys
  6 | import logging
  7 | if __name__ == '__main__':
  8 |     # Parse console arguments.
  9 |     from settings import TranslationSettings
 10 |     settings = TranslationSettings(from_console_arguments=True)
 11 |     # Set the logging level. This needs to be done before the tensorflow
 12 |     # module is imported.
 13 |     level = logging.DEBUG if settings.verbose else logging.INFO
 14 |     logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
 15 | 
 16 | import argparse
 17 | 
 18 | import tensorflow as tf
 19 | 
 20 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
 21 | if sys.version_info < (3, 6):
 22 |     ModuleNotFoundError = SystemError
 23 | 
 24 | try:
 25 |     from .beam_search_sampler import BeamSearchSampler
 26 |     from .config import load_config_from_json_file
 27 |     from .exponential_smoothing import ExponentialSmoothing
 28 |     from . import model_loader
 29 |     from .random_sampler import RandomSampler
 30 |     from . import rnn_model
 31 |     from .sampling_utils import SamplingUtils
 32 |     from .transformer import Transformer as TransformerModel
 33 |     from . import translate_utils
 34 | except (ModuleNotFoundError, ImportError) as e:
 35 |     from beam_search_sampler import BeamSearchSampler
 36 |     from config import load_config_from_json_file
 37 |     from exponential_smoothing import ExponentialSmoothing
 38 |     import model_loader
 39 |     from random_sampler import RandomSampler
 40 |     import rnn_model
 41 |     from sampling_utils import SamplingUtils
 42 |     from transformer import Transformer as TransformerModel
 43 |     import translate_utils
 44 | 
 45 | 
 46 | def main(settings):
 47 |     """
 48 |     Translates a source language file (or STDIN) into a target language file
 49 |     (or STDOUT).
 50 |     """
 51 |     # Create the TensorFlow session.
 52 |     g = tf.Graph()
 53 |     with g.as_default():
 54 |         tf_config = tf.compat.v1.ConfigProto()
 55 |         tf_config.allow_soft_placement = True
 56 |         session = tf.compat.v1.Session(config=tf_config)
 57 | 
 58 |         # Load config file for each model.
 59 |         configs = []
 60 |         for model in settings.models:
 61 |             config = load_config_from_json_file(model)
 62 |             setattr(config, 'reload', model)
 63 |             setattr(config, 'translation_maxlen', settings.translation_maxlen)
 64 |             configs.append(config)
 65 | 
 66 |         # Create the model graphs.
 67 |         logging.debug("Loading models\n")
 68 |         models = []
 69 |         for i, config in enumerate(configs):
 70 |             with tf.compat.v1.variable_scope("model%d" % i) as scope:
 71 |                 if config.model_type == "transformer":
 72 |                     model = TransformerModel(config)
 73 |                 else:
 74 |                     model = rnn_model.RNNModel(config)
 75 |                 model.sampling_utils = SamplingUtils(settings)
 76 |                 models.append(model)
 77 | 
 78 |         # Add smoothing variables (if the models were trained with smoothing).
 79 |         #FIXME Assumes either all models were trained with smoothing or none were.
 80 |         if configs[0].exponential_smoothing > 0.0:
 81 |             smoothing = ExponentialSmoothing(configs[0].exponential_smoothing)
 82 | 
 83 |         # Restore the model variables.
 84 |         for i, config in enumerate(configs):
 85 |             with tf.compat.v1.variable_scope("model%d" % i) as scope:
 86 |                 _ = model_loader.init_or_restore_variables(config, session,
 87 |                                                        ensemble_scope=scope)
 88 | 
 89 |         # Swap-in the smoothed versions of the variables.
 90 |         if configs[0].exponential_smoothing > 0.0:
 91 |             session.run(fetches=smoothing.swap_ops)
 92 | 
 93 |         max_translation_len = settings.translation_maxlen
 94 | 
 95 |         # Create a BeamSearchSampler / RandomSampler.
 96 |         if settings.translation_strategy == 'beam_search':
 97 |             sampler = BeamSearchSampler(models, configs, settings.beam_size)
 98 |         else:
 99 |             assert settings.translation_strategy == 'sampling'
100 |             sampler = RandomSampler(models, configs, settings.beam_size)
101 | 
102 |         # Warn about the change from neg log probs to log probs for the RNN.
103 |         if settings.n_best:
104 |             model_types = [config.model_type for config in configs]
105 |             if 'rnn' in model_types:
106 |                 logging.warn('n-best scores for RNN models have changed from '
107 |                              'positive to negative (as of commit 95793196...). '
108 |                              'If you are using the scores for reranking etc, then '
109 |                              'you may need to update your scripts.')
110 | 
111 |         # Translate the source file.
112 |         translate_utils.translate_file(
113 |             input_file=settings.input,
114 |             output_file=settings.output,
115 |             session=session,
116 |             sampler=sampler,
117 |             config=configs[0],
118 |             max_translation_len=max_translation_len,
119 |             normalization_alpha=settings.normalization_alpha,
120 |             nbest=settings.n_best,
121 |             minibatch_size=settings.minibatch_size,
122 |             maxibatch_size=settings.maxibatch_size)
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main(settings)
127 | 


--------------------------------------------------------------------------------
/nematus/translate_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | import time
  4 | 
  5 | import numpy
  6 | import tensorflow as tf
  7 | 
  8 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
  9 | if sys.version_info < (3, 6):
 10 |     ModuleNotFoundError = SystemError
 11 | 
 12 | try:
 13 |     from . import exception
 14 |     from . import util
 15 | except (ModuleNotFoundError, ImportError) as e:
 16 |     import exception
 17 |     import util
 18 | 
 19 | 
 20 | def translate_batch(session, sampler, x, x_mask, max_translation_len,
 21 |                     normalization_alpha):
 22 |     """Translate a batch using a RandomSampler or BeamSearchSampler.
 23 | 
 24 |     Args:
 25 |         session: a TensorFlow session.
 26 |         sampler: a BeamSearchSampler or RandomSampler object.
 27 |         x: input Tensor with shape (factors, max_seq_len, batch_size).
 28 |         x_mask: mask Tensor for x with shape (max_seq_len, batch_size).
 29 |         max_translation_len: integer specifying maximum translation length.
 30 |         normalization_alpha: float specifying alpha parameter for length
 31 |             normalization.
 32 | 
 33 |     Returns:
 34 |         A list of lists of (translation, score) pairs. The outer list contains
 35 |         one list for each input sentence in the batch. The inner lists contain
 36 |         k elements (where k is the beam size), sorted by score in best-first
 37 |         order.
 38 |     """
 39 | 
 40 |     x_tiled = numpy.tile(x, reps=[1, 1, sampler.beam_size])
 41 |     x_mask_tiled = numpy.tile(x_mask, reps=[1, sampler.beam_size])
 42 | 
 43 |     feed_dict = {}
 44 | 
 45 |     # Feed inputs to the models.
 46 |     for model, config in zip(sampler.models, sampler.configs):
 47 |         if config.model_type == 'rnn':
 48 |             feed_dict[model.inputs.x] = x_tiled
 49 |             feed_dict[model.inputs.x_mask] = x_mask_tiled
 50 |         else:
 51 |             assert config.model_type == 'transformer'
 52 |             # Inputs don't need to be tiled in the Transformer because it
 53 |             # checks for different batch sizes in the encoder and decoder and
 54 |             # does its own tiling internally at the connection points.
 55 |             feed_dict[model.inputs.x] = x
 56 |             feed_dict[model.inputs.x_mask] = x_mask
 57 |         feed_dict[model.inputs.training] = False
 58 | 
 59 |     # Feed inputs to the sampler.
 60 |     feed_dict[sampler.inputs.batch_size_x] = x.shape[-1]
 61 |     feed_dict[sampler.inputs.max_translation_len] = max_translation_len
 62 |     feed_dict[sampler.inputs.normalization_alpha] = normalization_alpha
 63 | 
 64 |     # Run the sampler.
 65 |     translations, scores = session.run(sampler.outputs, feed_dict=feed_dict)
 66 | 
 67 |     assert len(translations) == x.shape[-1]
 68 |     assert len(scores) == x.shape[-1]
 69 | 
 70 |     # Sort the translations by score. The scores are (optionally normalized)
 71 |     # log probs so higher values are better.
 72 |     beams = []
 73 |     for i in range(len(translations)):
 74 |         pairs = zip(translations[i], scores[i])
 75 |         beams.append(sorted(pairs, key=lambda pair: pair[1], reverse=True))
 76 | 
 77 |     return beams
 78 | 
 79 | 
 80 | def translate_file(input_file, output_file, session, sampler, config,
 81 |                    max_translation_len, normalization_alpha, nbest=False,
 82 |                    minibatch_size=80, maxibatch_size=20):
 83 |     """Translates a source file using a RandomSampler or BeamSearchSampler.
 84 | 
 85 |     Args:
 86 |         input_file: file object from which source sentences will be read.
 87 |         output_file: file object to which translations will be written.
 88 |         session: TensorFlow session.
 89 |         sampler: BeamSearchSampler or RandomSampler object.
 90 |         config: model config.
 91 |         max_translation_len: integer specifying maximum translation length.
 92 |         normalization_alpha: float specifying alpha parameter for length
 93 |             normalization.
 94 |         nbest: if True, produce n-best output with scores; otherwise 1-best.
 95 |         minibatch_size: minibatch size in sentences.
 96 |         maxibatch_size: number of minibatches to read and sort, pre-translation.
 97 |     """
 98 | 
 99 |     def translate_maxibatch(maxibatch, num_to_target, num_prev_translated):
100 |         """Translates an individual maxibatch.
101 | 
102 |         Args:
103 |             maxibatch: a list of sentences.
104 |             num_to_target: dictionary mapping target vocabulary IDs to strings.
105 |             num_prev_translated: the number of previously translated sentences.
106 |         """
107 | 
108 |         # Sort the maxibatch by length and split into minibatches.
109 |         try:
110 |             minibatches, idxs = util.read_all_lines(config, maxibatch,
111 |                                                     minibatch_size)
112 |         except exception.Error as x:
113 |             logging.error(x.msg)
114 |             sys.exit(1)
115 | 
116 |         # Translate the minibatches and store the resulting beam (i.e.
117 |         # translations and scores) for each sentence.
118 |         beams = []
119 |         for x in minibatches:
120 |             y_dummy = numpy.zeros(shape=(len(x),1))
121 |             x, x_mask, _, _ = util.prepare_data(x, y_dummy, config.factors,
122 |                                                 maxlen=None)
123 |             sample = translate_batch(session, sampler, x, x_mask,
124 |                                      max_translation_len, normalization_alpha)
125 |             beams.extend(sample)
126 |             num_translated = num_prev_translated + len(beams)
127 |             logging.info('Translated {} sents'.format(num_translated))
128 | 
129 |         # Put beams into the same order as the input maxibatch.
130 |         tmp = numpy.array(beams, dtype=numpy.object)
131 |         ordered_beams = tmp[idxs.argsort()]
132 | 
133 |         # Write the translations to the output file.
134 |         for i, beam in enumerate(ordered_beams):
135 |             if nbest:
136 |                 num = num_prev_translated + i
137 |                 for sent, cost in beam:
138 |                     translation = util.seq2words(sent, num_to_target)
139 |                     line = "{} ||| {} ||| {}\n".format(num, translation,
140 |                                                        str(cost))
141 |                     output_file.write(line)
142 |             else:
143 |                 best_hypo, cost = beam[0]
144 |                 line = util.seq2words(best_hypo, num_to_target) + '\n'
145 |                 output_file.write(line)
146 | 
147 |     _, _, _, num_to_target = util.load_dictionaries(config)
148 | 
149 |     logging.info("NOTE: Length of translations is capped to {}".format(
150 |         max_translation_len))
151 | 
152 |     start_time = time.time()
153 | 
154 |     num_translated = 0
155 |     maxibatch = []
156 |     while True:
157 |         line = input_file.readline()
158 |         if line == "":
159 |             if len(maxibatch) > 0:
160 |                 translate_maxibatch(maxibatch, num_to_target, num_translated)
161 |                 num_translated += len(maxibatch)
162 |             break
163 |         maxibatch.append(line)
164 |         if len(maxibatch) == (maxibatch_size * minibatch_size):
165 |             translate_maxibatch(maxibatch, num_to_target, num_translated)
166 |             num_translated += len(maxibatch)
167 |             maxibatch = []
168 | 
169 |     duration = time.time() - start_time
170 |     logging.info('Translated {} sents in {} sec. Speed {} sents/sec'.format(
171 |         num_translated, duration, num_translated/duration))
172 | 


--------------------------------------------------------------------------------
/nematus/util.py:
--------------------------------------------------------------------------------
  1 | """Utility functions."""
  2 | 
  3 | import pickle as pkl
  4 | import json
  5 | import logging
  6 | import numpy
  7 | import sys
  8 | 
  9 | # ModuleNotFoundError is new in 3.6; older versions will throw SystemError
 10 | if sys.version_info < (3, 6):
 11 |     ModuleNotFoundError = SystemError
 12 | 
 13 | try:
 14 |     from . import exception
 15 | except (ModuleNotFoundError, ImportError) as e:
 16 |     import exception
 17 | 
 18 | # batch preparation
 19 | def prepare_data(seqs_x, seqs_y, n_factors, maxlen=None):
 20 |     # x: a list of sentences
 21 |     lengths_x = [len(s) for s in seqs_x]
 22 |     lengths_y = [len(s) for s in seqs_y]
 23 | 
 24 |     if maxlen is not None:
 25 |         new_seqs_x = []
 26 |         new_seqs_y = []
 27 |         new_lengths_x = []
 28 |         new_lengths_y = []
 29 |         for l_x, s_x, l_y, s_y in zip(lengths_x, seqs_x, lengths_y, seqs_y):
 30 |             if l_x < maxlen and l_y < maxlen:
 31 |                 new_seqs_x.append(s_x)
 32 |                 new_lengths_x.append(l_x)
 33 |                 new_seqs_y.append(s_y)
 34 |                 new_lengths_y.append(l_y)
 35 |         lengths_x = new_lengths_x
 36 |         seqs_x = new_seqs_x
 37 |         lengths_y = new_lengths_y
 38 |         seqs_y = new_seqs_y
 39 | 
 40 |         if len(lengths_x) < 1 or len(lengths_y) < 1:
 41 |             return None, None, None, None
 42 | 
 43 |     n_samples = len(seqs_x)
 44 |     maxlen_x = numpy.max(lengths_x) + 1
 45 |     maxlen_y = numpy.max(lengths_y) + 1
 46 | 
 47 |     x = numpy.zeros((n_factors, maxlen_x, n_samples)).astype('int64')
 48 |     y = numpy.zeros((maxlen_y, n_samples)).astype('int64')
 49 |     x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32')
 50 |     y_mask = numpy.zeros((maxlen_y, n_samples)).astype('float32')
 51 |     for idx, [s_x, s_y] in enumerate(zip(seqs_x, seqs_y)):
 52 |         x[:, :lengths_x[idx], idx] = list(zip(*s_x))
 53 |         x_mask[:lengths_x[idx]+1, idx] = 1.
 54 |         y[:lengths_y[idx], idx] = s_y
 55 |         y_mask[:lengths_y[idx]+1, idx] = 1.
 56 | 
 57 |     return x, x_mask, y, y_mask
 58 | 
 59 | 
 60 | def load_dict(filename, model_type):
 61 |     try:
 62 |         # build_dictionary.py writes JSON files as UTF-8 so assume that here.
 63 |         with open(filename, 'r', encoding='utf-8') as f:
 64 |             d = json.load(f)
 65 |     except:
 66 |         # FIXME Should we be assuming UTF-8?
 67 |         with open(filename, 'r', encoding='utf-8') as f:
 68 |             d = pkl.load(f)
 69 | 
 70 |     # The transformer model requires vocab dictionaries to use the new style
 71 |     # special symbols. If the dictionary looks like an old one then tell the
 72 |     # user to update it.
 73 |     if model_type == 'transformer' and ("<GO>" not in d or d["<GO>"] != 1):
 74 |         logging.error('you must update \'{}\' for use with the '
 75 |                       '\'transformer\' model type. Please re-run '
 76 |                       'build_dictionary.py to generate a new vocabulary '
 77 |                       'dictionary.'.format(filename))
 78 |         sys.exit(1)
 79 | 
 80 |     return d
 81 | 
 82 | 
 83 | def seq2words(seq, inverse_dictionary, join=True):
 84 |     seq = numpy.array(seq, dtype='int64')
 85 |     assert len(seq.shape) == 1
 86 |     return factoredseq2words(seq.reshape([seq.shape[0], 1]),
 87 |                              [inverse_dictionary],
 88 |                              join)
 89 | 
 90 | def factoredseq2words(seq, inverse_dictionaries, join=True):
 91 |     assert len(seq.shape) == 2
 92 |     assert len(inverse_dictionaries) == seq.shape[1]
 93 |     words = []
 94 |     eos_reached = False
 95 |     for i, w in enumerate(seq):
 96 |         if eos_reached:
 97 |             break
 98 |         factors = []
 99 |         for j, f in enumerate(w):
100 |             if f == 0:
101 |                 eos_reached = True
102 |                 break
103 |                 # This assert has been commented out because it's possible for
104 |                 # non-zero values to follow zero values for Transformer models.
105 |                 # TODO Check why this happens
106 |                 #assert (i == len(seq) - 1) or (seq[i+1][j] == 0), \
107 |                 #       ('Zero not at the end of sequence', seq)
108 |             elif f in inverse_dictionaries[j]:
109 |                 factors.append(inverse_dictionaries[j][f])
110 |             else:
111 |                 factors.append('UNK')
112 |         word = '|'.join(factors)
113 |         words.append(word)
114 |     return ' '.join(words) if join else words
115 | 
116 | def reverse_dict(dictt):
117 |     keys, values = list(zip(*list(dictt.items())))
118 |     r_dictt = dict(list(zip(values, keys)))
119 |     return r_dictt
120 | 
121 | 
122 | def load_dictionaries(config):
123 |     model_type = config.model_type
124 |     source_to_num = [load_dict(d, model_type) for d in config.source_dicts]
125 |     target_to_num = load_dict(config.target_dict, model_type)
126 |     num_to_source = [reverse_dict(d) for d in source_to_num]
127 |     num_to_target = reverse_dict(target_to_num)
128 |     return source_to_num, target_to_num, num_to_source, num_to_target
129 | 
130 | 
131 | def read_all_lines(config, sentences, batch_size):
132 |     source_to_num, _, _, _ = load_dictionaries(config)
133 | 
134 |     if config.source_vocab_sizes != None:
135 |         assert len(config.source_vocab_sizes) == len(source_to_num)
136 |         for d, vocab_size in zip(source_to_num, config.source_vocab_sizes):
137 |             if vocab_size != None and vocab_size > 0:
138 |                 for key, idx in list(d.items()):
139 |                     if idx >= vocab_size:
140 |                         del d[key]
141 | 
142 |     lines = []
143 |     for sent in sentences:
144 |         line = []
145 |         for w in sent.strip().split():
146 |             if config.factors == 1:
147 |                 w = [source_to_num[0][w] if w in source_to_num[0] else 2]
148 |             else:
149 |                 w = [source_to_num[i][f] if f in source_to_num[i] else 2
150 |                                          for (i,f) in enumerate(w.split('|'))]
151 |                 if len(w) != config.factors:
152 |                     raise exception.Error(
153 |                         'Expected {0} factors, but input word has {1}\n'.format(
154 |                             config.factors, len(w)))
155 |             line.append(w)
156 |         lines.append(line)
157 |     lines = numpy.array(lines)
158 |     lengths = numpy.array([len(l) for l in lines])
159 |     idxs = lengths.argsort()
160 |     lines = lines[idxs]
161 | 
162 |     #merge into batches
163 |     batches = []
164 |     for i in range(0, len(lines), batch_size):
165 |         batch = lines[i:i+batch_size]
166 |         batches.append(batch)
167 | 
168 |     return batches, idxs
169 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import io
 5 | import setuptools
 6 | 
 7 | setuptools.setup(
 8 |     name='nematus',
 9 |     version='0.5',
10 |     description='Neural machine translation tools on top of Tensorflow',
11 |     long_description=io.open(os.path.join(os.path.dirname(
12 |         os.path.abspath(__file__)), 'README.md'),encoding='UTF-8').read(),
13 |     license='BSD 3-clause',
14 |     url='http://github.com/EdinburghNLP/nematus',
15 |     install_requires=['numpy',
16 |                       'tensorflow'],
17 |     classifiers=['Development Status :: 3 - Alpha',
18 |                  'Intended Audience :: Science/Research',
19 |                  'License :: OSI Approved :: BSD License',
20 |                  'Operating System :: OS Independent',
21 |                  'Topic :: Scientific/Engineering'],
22 |     packages = ['nematus', 'nematus.metrics'],
23 | )
24 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | Testing Nematus
 2 | ---------------
 3 | 
 4 | To test translation (on GPU 0), execute
 5 | 
 6 | CUDA_VISIBLE_DEVICES=0 python3 test_translate.py
 7 | 
 8 | To test scoring (on GPU 0), execute
 9 | 
10 | CUDA_VISIBLE_DEVICES=0 python3 test_score.py
11 | 
12 | more sample models (including scripts for pre- and postprocessing)
13 | are provided at: http://statmt.org/rsennrich/wmt16_systems/
14 | 
15 | to test training (on GPU 0), execute
16 | 
17 | CUDA_VISIBLE_DEVICES=0 ./test_train.sh
18 | 
19 | note that the training script is just a toy setup to make sure the scripts run,
20 | and to allow for speed comparisons. For instructions to train a
21 | real-scale system, check the instructions at https://github.com/rsennrich/wmt16-scripts
22 | 


--------------------------------------------------------------------------------
/test/en-de/in:
--------------------------------------------------------------------------------
1 | a Republican strategy to counter the re-election of Obama 
2 | Republican leaders justified their policy by the need to combat electoral fraud . 
3 | however , the Brenn@@ an Centre considers this a myth , stating that electoral fraud is rar@@ er in the United States than the number of people killed by lightning . 
4 | indeed , Republican lawyers identified only 300 cases of electoral fraud in the United States in a decade . 
5 | one thing is certain : these new provisions will have a negative impact on voter turn@@ -out .
6 | 


--------------------------------------------------------------------------------
/test/en-de/ref:
--------------------------------------------------------------------------------
 1 | eine republi@@ kanische Strategie gegen die Wiederwahl Obamas
 2 | 0.977844655514 0.90209954977 0.927412986755 0.984532177448 0.183520868421 0.907861471176 0.994144678116 0.917708992958 0.990146577358 
 3 | die republi@@ kanische Führung begründet ihre Politik mit der Notwendigkeit , Wahl@@ betrug zu bekämpfen .
 4 | 0.624975204468 0.467659324408 0.895200014114 0.922666728497 0.332508355379 0.962346553802 0.985188066959 0.511733949184 0.702501058578 0.733234107494 0.834280848503 0.298875242472 0.978177785873 0.962297916412 0.991670489311 0.998888731003 0.999692261219 
 5 | das Brenn@@ an Zentrum hält dies aber für einen Mythos , der besagt , dass Wahl@@ betrug in den USA seltener ist als die Zahl der getö@@ teten Menschen .
 6 | 0.153531059623 0.871728599072 0.346277505159 0.747219443321 0.871806800365 0.120552673936 0.37667247653 0.782940626144 0.822250068188 0.98460739851 0.73440104723 0.481711357832 0.311930894852 0.961221635342 0.896834015846 0.427923560143 0.903929233551 0.673036038876 0.992655754089 0.739101171494 0.754340946674 0.522766292095 0.916598856449 0.96203070879 0.791576385498 0.890906095505 0.162579834461 0.99129909277 0.765361487865 0.619172334671 0.999593555927 
 7 | tatsächlich wurden in den USA in einem Jahrzehnt nur 300 Fälle von Wahl@@ betrug in den USA festgestellt .
 8 | 0.874663293362 0.193072125316 0.830588340759 0.950349152088 0.536000072956 0.732309579849 0.601523339748 0.985651493073 0.771518468857 0.963857293129 0.582112908363 0.782780885696 0.960188984871 0.962329685688 0.735553085804 0.973220407963 0.69519174099 0.764474630356 0.998193442822 0.999425113201 
 9 | eines ist sicher : diese neuen Bestimmungen werden negative Auswirkungen auf die Wahlbeteiligung haben .
10 | 0.634134709835 0.78360158205 0.81129103899 0.985949218273 0.919415593147 0.925939559937 0.844495713711 0.82704269886 0.344317674637 0.952615022659 0.954769909382 0.629434704781 0.463058054447 0.923200011253 0.998686730862 0.999255955219 
11 | 


--------------------------------------------------------------------------------
/test/en-de/ref2:
--------------------------------------------------------------------------------
1 | eine republi@@ kanische Strategie gegen die Wiederwahl Obamas
2 | die republi@@ kanische Führung begründet ihre Politik mit der Notwendigkeit , Wahl@@ betrug zu bekämpfen .
3 | das Brenn@@ an Zentrum hält dies aber für einen Mythos , der besagt , dass Wahl@@ betrug in den USA seltener ist als die Zahl der getö@@ teten Menschen .
4 | tatsächlich wurden in den USA in einem Jahrzehnt nur 300 Fälle von Wahl@@ betrug in den USA festgestellt .
5 | eines ist sicher : diese neuen Bestimmungen werden negative Auswirkungen auf die Wahlbeteiligung haben .
6 | 


--------------------------------------------------------------------------------
/test/en-de/ref_score:
--------------------------------------------------------------------------------
1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 0.688558
2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 1.18311
3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 1.44055
4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 2.32595
5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 0.40967
6 | 


--------------------------------------------------------------------------------
/test/en-de/references:
--------------------------------------------------------------------------------
1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 
2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 
3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 
4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 
5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 
6 | 


--------------------------------------------------------------------------------
/test/en-ro/in:
--------------------------------------------------------------------------------
1 | the European Commission decided on Tuesday to resume payments for Romania under the &quot; Economic competitiveness &quot; and &quot; Environment &quot; programs , both interrupted in early April 2015 .
2 | the judge did not rule on whether L@@ M@@ FAO &apos;s song itself was an un@@ authorized copy of &quot; H@@ ust@@ lin &apos; . &quot;
3 | the Romanian national team is part of Group D in the World Cup in England , along with France , Ireland , Canada and Italy .
4 | it sends a message : your country does not value you becoming a parent .
5 | the round@@ about will be made at the appropriate time , we must consider the trams traffic in the area , and we also need an approval from the National Roads .
6 | 


--------------------------------------------------------------------------------
/test/en-ro/ref:
--------------------------------------------------------------------------------
 1 | Comisia Europeană a decis , marți , să reia plățile pentru România în cadrul programelor &quot; competitivitate economică &quot; și &quot; Mediu &quot; , ambele întrerupte la începutul lunii aprilie 2015 .
 2 | 0.995251238346 0.554548621178 0.986067473888 0.977536916733 0.471415698528 0.965951085091 0.991383254528 0.735538363457 0.99354493618 0.959721267223 0.960633397102 0.987248241901 0.73650187254 0.958207905293 0.329731225967 0.941679000854 0.48397654295 0.872097313404 0.995552778244 0.99405169487 0.820243418217 0.72900468111 0.978062391281 0.980996310711 0.959786713123 0.870699226856 0.956985473633 0.989414513111 0.948426306248 0.996526777744 0.996653676033 0.995466053486 0.999979257584 
 3 | judecătorul nu a exclus dacă melodia L@@ M@@ FAO în sine a fost o copie ne@@ autorizată a &quot; H@@ ust@@ lin &quot; &quot; .
 4 | 0.748930931091 0.976350605488 0.90377175808 0.238382071257 0.800515711308 0.51756888628 0.782619535923 0.955519676208 0.894009530544 0.183243229985 0.996174514294 0.782620131969 0.927685260773 0.802042484283 0.788843691349 0.390572547913 0.356075167656 0.823610961437 0.785067260265 0.941457808018 0.976138412952 0.979526996613 0.859899282455 0.516458272934 0.989753842354 0.999218225479 
 5 | naționala României face parte din Grupa D în Cupa Mondială din Anglia , alături de Franța , Irlanda , Canada și Italia .
 6 | 0.336522132158 0.97390460968 0.485618531704 0.998266816139 0.977845489979 0.972954690456 0.995464265347 0.582527756691 0.900587379932 0.904148697853 0.926693975925 0.990065574646 0.982615590096 0.970086634159 0.995798170567 0.985046744347 0.999237596989 0.992471039295 0.998591423035 0.994875609875 0.995780050755 0.996373534203 0.996117174625 0.99995225668 
 7 | transmite un mesaj : țara dumneavoastră nu apreciază că devine părinte .
 8 | 0.343225359917 0.930076539516 0.998842597008 0.99683535099 0.859772562981 0.548672556877 0.990485429764 0.126094281673 0.79455691576 0.418934345245 0.782570242882 0.974693894386 0.999907135963 
 9 | discu@@ tia despre care se va face la momentul oportun , trebuie sa avem in vedere traficul de tramvaie din zona si avem nevoie si de o aprobare de la Compania Nationala de auto@@ str@@ azi .
10 | 0.0200441926718 0.983767747879 0.254417777061 0.626059830189 0.914376199245 0.6536039114 0.598313570023 0.473640501499 0.660739302635 0.727347791195 0.59266859293 0.936970472336 0.982369661331 0.226246803999 0.963698983192 0.996792733669 0.877014875412 0.466113090515 0.902705550194 0.558049559593 0.952391505241 0.783247053623 0.856589257717 0.994170725346 0.818028509617 0.973481237888 0.627151310444 0.944025158882 0.787506222725 0.954702436924 0.380503386259 0.954006671906 0.737255275249 0.340464830399 0.983342587948 0.980352401733 0.994605183601 0.99982637167 
11 | 


--------------------------------------------------------------------------------
/test/en-ro/ref_score:
--------------------------------------------------------------------------------
1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor &quot; Competitivitate Econom@@ ica &quot; și &quot; Mediu &quot; , ambele intre@@ rupte la inceputul lunii aprilie 2015 . 1.10127
2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui &quot; H@@ ust@@ lin &apos; &quot; . 1.43826
3 | nationala &quot; tricol@@ ora &quot; face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia . 1.16586
4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte . 2.04865
5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale . 2.03933
6 | 


--------------------------------------------------------------------------------
/test/en-ro/references:
--------------------------------------------------------------------------------
1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor &quot; Competitivitate Econom@@ ica &quot; și &quot; Mediu &quot; , ambele intre@@ rupte la inceputul lunii aprilie 2015 .
2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui &quot; H@@ ust@@ lin &apos; &quot; .
3 | nationala &quot; tricol@@ ora &quot; face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia .
4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte .
5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale .
6 | 


--------------------------------------------------------------------------------
/test/models/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build
3 | dist
4 | nmt.egg-info
5 | 


--------------------------------------------------------------------------------
/test/test_score.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import os
 5 | import unittest
 6 | import logging
 7 | 
 8 | sys.path.append(os.path.abspath('../nematus'))
 9 | from score import main as score
10 | from settings import ScorerSettings
11 | from test_utils import load_wmt16_model
12 | 
13 | level = logging.DEBUG
14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
15 | 
16 | class TestScore(unittest.TestCase):
17 |     """
18 |     Regression tests for scoring with WMT16 models
19 |     """
20 | 
21 |     def setUp(self):
22 |         """
23 |         Download pre-trained models
24 |         """
25 |         load_wmt16_model('en','de')
26 | 
27 |     def scoreEqual(self, output1, output2):
28 |         """Given two files with translation scores, check that probabilities
29 |            are equal within rounding error.
30 |         """
31 |         with open(output1, 'r', encoding='utf-8') as out1, \
32 |              open(output2, 'r', encoding='utf-8') as out2:
33 |             for (line1, line2) in zip(out1.readlines(), out2.readlines()):
34 |                 score1 = float(line1.split()[-1])
35 |                 score2 = float(line2.split()[-1])
36 |                 self.assertAlmostEqual(score1, score2, places=5)
37 | 
38 |     # English-German WMT16 system, no dropout
39 |     def test_ende(self):
40 |         os.chdir('models/en-de/')
41 |         with open('../../en-de/in', 'r', encoding='utf-8') as in_file, \
42 |              open('../../en-de/references', 'r', encoding='utf-8') as ref_file, \
43 |              open('../../en-de/out_score', 'w', encoding='utf-8') as score_file:
44 |             settings = ScorerSettings()
45 |             settings.models = ['model.npz']
46 |             settings.minibatch_size = 80
47 |             settings.normalization_alpha = 1.0
48 |             score(in_file, ref_file, score_file, settings)
49 |         os.chdir('../..')
50 |         self.scoreEqual('en-de/ref_score', 'en-de/out_score')
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/test/test_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.en.json data/vocab.de.json \
13 |   --dim_word 256 \
14 |   --dim 512 \
15 |   --n_words_src 30000 \
16 |   --n_words 30000 \
17 |   --maxlen 50 \
18 |   --optimizer adam \
19 |   --lrate 0.0001 \
20 |   --batch_size 40 \
21 |   --no_shuffle \
22 |   --dispFreq 500 \
23 |   --finish_after 500
24 | 


--------------------------------------------------------------------------------
/test/test_train_l2_loss.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.en.json data/vocab.de.json \
13 |   --dim_word 256 \
14 |   --dim 512 \
15 |   --n_words_src 30000 \
16 |   --n_words 30000 \
17 |   --maxlen 50 \
18 |   --optimizer adam \
19 |   --lrate 0.0001 \
20 |   --batch_size 40 \
21 |   --no_shuffle \
22 |   --dispFreq 500 \
23 |   --finish_after 500 \
24 |   --decay_c 0.0001
25 | 


--------------------------------------------------------------------------------
/test/test_train_mapl2_loss.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.en.json data/vocab.de.json \
13 |   --dim_word 256 \
14 |   --dim 512 \
15 |   --n_words_src 30000 \
16 |   --n_words 30000 \
17 |   --maxlen 50 \
18 |   --optimizer adam \
19 |   --lrate 0.0001 \
20 |   --batch_size 40 \
21 |   --no_shuffle \
22 |   --dispFreq 500 \
23 |   --finish_after 500 \
24 |   --map_decay_c 0.0001 \
25 |   --prior_model models/model.npz-500 \
26 |   --reload latest_checkpoint
27 | 


--------------------------------------------------------------------------------
/test/test_train_outputactivations.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.en.json data/vocab.de.json \
13 |   --dim_word 256 \
14 |   --dim 512 \
15 |   --n_words_src 30000 \
16 |   --n_words 30000 \
17 |   --maxlen 50 \
18 |   --optimizer adam \
19 |   --lrate 0.0001 \
20 |   --batch_size 40 \
21 |   --no_shuffle \
22 |   --dispFreq 500 \
23 |   --finish_after 500 \
24 |   --output_hidden_activation relu
25 | 


--------------------------------------------------------------------------------
/test/test_train_reload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.en.json data/vocab.de.json \
13 |   --dim_word 256 \
14 |   --dim 512 \
15 |   --n_words_src 30000 \
16 |   --n_words 30000 \
17 |   --maxlen 50 \
18 |   --optimizer adam \
19 |   --lrate 0.0001 \
20 |   --batch_size 40 \
21 |   --no_shuffle \
22 |   --dispFreq 500 \
23 |   --finish_after 500 \
24 |   --reload latest_checkpoint
25 | 


--------------------------------------------------------------------------------
/test/test_train_summaries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.en.json data/vocab.de.json \
13 |   --dim_word 256 \
14 |   --dim 512 \
15 |   --n_words_src 30000 \
16 |   --n_words 30000 \
17 |   --maxlen 50 \
18 |   --optimizer adam \
19 |   --lrate 0.0001 \
20 |   --batch_size 40 \
21 |   --no_shuffle \
22 |   --dispFreq 500 \
23 |   --finish_after 500 \
24 |   --reload latest_checkpoint \
25 |   --summaryFreq 30000
26 | 


--------------------------------------------------------------------------------
/test/test_train_transformer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | ../nematus/train.py \
10 |   --model models/model.npz \
11 |   --datasets data/corpus.en data/corpus.de \
12 |   --dictionaries data/vocab.json data/vocab.json \
13 |   --n_words_src 10000 \
14 |   --n_words 10000 \
15 |   --model_type transformer \
16 |   --embedding_size 128 \
17 |   --tie_encoder_decoder_embeddings \
18 |   --tie_decoder_embeddings \
19 |   --state_size 128 \
20 |   --transformer_enc_depth 2 \
21 |   --transformer_dec_depth 2 \
22 |   --transformer_ffn_hidden_size 256 \
23 |   --loss_function per-token-cross-entropy \
24 |   --clip_c 0.0 \
25 |   --label_smoothing 0.1 \
26 |   --optimizer adam \
27 |   --adam_beta1 0.9 \
28 |   --adam_beta2 0.98 \
29 |   --adam_epsilon 1e-09 \
30 |   --learning_schedule transformer \
31 |   --warmup_steps 4000 \
32 |   --maxlen 100 \
33 |   --batch_size 300 \
34 |   --token_batch_size 3000 \
35 |   --disp_freq 500 \
36 |   --finish_after 500
37 | 


--------------------------------------------------------------------------------
/test/test_translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import os
 5 | import unittest
 6 | import logging
 7 | 
 8 | sys.path.append(os.path.abspath('../nematus'))
 9 | from translate import main as translate
10 | from settings import TranslationSettings
11 | from test_utils import load_wmt16_model
12 | 
13 | level = logging.DEBUG
14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
15 | 
16 | class TestTranslate(unittest.TestCase):
17 |     """
18 |     Regression tests for translation with WMT16 models
19 |     """
20 | 
21 |     def setUp(self):
22 |         """
23 |         Download pre-trained models
24 |         """
25 |         load_wmt16_model('en','de')
26 | 
27 |     def outputEqual(self, output1, output2):
28 |         """given two translation outputs, check that output string is identical
29 |         """
30 |         with open(output1, 'r', encoding='utf-8') as out1, \
31 |              open(output2, 'r', encoding='utf-8') as out2:
32 |             for (line1, line2) in zip(out1.readlines(), out2.readlines()):
33 |                 self.assertEqual(line1.strip(), line2.strip())
34 | 
35 |     # English-German WMT16 system, no dropout
36 |     def test_ende(self):
37 |         with open('en-de/in', 'r', encoding='utf-8') as in_file, \
38 |              open('en-de/out', 'w', encoding='utf-8') as out_file:
39 |             os.chdir('models/en-de/')
40 |             settings = TranslationSettings()
41 |             settings.input = in_file
42 |             settings.output = out_file
43 |             settings.models = ["model.npz"]
44 |             settings.beam_size = 12
45 |             settings.normalization_alpha = 1.0
46 |             translate(settings=settings)
47 |             os.chdir('../..')
48 |         self.outputEqual('en-de/ref2','en-de/out')
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/test/test_translate_sampling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import os
 5 | import unittest
 6 | import logging
 7 | 
 8 | sys.path.append(os.path.abspath('../nematus'))
 9 | from translate import main as translate
10 | from settings import TranslationSettings
11 | from test_utils import load_wmt16_model
12 | 
13 | level = logging.DEBUG
14 | logging.basicConfig(level=level, format='%(levelname)s: %(message)s')
15 | 
16 | class TestTranslate(unittest.TestCase):
17 |     """
18 |     Regression tests for translation with WMT16 models
19 |     """
20 | 
21 |     def setUp(self):
22 |         """
23 |         Download pre-trained models
24 |         """
25 |         load_wmt16_model('en','de')
26 | 
27 |     def outputEqual(self, output1, output2):
28 |         """given two translation outputs, check that output string is identical
29 |         """
30 |         with open(output1, 'r', encoding='utf-8') as out1, \
31 |              open(output2, 'r', encoding='utf-8') as out2:
32 |             for (line1, line2) in zip(out1.readlines(), out2.readlines()):
33 |                 self.assertEqual(line1.strip(), line2.strip())
34 | 
35 |     # English-German WMT16 system, no dropout
36 |     def test_ende(self):
37 |         with open('en-de/in', 'r', encoding='utf-8') as in_file, \
38 |              open('en-de/out', 'w', encoding='utf-8') as out_file:
39 |             os.chdir('models/en-de/')
40 |             settings = TranslationSettings()
41 |             settings.input = in_file
42 |             settings.output = out_file
43 |             settings.models = ["model.npz"]
44 |             settings.beam_size = 12
45 |             settings.normalization_alpha = 1.0
46 |             settings.translation_strategy = 'sampling'
47 |             settings.sampling_temperature = 0.4
48 |             translate(settings=settings)
49 |             os.chdir('../..')
50 |         self.outputEqual('en-de/ref2','en-de/out')
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import os
 5 | import requests
 6 | from shutil import copyfile
 7 | 
 8 | sys.path.append(os.path.abspath('../nematus'))
 9 | from theano_tf_convert import theano_to_tensorflow_model
10 | 
11 | def load_wmt16_model(src, target):
12 |         path = os.path.join('models', '{0}-{1}'.format(src,target))
13 |         try:
14 |             os.makedirs(path)
15 |         except OSError:
16 |             pass
17 |         for filename in ['model.npz.json', 'model.npz', 'vocab.{0}.json'.format(src), 'vocab.{0}.json'.format(target)]:
18 |             if not os.path.exists(os.path.join(path, filename)):
19 |                 if filename == 'model.npz' and os.path.exists(os.path.join(path, 'model.npz.index')):
20 |                     continue
21 |                 r = requests.get('http://data.statmt.org/rsennrich/wmt16_systems/{0}-{1}/'.format(src,target) + filename, stream=True)
22 |                 with open(os.path.join(path, filename), 'wb') as f:
23 |                     for chunk in r.iter_content(1024**2):
24 |                         f.write(chunk)
25 | 
26 |                 # regression test is based on Theano model - convert to TF names
27 |                 if filename == 'model.npz.json' and not os.path.exists(os.path.join(path, 'model.npz.index')):
28 |                     copyfile(os.path.join(path, 'model.npz.json'), os.path.join(path, 'model-theano.npz.json'))
29 |                 elif filename == 'model.npz' and not os.path.exists(os.path.join(path, 'model.npz.index')):
30 |                     os.rename(os.path.join(path, 'model.npz'), os.path.join(path, 'model-theano.npz'))
31 |                     theano_to_tensorflow_model(os.path.join(path, 'model-theano.npz'), os.path.join(path, 'model.npz'))
32 | 


--------------------------------------------------------------------------------
/utils/copy_unknown_words.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This script is to replace the unknown words in target sentences with their aligned words in source sentences.
 3 | Args: 
 4 | 	- input: a text file (json format), each line 
 5 | 			including a full alignment matrix, a pair of source and target sentences
 6 | 	- output (optional): updated text file (json format)
 7 | 	- unknown word token (optional): a string, default="UNK"
 8 | To use:
 9 | 	python copy_unknown_words.py -i translation.txt -o updated_translation.txt -u 'UNK'
10 | '''
11 | 
12 | import json
13 | import numpy
14 | import argparse
15 | import sys
16 | 
17 | ''' 
18 | Example input file:
19 | {"id": 0, "prob": 0, "target_sent": "Obama empfängt Netanjahu", "matrix": [[0.9239920377731323, 0.04680762067437172, 0.003626488381996751, 0.02343202754855156, 0.0021418146789073944], [0.009942686185240746, 0.4995519518852234, 0.44341862201690674, 0.02077348716557026, 0.026313267648220062], [0.01032756082713604, 0.6475557088851929, 0.029476342722773552, 0.27724361419677734, 0.035396818071603775], [0.0010026689851656556, 0.35200807452201843, 0.06362949311733246, 0.4778701961040497, 0.1054895892739296]], "source_sent": "Obama kindly receives Netanjahu"}
20 | '''
21 | 
22 | def copy_unknown_words(filename, out_filename, unk_token):
23 | 	for line in filename:
24 | 		sent_pair = json.loads(line)
25 | # 		print "Translation:"
26 | # 		print sent_pair
27 | 		source_sent = sent_pair["source_sent"]
28 | 		target_sent = sent_pair["target_sent"]
29 | 		# matrix dimension: (len(target_sent) + 1) * (len(source_sent) + 1)
30 | 		# sum of values in a row = 1
31 | 		full_alignment = sent_pair["matrix"]
32 | 		source_words = source_sent.split()
33 | 		target_words = target_sent.split()
34 | 		# get the indices of maximum values in each row 
35 | 		# (best alignment for each target word)
36 | 		hard_alignment = numpy.argmax(full_alignment, axis=1)
37 | # 		print hard_alignment
38 | 		
39 | 		updated_target_words = []
40 | 		for j in range(len(target_words)):
41 | 			if target_words[j] == unk_token:
42 | 				unk_source = source_words[hard_alignment[j]]
43 | 				updated_target_words.append(unk_source)
44 | 			else:
45 | 				updated_target_words.append(target_words[j])
46 | 				
47 | 		sent_pair["target_sent"] = " ".join(updated_target_words)
48 | # 		print "Updated translation:"
49 | # 		print sent_pair
50 | 		sent_pair = json.dumps(sent_pair).decode('unicode-escape').encode('utf8')
51 | 		print(sent_pair, file=out_filename)
52 | 
53 | if __name__ == "__main__":
54 | 	parser = argparse.ArgumentParser()
55 | 	parser.add_argument('--input', '-i', type=argparse.FileType('r'),
56 | 						metavar='PATH', required=True,
57 | 						help='''Input text file in json format including alignment matrix, 
58 | 								source sentences, target sentences''')
59 | 	parser.add_argument('--output', '-o', type=argparse.FileType('w'),
60 | 						default=sys.stdout, metavar='PATH',
61 | 						help="Output file (default: standard output)")
62 | 	parser.add_argument('--unknown', '-u', type=str, nargs = '?', default="UNK",
63 | 						help='Unknown token to be replaced (default: "UNK")')
64 | 
65 | 	args = parser.parse_args()
66 | 		
67 | 	copy_unknown_words(args.input, args.output, args.unk)
68 | 		
69 | 	
70 | 


--------------------------------------------------------------------------------
/utils/plot_heatmap.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import matplotlib.pyplot as plt
  3 | import sys
  4 | import json
  5 | import argparse
  6 | 
  7 | # input:
  8 | #  alignment matrix - numpy array
  9 | #  shape (target tokens + eos, number of hidden source states = source tokens +eos)
 10 | # one line correpsonds to one decoding step producing one target token
 11 | # each line has the attention model weights corresponding to that decoding step
 12 | # each float on a line is the attention model weight for a corresponding source state.
 13 | # plot: a heat map of the alignment matrix
 14 | # x axis are the source tokens (alignment is to source hidden state that roughly corresponds to a source token)
 15 | # y axis are the target tokens
 16 | 
 17 | # http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor
 18 | def plot_head_map(mma, target_labels, source_labels):
 19 |   fig, ax = plt.subplots()
 20 |   heatmap = ax.pcolor(mma, cmap=plt.cm.Blues)
 21 | 
 22 |   # put the major ticks at the middle of each cell
 23 |   ax.set_xticks(numpy.arange(mma.shape[1])+0.5, minor=False)
 24 |   ax.set_yticks(numpy.arange(mma.shape[0])+0.5, minor=False)
 25 |   
 26 |   # without this I get some extra columns rows
 27 |   # http://stackoverflow.com/questions/31601351/why-does-this-matplotlib-heatmap-have-an-extra-blank-column
 28 |   ax.set_xlim(0, int(mma.shape[1]))
 29 |   ax.set_ylim(0, int(mma.shape[0]))
 30 | 
 31 |   # want a more natural, table-like display
 32 |   ax.invert_yaxis()
 33 |   ax.xaxis.tick_top()
 34 | 
 35 |   # source words -> column labels
 36 |   ax.set_xticklabels(source_labels, minor=False)
 37 |   # target words -> row labels
 38 |   ax.set_yticklabels(target_labels, minor=False)
 39 |   
 40 |   plt.xticks(rotation=45)
 41 | 
 42 |   #plt.tight_layout()
 43 |   plt.show()
 44 | 
 45 | # column labels -> target words
 46 | # row labels -> source words
 47 | 
 48 | def read_alignment_matrix(f):
 49 |   header = f.readline().strip().split('|||')
 50 |   if header[0] == '':
 51 |     return None, None, None, None
 52 |   sid = int(header[0].strip())
 53 |   # number of tokens in source and translation +1 for eos
 54 |   src_count, trg_count = map(int,header[-1].split())
 55 |   # source words
 56 |   source_labels = header[3].decode('UTF-8').split()
 57 |   source_labels.append('</s>')
 58 |   # target words
 59 |   target_labels = header[1].decode('UTF-8').split()
 60 |   target_labels.append('</s>')
 61 | 
 62 |   mm = []
 63 |   for r in range(trg_count):
 64 |     alignment = map(float,f.readline().strip().split())
 65 |     mm.append(alignment)
 66 |   mma = numpy.array(mm)
 67 |   return sid,mma, target_labels, source_labels
 68 | 
 69 | 
 70 | def read_plot_alignment_matrices(f, n):
 71 |   while(f):
 72 |     sid, mma, target_labels, source_labels = read_alignment_matrix(f)
 73 |     if mma is None:
 74 |       return
 75 |     if sid >n:
 76 |       return
 77 |     plot_head_map(mma, target_labels, source_labels)
 78 |     # empty line separating the matrices
 79 |     f.readline()
 80 | 
 81 | 
 82 | """
 83 | Adding functions to read the json format.
 84 | """
 85 | 
 86 | def read_plot_alignment_json(file, n):
 87 |     while (file):
 88 |         sid, mma, target_labels, source_labels = read_alignment_json(file)
 89 |         if mma is None:
 90 |             return
 91 |         if sid > n:
 92 |             return
 93 |         plot_head_map(mma, target_labels, source_labels)
 94 | 
 95 | def read_alignment_json(file):
 96 |     data = file.readline() ##one line containing the json object.
 97 |     if len(data.strip()) == 0:
 98 |         return None, None, None, None
 99 |     jdata = json.loads(data)
100 |     ## messy json encodings... TODO: make this better
101 |     jdata = json.loads(json.dumps(jdata).decode('unicode-escape').encode('utf8'))
102 |     #print jdata
103 |     sid = int(jdata["id"])
104 |     mma = numpy.array(jdata["matrix"])
105 |     ##target words
106 |     target_labels = jdata["target_sent"].split()
107 |     target_labels.append('</s>')
108 |     ##source words
109 |     source_labels = jdata["source_sent"].split()
110 |     source_labels.append('</s>')
111 |     return sid,mma, target_labels, source_labels
112 | 
113 | if __name__ == "__main__":
114 | 
115 |     parser = argparse.ArgumentParser()
116 |     # '/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/en-de/test.alignment'
117 |     parser.add_argument('--input', '-i', type=argparse.FileType('r'),
118 |                             default='/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/ro-en/newstest2016-roen-src.ro.alignment', metavar='PATH',
119 |                             help="Input file (default: standard input)")
120 | 
121 |     parser.add_argument('--json', '-j', required = False,action="store_true",
122 |                             help="If this option is used, then read alignment matrix from a Json formatted file.")
123 |     args = parser.parse_args()
124 | 
125 |     if args.json:
126 |         read_plot_alignment_json(args.input, 10)   ##n is the maximum number of sentences to process.
127 |     else:
128 |         read_plot_alignment_matrices(args.input,10)
129 | >>>>>>> origin/nematus-liucan
130 | 


--------------------------------------------------------------------------------
/utils/visualize_probs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | # given a source sentence, a target sentence, and a sequence of probabilities (one per target word, plus an end-of-sentence probability),
 5 | # visualize the probability of each target word via HTML output.
 6 | # black fields indicate high confidence, light fields low confidence.
 7 | # example input:
 8 | """
 9 | Unsere digitalen Leben haben die Notwendigkeit, stark, lebenslustig und erfolgreich zu erscheinen, verdoppelt.
10 | Our digital lives have doubled the need to appear strong, lifel... ike and successful .
11 | 0.882218956947 0.989946246147 0.793388187885 0.790167689323 0.768674969673 0.941913545132 0.955783545971 0.777168631554 0.266917765141 0.909709095955 0.990240097046 0.341023534536 0.828059256077 0.854399263859 0.906807541847 0.960786998272 0.997184157372"""
12 | 
13 | html_text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
14 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 | 
16 | <html>
17 | <HEAD>
18 | <title>Results page</title>
19 | <meta http-equiv=Content-Type content=text/html; charset=UTF8>
20 | <style>
21 | html, body, pre{{
22 | background-color: #FFFFFF;
23 | color: #FFFFFF;
24 | font-family: Arial, Helvetica, sans-serif;
25 | font-size: 20px;
26 | }}
27 | td {{
28 | min-width:100px;
29 | }}
30 | th {{
31 | color: #000000;
32 | text-align: left;
33 | }}
34 | </style>
35 | </head>
36 | \n
37 | \n
38 | <body>
39 |  <table>
40 |   {0}
41 | </table> 
42 | 
43 | </body>
44 | </html>
45 | """
46 | 
47 | 
48 | def print_probdist(infile, outfile):
49 | 
50 |     entries = []
51 | 
52 |     for i, line in enumerate(infile):
53 |         if i % 3 == 0:
54 |             #words = line.split()
55 |             entry = ""
56 |             #for w in words:
57 |                 #entry += "<th>" + w + "</thr>\n"
58 |             entry = "<tr><th colspan=\"0\">" + line + "</th></tr>\n"
59 |             entries.append(entry)
60 | 
61 |         if i % 3 == 1:
62 |             words = line.split()
63 |             words.append('&lt;/s&gt;')
64 |         elif i % 3 == 2:
65 |             probs = list(map(float, line.split()))
66 |             entry = ""
67 |             for w,p in zip(words, probs):
68 |                 color = '#%02x%02x%02x' % (int((1-p)*255), int((1-p)*255), int((1-p)*255))
69 |                 entry += "<td bgcolor=\"{0}\">{1}</td>".format(color, w)
70 |             entry = "<tr>" + entry + "</tr>\n"
71 |             entries.append(entry)
72 | 
73 | 
74 |     outfile.write(html_text.format('\n'.join(entries)))
75 | 
76 | 
77 | parser = argparse.ArgumentParser()
78 | parser.add_argument('--input', '-i', type=argparse.FileType('r'),
79 |                         default=sys.stdin, metavar='PATH',
80 |                         help="Input file (default: standard input)")
81 | parser.add_argument('--output', '-o', type=argparse.FileType('w'),
82 |                         default=sys.stdout, metavar='PATH',
83 |                         help="Output file (default: standard output)")
84 | 
85 | args = parser.parse_args()
86 | 
87 | print_probdist(args.input, args.output)


--------------------------------------------------------------------------------