├── eval ├── hyp │ └── readme.md └── eval.sh ├── models └── readme.md ├── data ├── corpus │ └── readme.md ├── postprocess.sh ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.sv │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.pt │ ├── nonbreaking_prefix.ta │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.cs │ └── nonbreaking_prefix.sk ├── length.py ├── strip_sgml.py ├── merge.sh ├── preprocess.sh ├── build_dictionary.py ├── shuffle.py └── multi-bleu.perl ├── nematus ├── metrics │ ├── __init__.py │ ├── reference.py │ ├── test_sentence_bleu.py │ ├── test_scorer_provider.py │ ├── scorer.py │ ├── scorer_provider.py │ ├── scorer_interpolator.py │ ├── test_chrf.py │ ├── beer.py │ ├── sentence_bleu.py │ ├── meteor.py │ └── chrf.py ├── shuffle.py ├── __init__.py ├── training_progress.py ├── initializers.py ├── util.py ├── compat.py ├── theano_util.py ├── hypgraph.py ├── data_iterator.py ├── score.py ├── rescore.py ├── optimizers.py ├── domain_interpolation_data_iterator.py └── alignment_util.py ├── test ├── .gitignore ├── en-de │ ├── in │ ├── references │ ├── ref_score │ └── ref ├── en-ro │ ├── in │ ├── references │ ├── ref_score │ └── ref ├── README.md ├── test_train.sh ├── test_train_bigmem.sh ├── test_train_verybigmem.sh ├── test_train_domaininterpolation.sh ├── test_score.py ├── test_translate.py └── data │ ├── indomain-dev.en │ └── indomain-dev.de ├── .gitignore ├── train_reverse_model.sh ├── runnmt.sh ├── runnmt_l8-fce.sh ├── runnmt_l8-fce-giga.sh ├── Dockerfile.cpu ├── setup.py ├── Dockerfile.gpu ├── LICENSE ├── doc └── factored_neural_machine_translation.md └── utils ├── copy_unknown_words.py ├── visualize_probs.py ├── plot_heatmap.py └── attention_web.php /eval/hyp/readme.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/readme.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/corpus/readme.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nematus/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nematus/shuffle.py: -------------------------------------------------------------------------------- 1 | ../data/shuffle.py -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | */out* 2 | models 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build 3 | dist 4 | nmt.egg-info 5 | -------------------------------------------------------------------------------- /nematus/__init__.py: -------------------------------------------------------------------------------- 1 | from nematus import * 2 | import rescore 3 | import translate -------------------------------------------------------------------------------- /data/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # merges subword units that were split by BPE 4 | 5 | sed -r 's/\@\@ //g' -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /data/length.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy 4 | import sys 5 | 6 | for name in sys.argv[1:]: 7 | lens = [] 8 | with open(name, 'r') as f: 9 | for ll in f: 10 | lens.append(len(ll.strip().split(' '))) 11 | print name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens) 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /data/strip_sgml.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | 5 | def main(): 6 | fin = sys.stdin 7 | fout = sys.stdout 8 | for l in fin: 9 | line = l.strip() 10 | text = re.sub('<[^<]+>', "", line).strip() 11 | if len(text) == 0: 12 | continue 13 | print >>fout, text 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | 19 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /eval/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # translate 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=gpu1 python nematus/translate.py --models models/mle-l8-fce-giga.npz --input data/corpus/dev.src > hyp/hyp.tmp 5 | 6 | # recaser 7 | 8 | # capitalize the first char (if necessary) 9 | 10 | # copy src sentences when longer than 50 words (do nothing) 11 | 12 | # run GLEU. 13 | python2 ../jfleg/eval/gleu.py -r ../jfleg/dev/dev.ref[0-3] -s ../jfleg/dev/dev.src --hyp ./hyp/hyp.tmp 14 | -------------------------------------------------------------------------------- /data/merge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | SRC=$1 5 | TRG=$2 6 | 7 | FSRC=all_${1}-${2}.${1} 8 | FTRG=all_${1}-${2}.${2} 9 | 10 | echo "" > $FSRC 11 | for F in *${1}-${2}.${1} 12 | do 13 | if [ "$F" = "$FSRC" ]; then 14 | echo "pass" 15 | else 16 | cat $F >> $FSRC 17 | fi 18 | done 19 | 20 | 21 | echo "" > $FTRG 22 | for F in *${1}-${2}.${2} 23 | do 24 | if [ "$F" = "$FTRG" ]; then 25 | echo "pass" 26 | else 27 | cat $F >> $FTRG 28 | fi 29 | done 30 | -------------------------------------------------------------------------------- /test/en-de/in: -------------------------------------------------------------------------------- 1 | a Republican strategy to counter the re-election of Obama 2 | Republican leaders justified their policy by the need to combat electoral fraud . 3 | however , the Brenn@@ an Centre considers this a myth , stating that electoral fraud is rar@@ er in the United States than the number of people killed by lightning . 4 | indeed , Republican lawyers identified only 300 cases of electoral fraud in the United States in a decade . 5 | one thing is certain : these new provisions will have a negative impact on voter turn@@ -out . 6 | -------------------------------------------------------------------------------- /train_reverse_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # train from trg to src! 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \ 5 | --datasets ./data/corpus/train-esl.trg ./data/corpus/train-esl.src \ 6 | --dictionaries ./data/corpus/train-esl.trg.json ./data/corpus/train-esl.src.json \ 7 | --valid_datasets ./data/corpus/dev.trg ./data/corpus/dev.src \ 8 | --model models/reverse.npz \ 9 | --use_dropout \ 10 | --maxlen 50 \ 11 | --optimizer adam \ 12 | --lrate 0.0001 \ 13 | --batch_size 40 \ 14 | --n_words_src 35000 \ 15 | --n_words 35000 16 | -------------------------------------------------------------------------------- /runnmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \ 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu python nematus/nmt.py \ 5 | --datasets data/corpus/train.lang8.src.shuf data/corpus/train.lang8.trg.shuf \ 6 | --dictionaries data/corpus/train.lang8.src.shuf.json data/corpus/train.lang8.trg.shuf.json \ 7 | --model models/mle.npz \ 8 | --use_dropout \ 9 | --maxlen 50 \ 10 | --optimizer adam \ 11 | --lrate 0.0001 \ 12 | --batch_size 40 \ 13 | --n_words_src 35000 \ 14 | --n_words 35000 15 | -------------------------------------------------------------------------------- /nematus/training_progress.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Training progress 3 | ''' 4 | 5 | import sys 6 | import json 7 | 8 | import util 9 | 10 | class TrainingProgress(object): 11 | ''' 12 | Object used to store, serialize and deserialize pure python variables that change during training and should be preserved in order to properly restart the training process 13 | ''' 14 | 15 | def load_from_json(self, file_name): 16 | self.__dict__.update(util.unicode_to_utf8(json.load(open(file_name, 'rb')))) 17 | 18 | def save_to_json(self, file_name): 19 | json.dump(self.__dict__, open(file_name, 'wb'), indent=2) 20 | -------------------------------------------------------------------------------- /runnmt_l8-fce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \ 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu python nematus/nmt.py \ 5 | --datasets data/corpus/train.l8-fce.src data/corpus/train.l8-fce.trg \ 6 | --dictionaries data/corpus/train.l8-fce.src.json data/corpus/train.l8-fce.trg.json \ 7 | --model models/mle-l8-fce.npz \ 8 | --reload \ 9 | --use_dropout \ 10 | --maxlen 50 \ 11 | --optimizer adam \ 12 | --lrate 0.0001 \ 13 | --batch_size 40 \ 14 | --n_words_src 35000 \ 15 | --n_words 35000 16 | -------------------------------------------------------------------------------- /test/en-de/references: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 6 | -------------------------------------------------------------------------------- /nematus/initializers.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Parameter initializers 3 | ''' 4 | 5 | import numpy 6 | 7 | import theano 8 | import theano.tensor as tensor 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 10 | 11 | def ortho_weight(ndim): 12 | W = numpy.random.randn(ndim, ndim) 13 | u, s, v = numpy.linalg.svd(W) 14 | return u.astype('float32') 15 | 16 | def norm_weight(nin, nout=None, scale=0.01, ortho=True): 17 | if nout is None: 18 | nout = nin 19 | if nout == nin and ortho: 20 | W = ortho_weight(nin) 21 | else: 22 | W = scale * numpy.random.randn(nin, nout) 23 | return W.astype('float32') 24 | 25 | -------------------------------------------------------------------------------- /runnmt_l8-fce-giga.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \ 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \ 5 | --datasets data/corpus/train.l8-fce-giga.src data/corpus/train.l8-fce-giga.trg \ 6 | --dictionaries data/corpus/train.l8-fce-giga.src.json data/corpus/train.l8-fce-giga.trg.json \ 7 | --model models/mle-l8-fce-giga.npz \ 8 | --reload \ 9 | --use_dropout \ 10 | --maxlen 50 \ 11 | --optimizer adam \ 12 | --lrate 0.0001 \ 13 | --batch_size 40 \ 14 | --n_words_src 35000 \ 15 | --n_words 35000 16 | -------------------------------------------------------------------------------- /test/en-de/ref_score: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 0.688558 2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 1.18311 3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 1.44055 4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 2.32595 5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 0.40967 6 | -------------------------------------------------------------------------------- /test/en-ro/in: -------------------------------------------------------------------------------- 1 | the European Commission decided on Tuesday to resume payments for Romania under the " Economic competitiveness " and " Environment " programs , both interrupted in early April 2015 . 2 | the judge did not rule on whether L@@ M@@ FAO 's song itself was an un@@ authorized copy of " H@@ ust@@ lin ' . " 3 | the Romanian national team is part of Group D in the World Cup in England , along with France , Ireland , Canada and Italy . 4 | it sends a message : your country does not value you becoming a parent . 5 | the round@@ about will be made at the appropriate time , we must consider the trams traffic in the area , and we also need an approval from the National Roads . 6 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /test/en-ro/references: -------------------------------------------------------------------------------- 1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor " Competitivitate Econom@@ ica " și " Mediu " , ambele intre@@ rupte la inceputul lunii aprilie 2015 . 2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui " H@@ ust@@ lin ' " . 3 | nationala " tricol@@ ora " face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia . 4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte . 5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale . 6 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /test/en-ro/ref_score: -------------------------------------------------------------------------------- 1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor " Competitivitate Econom@@ ica " și " Mediu " , ambele intre@@ rupte la inceputul lunii aprilie 2015 . 1.10127 2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui " H@@ ust@@ lin ' " . 1.43826 3 | nationala " tricol@@ ora " face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia . 1.16586 4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte . 2.04865 5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale . 2.03933 6 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | Testing Nematus 2 | --------------- 3 | 4 | To test translation, execute 5 | 6 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=cpu python test_translate.py 7 | 8 | To test scoring, execute 9 | 10 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=cpu python test_score.py 11 | 12 | more sample models (including scripts for pre- and postprocessing) 13 | are provided at: http://statmt.org/rsennrich/wmt16_systems/ 14 | 15 | to test training, execute 16 | 17 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=cpu ./test_train.sh 18 | 19 | note that the training script is just a toy setup to make sure the scripts run, 20 | and to allow for speed comparisons. For instructions to train a 21 | real-scale system, check the instructions at https://github.com/rsennrich/wmt16-scripts -------------------------------------------------------------------------------- /test/test_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | mkdir -p models 10 | 11 | ../nematus/nmt.py \ 12 | --model models/model.npz \ 13 | --datasets data/corpus.en data/corpus.de \ 14 | --dictionaries data/vocab.en.json data/vocab.de.json \ 15 | --dim_word 256 \ 16 | --dim 512 \ 17 | --n_words_src 30000 \ 18 | --n_words 30000 \ 19 | --maxlen 50 \ 20 | --optimizer adam \ 21 | --lrate 0.0001 \ 22 | --batch_size 40 \ 23 | --no_shuffle \ 24 | --dispFreq 500 \ 25 | --finish_after 500 26 | -------------------------------------------------------------------------------- /test/test_train_bigmem.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | mkdir -p models 10 | 11 | ../nematus/nmt.py \ 12 | --model models/model.npz \ 13 | --datasets data/corpus.en data/corpus.de \ 14 | --dictionaries data/vocab.en.json data/vocab.de.json \ 15 | --dim_word 500 \ 16 | --dim 1024 \ 17 | --n_words_src 30000 \ 18 | --n_words 30000 \ 19 | --maxlen 50 \ 20 | --optimizer adam \ 21 | --lrate 0.0001 \ 22 | --batch_size 80 \ 23 | --no_shuffle \ 24 | --dispFreq 500 \ 25 | --finish_after 500 26 | -------------------------------------------------------------------------------- /test/test_train_verybigmem.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | mkdir -p models 10 | 11 | ../nematus/nmt.py \ 12 | --model models/model.npz \ 13 | --datasets data/corpus.en data/corpus.de \ 14 | --dictionaries data/vocab.en.json data/vocab.de.json \ 15 | --dim_word 500 \ 16 | --dim 2048 \ 17 | --n_words_src 30000 \ 18 | --n_words 30000 \ 19 | --maxlen 50 \ 20 | --optimizer adam \ 21 | --lrate 0.0001 \ 22 | --batch_size 80 \ 23 | --no_shuffle \ 24 | --dispFreq 500 \ 25 | --finish_after 500 26 | -------------------------------------------------------------------------------- /data/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | P=$1 4 | 5 | # source language (example: fr) 6 | S=$2 7 | # target language (example: en) 8 | T=$3 9 | 10 | # path to nematus/data 11 | P1=$4 12 | 13 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt) 14 | P2=$5 15 | 16 | # tokenize 17 | perl $P1/tokenizer.perl -threads 5 -l $S < {P}.${S} > {P}.${S}.tok 18 | perl $P1/tokenizer.perl -threads 5 -l $T < {P}.${T} > {P}.${T}.tok 19 | 20 | # learn BPE on joint vocabulary: 21 | cat {P}.${S}.tok {P}.${T}.tok | python $P2/learn_bpe.py -s 20000 > ${S}${T}.bpe 22 | 23 | python $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${S}.tok > {P}.${S}.tok.bpe 24 | python $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${T}.tok > {P}.${T}.tok.bpe 25 | 26 | # build dictionary 27 | python $P1/build_dictionary.py {P}.${S}.tok.bpe 28 | python $P1/build_dictionary.py {P}.${T}.tok.bpe 29 | 30 | -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER Tom Kocmi 3 | 4 | RUN apt-get update && apt-get install -y \ 5 | cmake \ 6 | git \ 7 | python \ 8 | python3 \ 9 | vim \ 10 | nano \ 11 | python-dev \ 12 | python-pip \ 13 | python-pygraphviz \ 14 | xml-twig-tools 15 | 16 | RUN pip install --upgrade pip 17 | 18 | RUN pip install numpy numexpr cython theano ipdb 19 | 20 | RUN mkdir -p /path/to 21 | WORKDIR /path/to/ 22 | 23 | # Install mosesdecoder 24 | RUN git clone https://github.com/moses-smt/mosesdecoder 25 | 26 | # Install subwords 27 | RUN git clone https://github.com/rsennrich/subword-nmt 28 | 29 | # Install nematus 30 | COPY . /path/to/nematus 31 | WORKDIR /path/to/nematus 32 | RUN python setup.py install 33 | 34 | WORKDIR / 35 | 36 | # playground will contain user defined scripts, it should be run as: 37 | # docker run -v `pwd`:/playground -it nematus-docker 38 | RUN mkdir playground 39 | WORKDIR /playground 40 | 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import setuptools 5 | 6 | setuptools.setup( 7 | name='nematus', 8 | version='0.1dev', 9 | description='Neural machine translation tools on top of Theano', 10 | long_description=open(os.path.join(os.path.dirname( 11 | os.path.abspath(__file__)), 'README.md')).read(), 12 | license='BSD 3-clause', 13 | url='http://github.com/rsennrich/nematus', 14 | install_requires=['numpy', 15 | 'Theano', 16 | 'ipdb'], 17 | dependency_links=['git+http://github.com/Theano/Theano.git#egg=Theano',], 18 | classifiers=['Development Status :: 3 - Alpha', 19 | 'Intended Audience :: Science/Research', 20 | 'License :: OSI Approved :: BSD License', 21 | 'Operating System :: OS Independent', 22 | 'Topic :: Scientific/Engineering'], 23 | packages = ['nematus', 'nematus.metrics'], 24 | ) 25 | -------------------------------------------------------------------------------- /data/build_dictionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy 4 | import json 5 | 6 | import sys 7 | import fileinput 8 | 9 | from collections import OrderedDict 10 | 11 | def main(): 12 | for filename in sys.argv[1:]: 13 | print 'Processing', filename 14 | word_freqs = OrderedDict() 15 | with open(filename, 'r') as f: 16 | for line in f: 17 | words_in = line.strip().split(' ') 18 | for w in words_in: 19 | if w not in word_freqs: 20 | word_freqs[w] = 0 21 | word_freqs[w] += 1 22 | words = word_freqs.keys() 23 | freqs = word_freqs.values() 24 | 25 | sorted_idx = numpy.argsort(freqs) 26 | sorted_words = [words[ii] for ii in sorted_idx[::-1]] 27 | 28 | worddict = OrderedDict() 29 | worddict['eos'] = 0 30 | worddict['UNK'] = 1 31 | for ii, ww in enumerate(sorted_words): 32 | worddict[ww] = ii+2 33 | 34 | with open('%s.json'%filename, 'wb') as f: 35 | json.dump(worddict, f, indent=2, ensure_ascii=False) 36 | 37 | print 'Done' 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /nematus/metrics/reference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from abc import ABCMeta, abstractmethod 5 | 6 | class Reference: 7 | """ 8 | Abstract base class for re-usable translation reference. Hypotheses can be 9 | scored against this reference through the evaluation metric implemented in 10 | its `score` function. 11 | """ 12 | 13 | __metaclass__ = ABCMeta #abstract base class 14 | 15 | def __init__(self, reference_tokens): 16 | """ 17 | @param reference the reference translation that hypotheses shall be 18 | scored against. 19 | """ 20 | self._reference_tokens = reference_tokens 21 | #additional (metric-specific) parameters to be defined in subclass 22 | 23 | @abstractmethod 24 | def score(self, hypothesis_tokens): 25 | """ 26 | Scores @param hypothesis against this reference. 27 | """ 28 | pass #to be implemented in sublcass 29 | 30 | def score_matrix(self, hypothesis_matrix): 31 | """ 32 | Scores every hypothesis in @param hypotheses against this reference. 33 | @param hypothesis_matrix an iterable of iterables of tokens. 34 | """ 35 | return [self.score(hypothesis_tokens) for hypothesis_tokens in hypothesis_matrix] 36 | -------------------------------------------------------------------------------- /nematus/util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utility functions 3 | ''' 4 | 5 | import sys 6 | import json 7 | import cPickle as pkl 8 | 9 | #json loads strings as unicode; we currently still work with Python 2 strings, and need conversion 10 | def unicode_to_utf8(d): 11 | return dict((key.encode("UTF-8"), value) for (key,value) in d.items()) 12 | 13 | def load_dict(filename): 14 | try: 15 | with open(filename, 'rb') as f: 16 | return unicode_to_utf8(json.load(f)) 17 | except: 18 | with open(filename, 'rb') as f: 19 | return pkl.load(f) 20 | 21 | 22 | def load_config(basename): 23 | try: 24 | with open('%s.json' % basename, 'rb') as f: 25 | return json.load(f) 26 | except: 27 | try: 28 | with open('%s.pkl' % basename, 'rb') as f: 29 | return pkl.load(f) 30 | except: 31 | sys.stderr.write('Error: config file {0}.json is missing\n'.format(basename)) 32 | sys.exit(1) 33 | 34 | 35 | def seqs2words(seq, inverse_target_dictionary): 36 | words = [] 37 | for w in seq: 38 | if w == 0: 39 | break 40 | if w in inverse_target_dictionary: 41 | words.append(inverse_target_dictionary[w]) 42 | else: 43 | words.append('UNK') 44 | return ' '.join(words) -------------------------------------------------------------------------------- /data/shuffle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | 5 | import tempfile 6 | from subprocess import call 7 | 8 | 9 | 10 | def main(files, temporary=False): 11 | 12 | tf_os, tpath = tempfile.mkstemp() 13 | tf = open(tpath, 'w') 14 | 15 | fds = [open(ff) for ff in files] 16 | 17 | for l in fds[0]: 18 | lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]] 19 | print >>tf, "|||".join(lines) 20 | 21 | [ff.close() for ff in fds] 22 | tf.close() 23 | 24 | lines = open(tpath, 'r').readlines() 25 | random.shuffle(lines) 26 | 27 | if temporary: 28 | fds = [] 29 | for ff in files: 30 | path, filename = os.path.split(os.path.realpath(ff)) 31 | fds.append(tempfile.TemporaryFile(prefix=filename+'.shuf', dir=path)) 32 | else: 33 | fds = [open(ff+'.shuf','w') for ff in files] 34 | 35 | for l in lines: 36 | s = l.strip().split('|||') 37 | for ii, fd in enumerate(fds): 38 | print >>fd, s[ii] 39 | 40 | if temporary: 41 | [ff.seek(0) for ff in fds] 42 | else: 43 | [ff.close() for ff in fds] 44 | 45 | os.close(tf_os) 46 | os.remove(tpath) 47 | 48 | return fds 49 | 50 | if __name__ == '__main__': 51 | main(sys.argv[1:]) 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /test/test_train_domaininterpolation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # warning: this test is useful to check if training fails, and what speed you can achieve 4 | # the toy datasets are too small to obtain useful translation results, 5 | # and hyperparameters are chosen for speed, not for quality. 6 | # For a setup that preprocesses and trains a larger data set, 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample 8 | 9 | mkdir -p models 10 | 11 | ../nematus/nmt.py \ 12 | --model models/model_domainadapt.npz \ 13 | --datasets data/corpus.en data/corpus.de \ 14 | --dictionaries data/vocab.en.json data/vocab.de.json \ 15 | --dim_word 256 \ 16 | --dim 512 \ 17 | --n_words_src 30000 \ 18 | --n_words 30000 \ 19 | --maxlen 50 \ 20 | --optimizer adam \ 21 | --lrate 0.0001 \ 22 | --batch_size 40 \ 23 | --no_shuffle \ 24 | --dispFreq 100 \ 25 | --finish_after 50000 \ 26 | --domain_interpolation_indomain_datasets data/indomain-corpus.en data/indomain-corpus.de \ 27 | --domain_interpolation_min 0.5 \ 28 | --domain_interpolation_max 1.0 \ 29 | --domain_interpolation_inc 0.2 \ 30 | --saveFreq 100 \ 31 | --valid_datasets data/indomain-dev.en data/indomain-dev.de \ 32 | --valid_batch_size 20 \ 33 | --validFreq 100 \ 34 | --patience 3 \ 35 | --use_domain_interpolation \ 36 | # --reload 37 | 38 | -------------------------------------------------------------------------------- /nematus/metrics/test_sentence_bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import unittest 5 | 6 | from sentence_bleu import SentenceBleuScorer 7 | 8 | class TestSentenceBleuReference(unittest.TestCase): 9 | """ 10 | Regression tests for SmoothedBleuReference 11 | """ 12 | @staticmethod 13 | def tokenize(sentence): 14 | return sentence.split(" ") 15 | def test_identical_segments(self): 16 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 17 | scorer = SentenceBleuScorer('n=4') 18 | scorer.set_reference(segment) 19 | self.assertEqual(scorer.score(segment), 1.0) 20 | def test_completely_different_segments(self): 21 | segment_a = self.tokenize("A A A") 22 | segment_b = self.tokenize("B B B") 23 | scorer = SentenceBleuScorer('n=4') 24 | scorer.set_reference(segment_a) 25 | self.assertEqual(scorer.score(segment_b), 0.0) 26 | def test_clipping(self): 27 | segment_a = self.tokenize("The very nice man") 28 | segment_b = self.tokenize("man man man man") 29 | scorer = SentenceBleuScorer('n=1') 30 | scorer.set_reference(segment_a) 31 | self.assertNotEqual(scorer.score(segment_b), 1.0) 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:8.0-cudnn5-devel 2 | MAINTAINER Tom Kocmi 3 | 4 | # Install git, wget, python-dev, pip and other dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | git \ 7 | wget \ 8 | cmake \ 9 | vim \ 10 | nano \ 11 | python3 \ 12 | libopenblas-dev \ 13 | python-dev \ 14 | python-pip \ 15 | python-nose \ 16 | python-numpy \ 17 | python-scipy \ 18 | python-pygraphviz \ 19 | xml-twig-tools 20 | 21 | RUN pip install --upgrade pip 22 | RUN pip install -U setuptools 23 | RUN pip install numexpr cython ipdb 24 | 25 | # Set CUDA_ROOT 26 | ENV CUDA_ROOT /usr/local/cuda/bin 27 | # Install bleeding-edge Theano 28 | RUN pip install --upgrade --no-deps theano 29 | # Set up .theanorc for CUDA 30 | RUN echo "[global]\ndevice=gpu\nfloatX=float32\noptimizer_including=cudnn\n[lib]\ncnmem=0.1\n[nvcc]\nfastmath=True" > /root/.theanorc 31 | 32 | 33 | RUN mkdir -p /path/to 34 | WORKDIR /path/to/ 35 | 36 | # Install mosesdecoder 37 | RUN git clone https://github.com/moses-smt/mosesdecoder 38 | 39 | # Install subwords 40 | RUN git clone https://github.com/rsennrich/subword-nmt 41 | 42 | # Install nematus 43 | COPY . /path/to/nematus 44 | WORKDIR /path/to/nematus 45 | RUN python setup.py install 46 | 47 | WORKDIR / 48 | 49 | # playground will contain user defined scripts, it should be run as: 50 | # nvidia-docker run -v `pwd`:/playground -it nematus-docker 51 | RUN mkdir playground 52 | WORKDIR /playground 53 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015--2017 Nematus Development Team 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of Nematus nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /nematus/metrics/test_scorer_provider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import unittest 5 | 6 | from scorer_provider import ScorerProvider 7 | from sentence_bleu import SentenceBleuScorer 8 | 9 | class TestScorerProvider(unittest.TestCase): 10 | """ 11 | Regression tests for ScorerProvider 12 | """ 13 | @staticmethod 14 | def tokenize(sentence): 15 | return sentence.split(" ") 16 | 17 | def test_single_metric(self): 18 | config_string = "SENTENCEBLEU n=4" 19 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 20 | reference_scorer = SentenceBleuScorer('n=4') 21 | provided_scorer = ScorerProvider().get(config_string) 22 | reference_scorer.set_reference(segment) 23 | provided_scorer.set_reference(segment) 24 | self.assertEqual( 25 | reference_scorer.score(segment), 26 | provided_scorer.score(segment) 27 | ) 28 | 29 | def test_interpolated_metrics(self): 30 | config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4" 31 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 32 | reference_scorer = SentenceBleuScorer('n=4') 33 | provided_scorer = ScorerProvider().get(config_string) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer 34 | reference_scorer.set_reference(segment) 35 | provided_scorer.set_reference(segment) 36 | self.assertEqual( 37 | reference_scorer.score(segment), 38 | provided_scorer.score(segment) 39 | ) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /test/en-de/ref: -------------------------------------------------------------------------------- 1 | eine republi@@ kanische Strategie gegen die Wiederwahl Obamas 2 | 0.977844655514 0.90209954977 0.927412986755 0.984532177448 0.183520868421 0.907861471176 0.994144678116 0.917708992958 0.990146577358 3 | die republi@@ kanische Führung begründet ihre Politik mit der Notwendigkeit , Wahl@@ betrug zu bekämpfen . 4 | 0.624975204468 0.467659324408 0.895200014114 0.922666728497 0.332508355379 0.962346553802 0.985188066959 0.511733949184 0.702501058578 0.733234107494 0.834280848503 0.298875242472 0.978177785873 0.962297916412 0.991670489311 0.998888731003 0.999692261219 5 | das Brenn@@ an Zentrum hält dies aber für einen Mythos , der besagt , dass Wahl@@ betrug in den USA seltener ist als die Zahl der getö@@ teten Menschen . 6 | 0.153531059623 0.871728599072 0.346277505159 0.747219443321 0.871806800365 0.120552673936 0.37667247653 0.782940626144 0.822250068188 0.98460739851 0.73440104723 0.481711357832 0.311930894852 0.961221635342 0.896834015846 0.427923560143 0.903929233551 0.673036038876 0.992655754089 0.739101171494 0.754340946674 0.522766292095 0.916598856449 0.96203070879 0.791576385498 0.890906095505 0.162579834461 0.99129909277 0.765361487865 0.619172334671 0.999593555927 7 | tatsächlich wurden in den USA in einem Jahrzehnt nur 300 Fälle von Wahl@@ betrug in den USA festgestellt . 8 | 0.874663293362 0.193072125316 0.830588340759 0.950349152088 0.536000072956 0.732309579849 0.601523339748 0.985651493073 0.771518468857 0.963857293129 0.582112908363 0.782780885696 0.960188984871 0.962329685688 0.735553085804 0.973220407963 0.69519174099 0.764474630356 0.998193442822 0.999425113201 9 | eines ist sicher : diese neuen Bestimmungen werden negative Auswirkungen auf die Wahlbeteiligung haben . 10 | 0.634134709835 0.78360158205 0.81129103899 0.985949218273 0.919415593147 0.925939559937 0.844495713711 0.82704269886 0.344317674637 0.952615022659 0.954769909382 0.629434704781 0.463058054447 0.923200011253 0.998686730862 0.999255955219 11 | -------------------------------------------------------------------------------- /nematus/compat.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Default options for backward compatibility 3 | ''' 4 | 5 | #hacks for using old models with missing options (dict is modified in-place) 6 | def fill_options(options): 7 | if not 'dropout_embedding' in options: 8 | options['dropout_embedding'] = 0 9 | if not 'dropout_hidden' in options: 10 | options['dropout_hidden'] = 0 11 | if not 'dropout_source' in options: 12 | options['dropout_source'] = 0 13 | if not 'dropout_target' in options: 14 | options['dropout_target'] = 0 15 | if not 'factors' in options: 16 | options['factors'] = 1 17 | if not 'dim_per_factor' in options: 18 | options['dim_per_factor'] = [options['dim_word']] 19 | if not 'model_version' in options: 20 | options['model_version'] = 0 21 | if not 'tie_encoder_decoder_embeddings' in options: 22 | options['tie_encoder_decoder_embeddings'] = False 23 | if not 'tie_decoder_embeddings' in options: 24 | options['tie_decoder_embeddings'] = False 25 | if not 'encoder_truncate_gradient' in options: 26 | options['encoder_truncate_gradient'] = -1 27 | if not 'decoder_truncate_gradient' in options: 28 | options['decoder_truncate_gradient'] = -1 29 | if not 'reload_training_progress' in options: 30 | options['reload_training_progress'] = True 31 | if not 'use_domain_interpolation' in options: 32 | options['use_domain_interpolation'] = False 33 | if not 'domain_interpolation_min' in options: 34 | options['decoder_truncate_gradient'] = 0.1 35 | if not 'domain_interpolation_max' in options: 36 | options['decoder_truncate_gradient'] = 1.0 37 | if not 'domain_interpolation_inc' in options: 38 | options['decoder_truncate_gradient'] = 0.1 39 | if not 'domain_interpolation_indomain_datasets' in options: 40 | options['domain_interpolation_indomain_datasets'] = ['indomain.en', 'indomain.fr'] 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /nematus/metrics/scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from abc import ABCMeta, abstractmethod 5 | 6 | class Scorer: 7 | """ 8 | Abstract base class for MT evaluation metric. Can be passed on to a 9 | Reference for scoring translation hypotheses. 10 | """ 11 | 12 | __metaclass__ = ABCMeta #abstract base class 13 | 14 | def __init__(self, argument_string): 15 | """ 16 | @param argument_string the metric-specific parameters (such as n-gram 17 | order for BLEU, language for METEOR, etc.) 18 | """ 19 | # parse arguments 20 | self._reference = None # to be set via `self.set_reference()` 21 | self._arguments = {} 22 | if argument_string: 23 | argument_strings = argument_string.split(",") 24 | for a in argument_strings: 25 | argument, value = a.split("=") 26 | argument = argument.strip() 27 | value = value.strip() 28 | try: 29 | value = int(value) # change type to int if applicable 30 | except ValueError: 31 | value = value 32 | self._arguments[argument] = value 33 | 34 | @abstractmethod 35 | def set_reference(self, reference_tokens): 36 | """ 37 | Sets the reference against which one or many hypotheses can be scored 38 | via `self.score()` and `self.score_matrix()`. 39 | """ 40 | pass # instantiate a Reference object and store it at self._reference 41 | 42 | def score(self, hypothesis_tokens): 43 | """ 44 | Scores @param hypothesis against this reference. 45 | """ 46 | return self._reference.score(hypothesis_tokens) 47 | 48 | def score_matrix(self, hypothesis_matrix): 49 | """ 50 | Scores every hypothesis in @param hypotheses against this reference. 51 | @param hypothesis_matrix an iterable of iterables of tokens. 52 | """ 53 | return self._reference.score_matrix(hypothesis_matrix) 54 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /doc/factored_neural_machine_translation.md: -------------------------------------------------------------------------------- 1 | FACTORED NEURAL MACHINE TRANSLATION 2 | ----------------------------------- 3 | 4 | Nematus supports arbitrary input features through factored representations, similar to factored models popularized with Moses. 5 | This can be used to add linguistic features such as lemmas, POS, or dependency labels, or potentially other types of information. 6 | The pipe symbol "|" serves as a factor separator and should not otherwise appear in the text. 7 | 8 | To use factored models, follow these steps: 9 | 10 | - preprocess the source side of the training, development and test data to include factors. Consider this example sentence, in an unfactored (or 1-factored) representation, and with 4 factors per word: 11 | 12 | Leonidas begged in the arena . 13 | 14 | Leonidas|Leonidas|NNP|nsubj begged|beg|VBD|root in|in|IN|prep the|the|DT|det gladiatorial|gladiatorial|JJ|amod arena|arena|NN|pobj 15 | 16 | https://github.com/rsennrich/wmt16-scripts/tree/master/factored_sample provides sample scripts to produce a factored representation from a CoNLL file, and BPE-segmented text. 17 | 18 | - in the arguments to nematus.nmt.train, adjust the following options: 19 | - factors: the number of factors per word 20 | - dim_per_factor: the size of the embedding layer for each factor (a list of integers) 21 | - dim_word: the total size of the input embedding (must match the sum of dim_per_factor) 22 | - dictionaries: add a vocabulary file for each factor (in the order they appear), plus a vocabulary file for the target side 23 | 24 | an example config is shown at https://github.com/rsennrich/wmt16-scripts/blob/master/factored_sample/config.py 25 | 26 | - commands for training and running Nematus are otherwise identical to the non-factored version 27 | 28 | 29 | PUBLICATIONS 30 | ------------ 31 | 32 | factored neural machine translation is described in: 33 | 34 | Sennrich, Rico, Haddow, Barry (2016): Linguistic Input Features Improve Neural Machine Translation, Proc. of the First Conference on Machine Translation (WMT16). Berlin, Germany -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /nematus/metrics/scorer_provider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import scorer_interpolator as si 5 | 6 | from sentence_bleu import SentenceBleuScorer 7 | from meteor import MeteorScorer 8 | from beer import BeerScorer 9 | from chrf import CharacterFScorer 10 | 11 | class ScorerProvider: 12 | """ 13 | Parses a config string and returns a matching scorer object with the given 14 | parameters. 15 | """ 16 | #from bleu import SentenceBleuScorer 17 | 18 | def __init__(self): 19 | pass 20 | 21 | def get(self, config_string): 22 | """ 23 | Returns a scorer matching the metric and parameters defined in @param 24 | config string. 25 | 26 | Example: ScorerProvider.get("BLEU n=4") returns a SmoothedBleuScorer 27 | object that considers n-gram precision up to n=4. 28 | 29 | If more than one metrics are provided (separated by `;`), 30 | an interpolated scorer will be returned. 31 | 32 | Example: ScorerProvider.get("INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor") 33 | returns an InterpolatedScorer object that scores hypotheses 34 | using 0.5 * bleu_score + 0.5 * meteor_score. 35 | """ 36 | # interpolation 37 | if config_string.startswith("INTERPOLATE"): 38 | return si.ScorerInterpolator(config_string) 39 | try: 40 | scorer, arguments = config_string.split(" ", 1) 41 | except ValueError: 42 | scorer = config_string 43 | arguments = '' 44 | if scorer == 'SENTENCEBLEU': 45 | return SentenceBleuScorer(arguments) 46 | elif scorer == 'METEOR': 47 | return MeteorScorer(arguments) 48 | elif scorer == 'BEER': 49 | return BeerScorer(arguments) 50 | elif scorer == 'CHRF': 51 | return CharacterFScorer(arguments) 52 | # add other scorers here 53 | else: 54 | raise NotImplementedError("No such scorer: %s" % scorer) 55 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /test/test_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import unittest 7 | import requests 8 | 9 | sys.path.append(os.path.abspath('../nematus')) 10 | from score import main as score 11 | 12 | 13 | def load_wmt16_model(src, target): 14 | path = os.path.join('models', '{0}-{1}'.format(src,target)) 15 | try: 16 | os.makedirs(path) 17 | except OSError: 18 | pass 19 | for filename in ['model.npz', 'model.npz.json', 'vocab.{0}.json'.format(src), 'vocab.{0}.json'.format(target)]: 20 | if not os.path.exists(os.path.join(path, filename)): 21 | r = requests.get('http://data.statmt.org/rsennrich/wmt16_systems/{0}-{1}/'.format(src,target) + filename, stream=True) 22 | with open(os.path.join(path, filename), 'wb') as f: 23 | for chunk in r.iter_content(1024**2): 24 | f.write(chunk) 25 | 26 | class TestTranslate(unittest.TestCase): 27 | """ 28 | Regression tests for translation with WMT16 models 29 | """ 30 | 31 | def setUp(self): 32 | """ 33 | Download pre-trained models 34 | """ 35 | load_wmt16_model('en','de') 36 | load_wmt16_model('en','ro') 37 | 38 | def scoreEqual(self, output1, output2): 39 | """given two files with translation scores, check that probabilities are equal within rounding error. 40 | """ 41 | for i, (line, line2) in enumerate(zip(open(output1).readlines(), open(output2).readlines())): 42 | self.assertAlmostEqual(float(line.split()[-1]), float(line2.split()[-1]), 5) 43 | 44 | # English-German WMT16 system, no dropout 45 | def test_ende(self): 46 | os.chdir('models/en-de/') 47 | score(['model.npz'], open('../../en-de/in'), open('../../en-de/references'), open('../../en-de/out_score','w'), normalize=True) 48 | os.chdir('../..') 49 | self.scoreEqual('en-de/ref_score', 'en-de/out_score') 50 | 51 | # English-Romanian WMT16 system, dropout 52 | def test_enro(self): 53 | os.chdir('models/en-ro/') 54 | score(['model.npz'], open('../../en-ro/in'), open('../../en-ro/references'), open('../../en-ro/out_score','w'), normalize=True) 55 | os.chdir('../..') 56 | self.scoreEqual('en-ro/ref_score', 'en-ro/out_score') 57 | 58 | 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /nematus/metrics/scorer_interpolator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from scorer import Scorer 5 | import scorer_provider as sp 6 | 7 | class ScorerInterpolator(Scorer): 8 | """ 9 | Creates a scorer that interpolates scores from 1..n sub-scorers, e.g., 10 | 0.5 * SENTENCEBLEU + 0.5 * METEOR. 11 | """ 12 | 13 | def __init__(self, config_string): 14 | """ 15 | @param config_string example: 16 | `INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor` 17 | """ 18 | self._scorers = [] 19 | self._weights = [] 20 | # parse arguments 21 | scorers = config_string.split(";") 22 | scorers = [scorer.strip() for scorer in scorers] 23 | try: 24 | instruction, weights = scorers[0].split("w=") 25 | assert instruction.strip() == "INTERPOLATE" 26 | weights = [float(w) for w in weights.split(',')] 27 | scorers = [sp.ScorerProvider().get(s) for s in scorers[1:]] 28 | except: 29 | raise SyntaxError("Ill-formated interpolation of metrics. Example of valid definition: `INTERPOLATE w=0.5,0.5`.") 30 | # assertions 31 | assert len(weights) == len(scorers) 32 | assert sum(weights) == 1.0 33 | # init scorers 34 | for i, scorer in enumerate(scorers): 35 | self._scorers.append(scorer) 36 | self._weights.append(weights[i]) 37 | 38 | def set_reference(self, reference_tokens): 39 | """ 40 | Sets the reference against which one or many hypotheses can be scored 41 | via `self.score()` and `self.score_matrix()`. 42 | """ 43 | for scorer in self._scorers: 44 | scorer.set_reference(reference_tokens) 45 | 46 | def score(self, hypothesis_tokens): 47 | """ 48 | Scores @param hypothesis with all scorers added via `self.add_scorer` 49 | and interpolates the scores with the respective weights. 50 | """ 51 | return sum([s.score(hypothesis_tokens) * w for w, s in zip(self._weights, self._scorers)]) 52 | 53 | def score_matrix(self, hypothesis_matrix): 54 | """ 55 | Scores every hypothesis in @param hypotheses with all scorers added via 56 | `self.add_scorer` and interpolates the scores with the respective 57 | weights. 58 | """ 59 | return sum([s.score_matrix(hypothesis_matrix) * w for w, s in zip(self._weights, self._scorers)]) 60 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /test/en-ro/ref: -------------------------------------------------------------------------------- 1 | Comisia Europeană a decis , marți , să reia plățile pentru România în cadrul programelor " competitivitate economică " și " Mediu " , ambele întrerupte la începutul lunii aprilie 2015 . 2 | 0.995251238346 0.554548621178 0.986067473888 0.977536916733 0.471415698528 0.965951085091 0.991383254528 0.735538363457 0.99354493618 0.959721267223 0.960633397102 0.987248241901 0.73650187254 0.958207905293 0.329731225967 0.941679000854 0.48397654295 0.872097313404 0.995552778244 0.99405169487 0.820243418217 0.72900468111 0.978062391281 0.980996310711 0.959786713123 0.870699226856 0.956985473633 0.989414513111 0.948426306248 0.996526777744 0.996653676033 0.995466053486 0.999979257584 3 | judecătorul nu a exclus dacă melodia L@@ M@@ FAO în sine a fost o copie ne@@ autorizată a " H@@ ust@@ lin " " . 4 | 0.748930931091 0.976350605488 0.90377175808 0.238382071257 0.800515711308 0.51756888628 0.782619535923 0.955519676208 0.894009530544 0.183243229985 0.996174514294 0.782620131969 0.927685260773 0.802042484283 0.788843691349 0.390572547913 0.356075167656 0.823610961437 0.785067260265 0.941457808018 0.976138412952 0.979526996613 0.859899282455 0.516458272934 0.989753842354 0.999218225479 5 | naționala României face parte din Grupa D în Cupa Mondială din Anglia , alături de Franța , Irlanda , Canada și Italia . 6 | 0.336522132158 0.97390460968 0.485618531704 0.998266816139 0.977845489979 0.972954690456 0.995464265347 0.582527756691 0.900587379932 0.904148697853 0.926693975925 0.990065574646 0.982615590096 0.970086634159 0.995798170567 0.985046744347 0.999237596989 0.992471039295 0.998591423035 0.994875609875 0.995780050755 0.996373534203 0.996117174625 0.99995225668 7 | transmite un mesaj : țara dumneavoastră nu apreciază că devine părinte . 8 | 0.343225359917 0.930076539516 0.998842597008 0.99683535099 0.859772562981 0.548672556877 0.990485429764 0.126094281673 0.79455691576 0.418934345245 0.782570242882 0.974693894386 0.999907135963 9 | discu@@ tia despre care se va face la momentul oportun , trebuie sa avem in vedere traficul de tramvaie din zona si avem nevoie si de o aprobare de la Compania Nationala de auto@@ str@@ azi . 10 | 0.0200441926718 0.983767747879 0.254417777061 0.626059830189 0.914376199245 0.6536039114 0.598313570023 0.473640501499 0.660739302635 0.727347791195 0.59266859293 0.936970472336 0.982369661331 0.226246803999 0.963698983192 0.996792733669 0.877014875412 0.466113090515 0.902705550194 0.558049559593 0.952391505241 0.783247053623 0.856589257717 0.994170725346 0.818028509617 0.973481237888 0.627151310444 0.944025158882 0.787506222725 0.954702436924 0.380503386259 0.954006671906 0.737255275249 0.340464830399 0.983342587948 0.980352401733 0.994605183601 0.99982637167 11 | -------------------------------------------------------------------------------- /utils/copy_unknown_words.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | ''' 3 | This script is to replace the unknown words in target sentences with their aligned words in source sentences. 4 | Args: 5 | - input: an alignment file produced by translating with the option '--output_alignment' 6 | - output: output text file 7 | - unknown word token (optional): a string, default="UNK" 8 | To use: 9 | python copy_unknown_words.py -i translation.txt -o updated_translation.txt -u 'UNK' 10 | ''' 11 | 12 | import json 13 | import numpy 14 | import argparse 15 | import sys 16 | 17 | ''' 18 | Example input file: 19 | 0 ||| das ist ein Test . ||| 0 ||| this is a UNK . ||| 6 6 20 | 0.723781 0.0561881 0.0652739 0.0888658 0.0159646 0.0499262 21 | 0.0250772 0.728351 0.105699 0.0764411 0.0245384 0.0398933 22 | 0.0257915 0.0667947 0.543118 0.177978 0.020311 0.166007 23 | 0.000306134 0.0161435 0.025201 0.937249 0.00364889 0.0174515 24 | 0.0116866 0.195885 0.0383414 0.0331976 0.437992 0.282897 25 | 0.0121966 0.00570636 0.00524746 0.014052 0.0325562 0.930241 26 | ''' 27 | 28 | def copy_unknown_words(filename, out_filename, unk_token): 29 | for line in filename: 30 | items = line.split(' ||| ') 31 | if len(items) > 1: 32 | src = items[1].split() 33 | target = items[3].split() 34 | alignments = [] 35 | elif line.strip(): 36 | alignment = map(float,line.split()) 37 | hard_alignment = numpy.argmax(alignment, axis=0) 38 | alignments.append(hard_alignment) 39 | elif line == '\n': 40 | print alignments 41 | for i, word in enumerate(target): 42 | if word == unk_token: 43 | target[i] = src[alignments[i]] 44 | out_filename.write(' '.join(target) + '\n') 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 50 | metavar='PATH', default=sys.stdin, 51 | help='''Input text file (produced by decoding with \'--output_alignment\')''') 52 | parser.add_argument('--output', '-o', type=argparse.FileType('w'), 53 | default=sys.stdout, metavar='PATH', 54 | help="Output file (default: standard output)") 55 | parser.add_argument('--unknown', '-u', type=str, nargs = '?', default="UNK", 56 | help='Unknown token to be replaced (default: "UNK")') 57 | 58 | args = parser.parse_args() 59 | 60 | copy_unknown_words(args.input, args.output, args.unknown) -------------------------------------------------------------------------------- /test/test_translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import unittest 7 | import requests 8 | 9 | sys.path.append(os.path.abspath('../nematus')) 10 | from translate import main as translate 11 | 12 | 13 | def load_wmt16_model(src, target): 14 | path = os.path.join('models', '{0}-{1}'.format(src,target)) 15 | try: 16 | os.makedirs(path) 17 | except OSError: 18 | pass 19 | for filename in ['model.npz', 'model.npz.json', 'vocab.{0}.json'.format(src), 'vocab.{0}.json'.format(target)]: 20 | if not os.path.exists(os.path.join(path, filename)): 21 | r = requests.get('http://data.statmt.org/rsennrich/wmt16_systems/{0}-{1}/'.format(src,target) + filename, stream=True) 22 | with open(os.path.join(path, filename), 'wb') as f: 23 | for chunk in r.iter_content(1024**2): 24 | f.write(chunk) 25 | 26 | class TestTranslate(unittest.TestCase): 27 | """ 28 | Regression tests for translation with WMT16 models 29 | """ 30 | 31 | def setUp(self): 32 | """ 33 | Download pre-trained models 34 | """ 35 | load_wmt16_model('en','de') 36 | load_wmt16_model('en','ro') 37 | 38 | def outputEqual(self, output1, output2): 39 | """given two translation outputs, check that output string is identical, 40 | and probabilities are equal within rounding error. 41 | """ 42 | for i, (line, line2) in enumerate(zip(open(output1).readlines(), open(output2).readlines())): 43 | if not i % 2: 44 | self.assertEqual(line, line2) 45 | else: 46 | probs = map(float, line.split()) 47 | probs2 = map(float, line.split()) 48 | for p, p2 in zip(probs, probs2): 49 | self.assertAlmostEqual(p, p2, 5) 50 | 51 | # English-German WMT16 system, no dropout 52 | def test_ende(self): 53 | os.chdir('models/en-de/') 54 | translate(['model.npz'], open('../../en-de/in'), open('../../en-de/out','w'), k=12, normalize=True, n_process=1, suppress_unk=True, print_word_probabilities=True) 55 | os.chdir('../..') 56 | self.outputEqual('en-de/ref','en-de/out') 57 | 58 | # English-Romanian WMT16 system, dropout 59 | def test_enro(self): 60 | os.chdir('models/en-ro/') 61 | translate(['model.npz'], open('../../en-ro/in'), open('../../en-ro/out','w'), k=12, normalize=True, n_process=1, suppress_unk=True, print_word_probabilities=True) 62 | os.chdir('../..') 63 | self.outputEqual('en-ro/ref','en-ro/out') 64 | 65 | 66 | 67 | if __name__ == '__main__': 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /nematus/metrics/test_chrf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import unittest 5 | 6 | from chrf import CharacterFScorer 7 | 8 | class TestCharacterFScoreReference(unittest.TestCase): 9 | """ 10 | Regression tests for SmoothedBleuReference 11 | """ 12 | @staticmethod 13 | def tokenize(sentence): 14 | return sentence.split(" ") 15 | def test_identical_segments(self): 16 | segment = self.tokenize("Consistency is the last refuge of the unimaginative") 17 | scorer = CharacterFScorer('n=6,beta=3') 18 | scorer.set_reference(segment) 19 | self.assertEqual(scorer.score(segment), 1.0) 20 | def test_completely_different_segments(self): 21 | segment_a = self.tokenize("AAAAAA") 22 | segment_b = self.tokenize("BBBB") 23 | scorer = CharacterFScorer('n=3,beta=3') 24 | scorer.set_reference(segment_a) 25 | self.assertEqual(scorer.score(segment_b), 0.0) 26 | def test_empty_string(self): 27 | segment_a = self.tokenize("") 28 | segment_b = self.tokenize("") 29 | scorer = CharacterFScorer('n=6,beta=3') 30 | scorer.set_reference(segment_a) 31 | self.assertEqual(scorer.score(segment_b), 1.0) 32 | def test_one_character_empty_string(self): 33 | segment_a = self.tokenize("A") 34 | segment_b = self.tokenize("") 35 | scorer = CharacterFScorer('n=6,beta=3') 36 | scorer.set_reference(segment_a) 37 | self.assertEqual(scorer.score(segment_b), 0.0) 38 | def test_empty_string_one_character(self): 39 | segment_a = self.tokenize("") 40 | segment_b = self.tokenize("A") 41 | scorer = CharacterFScorer('n=6,beta=3') 42 | scorer.set_reference(segment_a) 43 | self.assertEqual(scorer.score(segment_b), 0.0) 44 | def test_half_right(self): 45 | segment_a = self.tokenize("AB") 46 | segment_b = self.tokenize("AA") 47 | scorer = CharacterFScorer('n=6,beta=3') 48 | scorer.set_reference(segment_a) 49 | self.assertEqual(scorer.score(segment_b), 0.25) 50 | def test_one_character(self): 51 | segment_a = self.tokenize("A") 52 | segment_b = self.tokenize("A") 53 | scorer = CharacterFScorer('n=6,beta=3') 54 | scorer.set_reference(segment_a) 55 | self.assertEqual(scorer.score(segment_b), 1.0) 56 | def test_almost_correct(self): 57 | segment_a = self.tokenize("risk assessment has to be undertaken by those who are qualified and expert in that area - that is the scientists .") 58 | segment_b = self.tokenize(" risk assessment must be made of those who are qualified and expertise in the sector - these are the scientists .") 59 | scorer = CharacterFScorer('n=6,beta=3') 60 | scorer.set_reference(segment_a) 61 | self.assertEqual('{0:.12f}'.format(scorer.score(segment_b)), "0.652414427449") 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.pt: -------------------------------------------------------------------------------- 1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. 2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 6 | #usually upper case letters are initials in a name 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 104 | Adj 105 | Adm 106 | Adv 107 | Art 108 | Ca 109 | Capt 110 | Cmdr 111 | Col 112 | Comdr 113 | Con 114 | Corp 115 | Cpl 116 | DR 117 | DRA 118 | Dr 119 | Dra 120 | Dras 121 | Drs 122 | Eng 123 | Enga 124 | Engas 125 | Engos 126 | Ex 127 | Exo 128 | Exmo 129 | Fig 130 | Gen 131 | Hosp 132 | Insp 133 | Lda 134 | MM 135 | MR 136 | MRS 137 | MS 138 | Maj 139 | Mrs 140 | Ms 141 | Msgr 142 | Op 143 | Ord 144 | Pfc 145 | Ph 146 | Prof 147 | Pvt 148 | Rep 149 | Reps 150 | Res 151 | Rev 152 | Rt 153 | Sen 154 | Sens 155 | Sfc 156 | Sgt 157 | Sr 158 | Sra 159 | Sras 160 | Srs 161 | Sto 162 | Supt 163 | Surg 164 | adj 165 | adm 166 | adv 167 | art 168 | cit 169 | col 170 | con 171 | corp 172 | cpl 173 | dr 174 | dra 175 | dras 176 | drs 177 | eng 178 | enga 179 | engas 180 | engos 181 | ex 182 | exo 183 | exmo 184 | fig 185 | op 186 | prof 187 | sr 188 | sra 189 | sras 190 | srs 191 | sto 192 | 193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 194 | v 195 | vs 196 | i.e 197 | rev 198 | e.g 199 | 200 | #Numbers only. These should only induce breaks when followed by a numeric sequence 201 | # add NUMERIC_ONLY after the word for this function 202 | #This case is mostly for the english "No." which can either be a sentence of its own, or 203 | #if followed by a number, a non-breaking prefix 204 | No #NUMERIC_ONLY# 205 | Nos 206 | Art #NUMERIC_ONLY# 207 | Nr 208 | p #NUMERIC_ONLY# 209 | pp #NUMERIC_ONLY# 210 | 211 | -------------------------------------------------------------------------------- /utils/visualize_probs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | # given a source sentence, a target sentence, and a sequence of probabilities (one per target word, plus an end-of-sentence probability), 5 | # visualize the probability of each target word via HTML output. 6 | # black fields indicate high confidence, light fields low confidence. 7 | # example input: 8 | """ 9 | Unsere digitalen Leben haben die Notwendigkeit, stark, lebenslustig und erfolgreich zu erscheinen, verdoppelt. 10 | Our digital lives have doubled the need to appear strong, lifel... ike and successful . 11 | 0.882218956947 0.989946246147 0.793388187885 0.790167689323 0.768674969673 0.941913545132 0.955783545971 0.777168631554 0.266917765141 0.909709095955 0.990240097046 0.341023534536 0.828059256077 0.854399263859 0.906807541847 0.960786998272 0.997184157372""" 12 | 13 | html_text = """ 15 | 16 | 17 | 18 | Results page 19 | 20 | 35 | 36 | \n 37 | \n 38 | 39 | 40 | {0} 41 |
42 | 43 | 44 | 45 | """ 46 | 47 | 48 | def print_probdist(infile, outfile): 49 | 50 | entries = [] 51 | 52 | for i, line in enumerate(infile): 53 | if i % 3 == 0: 54 | #words = line.split() 55 | entry = "" 56 | #for w in words: 57 | #entry += "" + w + "\n" 58 | entry = "" + line + "\n" 59 | entries.append(entry) 60 | 61 | if i % 3 == 1: 62 | words = line.split() 63 | words.append('</s>') 64 | elif i % 3 == 2: 65 | probs = map(float, line.split()) 66 | entry = "" 67 | for w,p in zip(words, probs): 68 | color = '#%02x%02x%02x' % (int((1-p)*255), int((1-p)*255), int((1-p)*255)) 69 | entry += "{1}".format(color, w) 70 | entry = "" + entry + "\n" 71 | entries.append(entry) 72 | 73 | 74 | outfile.write(html_text.format('\n'.join(entries))) 75 | 76 | 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 79 | default=sys.stdin, metavar='PATH', 80 | help="Input file (default: standard input)") 81 | parser.add_argument('--output', '-o', type=argparse.FileType('w'), 82 | default=sys.stdout, metavar='PATH', 83 | help="Output file (default: standard output)") 84 | 85 | args = parser.parse_args() 86 | 87 | print_probdist(args.input, args.output) -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /nematus/metrics/beer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import subprocess, threading 5 | from scorer import Scorer 6 | from reference import Reference 7 | 8 | class BeerError(Exception): 9 | def __init__(self, value): 10 | self.value = value 11 | def __str__(self): 12 | return repr(self.value) 13 | 14 | class BeerScorer(Scorer): 15 | """ 16 | Python wrapper for the BEER metric. Starts a BEER process and keeps it alive, so that the model 17 | can be kept in memeory. Arguments are the BEER language abbreviation and the path to the BEER 18 | installation. They need to be specified as follows:"beer_language=lg,beer_path=path" (any order). 19 | """ 20 | def __init__(self, argument_string): 21 | Scorer.__init__(self, argument_string) 22 | 23 | #Lock for the BEER process, which can only handle one request at a time: 24 | self.lock = threading.Lock() 25 | 26 | #Get necessary arguments for starting BEER from argument string parsed in Scorer.__init__() 27 | self._beer_language = self._arguments["beer_language"] 28 | self._beer_path = self._arguments["beer_path"] + "/" 29 | 30 | #Start a BEER process: 31 | command = self._beer_path+"beer -l "+self._beer_language+" --workingMode interactive " 32 | self.beer_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 33 | 34 | def set_reference(self, reference_tokens): 35 | """ 36 | Construct a BeerReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses. 37 | This can be done any time. 38 | """ 39 | self.lock.acquire() 40 | self._reference = BeerReference(reference_tokens, self) 41 | self.lock.release() 42 | 43 | def terminate_process(self): 44 | """ 45 | Waits for the current request to be processed and terminates the BEER process. 46 | """ 47 | self.lock.acquire() 48 | self.beer_process.terminate() 49 | self.lock.release() 50 | 51 | def kill_process(self): 52 | """ 53 | Kills the BEER process right away. 54 | """ 55 | self.beer_process.kill() 56 | 57 | class BeerReference(Reference): 58 | """ 59 | BEER reference object, against which hypotheses can be scored. 60 | """ 61 | def __init__(self, reference_tokens, beer_scorer): 62 | Reference.__init__(self, reference_tokens) 63 | 64 | #Construct reference string from tokens 65 | self._reference_string = " ".join(reference_tokens) 66 | self._beer_scorer = beer_scorer 67 | 68 | def score(self, hypothesis_tokens): 69 | 70 | #Construct hypothesis string from hypothesis tokens: 71 | hypothesis_string = " ".join(hypothesis_tokens) 72 | 73 | #Acquire lock to make sure BEER process is not in use: 74 | self._beer_scorer.lock.acquire() 75 | 76 | #Score hypothesis string against reference string 77 | try: 78 | self._beer_scorer.beer_process.stdin.write("EVAL ||| "+hypothesis_string+" ||| "+self._reference_string+"\n") 79 | except: 80 | raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip()) 81 | 82 | #Read feature values from process output 83 | std_out = self._beer_scorer.beer_process.stdout.readline() 84 | #Release the process lock 85 | self._beer_scorer.lock.release() 86 | 87 | #Check if BEER returned a score: 88 | try: 89 | n = float(std_out) 90 | except: 91 | raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip()) 92 | #Return final score 93 | return n -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /nematus/metrics/sentence_bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import division 5 | 6 | from math import exp 7 | from operator import mul 8 | from collections import defaultdict 9 | 10 | from scorer import Scorer 11 | from reference import Reference 12 | 13 | class SentenceBleuScorer(Scorer): 14 | """ 15 | Scores SmoothedBleuReference objects. 16 | """ 17 | 18 | def __init__(self, argument_string): 19 | """ 20 | Initialises metric-specific parameters. 21 | """ 22 | Scorer.__init__(self, argument_string) 23 | # use n-gram order of 4 by default 24 | if not 'n' in self._arguments.keys(): 25 | self._arguments['n'] = 4 26 | 27 | def set_reference(self, reference_tokens): 28 | """ 29 | Sets the reference against hypotheses are scored. 30 | """ 31 | self._reference = SentenceBleuReference( 32 | reference_tokens, 33 | self._arguments['n'] 34 | ) 35 | 36 | class SentenceBleuReference(Reference): 37 | """ 38 | Smoothed sentence-level BLEU as as proposed by Lin and Och (2004). 39 | Implemented as described in (Chen and Cherry, 2014). 40 | """ 41 | 42 | def __init__(self, reference_tokens, n=4): 43 | """ 44 | @param reference the reference translation that hypotheses shall be 45 | scored against. Must be an iterable of tokens (any 46 | type). 47 | @param n maximum n-gram order to consider. 48 | """ 49 | Reference.__init__(self, reference_tokens) 50 | self.n = n 51 | # preprocess reference 52 | self._reference_length = len(self._reference_tokens) 53 | self._reference_ngrams = self._get_ngrams(self._reference_tokens, self.n) 54 | 55 | def _get_ngrams(self, tokens, max_n): 56 | """ 57 | Extracts all n-grams of order 1 up to (and including) @param max_n from 58 | a list of @param tokens. 59 | """ 60 | n_grams = [] 61 | for n in range(1, max_n+1): 62 | n_grams.append(defaultdict(int)) 63 | for n_gram in zip(*[tokens[i:] for i in range(n)]): 64 | n_grams[n-1][n_gram] += 1 65 | return n_grams 66 | 67 | def score(self, hypothesis_tokens): 68 | """ 69 | Scores @param hypothesis against this reference. 70 | 71 | @return the smoothed sentence-level BLEU score: 1.0 is best, 0.0 worst. 72 | """ 73 | def product(iterable): 74 | return reduce(mul, iterable, 1) 75 | def ngram_precisions(ref_ngrams, hyp_ngrams): 76 | precisions = [] 77 | for n in range(1, self.n+1): 78 | overlap = 0 79 | for ref_ngram, ref_ngram_count in ref_ngrams[n-1].iteritems(): 80 | if ref_ngram in hyp_ngrams[n-1]: 81 | overlap += min(ref_ngram_count, hyp_ngrams[n-1][ref_ngram]) 82 | hyp_length = max(0, len(hypothesis_tokens)-n+1) 83 | if n >= 2: 84 | # smoothing as proposed by Lin and Och (2004), 85 | # implemented as described in (Chen and Cherry, 2014) 86 | overlap += 1 87 | hyp_length += 1 88 | precisions.append(overlap/hyp_length if hyp_length > 0 else 0.0) 89 | return precisions 90 | def brevity_penalty(ref_length, hyp_length): 91 | return min(1.0, exp(1-(ref_length/hyp_length if hyp_length > 0 else 0.0))) 92 | # preprocess hypothesis 93 | hypothesis_length = len(hypothesis_tokens) 94 | hypothesis_ngrams = self._get_ngrams(hypothesis_tokens, self.n) 95 | # calculate n-gram precision for all orders 96 | np = ngram_precisions(self._reference_ngrams, hypothesis_ngrams) 97 | # calculate brevity penalty 98 | bp = brevity_penalty(self._reference_length, hypothesis_length) 99 | # compose final BLEU score 100 | return product(np)**(1/self.n) * bp 101 | -------------------------------------------------------------------------------- /nematus/theano_util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Theano utility functions 3 | ''' 4 | 5 | import json 6 | import cPickle as pkl 7 | import numpy 8 | from collections import OrderedDict 9 | 10 | import theano 11 | import theano.tensor as tensor 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 13 | 14 | # push parameters to Theano shared variables 15 | def zip_to_theano(params, tparams): 16 | for kk, vv in params.iteritems(): 17 | tparams[kk].set_value(vv) 18 | 19 | 20 | # pull parameters from Theano shared variables 21 | def unzip_from_theano(zipped, excluding_prefix=None): 22 | new_params = OrderedDict() 23 | for kk, vv in zipped.iteritems(): 24 | if excluding_prefix and (kk.startswith(excluding_prefix)): 25 | continue 26 | new_params[kk] = vv.get_value() 27 | return new_params 28 | 29 | 30 | # get the list of parameters: Note that tparams must be OrderedDict 31 | def itemlist(tparams): 32 | return [vv for kk, vv in tparams.iteritems()] 33 | 34 | # make prefix-appended name 35 | def pp(pp, name): 36 | return '%s_%s' % (pp, name) 37 | 38 | # initialize Theano shared variables according to the initial parameters 39 | def init_theano_params(params): 40 | tparams = OrderedDict() 41 | for kk, pp in params.iteritems(): 42 | tparams[kk] = theano.shared(params[kk], name=kk) 43 | return tparams 44 | 45 | 46 | # load parameters 47 | def load_params(path, params, with_prefix=''): 48 | pp = numpy.load(path) 49 | new_params = OrderedDict() 50 | for kk, vv in params.iteritems(): 51 | if kk not in pp: 52 | warnings.warn('%s is not in the archive' % kk) 53 | continue 54 | new_params[with_prefix+kk] = pp[kk] 55 | 56 | params.update(new_params) 57 | return params 58 | 59 | # load parameters of the optimizer 60 | def load_optimizer_params(path, optimizer_name): 61 | params = {} 62 | pp = numpy.load(path) 63 | for kk in pp: 64 | if kk.startswith(optimizer_name): 65 | params[kk] = pp[kk] 66 | return params 67 | 68 | def tanh(x): 69 | return tensor.tanh(x) 70 | 71 | 72 | def linear(x): 73 | return x 74 | 75 | 76 | def concatenate(tensor_list, axis=0): 77 | """ 78 | Alternative implementation of `theano.tensor.concatenate`. 79 | This function does exactly the same thing, but contrary to Theano's own 80 | implementation, the gradient is implemented on the GPU. 81 | Backpropagating through `theano.tensor.concatenate` yields slowdowns 82 | because the inverse operation (splitting) needs to be done on the CPU. 83 | This implementation does not have that problem. 84 | :usage: 85 | >>> x, y = theano.tensor.matrices('x', 'y') 86 | >>> c = concatenate([x, y], axis=1) 87 | :parameters: 88 | - tensor_list : list 89 | list of Theano tensor expressions that should be concatenated. 90 | - axis : int 91 | the tensors will be joined along this axis. 92 | :returns: 93 | - out : tensor 94 | the concatenated tensor expression. 95 | """ 96 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 97 | 98 | output_shape = () 99 | for k in range(axis): 100 | output_shape += (tensor_list[0].shape[k],) 101 | output_shape += (concat_size,) 102 | for k in range(axis + 1, tensor_list[0].ndim): 103 | output_shape += (tensor_list[0].shape[k],) 104 | 105 | out = tensor.zeros(output_shape) 106 | offset = 0 107 | for tt in tensor_list: 108 | indices = () 109 | for k in range(axis): 110 | indices += (slice(None),) 111 | indices += (slice(offset, offset + tt.shape[axis]),) 112 | for k in range(axis + 1, tensor_list[0].ndim): 113 | indices += (slice(None),) 114 | 115 | out = tensor.set_subtensor(out[indices], tt) 116 | offset += tt.shape[axis] 117 | 118 | return out 119 | 120 | # return name of word embedding for factor i 121 | # special handling of factor 0 for backward compatibility 122 | def embedding_name(i): 123 | if i == 0: 124 | return 'Wemb' 125 | else: 126 | return 'Wemb'+str(i) 127 | 128 | # Zero out all parameters 129 | def zero_all(params): 130 | for kk, vv in params.iteritems(): 131 | vv[:] = numpy.zeros_like(vv) 132 | 133 | -------------------------------------------------------------------------------- /nematus/metrics/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import subprocess, threading 5 | from scorer import Scorer 6 | from reference import Reference 7 | 8 | class MeteorError(Exception): 9 | def __init__(self, value): 10 | self.value = value 11 | def __str__(self): 12 | return repr(self.value) 13 | 14 | class MeteorScorer(Scorer): 15 | """ 16 | Python wrapper for the METEOR metric. Starts a METEOR process and keeps it alive, so that the model 17 | can be kept in memeory. Arguments are the meteor language abbreviation and the path to the METEOR 18 | installation. They need to be specified as follows:"meteor_language=lg,meteor_path=path" (any order). 19 | """ 20 | def __init__(self, argument_string): 21 | Scorer.__init__(self, argument_string) 22 | 23 | #Lock for the METEOR process, which can only handle one request at a time: 24 | self.lock = threading.Lock() 25 | 26 | #Get necessary arguments for starting METEOR from argument string parsed in Scorer.__init__() 27 | self._meteor_language = self._arguments["meteor_language"] 28 | self._meteor_path = self._arguments["meteor_path"] + "/" 29 | 30 | #Start a METEOR process: 31 | command = "java -Xmx2G -jar "+self._meteor_path+"meteor-*.jar - - -l "+self._meteor_language+" -stdio" 32 | self.meteor_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 33 | 34 | def set_reference(self, reference_tokens): 35 | """ 36 | Construct a MeteorReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses. 37 | This can be done any time. 38 | """ 39 | self.lock.acquire() 40 | self._reference = MeteorReference(reference_tokens, self) 41 | self.lock.release() 42 | 43 | def terminate_process(self): 44 | """ 45 | Waits for the current request to be processed and terminates the METEOR process. 46 | """ 47 | self.lock.acquire() 48 | self.meteor_process.terminate() 49 | self.lock.release() 50 | 51 | def kill_process(self): 52 | """ 53 | Kills the METEOR process right away. 54 | """ 55 | self.meteor_process.kill() 56 | 57 | class MeteorReference(Reference): 58 | """ 59 | METEOR reference object, against which hypotheses can be scored. 60 | """ 61 | def __init__(self, reference_tokens, meteor_scorer): 62 | Reference.__init__(self, reference_tokens) 63 | 64 | #Construct reference string from tokens 65 | self._reference_string = " ".join(reference_tokens) 66 | self._meteor_scorer = meteor_scorer 67 | 68 | def score(self, hypothesis_tokens): 69 | 70 | #Construct hypothesis string from hypothesis tokens: 71 | hypothesis_string = " ".join(hypothesis_tokens) 72 | 73 | #Acquire lock to make sure METEOR process is not in use: 74 | self._meteor_scorer.lock.acquire() 75 | 76 | #Score hypothesis string against reference string 77 | try: 78 | self._meteor_scorer.meteor_process.stdin.write("SCORE ||| "+self._reference_string+" ||| "+hypothesis_string+"\n") 79 | except: 80 | raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip()) 81 | 82 | #Read feature values from process output 83 | std_out = self._meteor_scorer.meteor_process.stdout.readline() 84 | 85 | #Pass feature values to METEOR process for computation of the final score 86 | try: 87 | self._meteor_scorer.meteor_process.stdin.write("EVAL ||| "+std_out) 88 | except: 89 | raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip()) 90 | std_out = self._meteor_scorer.meteor_process.stdout.readline() 91 | 92 | #Release the process lock 93 | self._meteor_scorer.lock.release() 94 | 95 | #Check if Meteor returned a score: 96 | try: 97 | n = float(std_out) 98 | except: 99 | raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip()) 100 | 101 | #Return final score 102 | return n -------------------------------------------------------------------------------- /nematus/hypgraph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from collections import defaultdict 5 | 6 | class HypGraph(object): 7 | 8 | def __init__(self): 9 | self.nodes = defaultdict(str) # {id = label} 10 | self.edges = [] # (parent_node_id, child_node_id) 11 | self.costs = defaultdict(float) # {node_id = cost} 12 | self.word_probs = defaultdict(float) # {node_id = word_prob} 13 | 14 | def get_id(self, word, history): 15 | if history == []: 16 | return str(word) 17 | history = '-'.join([str(h) for h in reversed(history)]) 18 | return '%s-%s' % (word, history) 19 | 20 | def get_ids(self, words): 21 | ids = [] 22 | for i, w in enumerate(words): 23 | history = words[:i] 24 | ids.append(self.get_id(w, history)) 25 | return ids 26 | 27 | def add(self, word, history, word_prob=None, cost=None): 28 | history_labels = [0] + history 29 | history_ids = self.get_ids(history_labels) 30 | word_label = word 31 | word_id = self.get_id(word_label, history_labels) 32 | # store 33 | self.nodes[word_id] = word_label 34 | self.edges.append((history_ids[-1], word_id)) 35 | if word_prob != None: 36 | self.word_probs[word_id] = word_prob 37 | if cost != None: 38 | self.costs[word_id] = cost 39 | 40 | class HypGraphRenderer(object): 41 | 42 | def __init__(self, hyp_graph): 43 | self.nodes = hyp_graph.nodes 44 | self.edges = hyp_graph.edges 45 | self.costs = hyp_graph.costs 46 | self.word_probs = hyp_graph.word_probs 47 | # constants 48 | self.BOS_SYMBOLS = ['0'] 49 | self.EOS_SYMBOLS = [''] 50 | 51 | def _escape_label(self, label): 52 | replacements = { 53 | '<': '\<', 54 | '>': '\>', 55 | } 56 | for original, replacement in replacements.iteritems(): 57 | label = label.replace(original, replacement) 58 | return label 59 | 60 | def _render(self, costs=False, word_probs=False, highlight_best=False): 61 | from pygraphviz import AGraph 62 | graph = AGraph(directed=True) 63 | for node_id, node_label in self.nodes.iteritems(): 64 | attributes = self._node_attr(node_id, costs=costs, word_probs=word_probs) 65 | graph.add_node(node_id, **attributes) 66 | for (parent_node_id, child_node_id) in self.edges: 67 | graph.add_edge(parent_node_id, child_node_id) 68 | self.graph = graph 69 | if highlight_best: 70 | self._highlight_best() 71 | 72 | def _node_attr(self, node_id, costs=False, word_probs=False): 73 | word = self.nodes[node_id].decode('utf-8') 74 | cost = self.costs[node_id] 75 | prob = self.word_probs[node_id] 76 | attr = {} 77 | if costs and word_probs: 78 | attr['shape'] = "record" 79 | attr['label'] = "{{%s|%.3f}|%.3f}" % (word, prob, cost) 80 | elif costs: 81 | attr['shape'] = "record" 82 | attr['label'] = "{{%s}|%.3f}" % (word, cost) 83 | elif word_probs: 84 | attr['shape'] = "record" 85 | attr['label'] = "{{%s|%.3f}}" % (word, prob) 86 | else: 87 | attr['label'] = word 88 | attr['label'] = self._escape_label(attr['label']) 89 | return attr 90 | 91 | def _highlight_best(self): 92 | best_hyp_bg_color = '#CDE9EC' 93 | best_hyp_cost = None 94 | best_hyp_leaf_node_id = None 95 | for node_id, label in self.nodes.iteritems(): 96 | if label in self.EOS_SYMBOLS: 97 | if best_hyp_cost == None or self.costs[node_id] < best_hyp_cost: 98 | best_hyp_leaf_node_id = node_id 99 | best_hyp_cost = self.costs[node_id] 100 | if best_hyp_leaf_node_id: 101 | best_hyp_leaf_node = self.graph.get_node(best_hyp_leaf_node_id) 102 | current_node = best_hyp_leaf_node 103 | while current_node != []: 104 | current_node.attr['style'] = 'filled' 105 | current_node.attr['fillcolor'] = best_hyp_bg_color 106 | try: 107 | current_node = self.graph.predecessors(current_node)[0] 108 | except IndexError: 109 | break 110 | 111 | def wordify(self, word_dict): 112 | """ 113 | Replace node labels (usually integers) with words, subwords, or 114 | characters. 115 | """ 116 | for node_id, label in self.nodes.iteritems(): 117 | self.nodes[node_id] = word_dict[label] 118 | 119 | def save_png(self, filepath, detailed=False, highlight_best=False): 120 | """ 121 | Renders the graph as PNG image. 122 | 123 | @param filepath the taget file 124 | @param detailed whether to include word probabilities and 125 | hypothesis costs. 126 | @param highlight_best whether to highlight the best hypothesis. 127 | """ 128 | costs = True if detailed else False 129 | word_probs = True if detailed else False 130 | self._render(costs=costs, word_probs=word_probs, highlight_best=highlight_best) 131 | self.graph.draw(filepath, prog="dot") 132 | -------------------------------------------------------------------------------- /data/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /utils/plot_heatmap.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import matplotlib.pyplot as plt 3 | import sys 4 | import json 5 | import argparse 6 | 7 | # input: 8 | # alignment matrix - numpy array 9 | # shape (target tokens + eos, number of hidden source states = source tokens +eos) 10 | # one line correpsonds to one decoding step producing one target token 11 | # each line has the attention model weights corresponding to that decoding step 12 | # each float on a line is the attention model weight for a corresponding source state. 13 | # plot: a heat map of the alignment matrix 14 | # x axis are the source tokens (alignment is to source hidden state that roughly corresponds to a source token) 15 | # y axis are the target tokens 16 | 17 | # http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor 18 | def plot_head_map(mma, target_labels, source_labels): 19 | fig, ax = plt.subplots() 20 | heatmap = ax.pcolor(mma, cmap=plt.cm.Blues) 21 | 22 | # put the major ticks at the middle of each cell 23 | ax.set_xticks(numpy.arange(mma.shape[1])+0.5, minor=False) 24 | ax.set_yticks(numpy.arange(mma.shape[0])+0.5, minor=False) 25 | 26 | # without this I get some extra columns rows 27 | # http://stackoverflow.com/questions/31601351/why-does-this-matplotlib-heatmap-have-an-extra-blank-column 28 | ax.set_xlim(0, int(mma.shape[1])) 29 | ax.set_ylim(0, int(mma.shape[0])) 30 | 31 | # want a more natural, table-like display 32 | ax.invert_yaxis() 33 | ax.xaxis.tick_top() 34 | 35 | # source words -> column labels 36 | ax.set_xticklabels(source_labels, minor=False) 37 | # target words -> row labels 38 | ax.set_yticklabels(target_labels, minor=False) 39 | 40 | plt.xticks(rotation=45) 41 | 42 | #plt.tight_layout() 43 | plt.show() 44 | 45 | # column labels -> target words 46 | # row labels -> source words 47 | 48 | def read_alignment_matrix(f): 49 | header = f.readline().strip().split('|||') 50 | if header[0] == '': 51 | return None, None, None, None 52 | sid = int(header[0].strip()) 53 | # number of tokens in source and translation +1 for eos 54 | src_count, trg_count = map(int,header[-1].split()) 55 | # source words 56 | source_labels = header[3].decode('UTF-8').split() 57 | source_labels.append('') 58 | # target words 59 | target_labels = header[1].decode('UTF-8').split() 60 | target_labels.append('') 61 | 62 | mm = [] 63 | for r in range(trg_count): 64 | alignment = map(float,f.readline().strip().split()) 65 | mm.append(alignment) 66 | mma = numpy.array(mm) 67 | return sid,mma, target_labels, source_labels 68 | 69 | 70 | def read_plot_alignment_matrices(f, n): 71 | while(f): 72 | sid, mma, target_labels, source_labels = read_alignment_matrix(f) 73 | if mma is None: 74 | return 75 | if sid >n: 76 | return 77 | plot_head_map(mma, target_labels, source_labels) 78 | # empty line separating the matrices 79 | f.readline() 80 | 81 | 82 | """ 83 | Adding functions to read the json format. 84 | """ 85 | 86 | def read_plot_alignment_json(file, n): 87 | while (file): 88 | sid, mma, target_labels, source_labels = read_alignment_json(file) 89 | if mma is None: 90 | return 91 | if sid > n: 92 | return 93 | plot_head_map(mma, target_labels, source_labels) 94 | 95 | def read_alignment_json(file): 96 | data = file.readline() ##one line containing the json object. 97 | if len(data.strip()) == 0: 98 | return None, None, None, None 99 | jdata = json.loads(data) 100 | ## messy json encodings... TODO: make this better 101 | jdata = json.loads(json.dumps(jdata).decode('unicode-escape').encode('utf8')) 102 | #print jdata 103 | sid = int(jdata["id"]) 104 | mma = numpy.array(jdata["matrix"]) 105 | ##target words 106 | target_labels = jdata["target_sent"].split() 107 | target_labels.append('') 108 | ##source words 109 | source_labels = jdata["source_sent"].split() 110 | source_labels.append('') 111 | return sid,mma, target_labels, source_labels 112 | 113 | if __name__ == "__main__": 114 | 115 | parser = argparse.ArgumentParser() 116 | # '/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/en-de/test.alignment' 117 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 118 | default='/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/ro-en/newstest2016-roen-src.ro.alignment', metavar='PATH', 119 | help="Input file (default: standard input)") 120 | 121 | parser.add_argument('--json', '-j', required = False,action="store_true", 122 | help="If this option is used, then read alignment matrix from a Json formatted file.") 123 | args = parser.parse_args() 124 | 125 | if args.json: 126 | read_plot_alignment_json(args.input, 10) ##n is the maximum number of sentences to process. 127 | else: 128 | read_plot_alignment_matrices(args.input,10) 129 | >>>>>>> origin/nematus-liucan 130 | -------------------------------------------------------------------------------- /data/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | sub add_to_ref { 35 | my ($file,$REF) = @_; 36 | my $s=0; 37 | open(REF,$file) or die "Can't read $file"; 38 | while() { 39 | chop; 40 | push @{$$REF[$s++]}, $_; 41 | } 42 | close(REF); 43 | } 44 | 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 46 | my $s=0; 47 | while() { 48 | chop; 49 | $_ = lc if $lowercase; 50 | my @WORD = split; 51 | my %REF_NGRAM = (); 52 | my $length_translation_this_sentence = scalar(@WORD); 53 | my ($closest_diff,$closest_length) = (9999,9999); 54 | foreach my $reference (@{$REF[$s]}) { 55 | # print "$s $_ <=> $reference\n"; 56 | $reference = lc($reference) if $lowercase; 57 | my @WORD = split(' ',$reference); 58 | my $length = scalar(@WORD); 59 | my $diff = abs($length_translation_this_sentence-$length); 60 | if ($diff < $closest_diff) { 61 | $closest_diff = $diff; 62 | $closest_length = $length; 63 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 64 | } elsif ($diff == $closest_diff) { 65 | $closest_length = $length if $length < $closest_length; 66 | # from two references with the same closeness to me 67 | # take the *shorter* into account, not the "first" one. 68 | } 69 | for(my $n=1;$n<=4;$n++) { 70 | my %REF_NGRAM_N = (); 71 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 72 | my $ngram = "$n"; 73 | for(my $w=0;$w<$n;$w++) { 74 | $ngram .= " ".$WORD[$start+$w]; 75 | } 76 | $REF_NGRAM_N{$ngram}++; 77 | } 78 | foreach my $ngram (keys %REF_NGRAM_N) { 79 | if (!defined($REF_NGRAM{$ngram}) || 80 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 81 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 82 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 83 | } 84 | } 85 | } 86 | } 87 | $length_translation += $length_translation_this_sentence; 88 | $length_reference += $closest_length; 89 | for(my $n=1;$n<=4;$n++) { 90 | my %T_NGRAM = (); 91 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 92 | my $ngram = "$n"; 93 | for(my $w=0;$w<$n;$w++) { 94 | $ngram .= " ".$WORD[$start+$w]; 95 | } 96 | $T_NGRAM{$ngram}++; 97 | } 98 | foreach my $ngram (keys %T_NGRAM) { 99 | $ngram =~ /^(\d+) /; 100 | my $n = $1; 101 | # my $corr = 0; 102 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 103 | $TOTAL[$n] += $T_NGRAM{$ngram}; 104 | if (defined($REF_NGRAM{$ngram})) { 105 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 106 | $CORRECT[$n] += $T_NGRAM{$ngram}; 107 | # $corr = $T_NGRAM{$ngram}; 108 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 109 | } 110 | else { 111 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 112 | # $corr = $REF_NGRAM{$ngram}; 113 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 114 | } 115 | } 116 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 117 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 118 | } 119 | } 120 | $s++; 121 | } 122 | my $brevity_penalty = 1; 123 | my $bleu = 0; 124 | 125 | my @bleu=(); 126 | 127 | for(my $n=1;$n<=4;$n++) { 128 | if (defined ($TOTAL[$n])){ 129 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 130 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 131 | }else{ 132 | $bleu[$n]=0; 133 | } 134 | } 135 | 136 | if ($length_reference==0){ 137 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 138 | exit(1); 139 | } 140 | 141 | if ($length_translation<$length_reference) { 142 | $brevity_penalty = exp(1-$length_reference/$length_translation); 143 | } 144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 145 | my_log( $bleu[2] ) + 146 | my_log( $bleu[3] ) + 147 | my_log( $bleu[4] ) ) / 4) ; 148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 149 | 100*$bleu, 150 | 100*$bleu[1], 151 | 100*$bleu[2], 152 | 100*$bleu[3], 153 | 100*$bleu[4], 154 | $brevity_penalty, 155 | $length_translation / $length_reference, 156 | $length_translation, 157 | $length_reference; 158 | 159 | sub my_log { 160 | return -9999999999 unless $_[0]; 161 | return log($_[0]); 162 | } 163 | -------------------------------------------------------------------------------- /nematus/data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import gzip 4 | 5 | import shuffle 6 | from util import load_dict 7 | 8 | def fopen(filename, mode='r'): 9 | if filename.endswith('.gz'): 10 | return gzip.open(filename, mode) 11 | return open(filename, mode) 12 | 13 | class TextIterator: 14 | """Simple Bitext iterator.""" 15 | def __init__(self, source, target, 16 | source_dicts, target_dict, 17 | batch_size=128, 18 | maxlen=100, 19 | n_words_source=-1, 20 | n_words_target=-1, 21 | skip_empty=False, 22 | shuffle_each_epoch=False, 23 | sort_by_length=True, 24 | maxibatch_size=20): 25 | if shuffle_each_epoch: 26 | self.source_orig = source 27 | self.target_orig = target 28 | self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) 29 | else: 30 | self.source = fopen(source, 'r') 31 | self.target = fopen(target, 'r') 32 | self.source_dicts = [] 33 | for source_dict in source_dicts: 34 | self.source_dicts.append(load_dict(source_dict)) 35 | self.target_dict = load_dict(target_dict) 36 | 37 | self.batch_size = batch_size 38 | self.maxlen = maxlen 39 | self.skip_empty = skip_empty 40 | 41 | self.n_words_source = n_words_source 42 | self.n_words_target = n_words_target 43 | 44 | if self.n_words_source > 0: 45 | for d in self.source_dicts: 46 | for key, idx in d.items(): 47 | if idx >= self.n_words_source: 48 | del d[key] 49 | 50 | if self.n_words_target > 0: 51 | for key, idx in self.target_dict.items(): 52 | if idx >= self.n_words_target: 53 | del self.target_dict[key] 54 | 55 | self.shuffle = shuffle_each_epoch 56 | self.sort_by_length = sort_by_length 57 | 58 | self.source_buffer = [] 59 | self.target_buffer = [] 60 | self.k = batch_size * maxibatch_size 61 | 62 | 63 | self.end_of_data = False 64 | 65 | def __iter__(self): 66 | return self 67 | 68 | def reset(self): 69 | if self.shuffle: 70 | self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) 71 | else: 72 | self.source.seek(0) 73 | self.target.seek(0) 74 | 75 | def next(self): 76 | if self.end_of_data: 77 | self.end_of_data = False 78 | self.reset() 79 | raise StopIteration 80 | 81 | source = [] 82 | target = [] 83 | 84 | # fill buffer, if it's empty 85 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 86 | 87 | if len(self.source_buffer) == 0: 88 | for k_ in xrange(self.k): 89 | ss = self.source.readline() 90 | if ss == "": 91 | break 92 | tt = self.target.readline() 93 | if tt == "": 94 | break 95 | 96 | self.source_buffer.append(ss.strip().split()) 97 | self.target_buffer.append(tt.strip().split()) 98 | 99 | # sort by target buffer 100 | if self.sort_by_length: 101 | tlen = numpy.array([len(t) for t in self.target_buffer]) 102 | tidx = tlen.argsort() 103 | 104 | _sbuf = [self.source_buffer[i] for i in tidx] 105 | _tbuf = [self.target_buffer[i] for i in tidx] 106 | 107 | self.source_buffer = _sbuf 108 | self.target_buffer = _tbuf 109 | 110 | else: 111 | self.source_buffer.reverse() 112 | self.target_buffer.reverse() 113 | 114 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 115 | self.end_of_data = False 116 | self.reset() 117 | raise StopIteration 118 | 119 | try: 120 | 121 | # actual work here 122 | while True: 123 | 124 | # read from source file and map to word index 125 | try: 126 | ss = self.source_buffer.pop() 127 | except IndexError: 128 | break 129 | tmp = [] 130 | for w in ss: 131 | w = [self.source_dicts[i][f] if f in self.source_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] 132 | tmp.append(w) 133 | ss = tmp 134 | 135 | # read from source file and map to word index 136 | tt = self.target_buffer.pop() 137 | tt = [self.target_dict[w] if w in self.target_dict else 1 138 | for w in tt] 139 | if self.n_words_target > 0: 140 | tt = [w if w < self.n_words_target else 1 for w in tt] 141 | 142 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 143 | continue 144 | if self.skip_empty and (not ss or not tt): 145 | continue 146 | 147 | source.append(ss) 148 | target.append(tt) 149 | 150 | if len(source) >= self.batch_size or \ 151 | len(target) >= self.batch_size: 152 | break 153 | except IOError: 154 | self.end_of_data = True 155 | 156 | # all sentence pairs in maxibatch filtered out because of length 157 | if len(source) == 0 or len(target) == 0: 158 | source, target = self.next() 159 | 160 | return source, target 161 | -------------------------------------------------------------------------------- /nematus/score.py: -------------------------------------------------------------------------------- 1 | """ 2 | Given a parallel corpus of sentence pairs: with one-to-one of target and source sentences, 3 | produce the score, and optionally alignment for each pair. 4 | """ 5 | 6 | import sys 7 | import argparse 8 | import tempfile 9 | 10 | import numpy 11 | import json 12 | 13 | from data_iterator import TextIterator 14 | from util import load_dict, load_config 15 | from alignment_util import * 16 | from compat import fill_options 17 | 18 | from theano_util import (load_params, init_theano_params) 19 | from nmt import (pred_probs, build_model, prepare_data, init_params) 20 | 21 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 22 | import theano 23 | 24 | def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights): 25 | 26 | trng = RandomStreams(1234) 27 | 28 | fs_log_probs = [] 29 | 30 | for model, option in zip(models, options): 31 | 32 | # load model parameters and set theano shared variables 33 | param_list = numpy.load(model).files 34 | param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) 35 | params = load_params(model, param_list) 36 | tparams = init_theano_params(params) 37 | 38 | trng, use_noise, \ 39 | x, x_mask, y, y_mask, \ 40 | opt_ret, \ 41 | cost = \ 42 | build_model(tparams, option) 43 | inps = [x, x_mask, y, y_mask] 44 | use_noise.set_value(0.) 45 | 46 | if alignweights: 47 | sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n") 48 | outputs = [cost, opt_ret['dec_alphas']] 49 | f_log_probs = theano.function(inps, outputs) 50 | else: 51 | f_log_probs = theano.function(inps, cost) 52 | 53 | fs_log_probs.append(f_log_probs) 54 | 55 | def _score(pairs, alignweights=False): 56 | # sample given an input sequence and obtain scores 57 | scores = [] 58 | alignments = [] 59 | for i, f_log_probs in enumerate(fs_log_probs): 60 | score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) 61 | scores.append(score) 62 | alignments.append(alignment) 63 | 64 | return scores, alignments 65 | 66 | pairs = TextIterator(source_file.name, target_file.name, 67 | options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], 68 | n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], 69 | batch_size=b, 70 | maxlen=float('inf'), 71 | sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after 72 | 73 | scores, alignments = _score(pairs, alignweights) 74 | 75 | source_file.seek(0) 76 | target_file.seek(0) 77 | source_lines = source_file.readlines() 78 | target_lines = target_file.readlines() 79 | 80 | for i, line in enumerate(target_lines): 81 | score_str = ' '.join(map(str,[s[i] for s in scores])) 82 | if verbose: 83 | saveto.write('{0} '.format(line.strip())) 84 | saveto.write('{0}\n'.format(score_str)) 85 | 86 | ### optional save weights mode. 87 | if alignweights: 88 | ### writing out the alignments. 89 | temp_name = saveto.name + ".json" 90 | with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: 91 | for line in all_alignments: 92 | align_OUT.write(line + "\n") 93 | ### combining the actual source and target words. 94 | combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT) 95 | 96 | def main(models, source_file, nbest_file, saveto, b=80, 97 | normalize=False, verbose=False, alignweights=False): 98 | 99 | # load model model_options 100 | options = [] 101 | for model in models: 102 | options.append(load_config(model)) 103 | 104 | fill_options(options[-1]) 105 | 106 | rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights) 107 | 108 | if __name__ == "__main__": 109 | parser = argparse.ArgumentParser() 110 | parser.add_argument('-b', type=int, default=80, 111 | help="Minibatch size (default: %(default)s))") 112 | parser.add_argument('-n', action="store_true", 113 | help="Normalize scores by sentence length") 114 | parser.add_argument('-v', action="store_true", help="verbose mode.") 115 | parser.add_argument('--models', '-m', type=str, nargs = '+', required=True, 116 | help="model to use. Provide multiple models (with same vocabulary) for ensemble decoding") 117 | parser.add_argument('--source', '-s', type=argparse.FileType('r'), 118 | required=True, metavar='PATH', 119 | help="Source text file") 120 | parser.add_argument('--target', '-t', type=argparse.FileType('r'), 121 | required=True, metavar='PATH', 122 | help="Target text file") 123 | parser.add_argument('--output', '-o', type=argparse.FileType('w'), 124 | default=sys.stdout, metavar='PATH', 125 | help="Output file (default: standard output)") 126 | parser.add_argument('--walign', '-w',required = False,action="store_true", 127 | help="Whether to store the alignment weights or not. If specified, weights will be saved in .alignment") 128 | 129 | args = parser.parse_args() 130 | 131 | main(args.models, args.source, args.target, 132 | args.output, b=args.b, normalize=args.n, verbose=args.v, alignweights=args.walign) 133 | -------------------------------------------------------------------------------- /nematus/metrics/chrf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from scorer import Scorer 5 | from reference import Reference 6 | 7 | class CharacterFScorer(Scorer): 8 | """ 9 | Scores CharacterFScoreReference objects. 10 | """ 11 | 12 | def __init__(self, argument_string): 13 | """ 14 | Initialises metric-specific parameters. 15 | """ 16 | Scorer.__init__(self, argument_string) 17 | # use character n-gram order of 4 by default 18 | if not 'n' in self._arguments.keys(): 19 | self._arguments['n'] = 6 20 | # use beta = 1 by default (recommendation by Maja Popovic for generative modelling) 21 | if not 'beta' in self._arguments.keys(): 22 | self._arguments['beta'] = 1 23 | 24 | def set_reference(self, reference_tokens): 25 | """ 26 | Sets the reference against hypotheses are scored. 27 | """ 28 | self._reference = CharacterFScoreReference( 29 | reference_tokens, 30 | self._arguments['n'], 31 | self._arguments['beta'] 32 | ) 33 | 34 | class CharacterFScoreReference(Reference): 35 | """ 36 | References for Character F-Score, as proposed by Popovic (2015): http://www.statmt.org/wmt15/pdf/WMT49.pdf 37 | """ 38 | 39 | def __init__(self, reference_tokens, n=6, beta=1): 40 | """ 41 | @param reference the reference translation that hypotheses shall be 42 | scored against. 43 | @param n maximum character n-gram order to consider. 44 | @param beta algorithm paramater beta (interpolation weight, needs to be > 0). 45 | """ 46 | if beta <= 0: 47 | raise ValueError("Value of beta needs to be larger than zero!") 48 | 49 | Reference.__init__(self, reference_tokens) 50 | self.n = n 51 | self.max_order = n 52 | self.beta_squared = beta ** 2 53 | 54 | # The paper specifies that whitespace is ignored, but for a training objective, 55 | #it's perhaps better to leave it in. According to the paper, it makes no 56 | #difference in practise for scoring. 57 | self._reference_string = " ".join(reference_tokens).strip() 58 | 59 | # Get n-grams from reference: 60 | self._reference_ngrams = self._get_ngrams(self._reference_string, self.n) 61 | 62 | def _get_ngrams(self, tokens, n): 63 | """ 64 | Extracts all n-grams up to order @param n from a list of @param tokens. 65 | """ 66 | n_grams_dict = {} 67 | length = len(tokens) 68 | #If the reference is shorter than n characters, insist on an exact match: 69 | if len(tokens) < n: 70 | self.max_order = len(tokens) 71 | m = 1 72 | while m <= n: #n-gram order 73 | i = m 74 | n_grams_list = [] 75 | order_dict = {} 76 | while (i <= length): 77 | n_grams_list.append(tokens[i-m:i]) 78 | i += 1 79 | for ngr in n_grams_list: 80 | order_dict[ngr] = order_dict.setdefault(ngr,0) + 1 81 | n_grams_dict[m] = order_dict 82 | m += 1 83 | return n_grams_dict 84 | 85 | def score(self, hypothesis_tokens): 86 | """ 87 | Scores @param hypothesis against this reference. 88 | 89 | @return the sentence-level ChrF score: 1.0 is best, 0.0 worst. 90 | """ 91 | #See comment above on treating whitespace. 92 | hypothesis_string = " ".join(hypothesis_tokens).strip() 93 | 94 | #If the hypothesis or the reference is empty, insist on an exact match: 95 | if len(self._reference_string) < 1 or len(hypothesis_string) < 1: 96 | if hypothesis_string == self._reference_string: 97 | return 1.0 98 | else: 99 | return 0.0 100 | 101 | hypothesis_ngrams = self._get_ngrams(hypothesis_string, self.n) 102 | 103 | #Calculate character precision: 104 | chrP = 0.0 105 | chrR = 0.0 106 | for m in range(1,self.n+1): 107 | hyp_count = 0.0 108 | count_total = 0.0 109 | count_in = 0.0 110 | for ngr in hypothesis_ngrams[m]: 111 | hyp_count = hypothesis_ngrams[m][ngr] 112 | count_total += hyp_count 113 | if ngr in self._reference_ngrams[m]: 114 | count_in += min(hyp_count, self._reference_ngrams[m][ngr]) 115 | #Catch division by zero: 116 | if count_total == 0.0: 117 | chrP += 0.0 118 | else: 119 | chrP += count_in / count_total 120 | #average chrP over n-gram orders: 121 | chrP = chrP / float(self.max_order) 122 | 123 | #Calculate character recall: 124 | for m in range(1,self.n+1): 125 | ref_count = 0.0 126 | count_total = 0.0 127 | count_in = 0.0 128 | for ngr in self._reference_ngrams[m]: 129 | ref_count = self._reference_ngrams[m][ngr] 130 | count_total += ref_count 131 | if ngr in hypothesis_ngrams[m]: 132 | count_in += min(ref_count, hypothesis_ngrams[m][ngr]) 133 | #Catch division by zero: 134 | if count_total == 0.0: 135 | chrR += 0.0 136 | else: 137 | chrR += count_in/count_total 138 | #average chrR over n-gram orders: 139 | chrR = chrR / float(self.max_order) 140 | 141 | #Catch division by zero: 142 | if chrP == 0.0 and chrR == 0.0: 143 | return 0.0 144 | return (1 + self.beta_squared) * (chrP*chrR) / ((self.beta_squared * chrP) + chrR) -------------------------------------------------------------------------------- /nematus/rescore.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Rescoring an n-best list of translations using a translation model. 3 | ''' 4 | import sys 5 | import argparse 6 | import tempfile 7 | 8 | import numpy 9 | import json 10 | 11 | from data_iterator import TextIterator 12 | from util import load_dict, load_config 13 | from alignment_util import * 14 | from compat import fill_options 15 | 16 | from theano_util import (load_params, init_theano_params) 17 | from nmt import (pred_probs, build_model, prepare_data, init_params) 18 | 19 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 20 | import theano 21 | 22 | def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights): 23 | 24 | trng = RandomStreams(1234) 25 | 26 | fs_log_probs = [] 27 | 28 | for model, option in zip(models, options): 29 | 30 | # load model parameters and set theano shared variables 31 | param_list = numpy.load(model).files 32 | param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) 33 | params = load_params(model, param_list) 34 | tparams = init_theano_params(params) 35 | 36 | trng, use_noise, \ 37 | x, x_mask, y, y_mask, \ 38 | opt_ret, \ 39 | cost = \ 40 | build_model(tparams, option) 41 | inps = [x, x_mask, y, y_mask] 42 | use_noise.set_value(0.) 43 | 44 | if alignweights: 45 | sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n") 46 | outputs = [cost, opt_ret['dec_alphas']] 47 | f_log_probs = theano.function(inps, outputs) 48 | else: 49 | f_log_probs = theano.function(inps, cost) 50 | 51 | fs_log_probs.append(f_log_probs) 52 | 53 | def _score(pairs, alignweights=False): 54 | # sample given an input sequence and obtain scores 55 | scores = [] 56 | alignments = [] 57 | for i, f_log_probs in enumerate(fs_log_probs): 58 | score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) 59 | scores.append(score) 60 | alignments.append(alignment) 61 | 62 | return scores, alignments 63 | 64 | lines = source_file.readlines() 65 | nbest_lines = nbest_file.readlines() 66 | 67 | if alignweights: ### opening the temporary file. 68 | temp_name = saveto.name + ".json" 69 | align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name) 70 | 71 | with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out: 72 | for line in nbest_lines: 73 | linesplit = line.split(' ||| ') 74 | idx = int(linesplit[0]) ##index from the source file. Starting from 0. 75 | tmp_in.write(lines[idx]) 76 | tmp_out.write(linesplit[1] + '\n') 77 | 78 | tmp_in.seek(0) 79 | tmp_out.seek(0) 80 | pairs = TextIterator(tmp_in.name, tmp_out.name, 81 | options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], 82 | n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], 83 | batch_size=b, 84 | maxlen=float('inf'), 85 | sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after 86 | 87 | 88 | scores, alignments = _score(pairs, alignweights) 89 | 90 | for i, line in enumerate(nbest_lines): 91 | score_str = ' '.join(map(str,[s[i] for s in scores])) 92 | saveto.write('{0} {1}\n'.format(line.strip(), score_str)) 93 | 94 | ### optional save weights mode. 95 | if alignweights: 96 | for line in alignments: 97 | align_OUT.write(line + "\n") 98 | if alignweights: 99 | combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT) 100 | align_OUT.close() 101 | 102 | def main(models, source_file, nbest_file, saveto, b=80, 103 | normalize=False, verbose=False, alignweights=False): 104 | 105 | # load model model_options 106 | options = [] 107 | for model in models: 108 | options.append(load_config(model)) 109 | 110 | fill_options(options[-1]) 111 | 112 | rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights) 113 | 114 | if __name__ == "__main__": 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument('-b', type=int, default=80, 117 | help="Minibatch size (default: %(default)s))") 118 | parser.add_argument('-n', action="store_true", 119 | help="Normalize scores by sentence length") 120 | parser.add_argument('-v', action="store_true", help="verbose mode.") 121 | parser.add_argument('--models', '-m', type=str, nargs = '+', required=True, 122 | help="model to use. Provide multiple models (with same vocabulary) for ensemble decoding") 123 | parser.add_argument('--source', '-s', type=argparse.FileType('r'), 124 | required=True, metavar='PATH', 125 | help="Source text file") 126 | parser.add_argument('--input', '-i', type=argparse.FileType('r'), 127 | default=sys.stdin, metavar='PATH', 128 | help="Input n-best list file (default: standard input)") 129 | parser.add_argument('--output', '-o', type=argparse.FileType('w'), 130 | default=sys.stdout, metavar='PATH', 131 | help="Output file (default: standard output)") 132 | parser.add_argument('--walign', '-w',required = False,action="store_true", 133 | help="Whether to store the alignment weights or not. If specified, weights will be saved in .alignment") 134 | 135 | args = parser.parse_args() 136 | 137 | main(args.models, args.source, args.input, 138 | args.output, b=args.b, normalize=args.n, verbose=args.v, alignweights=args.walign) 139 | -------------------------------------------------------------------------------- /nematus/optimizers.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Optimizers 3 | ''' 4 | 5 | import numpy 6 | from collections import OrderedDict 7 | 8 | import theano 9 | import theano.tensor as tensor 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 11 | 12 | from util import * 13 | from theano_util import * 14 | 15 | # Calling convention: 16 | # f_grad_shared, f_update = name(hyperp, tparams, grads, inputs (list), cost) 17 | # with profile as an optional argument 18 | 19 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8, optimizer_params={}, profile=False): 20 | PREFIX='adam_' 21 | 22 | gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) 23 | for k, p in tparams.iteritems()] 24 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 25 | 26 | f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) 27 | 28 | updates = [] 29 | optimizer_tparams = {} 30 | 31 | t_prev_name = PREFIX + 't_prev' 32 | if t_prev_name in optimizer_params: 33 | t_prev_init = optimizer_params[t_prev_name] 34 | else: 35 | t_prev_init = 0. 36 | t_prev = theano.shared(numpy.float32(t_prev_init), t_prev_name) 37 | optimizer_tparams[t_prev_name] = t_prev 38 | 39 | t = t_prev + 1. 40 | lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) 41 | 42 | for p, g in zip(tparams.values(), gshared): 43 | # Create/Load variable for first moment 44 | m_name = PREFIX + p.name + '_mean' 45 | if m_name in optimizer_params: 46 | m_init = optimizer_params[m_name] 47 | else: 48 | m_init = p.get_value() * 0. 49 | m = theano.shared(m_init, m_name) 50 | optimizer_tparams[m_name] = m 51 | 52 | # Create/Load variable for second moment 53 | v_name = PREFIX + p.name + '_variance' 54 | if v_name in optimizer_params: 55 | v_init = optimizer_params[v_name] 56 | else: 57 | v_init = p.get_value() * 0. 58 | v = theano.shared(v_init, v_name) 59 | optimizer_tparams[v_name] = v 60 | 61 | # Define updates on shared vars 62 | m_t = beta1 * m + (1. - beta1) * g 63 | v_t = beta2 * v + (1. - beta2) * g**2 64 | step = lr_t * m_t / (tensor.sqrt(v_t) + e) 65 | p_t = p - step 66 | updates.append((m, m_t)) 67 | updates.append((v, v_t)) 68 | updates.append((p, p_t)) 69 | updates.append((t_prev, t)) 70 | 71 | f_update = theano.function([lr], [], updates=updates, 72 | on_unused_input='ignore', profile=profile) 73 | 74 | return f_grad_shared, f_update, optimizer_tparams 75 | 76 | def adadelta(lr, tparams, grads, inp, cost, optimizer_params={}, profile=False): 77 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 78 | name='%s_grad' % k) 79 | for k, p in tparams.iteritems()] 80 | running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), 81 | name='%s_rup2' % k) 82 | for k, p in tparams.iteritems()] 83 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 84 | name='%s_rgrad2' % k) 85 | for k, p in tparams.iteritems()] 86 | 87 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 88 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 89 | for rg2, g in zip(running_grads2, grads)] 90 | 91 | f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, 92 | profile=profile) 93 | 94 | updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg 95 | for zg, ru2, rg2 in zip(zipped_grads, running_up2, 96 | running_grads2)] 97 | ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) 98 | for ru2, ud in zip(running_up2, updir)] 99 | param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] 100 | 101 | f_update = theano.function([lr], [], updates=ru2up+param_up, 102 | on_unused_input='ignore', profile=profile) 103 | 104 | # TODO: third return value should be a dict of name->shared var used by optimizer 105 | return f_grad_shared, f_update, {} 106 | 107 | 108 | def rmsprop(lr, tparams, grads, inp, cost, optimizer_params={}, profile=False): 109 | zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), 110 | name='%s_grad' % k) 111 | for k, p in tparams.iteritems()] 112 | running_grads = [theano.shared(p.get_value() * numpy.float32(0.), 113 | name='%s_rgrad' % k) 114 | for k, p in tparams.iteritems()] 115 | running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), 116 | name='%s_rgrad2' % k) 117 | for k, p in tparams.iteritems()] 118 | 119 | zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] 120 | rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] 121 | rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) 122 | for rg2, g in zip(running_grads2, grads)] 123 | 124 | f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, 125 | profile=profile) 126 | 127 | updir = [theano.shared(p.get_value() * numpy.float32(0.), 128 | name='%s_updir' % k) 129 | for k, p in tparams.iteritems()] 130 | updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) 131 | for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, 132 | running_grads2)] 133 | param_up = [(p, p + udn[1]) 134 | for p, udn in zip(itemlist(tparams), updir_new)] 135 | f_update = theano.function([lr], [], updates=updir_new+param_up, 136 | on_unused_input='ignore', profile=profile) 137 | 138 | # TODO: third return value should be a dict of name->shared var used by optimizer 139 | return f_grad_shared, f_update, {} 140 | 141 | 142 | def sgd(lr, tparams, grads, inp, cost, optimizer_params=None, profile=False): 143 | gshared = [theano.shared(p.get_value() * 0., 144 | name='%s_grad' % k) 145 | for k, p in tparams.iteritems()] 146 | gsup = [(gs, g) for gs, g in zip(gshared, grads)] 147 | 148 | f_grad_shared = theano.function(inp, cost, updates=gsup, 149 | profile=profile) 150 | 151 | pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] 152 | f_update = theano.function([lr], [], updates=pup, profile=profile) 153 | 154 | return f_grad_shared, f_update, {} 155 | 156 | -------------------------------------------------------------------------------- /nematus/domain_interpolation_data_iterator.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import gzip 4 | 5 | import shuffle 6 | from util import load_dict 7 | 8 | import math 9 | 10 | def fopen(filename, mode='r'): 11 | if filename.endswith('.gz'): 12 | return gzip.open(filename, mode) 13 | return open(filename, mode) 14 | 15 | 16 | class DomainInterpolatorTextIterator: 17 | """Bitext iterator with domain interpolation.""" 18 | def __init__(self, source, target, 19 | source_dicts, target_dict, 20 | batch_size=128, 21 | maxlen=100, 22 | n_words_source=-1, 23 | n_words_target=-1, 24 | skip_empty=False, 25 | shuffle_each_epoch=False, 26 | sort_by_length=True, 27 | indomain_source='', indomain_target='', 28 | interpolation_rate=0.1, 29 | maxibatch_size=20): 30 | if shuffle_each_epoch: 31 | self.source_orig = source 32 | self.target_orig = target 33 | self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) 34 | self.indomain_source_orig = indomain_source 35 | self.indomain_target_orig = indomain_target 36 | self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True) 37 | else: 38 | self.source = fopen(source, 'r') 39 | self.target = fopen(target, 'r') 40 | self.indomain_source = fopen(indomain_source, 'r') 41 | self.indomain_target = fopen(indomain_target, 'r') 42 | self.source_dicts = [] 43 | for source_dict in source_dicts: 44 | self.source_dicts.append(load_dict(source_dict)) 45 | self.target_dict = load_dict(target_dict) 46 | 47 | self.batch_size = batch_size 48 | self.maxlen = maxlen 49 | self.skip_empty = skip_empty 50 | 51 | self.n_words_source = n_words_source 52 | self.n_words_target = n_words_target 53 | 54 | if self.n_words_source > 0: 55 | for d in self.source_dicts: 56 | for key, idx in d.items(): 57 | if idx >= self.n_words_source: 58 | del d[key] 59 | 60 | if self.n_words_target > 0: 61 | for key, idx in self.target_dict.items(): 62 | if idx >= self.n_words_target: 63 | del self.target_dict[key] 64 | 65 | self.shuffle = shuffle_each_epoch 66 | self.sort_by_length = sort_by_length 67 | 68 | self.source_buffer = [] 69 | self.target_buffer = [] 70 | self.k = batch_size * maxibatch_size 71 | 72 | self.end_of_data = False 73 | 74 | self.interpolation_rate = interpolation_rate 75 | self.cur_interpolation_rate = self.interpolation_rate 76 | self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k)) 77 | self.outdomain_k = self.k - self.indomain_k 78 | 79 | def __iter__(self): 80 | return self 81 | 82 | def reset(self): 83 | if self.shuffle: 84 | self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) 85 | else: 86 | self.source.seek(0) 87 | self.target.seek(0) 88 | 89 | def indomain_reset(self): 90 | if self.shuffle: 91 | self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True) 92 | else: 93 | self.indomain_source.seek(0) 94 | self.indomain_target.seek(0) 95 | 96 | def adjust_domain_interpolation_rate(self, interpolation_rate): 97 | # discard sentences in buffers 98 | self.source_buffer = [] 99 | self.target_buffer = [] 100 | # adjust rate 101 | self.cur_interpolation_rate = interpolation_rate 102 | self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k)) 103 | self.outdomain_k = self.k - self.indomain_k 104 | 105 | def next(self): 106 | if self.end_of_data: 107 | self.end_of_data = False 108 | self.reset() 109 | #raise StopIteration 110 | 111 | source = [] 112 | target = [] 113 | 114 | # fill buffer, if it's empty 115 | assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!' 116 | 117 | if len(self.source_buffer) == 0: 118 | for k_ in xrange(self.outdomain_k): 119 | ss = self.source.readline() 120 | if ss == "": 121 | break 122 | tt = self.target.readline() 123 | if tt == "": 124 | break 125 | self.source_buffer.append(ss.strip().split()) 126 | self.target_buffer.append(tt.strip().split()) 127 | for k_ in xrange(self.indomain_k): 128 | indomain_error = False 129 | try: 130 | ss = self.indomain_source.readline() 131 | tt = self.indomain_target.readline() 132 | except IOError: 133 | indomain_error = True 134 | if (ss == "") or (tt == "") or indomain_error: 135 | self.indomain_reset() 136 | raise StopIteration 137 | self.source_buffer.append(ss.strip().split()) 138 | self.target_buffer.append(tt.strip().split()) 139 | 140 | # sort by target buffer 141 | if self.sort_by_length: 142 | tlen = numpy.array([len(t) for t in self.target_buffer]) 143 | tidx = tlen.argsort() 144 | 145 | _sbuf = [self.source_buffer[i] for i in tidx] 146 | _tbuf = [self.target_buffer[i] for i in tidx] 147 | 148 | self.source_buffer = _sbuf 149 | self.target_buffer = _tbuf 150 | 151 | else: 152 | self.source_buffer.reverse() 153 | self.target_buffer.reverse() 154 | 155 | if len(self.source_buffer) == 0 or len(self.target_buffer) == 0: 156 | self.end_of_data = False 157 | self.reset() 158 | #raise StopIteration 159 | 160 | try: 161 | 162 | # actual work here 163 | while True: 164 | 165 | # read from source file and map to word index 166 | try: 167 | ss = self.source_buffer.pop() 168 | except IndexError: 169 | break 170 | tmp = [] 171 | for w in ss: 172 | w = [self.source_dicts[i][f] if f in self.source_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] 173 | tmp.append(w) 174 | ss = tmp 175 | 176 | # read from source file and map to word index 177 | tt = self.target_buffer.pop() 178 | tt = [self.target_dict[w] if w in self.target_dict else 1 179 | for w in tt] 180 | if self.n_words_target > 0: 181 | tt = [w if w < self.n_words_target else 1 for w in tt] 182 | 183 | if len(ss) > self.maxlen and len(tt) > self.maxlen: 184 | continue 185 | if self.skip_empty and (not ss or not tt): 186 | continue 187 | 188 | source.append(ss) 189 | target.append(tt) 190 | 191 | if len(source) >= self.batch_size or \ 192 | len(target) >= self.batch_size: 193 | break 194 | except IOError: 195 | self.end_of_data = True 196 | 197 | # all sentence pairs in maxibatch filtered out because of length 198 | if len(source) == 0 or len(target) == 0: 199 | source, target = self.next() 200 | 201 | return source, target 202 | -------------------------------------------------------------------------------- /test/data/indomain-dev.en: -------------------------------------------------------------------------------- 1 | one day , Los Angeles Times colum@@ n@@ ist Steve Lopez was walking along the streets of downtown Los Angeles when he heard beautiful music . 2 | and the source was a man , an African-@@ American man , charming , rugged , homeless , playing a violin that only had two strings . 3 | and I 'm telling a story that many of you know , because Steve 's columns became the basis for a book , which was turned into a movie , with Robert Do@@ w@@ ney Jr. acting as Steve Lopez , and Jamie Fo@@ xx as Nath@@ an@@ iel Anthony A@@ yers , the Ju@@ illi@@ ard-@@ trained double b@@ assist whose promising career was cut short by a tragic afflic@@ tion with paranoid schizophren@@ ia . 4 | Nath@@ an@@ iel dropped out of Ju@@ illi@@ ard , he suffered a complete breakdown , and 30 years later he was living homeless on the streets of Sk@@ id Ro@@ w in downtown Los Angeles . 5 | I encourage all of you to read Steve 's book or to watch the movie to understand not only the beautiful bond that formed between these two men , but how music helped shape that bond , and ultimately was instrumental -- if you 'll pardon the p@@ un -- in helping Nath@@ an@@ iel get off the streets . 6 | I met Mr. A@@ yers in 2008 , two years ago , at Walt Disney Concert Hall . 7 | he had just heard a performance of Beethoven 's First and Fourth symph@@ onies , and came back@@ stage and introduced himself . 8 | he was speaking in a very jo@@ vial and greg@@ arious way about Y@@ o-@@ Y@@ o Ma and Hillary Clinton and how the Dod@@ gers were never going to make the World Series , all because of the tre@@ acher@@ ous first violin passage work in the last movement of Beethoven 's Fourth Symphony . 9 | and we got talking about music , and I got an email from Steve a few days later saying that Nath@@ an@@ iel was interested in a violin lesson with me . 10 | now , I should mention that Nath@@ an@@ iel refuses treatment because when he was treated it was with shock therapy and Thor@@ az@@ ine and hand@@ cu@@ ffs , and that scar has stayed with him for his entire life . 11 | but as a result now , he is prone to these schizophren@@ ic episodes , the worst of which can manifest themselves as and then disappearing for days , wandering the streets of Sk@@ id Ro@@ w , exposed to its horrors , with the tor@@ ment of his own mind unleashed upon him . 12 | and Nath@@ an@@ iel was in such a state of ag@@ itation when we started our first lesson at Walt Disney Concert Hall -- he had a kind of man@@ ic g@@ lin@@ t in his eyes , he was lost . 13 | and he was talking about invisible demons and smoke , and how someone was poisoning him in his sleep . 14 | and I was afraid , not for myself , but I was afraid that I was going to lose him , that he was going to sink into one of his states , and that I would ruin his relationship with the violin if I started talking about scales and ar@@ peg@@ gi@@ os and other exciting forms of didac@@ tic violin pedagog@@ y . 15 | so , I just started playing . 16 | and I played the first movement of the Beethoven Viol@@ in Concerto . 17 | and as I played , I understood that there was a profound change occurring in Nath@@ an@@ iel 's eyes . 18 | it was as if he was in the grip of some invisible pharmaceutical , a chemical reaction , for which my playing the music was its catalyst . 19 | and Nath@@ an@@ iel 's man@@ ic rage was transformed into understanding , a quiet curiosity and grace . 20 | and in a miracle , he lifted his own violin and he started playing , by ear , certain sni@@ ppets of violin concer@@ tos which he then asked me to complete -- Mendelssohn , T@@ ch@@ ai@@ kovsky , Si@@ bel@@ ius . 21 | and we started talking about music , from Bach to Beethoven and Brahms , Bruck@@ ner , all the B 's , from Bart@@ ó@@ k , all the way up to E@@ sa-@@ Pek@@ ka Sal@@ onen . 22 | and I understood that he not only had an en@@ cyclop@@ edic knowledge of music , but he related to this music at a personal level . 23 | he spoke about it with the kind of passion and understanding that I share with my colleagues in the Los Angeles Philharmonic . 24 | and through playing music and talking about music , this man had transformed from the paranoid , disturbed man that had just come from walking the streets of downtown Los Angeles to the charming , eru@@ dite , brilliant , Ju@@ illi@@ ard-@@ trained musician . 25 | music is medicine . music changes us . 26 | and for Nath@@ an@@ iel , music is san@@ ity . 27 | because music allows him to take his thoughts and delu@@ sions and shape them through his imagination and his creativity , into reality . 28 | and that is an escape from his tor@@ mented state . 29 | and I understood that this was the very essence of art . 30 | this was the very reason why we made music , that we take something that exists within all of us at our very fundamental core , our emotions , and through our artistic lens , through our creativity , we 're able to shape those emotions into reality . 31 | and the reality of that expression reaches all of us and moves us , inspires and unites us . 32 | and for Nath@@ an@@ iel , music brought him back into a fold of friends . 33 | the rede@@ mp@@ tive power of music brought him back into a family of musicians that understood him , that recognized his talents and respected him . 34 | and I will always make music with Nath@@ an@@ iel , whether we 're at Walt Disney Concert Hall or on Sk@@ id Ro@@ w , because he reminds me why I became a musician . 35 | thank you . 36 | Bruno Gi@@ uss@@ ani : thank you . thanks . 37 | Robert G@@ up@@ ta . 38 | Robert G@@ up@@ ta : I 'm going to play something that I sham@@ elessly st@@ ole from cell@@ ists . 39 | so , please forgive me . 40 | so , I 've known a lot of fish in my life . 41 | I 've loved only two . 42 | that first one , it was more like a passionate affair . 43 | it was a beautiful fish : fla@@ vor@@ ful , tex@@ tured , me@@ aty , a bes@@ tseller on the menu . 44 | what a fish . 45 | even better , it was far@@ m-@@ raised to the supposed highest standards of sustainability . 46 | so you could feel good about selling it . 47 | I was in a relationship with this beauty for several months . 48 | one day , the head of the company called and asked if I 'd speak at an event about the farm 's sustainability . 49 | `` Absol@@ utely , '' I said . 50 | here was a company trying to solve what 's become this unimaginable problem for us chefs : how do we keep fish on our menus ? 51 | for the past 50 years , we 've been fishing the seas like we clear-cut forests . 52 | it 's hard to over@@ state the destruction . 53 | nin@@ ety percent of large fish , the ones we love -- the tun@@ as , the hal@@ i@@ bu@@ ts , the sal@@ mons , s@@ word@@ fish -- they 've collapsed . 54 | there 's almost nothing left . 55 | so , for better or for worse , aquaculture , fish farming , is going to be a part of our future . 56 | a lot of arguments against it : fish farms pollute -- most of them do anyway -- and they 're inefficient . take tuna , a major drawback . 57 | it 's got a feed conversion ratio of 15 to one . 58 | that means it takes fifteen pounds of wild fish to get you one pound of farm tuna . 59 | not very sustainable . 60 | it does n't taste very good either . 61 | so here , finally , was a company trying to do it right . 62 | I wanted to support them . 63 | the day before the event , I called the head of P.@@ R. for the company . 64 | let 's call him Don . 65 | `` Don , '' I said , `` just to get the facts straight , you guys are famous for farming so far out to sea , you do n't pollute . '' 66 | `` That 's right , '' he said . `` We 're so far out , the waste from our fish gets distributed , not concentrated . '' 67 | and then he added , `` We 're basically a world unto ourselves . 68 | that feed conversion ratio ? 2.5 to one , '' he said . 69 | `` Best in the business . '' 70 | 2.5 to one , great . 71 | `` 2.5 what ? what are you feeding ? '' 72 | `` Sustainable proteins , '' he said . 73 | `` Great , '' I said . got off the phone . 74 | and that night , I was lying in bed , and I thought : what the hell is a sustainable protein ? 75 | so the next day , just before the event , I called Don . 76 | I said , `` Don , what are some examples of sustainable proteins ? '' 77 | he said he did n't know . he would ask around . 78 | well , I got on the phone with a few people in the company ; no one could give me a straight answer until finally , I got on the phone with the head bi@@ ologist . 79 | let 's call him Don too . 80 | `` Don , '' I said , `` what are some examples of sustainable proteins ? '' 81 | well , he mentioned some al@@ ga@@ es and some fish meals , and then he said chicken pellets . 82 | I said , `` Ch@@ icken pellets ? '' 83 | he said , `` Ye@@ ah , feathers , skin , bone meal , scra@@ ps , dried and processed into feed . '' 84 | I said , `` What percentage of your feed is chicken ? '' 85 | thinking , you know , two percent . 86 | `` Well , it 's about 30 percent , '' he said . 87 | I said , `` Don , what 's sustainable about feeding chicken to fish ? '' 88 | there was a long pause on the line , and he said , `` There 's just too much chicken in the world . '' 89 | I fell out of love with this fish . 90 | no , not because I 'm some self-@@ righteous , good@@ y-@@ two shoes Foo@@ die . 91 | I actually am . 92 | no , I actually fell out of love with this fish because , I swe@@ ar to God , after that conversation , the fish tasted like chicken . 93 | this second fish , it 's a different kind of love story . 94 | it 's the romantic kind , the kind where the more you get to know your fish , you love the fish . 95 | I first ate it at a restaurant in southern Spain . 96 | a journalist friend had been talking about this fish for a long time . 97 | she kind of set us up . 98 | it came to the table a bright , almost sh@@ immer@@ ing , white color . 99 | the chef had over@@ cooked it . 100 | like twice over . 101 | -------------------------------------------------------------------------------- /nematus/alignment_util.py: -------------------------------------------------------------------------------- 1 | __author__ = 'canliu' 2 | """ 3 | Save the alignment matrix in XML format. Like the following: 4 | 5 | 6 | 7 | 8 | x,x,x... 9 | x,x,x... 10 | 11 | 12 | 13 | The number of rows is equal to the number of target_words + 1. 14 | The number of columns is equal to the number of source_words + 1. 15 | """ 16 | import json 17 | import sys 18 | import codecs 19 | 20 | def get_alignments(attention, x_mask, y_mask): 21 | #print "\nPrinting Attention..." 22 | #print attention 23 | #print "\nPrinting x_mask, need to figure out how to use it" 24 | #print x_mask 25 | #print "\nPrinting y_mask, need to figure out how to use it" 26 | #print y_mask 27 | 28 | n_rows, n_cols = y_mask.shape ###n_cols correspond to the number of sentences. 29 | #print "Number of rows and number of columns: \n\n", n_rows, n_cols 30 | 31 | for target_sent_index in range(n_cols): 32 | #print "\n\n","*" * 40 33 | print "Going through sentence", target_sent_index 34 | #source_sent_index = source_indexes[target_sent_index] 35 | target_length = y_mask[:,target_sent_index].tolist().count(1) 36 | source_length = x_mask[:,target_sent_index].tolist().count(1) 37 | # #print "STEP1: The attention matrix that is relevant for this sentence", 38 | temp_attention = attention[range(target_length),:,:] 39 | #print "STEP2: The attention matrix that is particular to just this sentence\n", 40 | this_attention = temp_attention[:,target_sent_index,range(source_length)] 41 | 42 | jdata = {} 43 | jdata['matrix'] = this_attention.tolist() 44 | jdata = json.dumps(jdata) 45 | #print "\t\tJSON Data" 46 | #print "\t\t",jdata 47 | yield jdata 48 | 49 | def combine_source_target_text(source_IN, nbest_IN, saveto, alignment_IN): 50 | """ 51 | there can be multiple target sentences, aligned to the same source sentence. 52 | """ 53 | source_IN.seek(0) 54 | nbest_IN.seek(0) 55 | alignment_IN.seek(0) 56 | 57 | with open(saveto + "_withwords.json", "w") as alignment_OUT: 58 | all_matrixes = alignment_IN.readlines() 59 | nbest_lines = nbest_IN.readlines() 60 | source_lines = source_IN.readlines() 61 | assert len(all_matrixes) == len(nbest_lines), "The number of lines does not match with each other!" 62 | 63 | for target_index in range(len(all_matrixes)): 64 | jdata = json.loads(all_matrixes[target_index]) 65 | target_line = nbest_lines[target_index] 66 | elements = target_line.strip().split("|||") 67 | refer_index = int(elements[0].strip()) 68 | source_sent = source_lines[refer_index].strip() 69 | target_sent = elements[1].strip() 70 | 71 | jdata["source_sent"] = source_sent 72 | jdata["target_sent"] = target_sent 73 | jdata["id"] = refer_index 74 | jdata["prob"] = 0 #float(elements[2].strip().split()[1]) 75 | 76 | #jdata = json.dumps(jdata) 77 | jdata = json.dumps(jdata).decode('unicode-escape').encode('utf8') 78 | alignment_OUT.write(jdata + "\n") 79 | 80 | def combine_source_target_text_1to1(source_IN, target_IN, saveto, alignment_IN): 81 | """ 82 | There is a 1-1 mapping of target and source sentence. 83 | """ 84 | source_IN.seek(0) 85 | target_IN.seek(0) 86 | alignment_IN.seek(0) 87 | with open(saveto + "_withwords.json", "w") as alignment_OUT: 88 | 89 | all_matrixes = alignment_IN.readlines() 90 | target_lines = target_IN.readlines() 91 | source_lines = source_IN.readlines() 92 | assert len(all_matrixes) == len(target_lines), "The number of lines does not match with each other!" 93 | 94 | for target_index in range(len(all_matrixes)): 95 | jdata = json.loads(all_matrixes[target_index]) 96 | 97 | jdata["source_sent"] = source_lines[target_index].strip() 98 | jdata["target_sent"] = target_lines[target_index].strip() 99 | jdata["id"] = target_index 100 | jdata["prob"] = 0 #float(elements[2].strip().split()[1]) 101 | 102 | #jdata = json.dumps(jdata) 103 | jdata = json.dumps(jdata).decode('unicode-escape').encode('utf8') 104 | alignment_OUT.write(jdata + "\n") 105 | 106 | 107 | def convert_to_nodes_edges_v1(filename): 108 | """ 109 | Take as input the aligned file with file names ".withtext", and convert this into a file with nodes and edges. 110 | Which will later used for Visualization. 111 | """ 112 | with open(filename, "r") as IN: 113 | with open(filename + ".forweb" , "w") as OUT: 114 | in_lines = IN.readlines() 115 | for data in in_lines: 116 | data4web = convert_to_nodes_edges_each_v1(data) 117 | OUT.write(data4web + "\n") 118 | 119 | def convert_to_nodes_edges_each_v1(data): 120 | """ 121 | give a single data object string, convert it into a json data string that is compatible with the Web interface. 122 | """ 123 | jdata = json.loads(data) 124 | web_data = {} 125 | source_words = jdata["source_sent"].strip().split() 126 | target_words = jdata["target_sent"].strip().split() 127 | 128 | ###make the data for source and target words 129 | web_data["nodes"] = [] 130 | for word in source_words: 131 | web_data["nodes"].append({"name":word, "group": 1}) 132 | web_data["nodes"].append({"name":" 2 | 3 | 4 | 5 | 6 | Visualization Demo 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 34 | 37 | 38 | 52 | 53 |
54 |
55 | 56 | 57 | 214 | 215 | --------------------------------------------------------------------------------