├── .gitignore ├── .htaccess ├── HEADER.md ├── README.md ├── cs-en ├── STATUS ├── postprocess.sh ├── preprocess.sh ├── rerank_normalize.py ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── de-en ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── en-cs ├── STATUS ├── postprocess.sh ├── preprocess.sh ├── rerank_normalize.py ├── test.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── en-de ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── en-ru ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-single.sh └── vars ├── en-tr ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh └── translate-single.sh ├── en-zh ├── STATUS ├── deseg.py ├── postprocess.sh ├── preprocess.sh ├── rerank_normalize.py ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── lv-en ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── ru-en ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars ├── scripts ├── rerank.py ├── rerank_normalize.py ├── reverse.py └── reverse_nbest.py ├── tr-en ├── postprocess.sh ├── preprocess.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh └── translate-single.sh ├── training ├── .htaccess ├── README.md ├── data │ └── .gitignore ├── downloads │ └── .gitignore ├── model │ └── .gitignore ├── scripts.tensorflow │ ├── download_files.sh │ ├── evaluate.sh │ ├── postprocess.sh │ ├── preprocess.sh │ ├── train.sh │ └── validate.sh ├── scripts │ ├── download_files.sh │ ├── evaluate.sh │ ├── postprocess.sh │ ├── preprocess.sh │ ├── train.sh │ └── validate.sh └── vars ├── vars └── zh-en ├── STATUS ├── postprocess.sh ├── preprocess.sh ├── rerank_normalize.py ├── test.sh ├── tf-translate-ensemble.sh ├── tf-translate-reranked.sh ├── tf-translate-single.sh ├── translate-ensemble.sh ├── translate-reranked.sh ├── translate-single.sh └── vars /.gitignore: -------------------------------------------------------------------------------- 1 | *.npz 2 | *.json 3 | vocab* 4 | truecase-model* 5 | bpe.model* 6 | .NOTES 7 | *~ 8 | *.index 9 | *.meta 10 | *.data-00000-of-00001 11 | -------------------------------------------------------------------------------- /.htaccess: -------------------------------------------------------------------------------- 1 | AddHandler markdown .md 2 | AddType text/html .md 3 | ReadmeName README.md 4 | HeaderName HEADER.md 5 | IndexIgnore *.md README 6 | # DirectoryIndex README.md 7 | # MarkdownCss markdown.css 8 | -------------------------------------------------------------------------------- /HEADER.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # THE UNIVERSITY OF EDINBURGH'S WMT17 SYSTEMS 2 | ------------------------------------------- 3 | 4 | This directory contains some of the University of Edinburgh's 5 | submissions to the WMT17 shared translation task, and a 'training' 6 | directory with scripts to preprocess and train your own model. 7 | 8 | If you are accessing this through a git repository, it will contain all scripts and documentation, 9 | but no model files - the models are accessible at http://data.statmt.org/wmt17_systems 10 | 11 | Use the git repository to keep track of changes to this directory: https://github.com/EdinburghNLP/wmt17-scripts 12 | 13 | REQUIREMENTS 14 | ------------ 15 | 16 | The models use the following software: 17 | 18 | - moses decoder (scripts only; no compilation required) https://github.com/moses-smt/mosesdecoder 19 | - nematus: https://github.com/EdinburghNLP/nematus 20 | - subword-nmt https://github.com/rsennrich/subword-nmt 21 | 22 | Please set the appropriate paths in the 'vars' file. 23 | 24 | 25 | DOWNLOAD INSTRUCTIONS 26 | --------------------- 27 | 28 | you can download all files in this directory with this command: 29 | 30 | ``` 31 | wget -r -e robots=off -nH -np -R index.html* http://data.statmt.org/wmt17_systems/ 32 | ``` 33 | 34 | to download just one language pair (such as en-de), execute: 35 | 36 | ``` 37 | wget -r -e robots=off -nH -np -R index.html* http://data.statmt.org/wmt17_systems/en-de/ 38 | ``` 39 | 40 | to download just a single model (approx 2GB) and the corresponding translation scripts, ignoring ensembles, execute: 41 | 42 | ``` 43 | wget -r -e robots=off -nH -np -R *ens2* -R *ens3* -R *ens4* -R *r2l* -R tf-translate-single.sh -R tf-translate-ensemble.sh -R tf-translate-reranked.sh -R index.html* http://data.statmt.org/wmt17_systems/en-de/ 44 | ``` 45 | 46 | if you only download selected language pairs or models, you should also download these files which are shared: 47 | 48 | ``` 49 | wget -r -e robots=off -nH -np -R index.html* http://data.statmt.org/wmt17_systems/scripts/ http://data.statmt.org/wmt17_systems/vars 50 | ``` 51 | 52 | 53 | USAGE INSTRUCTIONS: PRE-TRAINED MODELS 54 | -------------------------------------- 55 | 56 | first, ensure that all requirements are present, and that the path names in the 'vars' file are up-to-date. 57 | If you want to decode on a GPU, you can also update the 'device' variable in that file. 58 | 59 | each subdirectory comes with several scripts tf-translate-*.sh. 60 | 61 | For translation with a single model, execute: 62 | 63 | ``` 64 | ./tf-translate-single.sh < your_input_file > your_output_file 65 | ``` 66 | 67 | the input should be UTF-8 plain text in the source language, one sentence per line. 68 | 69 | We also provide ensembles of left-to-right models: 70 | 71 | ``` 72 | ./tf-translate-ensemble.sh < your_input_file > your_output_file 73 | ``` 74 | 75 | For some language pairs, we built models that use right-to-left models for reranking: 76 | 77 | ``` 78 | ./tf-translate-reranked.sh < your_input_file > your_output_file 79 | ``` 80 | 81 | We used systems that include ensembles and right-to-left reranking for 82 | our official submissions; result may vary slightly from the official 83 | submissions due to post-submission improvements - see the shared task 84 | description for more details. 85 | 86 | USAGE INSTRUCTIONS: TRAINING SCRIPTS 87 | ------------------------------------ 88 | 89 | For training your own models, follow the instructions in `training/README.md` 90 | 91 | 92 | LEGACY MODELS: THEANO 93 | --------------------- 94 | 95 | All models for WMT17 were trained with a legacy version of Nematus, based on Theano. 96 | They have been converted to run with the current Tensorflow codebase of Nematus. 97 | 98 | To run the original Theano files, install the Theano version of Nematus 99 | and set the corresponding $nematus_home path in the 'vars' file: 100 | 101 | https://github.com/EdinburghNLP/nematus/tree/theano 102 | 103 | The translate scripts ('translate-*') without the 'tf-' prefix can be used to 104 | translate with the Theano models and codebase. 105 | 106 | 107 | LICENSE 108 | ------- 109 | 110 | All scripts in this directory are distributed under MIT license. 111 | 112 | The use of the models provided in this directory is permitted under 113 | the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported 114 | license (CC BY-NC-SA 3.0): 115 | https://creativecommons.org/licenses/by-nc-sa/3.0/ 116 | 117 | Attribution - You must give appropriate credit [please use the 118 | citation below], provide a link to the license, and indicate if 119 | changes were made. You may do so in any reasonable manner, but not in 120 | any way that suggests the licensor endorses you or your use. 121 | 122 | NonCommercial - You may not use the material for commercial purposes. 123 | 124 | ShareAlike - If you remix, transform, or build upon the material, you 125 | must distribute your contributions under the same license as the 126 | original. 127 | 128 | 129 | REFERENCE 130 | --------- 131 | 132 | The models are described in the following publication: 133 | 134 | Rico Sennrich, Alexandra Birch, Anna Currey, Ulrich Germann, Barry Haddow, Kenneth Heafield, Antonio Valerio Miceli Barone, and Philip Williams (2017). 135 | "The University of Edinburgh’s Neural MT Systems for WMT17". 136 | In: _Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers_. 137 | Copenhagen, Denmark. 138 | 139 |
140 | @inproceedings{uedin-nmt:2017, 141 | address = "Copenhagen, Denmark", 142 | author = "Sennrich, Rico and Birch, Alexandra and Currey, Anna and 143 | Germann, Ulrich and Haddow, Barry and Heafield, Kenneth and 144 | {Miceli Barone}, Antonio Valerio and Williams, Philip", 145 | booktitle = "{Proceedings of the Second Conference on Machine Translation, 146 | Volume 2: Shared Task Papers}", 147 | title = "{The University of Edinburgh's Neural MT Systems for WMT17}", 148 | year = "2017" 149 | } 150 |151 | -------------------------------------------------------------------------------- /cs-en/STATUS: -------------------------------------------------------------------------------- 1 | Copied models, test single and ensemble. Both give expected outputs. 2 | Tested reranked. Also gives expected output. 3 | 4 | Note that the models here were used in the submissions, but those in the 5 | paper had been trained slightly longer. 6 | -------------------------------------------------------------------------------- /cs-en/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /cs-en/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model 15 | -------------------------------------------------------------------------------- /cs-en/rerank_normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | from collections import defaultdict 8 | 9 | if __name__ == '__main__': 10 | 11 | if len(sys.argv) != 3: 12 | sys.stderr.write("usage: %s K ALPHA\n" % sys.argv[0]) 13 | sys.exit(1) 14 | k = float(sys.argv[1]) 15 | alpha = float(sys.argv[2]) 16 | 17 | cur = 0 18 | best_score = float('inf') 19 | best_sent = '' 20 | idx = 0 21 | for line in sys.stdin: 22 | num, sent, scores = line.split(' ||| ') 23 | 24 | # new input sentence: print best translation of previous sentence, and reset stats 25 | if int(num) > cur: 26 | print best_sent 27 | #print best_score 28 | cur = int(num) 29 | best_score = float('inf') 30 | best_sent = '' 31 | idx = 0 32 | 33 | #only consider k-best hypotheses 34 | if idx >= k: 35 | continue 36 | 37 | if len(sent.split()) == 0: continue 38 | 39 | score = sum(map(float, scores.split())) / (len(sent.split()))**alpha 40 | if score < best_score: 41 | best_score = score 42 | best_sent = sent.strip() 43 | 44 | idx += 1 45 | 46 | # end of file; print best translation of last sentence 47 | print best_sent 48 | # print best_score 49 | 50 | -------------------------------------------------------------------------------- /cs-en/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /cs-en/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Running on $HOSTNAME" 4 | 5 | model_dir=`dirname $0` 6 | 7 | #language-independent variables (toolkit locations) 8 | . $model_dir/../vars 9 | 10 | #language-dependent variables (source and target language) 11 | . $model_dir/vars 12 | 13 | # temporary files 14 | tmpfile_src=`mktemp -p $model_dir` 15 | tmpfile_nbest=`mktemp -p $model_dir` 16 | 17 | $model_dir/preprocess.sh > $tmpfile_src 18 | 19 | #left-to-right n-best list 20 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 21 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 22 | -k 50 --n-best > $tmpfile_nbest 23 | 24 | #rescoring 25 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 26 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 27 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 28 | -b 50 -s $tmpfile_src | \ 29 | $model_dir/rerank_normalize.py 12 1 | \ 30 | $model_dir/../scripts/reverse.py | \ 31 | $model_dir/postprocess.sh 32 | 33 | rm $tmpfile_src 34 | rm $tmpfile_nbest 35 | -------------------------------------------------------------------------------- /cs-en/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /cs-en/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /cs-en/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Running on $HOSTNAME" 4 | 5 | model_dir=`dirname $0` 6 | 7 | #language-independent variables (toolkit locations) 8 | . $model_dir/../vars 9 | 10 | #language-dependent variables (source and target language) 11 | . $model_dir/vars 12 | 13 | # temporary files 14 | tmpfile_src=`mktemp -p $model_dir` 15 | tmpfile_nbest=`mktemp -p $model_dir` 16 | 17 | $model_dir/preprocess.sh > $tmpfile_src 18 | 19 | #left-to-right n-best list 20 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 21 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 22 | -k 50 -p 2 --n-best > $tmpfile_nbest 23 | 24 | #rescoring 25 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 26 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 27 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 28 | -b 50 -s $tmpfile_src | \ 29 | $model_dir/rerank_normalize.py 12 1 | \ 30 | $model_dir/../scripts/reverse.py | \ 31 | $model_dir/postprocess.sh 32 | 33 | rm $tmpfile_src 34 | rm $tmpfile_nbest 35 | -------------------------------------------------------------------------------- /cs-en/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /cs-en/vars: -------------------------------------------------------------------------------- 1 | src=cs 2 | trg=en 3 | #nematus_home=/home/bhaddow/code/nematus/github 4 | #device=gpu0 5 | -------------------------------------------------------------------------------- /de-en/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /de-en/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model.$src 15 | -------------------------------------------------------------------------------- /de-en/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /de-en/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /de-en/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /de-en/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /de-en/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 -p 1 --n-best --suppress-unk > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /de-en/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /de-en/vars: -------------------------------------------------------------------------------- 1 | src=de 2 | trg=en 3 | -------------------------------------------------------------------------------- /en-cs/STATUS: -------------------------------------------------------------------------------- 1 | (Note that published newstest2016 results appear to have used wrong bpe, so testing on newstest2017) 2 | Single shows slight differences (4/3000) to previous run. 3 | Ensemble is exactly the same. 4 | Testing rerank. 5 | -------------------------------------------------------------------------------- /en-cs/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /en-cs/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model 15 | -------------------------------------------------------------------------------- /en-cs/rerank_normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | from collections import defaultdict 8 | 9 | if __name__ == '__main__': 10 | 11 | if len(sys.argv) != 3: 12 | sys.stderr.write("usage: %s K ALPHA\n" % sys.argv[0]) 13 | sys.exit(1) 14 | k = float(sys.argv[1]) 15 | alpha = float(sys.argv[2]) 16 | 17 | cur = 0 18 | best_score = float('inf') 19 | best_sent = '' 20 | idx = 0 21 | for line in sys.stdin: 22 | num, sent, scores = line.split(' ||| ') 23 | 24 | # new input sentence: print best translation of previous sentence, and reset stats 25 | if int(num) > cur: 26 | print best_sent 27 | #print best_score 28 | cur = int(num) 29 | best_score = float('inf') 30 | best_sent = '' 31 | idx = 0 32 | 33 | #only consider k-best hypotheses 34 | if idx >= k: 35 | continue 36 | 37 | if len(sent.split()) == 0: 38 | continue 39 | 40 | score = sum(map(float, scores.split())) / (len(sent.split()))**alpha 41 | if score < best_score: 42 | best_score = score 43 | best_sent = sent.strip() 44 | 45 | idx += 1 46 | 47 | # end of file; print best translation of last sentence 48 | print best_sent 49 | # print best_score 50 | 51 | -------------------------------------------------------------------------------- /en-cs/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m /home/bhaddow/experiments/wmt17/cs-en/translate/model.lr.0.fixed.npz \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-cs/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-cs/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 12 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /en-cs/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-cs/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz --suppress-unk \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-cs/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | #echo $tmpfile_src $tmpfile_nbest 15 | 16 | $model_dir/preprocess.sh > $tmpfile_src 17 | 18 | #left-to-right n-best list 19 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 20 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 21 | -k 50 -p 2 --n-best > $tmpfile_nbest 22 | 23 | #rescoring 24 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 25 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 26 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 27 | -b 40 -s $tmpfile_src | \ 28 | $model_dir/rerank_normalize.py 12 1 | \ 29 | $model_dir/../scripts/reverse.py | \ 30 | $model_dir/postprocess.sh 31 | 32 | rm $tmpfile_src 33 | rm $tmpfile_nbest 34 | -------------------------------------------------------------------------------- /en-cs/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz --suppress-unk \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-cs/vars: -------------------------------------------------------------------------------- 1 | src=en 2 | trg=cs 3 | #nematus_home=/home/bhaddow/code/nematus/github 4 | nematus_home=/home/bhaddow/code/nematus/deep-d9a13ef 5 | device=cuda1 6 | -------------------------------------------------------------------------------- /en-de/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /en-de/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model.$src 15 | -------------------------------------------------------------------------------- /en-de/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-de/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /en-de/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-de/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-de/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 -p 1 --n-best --suppress-unk > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 40 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /en-de/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-de/vars: -------------------------------------------------------------------------------- 1 | src=en 2 | trg=de 3 | -------------------------------------------------------------------------------- /en-ru/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /en-ru/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model.$src 15 | -------------------------------------------------------------------------------- /en-ru/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-ru/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-ru/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-ru/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-ru/vars: -------------------------------------------------------------------------------- 1 | src=en 2 | trg=ru 3 | -------------------------------------------------------------------------------- /en-tr/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | # src and trg language 9 | src=en 10 | trg=tr 11 | 12 | # remove BPE, detruecase, detokenize 13 | sed 's/\@\@ //g' | \ 14 | $moses_scripts/recaser/detruecase.perl | \ 15 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 16 | -------------------------------------------------------------------------------- /en-tr/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | # src and trg language 9 | src=en 10 | trg=tr 11 | 12 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 13 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 14 | $moses_scripts/recaser/truecase.perl -model truecase-model.$src | \ 15 | $bpe_scripts/apply_bpe.py -c $src$trg.bpe 16 | -------------------------------------------------------------------------------- /en-tr/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-tr/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /en-tr/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-tr/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | ./preprocess.sh | \ 9 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 10 | -m model.l2r.ens{1,2,3,4}.npz \ 11 | -k 12 -n -p 1 --suppress-unk | \ 12 | ./postprocess.sh 13 | -------------------------------------------------------------------------------- /en-tr/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | # temporary files 9 | tmpfile_src=`mktemp` 10 | tmpfile_nbest=`mktemp` 11 | tmpfile_reverse=`mktemp` 12 | 13 | ./preprocess.sh > $tmpfile_src 14 | 15 | #left-to-right n-best list 16 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 17 | -m model.l2r.ens{1,2,3,4}.npz \ 18 | -k 50 -p 2 --n-best --suppress-unk > $tmpfile_nbest 19 | 20 | #need to reverse the source file 21 | ../scripts/reverse.py < $tmpfile_src > $tmpfile_reverse 22 | 23 | #rescoring 24 | ../scripts/reverse_nbest.py < $tmpfile_nbest | \ 25 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 26 | -m model.r2l.ens{1,2,3,4}.npz \ 27 | -b 20 -s $tmpfile_reverse | \ 28 | ../scripts/rerank_normalize.py 50 1 | \ 29 | ../scripts/reverse.py | \ 30 | ./postprocess.sh 31 | 32 | rm $tmpfile_src 33 | rm $tmpfile_nbest 34 | rm $tmpfile_reverse 35 | -------------------------------------------------------------------------------- /en-tr/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | ./preprocess.sh | \ 9 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 10 | -m model.l2r.ens1.npz \ 11 | -k 12 -n -p 1 --suppress-unk | \ 12 | ./postprocess.sh 13 | -------------------------------------------------------------------------------- /en-zh/STATUS: -------------------------------------------------------------------------------- 1 | Checking against runs in paper 2 | 3 | single: Checked, 3 diffs in 2000 sentences 4 | ensemble: Checked: 1 diff in 2000 sentences 5 | reranked: Checked: 3 diffs in 2000 sentences 6 | -------------------------------------------------------------------------------- /en-zh/deseg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | 6 | re_space = re.compile(r"(? cur: 26 | print best_sent 27 | #print best_score 28 | cur = int(num) 29 | best_score = float('inf') 30 | best_sent = '' 31 | idx = 0 32 | 33 | #only consider k-best hypotheses 34 | if idx >= k: 35 | continue 36 | 37 | if len(sent.split()) == 0: 38 | continue 39 | 40 | score = sum(map(float, scores.split())) / (len(sent.split()))**alpha 41 | if score < best_score: 42 | best_score = score 43 | best_sent = sent.strip() 44 | 45 | idx += 1 46 | 47 | # end of file; print best translation of last sentence 48 | print best_sent 49 | # print best_score 50 | 51 | -------------------------------------------------------------------------------- /en-zh/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-zh/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /en-zh/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-zh/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz --suppress-unk \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-zh/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 -p 2 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | GPUARRAY_FORCE_CUDA_DRIVER_LOAD=True THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device,gpuarray.preallocate=0.8 python $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 40 -s $tmpfile_src | \ 27 | $model_dir/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /en-zh/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz --suppress-unk \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /en-zh/vars: -------------------------------------------------------------------------------- 1 | src=en 2 | trg=zh 3 | nematus_home=/home/bhaddow/code/nematus/github 4 | #nematus_home=/home/bhaddow/code/nematus/deep-d9a13ef 5 | device=cuda0 6 | -------------------------------------------------------------------------------- /lv-en/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /lv-en/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model.$src 15 | -------------------------------------------------------------------------------- /lv-en/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /lv-en/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /lv-en/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /lv-en/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /lv-en/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 -p 1 --n-best --suppress-unk > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /lv-en/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n -p 1 --suppress-unk | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /lv-en/vars: -------------------------------------------------------------------------------- 1 | src=lv 2 | trg=en 3 | -------------------------------------------------------------------------------- /ru-en/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /ru-en/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 12 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 13 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src | \ 14 | $bpe_scripts/apply_bpe.py --vocabulary $model_dir/vocab.$src --vocabulary-threshold 50 -c $model_dir/bpe.model 15 | -------------------------------------------------------------------------------- /ru-en/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /ru-en/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 12 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /ru-en/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /ru-en/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /ru-en/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Running on $HOSTNAME" 4 | 5 | model_dir=`dirname $0` 6 | 7 | #language-independent variables (toolkit locations) 8 | . $model_dir/../vars 9 | 10 | #language-dependent variables (source and target language) 11 | . $model_dir/vars 12 | 13 | # temporary files 14 | tmpfile_src=`mktemp -p $model_dir` 15 | tmpfile_nbest=`mktemp -p $model_dir` 16 | echo $tmpfile_src $tmpfile_nbest 17 | 18 | $model_dir/preprocess.sh > $tmpfile_src 19 | 20 | #left-to-right n-best list 21 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 22 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 23 | -k 50 -p 1 --n-best > $tmpfile_nbest 24 | 25 | #rescoring 26 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 27 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 28 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 29 | -b 50 -s $tmpfile_src | \ 30 | $model_dir/rerank_normalize.py 12 1 | \ 31 | $model_dir/../scripts/reverse.py | \ 32 | $model_dir/postprocess.sh 33 | 34 | #rm $tmpfile_src 35 | #rm $tmpfile_nbest 36 | -------------------------------------------------------------------------------- /ru-en/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /ru-en/vars: -------------------------------------------------------------------------------- 1 | src=ru 2 | trg=en 3 | -------------------------------------------------------------------------------- /scripts/rerank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | from collections import defaultdict 8 | 9 | if __name__ == '__main__': 10 | 11 | if len(sys.argv) > 1: 12 | k = int(sys.argv[1]) 13 | else: 14 | k = float('inf') 15 | 16 | cur = 0 17 | best_score = float('inf') 18 | best_sent = '' 19 | idx = 0 20 | for line in sys.stdin: 21 | num, sent, scores = line.split(' ||| ') 22 | 23 | # new input sentence: print best translation of previous sentence, and reset stats 24 | if int(num) > cur: 25 | print best_sent 26 | #print best_score 27 | cur = int(num) 28 | best_score = float('inf') 29 | best_sent = '' 30 | idx = 0 31 | 32 | #only consider k-best hypotheses 33 | if idx >= k: 34 | continue 35 | 36 | score = sum(map(float, scores.split())) 37 | if score < best_score: 38 | best_score = score 39 | best_sent = sent.strip() 40 | 41 | idx += 1 42 | 43 | # end of file; print best translation of last sentence 44 | print best_sent 45 | # print best_score 46 | -------------------------------------------------------------------------------- /scripts/rerank_normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | from collections import defaultdict 8 | 9 | if __name__ == '__main__': 10 | 11 | if len(sys.argv) != 3: 12 | sys.stderr.write("usage: %s K ALPHA\n" % sys.argv[0]) 13 | sys.exit(1) 14 | k = float(sys.argv[1]) 15 | alpha = float(sys.argv[2]) 16 | 17 | cur = 0 18 | best_score = float('inf') 19 | best_sent = '' 20 | idx = 0 21 | for line in sys.stdin: 22 | num, sent, scores = line.split(' ||| ') 23 | 24 | # new input sentence: print best translation of previous sentence, and reset stats 25 | if int(num) > cur: 26 | print best_sent 27 | #print best_score 28 | cur = int(num) 29 | best_score = float('inf') 30 | best_sent = '' 31 | idx = 0 32 | 33 | #only consider k-best hypotheses 34 | if idx >= k: 35 | continue 36 | 37 | score = sum(map(float, scores.split())) / (len(sent.split()) + 1)**alpha 38 | if score < best_score: 39 | best_score = score 40 | best_sent = sent.strip() 41 | 42 | idx += 1 43 | 44 | # end of file; print best translation of last sentence 45 | print best_sent 46 | # print best_score 47 | 48 | -------------------------------------------------------------------------------- /scripts/reverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | 8 | for line in sys.stdin: 9 | sys.stdout.write(' '.join(reversed(line.split())) + '\n') 10 | -------------------------------------------------------------------------------- /scripts/reverse_nbest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | 8 | for line in sys.stdin: 9 | linesplit = line.split(' ||| ') 10 | linesplit[1] = ' '.join(reversed(linesplit[1].split())) 11 | sys.stdout.write(' ||| '.join(linesplit)) 12 | -------------------------------------------------------------------------------- /tr-en/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | # src and trg language 9 | src=tr 10 | trg=en 11 | 12 | # remove BPE, detruecase, detokenize 13 | sed 's/\@\@ //g' | \ 14 | $moses_scripts/recaser/detruecase.perl | \ 15 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 16 | -------------------------------------------------------------------------------- /tr-en/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | # src and trg language 9 | src=tr 10 | trg=en 11 | 12 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 13 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src | \ 14 | $moses_scripts/recaser/truecase.perl -model truecase-model.$src | \ 15 | $bpe_scripts/apply_bpe.py -c $src$trg.bpe 16 | -------------------------------------------------------------------------------- /tr-en/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /tr-en/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /tr-en/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /tr-en/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | ./preprocess.sh | \ 9 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 10 | -m model.l2r.ens{1,2,3,4}.npz \ 11 | -k 12 -n -p 1 --suppress-unk | \ 12 | ./postprocess.sh 13 | -------------------------------------------------------------------------------- /tr-en/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | # temporary files 9 | tmpfile_src=`mktemp` 10 | tmpfile_nbest=`mktemp` 11 | tmpfile_reverse=`mktemp` 12 | 13 | ./preprocess.sh > $tmpfile_src 14 | 15 | #left-to-right n-best list 16 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 17 | -m model.l2r.ens{1,2,3,4}.npz \ 18 | -k 50 -p 2 --n-best --suppress-unk > $tmpfile_nbest 19 | 20 | #need to reverse the source file 21 | ../scripts/reverse.py < $tmpfile_src > $tmpfile_reverse 22 | 23 | #rescoring 24 | ../scripts/reverse_nbest.py < $tmpfile_nbest | \ 25 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 26 | -m model.r2l.ens{1,2,3,4}.npz \ 27 | -b 10 -s $tmpfile_reverse | \ 28 | ../scripts/rerank_normalize.py 50 1 | \ 29 | ../scripts/reverse.py | \ 30 | ./postprocess.sh 31 | 32 | rm $tmpfile_src 33 | rm $tmpfile_nbest 34 | rm $tmpfile_reverse 35 | -------------------------------------------------------------------------------- /tr-en/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | ./preprocess.sh | \ 9 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 10 | -m model.l2r.ens1.npz \ 11 | -k 12 -n -p 1 --suppress-unk | \ 12 | ./postprocess.sh 13 | -------------------------------------------------------------------------------- /training/.htaccess: -------------------------------------------------------------------------------- 1 | AddHandler markdown .md 2 | AddType text/html .md 3 | ReadmeName README.md 4 | IndexIgnore *.md README 5 | -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | WMT17 TRAINING SCRIPTS 2 | ---------------------- 3 | 4 | We used various different approaches for preprocessing 5 | and data augmentation for monolingual data for different 6 | languages. Check the system description for more detail. 7 | 8 | In this directory, we provide a sample configuration for 9 | preprocessing and training for English->German. Please note 10 | that this script will not reproduce our WMT17 results, which also 11 | rely on the use of back-translated monolingual data, and 12 | combination of multiple models. Please also have a look at last year's 13 | accompanying scripts and sample configurations; among others, 14 | there is documentation for right-to-left reranking: 15 | https://github.com/rsennrich/wmt16-scripts 16 | 17 | Note: since the WMT17 models were developed, Nematus has switched 18 | from using a Theano back-end to using TensorFlow. The scripts provided in 19 | the ```scripts``` directory are for use with the Theano version; updated 20 | scripts for use with the current TensorFlow version can be found in 21 | ```scripts.tensorflow```. 22 | 23 | 24 | USAGE INSTRUCTIONS 25 | ------------------ 26 | 27 | 1. ensure that all requirements are present, and that the path names in the ‘vars’ file (and in `../vars`) are up-to-date. If you want to train on GPUs, you should also update the ‘device’ variable in that file. 28 | 29 | 2. download sample files (WMT17 parallel training data, dev and test sets): 30 | 31 | ``` 32 | scripts/download_files.sh 33 | ``` 34 | 35 | 3. preprocess the training, development and test corpora: 36 | 37 | ``` 38 | scripts/preprocess.sh 39 | ``` 40 | 41 | 4. train a Nematus model: 42 | 43 | ``` 44 | scripts/train.sh 45 | ``` 46 | 47 | 5. evaluate your model: 48 | 49 | ``` 50 | scripts/evaluate.sh 51 | ``` 52 | 53 | -------------------------------------------------------------------------------- /training/data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /training/downloads/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /training/model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /training/scripts.tensorflow/download_files.sh: -------------------------------------------------------------------------------- 1 | ../scripts/download_files.sh -------------------------------------------------------------------------------- /training/scripts.tensorflow/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | # this script evaluates the best model (according to BLEU early stopping) 5 | # on newstest2017, using detokenized BLEU (equivalent to evaluation with 6 | # mteval-v13a.pl) 7 | 8 | script_dir=`dirname $0` 9 | main_dir=$script_dir/../ 10 | data_dir=$main_dir/data 11 | working_dir=$main_dir/model 12 | 13 | #language-independent variables (toolkit locations) 14 | . $main_dir/../vars 15 | 16 | #language-dependent variables (source and target language) 17 | . $main_dir/vars 18 | 19 | test_prefix=newstest2017 20 | test=$test_prefix.bpe.$src 21 | ref=$test_prefix.$trg 22 | model=$working_dir/model.best-valid-script 23 | 24 | # decode 25 | CUDA_VISIBLE_DEVICES=$device python $nematus_home/nematus/translate.py \ 26 | -m $model \ 27 | -i $data_dir/$test \ 28 | -o $working_dir/$test.output.dev \ 29 | -k 12 \ 30 | -n 31 | 32 | # postprocess 33 | $script_dir/postprocess.sh < $working_dir/$test.output.dev > $working_dir/$test.output.postprocessed.dev 34 | 35 | # evaluate with detokenized BLEU (same as mteval-v13a.pl) 36 | $nematus_home/data/multi-bleu-detok.perl $data_dir/$ref < $working_dir/$test.output.postprocessed.dev 37 | -------------------------------------------------------------------------------- /training/scripts.tensorflow/postprocess.sh: -------------------------------------------------------------------------------- 1 | ../scripts/postprocess.sh -------------------------------------------------------------------------------- /training/scripts.tensorflow/preprocess.sh: -------------------------------------------------------------------------------- 1 | ../scripts/preprocess.sh -------------------------------------------------------------------------------- /training/scripts.tensorflow/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | script_dir=`dirname $0` 5 | main_dir=$script_dir/../ 6 | data_dir=$main_dir/data 7 | working_dir=$main_dir/model 8 | 9 | #language-independent variables (toolkit locations) 10 | . $main_dir/../vars 11 | 12 | #language-dependent variables (source and target language) 13 | . $main_dir/vars 14 | 15 | CUDA_VISIBLE_DEVICES=$device python $nematus_home/nematus/train.py \ 16 | --model $working_dir/model \ 17 | --datasets $data_dir/corpus.bpe.$src $data_dir/corpus.bpe.$trg \ 18 | --valid_datasets $data_dir/newstest2013.bpe.$src $data_dir/newstest2013.bpe.$trg \ 19 | --dictionaries $data_dir/corpus.bpe.$src.json $data_dir/corpus.bpe.$trg.json \ 20 | --valid_script $script_dir/validate.sh \ 21 | --reload latest_checkpoint \ 22 | --dim_word 512 \ 23 | --dim 1024 \ 24 | --lrate 0.0001 \ 25 | --optimizer adam \ 26 | --maxlen 50 \ 27 | --batch_size 80 \ 28 | --valid_batch_size 40 \ 29 | --validFreq 10000 \ 30 | --dispFreq 1000 \ 31 | --saveFreq 30000 \ 32 | --sampleFreq 10000 \ 33 | --tie_decoder_embeddings \ 34 | --layer_normalisation \ 35 | --dec_base_recurrence_transition_depth 8 \ 36 | --enc_recurrence_transition_depth 4 37 | 38 | -------------------------------------------------------------------------------- /training/scripts.tensorflow/validate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | # this script evaluates translations of the newstest2013 test set 5 | # using detokenized BLEU (equivalent to evaluation with mteval-v13a.pl). 6 | 7 | translations=$1 8 | 9 | script_dir=`dirname $0` 10 | main_dir=$script_dir/../ 11 | data_dir=$main_dir/data 12 | 13 | #language-independent variables (toolkit locations) 14 | . $main_dir/../vars 15 | 16 | #language-dependent variables (source and target language) 17 | . $main_dir/vars 18 | 19 | dev_prefix=newstest2013 20 | ref=$dev_prefix.$trg 21 | 22 | # evaluate translations and write BLEU score to standard output (for 23 | # use by nmt.py) 24 | $script_dir/postprocess.sh < $translations | \ 25 | $nematus_home/data/multi-bleu-detok.perl $data_dir/$ref | \ 26 | cut -f 3 -d ' ' | \ 27 | cut -f 1 -d ',' 28 | -------------------------------------------------------------------------------- /training/scripts/download_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Downloads WMT17 training and test data for EN-DE 3 | # Distributed under MIT license 4 | 5 | script_dir=`dirname $0` 6 | main_dir=$script_dir/.. 7 | 8 | #language-independent variables (toolkit locations) 9 | . $main_dir/../vars 10 | 11 | # get EN-DE training data for WMT17 12 | 13 | if [ ! -f $main_dir/downloads/de-en.tgz ]; 14 | then 15 | wget http://www.statmt.org/europarl/v7/de-en.tgz -O $main_dir/downloads/de-en.tgz 16 | tar -xf $main_dir/downloads/de-en.tgz -C $main_dir/downloads 17 | fi 18 | 19 | if [ ! -f $main_dir/downloads/training-parallel-commoncrawl.tgz ]; 20 | then 21 | wget http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz -O $main_dir/downloads/training-parallel-commoncrawl.tgz 22 | tar -xf $main_dir/downloads/training-parallel-commoncrawl.tgz -C $main_dir/downloads 23 | fi 24 | 25 | if [ ! -f $main_dir/downloads/training-parallel-nc-v12.tgz ]; 26 | then 27 | wget http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz -O $main_dir/downloads/training-parallel-nc-v12.tgz 28 | tar -xf $main_dir/downloads/training-parallel-nc-v12.tgz -C $main_dir/downloads 29 | fi 30 | 31 | if [ ! -f $main_dir/downloads/rapid2016.tgz ]; 32 | then 33 | wget http://data.statmt.org/wmt17/translation-task/rapid2016.tgz -O $main_dir/downloads/rapid2016.tgz 34 | tar -xf $main_dir/downloads/rapid2016.tgz -C $main_dir/downloads 35 | fi 36 | 37 | if [ ! -f $main_dir/downloads/dev.tgz ]; 38 | then 39 | wget http://data.statmt.org/wmt17/translation-task/dev.tgz -O $main_dir/downloads/dev.tgz 40 | tar -xf $main_dir/downloads/dev.tgz -C $main_dir/downloads 41 | fi 42 | 43 | if [ ! -f $main_dir/downloads/test.tgz ]; 44 | then 45 | wget http://data.statmt.org/wmt17/translation-task/test.tgz -O $main_dir/downloads/test.tgz 46 | tar -xf $main_dir/downloads/test.tgz -C $main_dir/downloads 47 | fi 48 | 49 | 50 | # concatenate all training corpora 51 | cat $main_dir/downloads/europarl-v7.de-en.en $main_dir/downloads/commoncrawl.de-en.en $main_dir/downloads/rapid2016.de-en.en $main_dir/downloads/training/news-commentary-v12.de-en.en > $main_dir/data/corpus.en 52 | cat $main_dir/downloads/europarl-v7.de-en.de $main_dir/downloads/commoncrawl.de-en.de $main_dir/downloads/rapid2016.de-en.de $main_dir/downloads/training/news-commentary-v12.de-en.de > $main_dir/data/corpus.de 53 | 54 | for year in 2013; 55 | do 56 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/dev/newstest${year}-ref.de.sgm > $main_dir/data/newstest$year.de 57 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/dev/newstest${year}-src.en.sgm > $main_dir/data/newstest$year.en 58 | done 59 | 60 | for year in 2014; 61 | do 62 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/dev/newstest${year}-deen-ref.de.sgm > $main_dir/data/newstest$year.de 63 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/dev/newstest${year}-deen-src.en.sgm > $main_dir/data/newstest$year.en 64 | done 65 | 66 | for year in {2015,2016}; 67 | do 68 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/dev/newstest${year}-ende-ref.de.sgm > $main_dir/data/newstest$year.de 69 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/dev/newstest${year}-ende-src.en.sgm > $main_dir/data/newstest$year.en 70 | done 71 | 72 | for year in 2017; 73 | do 74 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/test/newstest${year}-ende-ref.de.sgm > $main_dir/data/newstest$year.de 75 | $moses_scripts/ems/support/input-from-sgm.perl < $main_dir/downloads/test/newstest${year}-ende-src.en.sgm > $main_dir/data/newstest$year.en 76 | done 77 | 78 | 79 | cd .. 80 | -------------------------------------------------------------------------------- /training/scripts/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | # this script evaluates the best model (according to BLEU early stopping) 5 | # on newstest2017, using detokenized BLEU (equivalent to evaluation with 6 | # mteval-v13a.pl) 7 | 8 | script_dir=`dirname $0` 9 | main_dir=$script_dir/../ 10 | data_dir=$main_dir/data 11 | working_dir=$main_dir/model 12 | 13 | #language-independent variables (toolkit locations) 14 | . $main_dir/../vars 15 | 16 | #language-dependent variables (source and target language) 17 | . $main_dir/vars 18 | 19 | test_prefix=newstest2017 20 | test=$test_prefix.bpe.$src 21 | ref=$test_prefix.$trg 22 | model=$working_dir/model.npz.best_bleu 23 | 24 | 25 | # decode 26 | 27 | # for new Tensorflow backend, use a command like this: 28 | # CUDA_VISIBLE_DEVICES=$device python $nematus_home/nematus/translate.py \ 29 | 30 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device,gpuarray.preallocate=0.1 time python $nematus_home/nematus/translate.py \ 31 | -m $model \ 32 | -i $data_dir/$test -o $working_dir/$test.output.dev -k 12 -n -p 1 --suppress-unk 33 | 34 | # postprocess 35 | $script_dir/postprocess.sh < $working_dir/$test.output.dev > $working_dir/$test.output.postprocessed.dev 36 | 37 | # evaluate with detokenized BLEU (same as mteval-v13a.pl) 38 | $nematus_home/data/multi-bleu-detok.perl $data_dir/$ref < $working_dir/$test.output.postprocessed.dev 39 | -------------------------------------------------------------------------------- /training/scripts/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | # this sample script postprocesses the MT output, 5 | # including merging of BPE subword units, 6 | # detruecasing, and detokenization 7 | 8 | script_dir=`dirname $0` 9 | main_dir=$script_dir/../ 10 | 11 | #language-independent variables (toolkit locations) 12 | . $main_dir/../vars 13 | 14 | #language-dependent variables (source and target language) 15 | . $main_dir/vars 16 | 17 | sed -r 's/\@\@ //g' | 18 | $moses_scripts/recaser/detruecase.perl | 19 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 20 | -------------------------------------------------------------------------------- /training/scripts/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | # this sample script preprocesses a sample corpus, including tokenization, 5 | # truecasing, and subword segmentation. 6 | # for application to a different language pair, 7 | # change source and target prefix, optionally the number of BPE operations, 8 | 9 | script_dir=`dirname $0` 10 | main_dir=$script_dir/.. 11 | data_dir=$main_dir/data 12 | model_dir=$main_dir/model 13 | 14 | #language-independent variables (toolkit locations) 15 | . $main_dir/../vars 16 | 17 | #language-dependent variables (source and target language) 18 | . $main_dir/vars 19 | 20 | # number of merge operations. Network vocabulary should be slightly larger (to include characters), 21 | # or smaller if the operations are learned on the joint vocabulary 22 | bpe_operations=90000 23 | 24 | #minimum number of times we need to have seen a character sequence in the training text before we merge it into one unit 25 | #this is applied to each training text independently, even with joint BPE 26 | bpe_threshold=50 27 | 28 | # tokenize 29 | for prefix in corpus newstest2013 newstest2014 newstest2015 newstest2016 newstest2017 30 | do 31 | cat $data_dir/$prefix.$src | \ 32 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $src | \ 33 | $moses_scripts/tokenizer/tokenizer.perl -a -l $src > $data_dir/$prefix.tok.$src 34 | 35 | cat $data_dir/$prefix.$trg | \ 36 | $moses_scripts/tokenizer/normalize-punctuation.perl -l $trg | \ 37 | $moses_scripts/tokenizer/tokenizer.perl -a -l $trg > $data_dir/$prefix.tok.$trg 38 | 39 | done 40 | 41 | # clean empty and long sentences, and sentences with high source-target ratio (training corpus only) 42 | $moses_scripts/training/clean-corpus-n.perl $data_dir/corpus.tok $src $trg $data_dir/corpus.tok.clean 1 80 43 | 44 | # train truecaser 45 | $moses_scripts/recaser/train-truecaser.perl -corpus $data_dir/corpus.tok.clean.$src -model $model_dir/truecase-model.$src 46 | $moses_scripts/recaser/train-truecaser.perl -corpus $data_dir/corpus.tok.clean.$trg -model $model_dir/truecase-model.$trg 47 | 48 | # apply truecaser (cleaned training corpus) 49 | for prefix in corpus 50 | do 51 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src < $data_dir/$prefix.tok.clean.$src > $data_dir/$prefix.tc.$src 52 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$trg < $data_dir/$prefix.tok.clean.$trg > $data_dir/$prefix.tc.$trg 53 | done 54 | 55 | # apply truecaser (dev/test files) 56 | for prefix in newstest2013 newstest2014 newstest2015 newstest2016 newstest2017 57 | do 58 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$src < $data_dir/$prefix.tok.$src > $data_dir/$prefix.tc.$src 59 | $moses_scripts/recaser/truecase.perl -model $model_dir/truecase-model.$trg < $data_dir/$prefix.tok.$trg > $data_dir/$prefix.tc.$trg 60 | done 61 | 62 | # train BPE 63 | $bpe_scripts/learn_joint_bpe_and_vocab.py -i $data_dir/corpus.tc.$src $data_dir/corpus.tc.$trg --write-vocabulary $data_dir/vocab.$src $data_dir/vocab.$trg -s $bpe_operations -o $model_dir/$src$trg.bpe 64 | 65 | # apply BPE 66 | 67 | for prefix in corpus newstest2013 newstest2014 newstest2015 newstest2016 newstest2017 68 | do 69 | $bpe_scripts/apply_bpe.py -c $model_dir/$src$trg.bpe --vocabulary $data_dir/vocab.$src --vocabulary-threshold $bpe_threshold < $data_dir/$prefix.tc.$src > $data_dir/$prefix.bpe.$src 70 | $bpe_scripts/apply_bpe.py -c $model_dir/$src$trg.bpe --vocabulary $data_dir/vocab.$trg --vocabulary-threshold $bpe_threshold < $data_dir/$prefix.tc.$trg > $data_dir/$prefix.bpe.$trg 71 | done 72 | 73 | # build network dictionary 74 | $nematus_home/data/build_dictionary.py $data_dir/corpus.bpe.$src $data_dir/corpus.bpe.$trg 75 | 76 | -------------------------------------------------------------------------------- /training/scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | script_dir=`dirname $0` 5 | main_dir=$script_dir/../ 6 | data_dir=$main_dir/data 7 | working_dir=$main_dir/model 8 | 9 | #language-independent variables (toolkit locations) 10 | . $main_dir/../vars 11 | 12 | #language-dependent variables (source and target language) 13 | . $main_dir/vars 14 | 15 | # for new Tensorflow backend, use a command like this: 16 | # CUDA_VISIBLE_DEVICES=$device python $nematus_home/nematus/nmt.py \ 17 | 18 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device,gpuarray.preallocate=0.8 python $nematus_home/nematus/nmt.py \ 19 | --model $working_dir/model.npz \ 20 | --datasets $data_dir/corpus.bpe.$src $data_dir/corpus.bpe.$trg \ 21 | --valid_datasets $data_dir/newstest2013.bpe.$src $data_dir/newstest2013.bpe.$trg \ 22 | --dictionaries $data_dir/corpus.bpe.$src.json $data_dir/corpus.bpe.$trg.json \ 23 | --external_validation_script $script_dir/validate.sh \ 24 | --dim_word 512 \ 25 | --dim 1024 \ 26 | --lrate 0.0001 \ 27 | --optimizer adam \ 28 | --maxlen 50 \ 29 | --batch_size 80 \ 30 | --valid_batch_size 40 \ 31 | --validFreq 10000 \ 32 | --dispFreq 1000 \ 33 | --saveFreq 30000 \ 34 | --sampleFreq 10000 \ 35 | --tie_decoder_embeddings \ 36 | --layer_normalisation \ 37 | --dec_base_recurrence_transition_depth 8 \ 38 | --enc_recurrence_transition_depth 4 39 | 40 | -------------------------------------------------------------------------------- /training/scripts/validate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Distributed under MIT license 3 | 4 | # this script evaluates the current model on newstest2013, 5 | # using detokenized BLEU (equivalent to evaluation with 6 | # mteval-v13a.pl). 7 | 8 | # If BLEU improves, the model is copied to model.npz.best_bleu 9 | 10 | script_dir=`dirname $0` 11 | main_dir=$script_dir/../ 12 | data_dir=$main_dir/data 13 | working_dir=$main_dir/model 14 | 15 | #language-independent variables (toolkit locations) 16 | . $main_dir/../vars 17 | 18 | #language-dependent variables (source and target language) 19 | . $main_dir/vars 20 | 21 | dev_prefix=newstest2013 22 | dev=$dev_prefix.bpe.$src 23 | ref=$dev_prefix.$trg 24 | prefix=$working_dir/model.npz 25 | 26 | 27 | # decode 28 | # for new Tensorflow backend, use a command like this: 29 | # CUDA_VISIBLE_DEVICES=$device python $nematus_home/nematus/translate.py \ 30 | 31 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device,gpuarray.preallocate=0.1 time python $nematus_home/nematus/translate.py \ 32 | -m $prefix.dev.npz \ 33 | -i $data_dir/$dev -o $working_dir/$dev.output.dev -k 5 -n -p 1 --suppress-unk 34 | 35 | 36 | $script_dir/postprocess.sh < $working_dir/$dev.output.dev > $working_dir/$dev.output.postprocessed.dev 37 | 38 | 39 | ## get BLEU 40 | BEST=`cat ${prefix}_best_bleu || echo 0` 41 | $nematus_home/data/multi-bleu-detok.perl $data_dir/$ref < $working_dir/$dev.output.postprocessed.dev >> ${prefix}_bleu_scores 42 | BLEU=`$nematus_home/data/multi-bleu-detok.perl $data_dir/$ref < $working_dir/$dev.output.postprocessed.dev | cut -f 3 -d ' ' | cut -f 1 -d ','` 43 | BETTER=`echo "$BLEU > $BEST" | bc` 44 | 45 | echo "BLEU = $BLEU" 46 | 47 | if [ "$BETTER" = "1" ]; then 48 | echo "new best; saving" 49 | echo $BLEU > ${prefix}_best_bleu 50 | cp ${prefix}.dev.npz ${prefix}.best_bleu 51 | cp ${prefix}.dev.npz.json ${prefix}.best_bleu.json 52 | fi 53 | 54 | -------------------------------------------------------------------------------- /training/vars: -------------------------------------------------------------------------------- 1 | src=en 2 | trg=de 3 | -------------------------------------------------------------------------------- /vars: -------------------------------------------------------------------------------- 1 | # scripts directory of moses decoder: http://www.statmt.org/moses/ 2 | # you do not need to compile moses; a simple download is sufficient 3 | moses_scripts=/home/bhaddow/moses.new/dist/977e8ea/scripts 4 | 5 | #scripts for subword segmentation: https://github.com/rsennrich/subword-nmt 6 | bpe_scripts=/home/bhaddow/tools/subword-nmt 7 | 8 | #nematus (theano version): https://github.com/EdinburghNLP/nematus/tree/theano 9 | nematus_home=/mnt/gna0/rsennrich/tools/nematus/ 10 | 11 | #jieba word segmentation utility: https://pypi.python.org/pypi/jieba/ 12 | #this is only required for Chinese 13 | zh_segment_home=/mnt/baldur0/tramooc/tools/jieba 14 | 15 | # Theano/TensorFlow device; change this to execute Nematus on GPU 16 | # 17 | # For Theano, a typical value is 'cuda' 18 | # 19 | # For TensorFlow, the value will be passed to CUDA_VISIBLE_DEVICES. It should 20 | # be a list of GPU identifiers. For example, '1' or '0,1,3' 21 | device=0 22 | -------------------------------------------------------------------------------- /zh-en/STATUS: -------------------------------------------------------------------------------- 1 | Comparing the outputs now against those created for the paper .. 2 | 3 | Checked translate-single: shows 4 diffs in 2000 sentences 4 | Checked translate-ensemble: shows no diffs 5 | Checked translate-reranked: 3 diffs in 2000 sentences 6 | -------------------------------------------------------------------------------- /zh-en/postprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | sed 's/\@\@ //g' | \ 12 | $moses_scripts/recaser/detruecase.perl | \ 13 | $moses_scripts/tokenizer/detokenizer.perl -l $trg 14 | -------------------------------------------------------------------------------- /zh-en/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | export PYTHONPATH=$jieba_home 12 | python -m jieba -d | \ 13 | $bpe_scripts/apply_bpe.py -c $model_dir/bpe.model.$src 14 | -------------------------------------------------------------------------------- /zh-en/rerank_normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | # Distributed under MIT license 5 | 6 | import sys 7 | from collections import defaultdict 8 | 9 | if __name__ == '__main__': 10 | 11 | if len(sys.argv) != 3: 12 | sys.stderr.write("usage: %s K ALPHA\n" % sys.argv[0]) 13 | sys.exit(1) 14 | k = float(sys.argv[1]) 15 | alpha = float(sys.argv[2]) 16 | 17 | cur = 0 18 | best_score = float('inf') 19 | best_sent = '' 20 | idx = 0 21 | for line in sys.stdin: 22 | num, sent, scores = line.split(' ||| ') 23 | 24 | # new input sentence: print best translation of previous sentence, and reset stats 25 | if int(num) > cur: 26 | print best_sent 27 | #print best_score 28 | cur = int(num) 29 | best_score = float('inf') 30 | best_sent = '' 31 | idx = 0 32 | 33 | #only consider k-best hypotheses 34 | if idx >= k: 35 | continue 36 | 37 | if len(sent.split()) == 0: 38 | continue 39 | 40 | score = sum(map(float, scores.split())) / (len(sent.split()))**alpha 41 | if score < best_score: 42 | best_score = score 43 | best_sent = sent.strip() 44 | 45 | idx += 1 46 | 47 | # end of file; print best translation of last sentence 48 | print best_sent 49 | # print best_score 50 | 51 | -------------------------------------------------------------------------------- /zh-en/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m /home/bhaddow/experiments/wmt17/cs-en/translate/model.lr.0.fixed.npz \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /zh-en/tf-translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /zh-en/tf-translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp` 13 | tmpfile_nbest=`mktemp` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3,4}.npz \ 20 | -k 50 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3,4}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/../scripts/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /zh-en/tf-translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | CUDA_VISIBLE_DEVICES=$device python3 $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz \ 14 | -k 12 -n | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /zh-en/translate-ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens{1,2,3}.npz --suppress-unk \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /zh-en/translate-reranked.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | # temporary files 12 | tmpfile_src=`mktemp -p .` 13 | tmpfile_nbest=`mktemp -p .` 14 | 15 | $model_dir/preprocess.sh > $tmpfile_src 16 | 17 | #left-to-right n-best list 18 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py < $tmpfile_src \ 19 | -m $model_dir/model.l2r.ens{1,2,3}.npz \ 20 | -k 50 -p 2 --n-best > $tmpfile_nbest 21 | 22 | #rescoring 23 | $model_dir/../scripts/reverse_nbest.py < $tmpfile_nbest | \ 24 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/rescore.py \ 25 | -m $model_dir/model.r2l.ens{1,2,3}.npz \ 26 | -b 80 -s $tmpfile_src | \ 27 | $model_dir/rerank_normalize.py 50 1 | \ 28 | $model_dir/../scripts/reverse.py | \ 29 | $model_dir/postprocess.sh 30 | 31 | rm $tmpfile_src 32 | rm $tmpfile_nbest 33 | -------------------------------------------------------------------------------- /zh-en/translate-single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=`dirname $0` 4 | 5 | #language-independent variables (toolkit locations) 6 | . $model_dir/../vars 7 | 8 | #language-dependent variables (source and target language) 9 | . $model_dir/vars 10 | 11 | $model_dir/preprocess.sh | \ 12 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=$device python $nematus_home/nematus/translate.py \ 13 | -m $model_dir/model.l2r.ens1.npz --suppress-unk \ 14 | -k 12 -n -p 1 | \ 15 | $model_dir/postprocess.sh 16 | -------------------------------------------------------------------------------- /zh-en/vars: -------------------------------------------------------------------------------- 1 | src=zh 2 | trg=en 3 | jieba_home=/mnt/baldur0/tramooc/tools/jieba 4 | nematus_home=/home/bhaddow/code/nematus/github 5 | device=cuda1 6 | --------------------------------------------------------------------------------