├── .gitignore ├── LICENSE ├── README.md ├── baseline-1M-enfr ├── local │ └── parse_options.sh ├── results └── run.sh ├── baseline-small ├── local │ └── parse_options.sh └── run.sh └── romance-multi-way ├── local └── parse_options.sh └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Lua sources 2 | luac.out 3 | 4 | # luarocks build files 5 | *.src.rock 6 | *.zip 7 | *.tar.gz 8 | 9 | # Object files 10 | *.o 11 | *.os 12 | *.ko 13 | *.obj 14 | *.elf 15 | 16 | # Precompiled Headers 17 | *.gch 18 | *.pch 19 | 20 | # Libraries 21 | *.lib 22 | *.a 23 | *.la 24 | *.lo 25 | *.def 26 | *.exp 27 | 28 | # Shared objects (inc. Windows DLLs) 29 | *.dll 30 | *.so 31 | *.so.* 32 | *.dylib 33 | 34 | # Executables 35 | *.exe 36 | *.out 37 | *.app 38 | *.i*86 39 | *.x86_64 40 | *.hex 41 | 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 OpenNMT 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recipes 2 | Recipes for training OpenNMT systems 3 | 4 | 5 | You will find here some "recipes" which basically script the end-to-end data preparation, preprocessing, training and evaluation. 6 | 7 | ## Requirements 8 | 9 | * You do need OpenNMT - see [here](http://opennmt.net/OpenNMT/installation/). If you clone Recipes.git repo at the same level as OpenNMT.git on your local computer, you don't need to update the PATH 10 | in the scripts. Otherwise update the line `OPENNMT_PATH=../../OpenNMT` 11 | * for evaluation scripts, you do need perl `XML::Twig` module (`perl -MCPAN -e 'install XML::Twig`) 12 | 13 | ## The recipes 14 | 15 | ### Baseline-1M-enfr 16 | Train a baseline English-French model, use case feature and onmt reversible tokenization. GPU highly recommended. Training takes 75 minutes per epoch on a single GTX 1080. 17 | Parameters: 2x500 layers, 13 epochs. See script for the details. 18 | Data: set of 1 million parallel sentences (extract of Europarl, Newscommentaries, ..) 19 | See the results file for the evaluation. 20 | 21 | ### Romance Multi-way 22 | See http://forum.opennmt.net/t/training-romance-multi-way-model/86 23 | GPU highly recommended. Training takes 4 1/2 hours per epoch on a single GTX 1080. 24 | Parameters: 2x500 layers, 13 epochs. See script for the details. 25 | -------------------------------------------------------------------------------- /baseline-1M-enfr/local/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /baseline-1M-enfr/results: -------------------------------------------------------------------------------- 1 | 2 | MT evaluation scorer began on 2017 Jan 12 at 15:05:37 3 | command line: tools/mteval-v13a.pl -r data/testsets-enfr/News/newstest2014-fren-ref.fr.sgm -s data/testsets-enfr/News/newstest2014-fren-src.en.sgm -t exp/newstest2014-fren-tgt.trans.fr.sgm -c 4 | Evaluation of any-to-fr translation using: 5 | src set "newstest2014" (176 docs, 3003 segs) 6 | ref set "newstest2014" (1 refs) 7 | tst set "newstest2014" (1 systems) 8 | 9 | length ratio: 0.946537138126407 (73173/77306), penalty (log): -0.0564825823732797 10 | NIST score = 7.2576 BLEU score = 0.2614 for system "tst" 11 | 12 | # ------------------------------------------------------------------------ 13 | 14 | Individual N-gram scoring 15 | 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram 16 | ------ ------ ------ ------ ------ ------ ------ ------ ------ 17 | NIST: 5.3558 1.5459 0.3097 0.0399 0.0062 0.0009 0.0004 0.0001 0.0001 "tst" 18 | 19 | BLEU: 0.5852 0.3372 0.2146 0.1382 0.0913 0.0614 0.0416 0.0282 0.0191 "tst" 20 | 21 | # ------------------------------------------------------------------------ 22 | Cumulative N-gram scoring 23 | 1-gram 2-gram 3-gram 4-gram 5-gram 6-gram 7-gram 8-gram 9-gram 24 | ------ ------ ------ ------ ------ ------ ------ ------ ------ 25 | NIST: 5.3558 6.9018 7.2115 7.2514 7.2576 7.2585 7.2589 7.2590 7.2591 "tst" 26 | 27 | BLEU: 0.5530 0.4198 0.3294 0.2614 0.2094 0.1691 0.1372 0.1118 0.0913 "tst" 28 | MT evaluation scorer ended on 2017 Jan 12 at 15:05:46 29 | 30 | 31 | 32 | For information, Multi Bleu score on generic test set: 33 | BLEU = 31.01, 58.4/37.6/26.1/18.5 (BP=0.967, ratio=0.967, hyp_len=42238, ref_len=43658) 34 | -------------------------------------------------------------------------------- /baseline-1M-enfr/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2017 Ubiqus (Author: Vincent Nguyen) 4 | # Systran (Author: Jean Senellart) 5 | # License MIT 6 | # 7 | # This recipe shows how to build an openNMT translation model from English to French 8 | # based on a limited resource (1 Mio segments) 9 | # 10 | # Based on the tuto from the OpenNMT forum 11 | 12 | 13 | # TODO test is GPU is present or not 14 | CUDA_VISIBLE_DEVICES=0 15 | decode_cpu=false 16 | 17 | # Make symlinks to access OpenNMT scripts - change this line if needed 18 | OPENNMT_PATH=../../OpenNMT 19 | [ ! -h tools ] && ln -s $OPENNMT_PATH/tools tools 20 | [ ! -h preprocess.lua ] && ln -s $OPENNMT_PATH/preprocess.lua preprocess.lua 21 | [ ! -h train.lua ] && ln -s $OPENNMT_PATH/train.lua train.lua 22 | [ ! -h translate.lua ] && ln -s $OPENNMT_PATH/translate.lua translate.lua 23 | [ ! -h onmt ] && ln -s $OPENNMT_PATH/onmt onmt 24 | 25 | # this is usefull to skip some stages during step by step execution 26 | stage=0 27 | 28 | # if you want to run without training and use an existing model in the "exp" folder set notrain to true 29 | notrain=false 30 | 31 | # making these variables to make replication easier for other languages 32 | sl=en 33 | tl=fr 34 | 35 | # training corpus - baseline-1M/baseline-2M available 36 | corpus=baseline-1M 37 | 38 | # At the moment only "stage" option is available anyway 39 | . local/parse_options.sh 40 | 41 | # Data download and preparation 42 | 43 | if [ $stage -le 0 ]; then 44 | # TODO put this part in a local/download_data.sh script ? 45 | mkdir -p data 46 | cd data 47 | if [ ! -f $corpus-$sl$tl.tgz ]; then 48 | echo "$0: downloading the baseline corpus from amazon s3" 49 | wget https://s3.amazonaws.com/opennmt-trainingdata/$corpus-$sl$tl.tgz 50 | tar xzfv $corpus-$sl$tl.tgz 51 | fi 52 | if [ ! -f testsets-$sl$tl.tgz ]; then 53 | echo "$0: downloading the baseline corpus from amazon s3" 54 | wget https://s3.amazonaws.com/opennmt-tests/testsets-$sl$tl.tgz 55 | tar xzfv testsets-$sl$tl.tgz 56 | fi 57 | cd ../local 58 | if [ ! -f mteval-v13a.pl ]; then 59 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/mteval-v13a.pl 60 | fi 61 | if [ ! -f input-from-sgm.perl ]; then 62 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/input-from-sgm.perl 63 | fi 64 | if [ ! -f wrap-xml.perl ]; then 65 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/wrap-xml.perl 66 | fi 67 | if [ ! -f multi-bleu.perl ]; then 68 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl 69 | fi 70 | cd .. 71 | fi 72 | 73 | # Tokenize the Corpus 74 | if [ $stage -le 1 ]; then 75 | echo "$0: tokenizing corpus and test sets" 76 | for f in data/$corpus-$sl$tl/*.?? ; do th tools/tokenize.lua -case_feature -joiner_annotate < $f > $f.tok ; done 77 | fi 78 | 79 | # Preprocess the data - decide here the vocabulary size 50000 default value 80 | if [ $stage -le 2 ]; then 81 | mkdir -p exp 82 | echo "$0: preprocessing corpus" 83 | th preprocess.lua -src_vocab_size 50000 -tgt_vocab_size 50000 \ 84 | -train_src data/$corpus-$sl$tl/*_train.$sl.tok \ 85 | -train_tgt data/$corpus-$sl$tl/*_train.$tl.tok \ 86 | -valid_src data/$corpus-$sl$tl/*_valid.$sl.tok \ 87 | -valid_tgt data/$corpus-$sl$tl/*_valid.$tl.tok -save_data exp/data-$corpus-$sl$tl 88 | fi 89 | 90 | # Train the model !!!! even if OS cuda device ID is 0 you need -gpuid=1 91 | # Decide here the number of epochs, learning rate, which epoch to start decay, decay rate 92 | # if you change number of epochs do not forget to change the model name too 93 | if [ $stage -le 3 ]; then 94 | if [ $notrain = false ]; then 95 | echo "$0: training starting, will take a while." 96 | th train.lua -data exp/data-$corpus-$sl$tl-train.t7 \ 97 | -save_model exp/model-$corpus-$sl$tl \ 98 | -end_epoch 13 -start_decay_at 5 -learning_rate_decay 0.65 -gpuid 1 99 | cp -f exp/model-$corpus-$sl$tl"_epoch13_"*".t7" exp/model-$corpus-$sl$tl"_final.t7" 100 | else 101 | echo "$0: using an existing model" 102 | if [ ! -f exp/model-$corpus-$sl$tl"_final.t7" ]; then 103 | echo "$0: mode file does not exist" 104 | exit 1 105 | fi 106 | fi 107 | fi 108 | 109 | # Deploy model for CPU usage 110 | if [ $stage -le 4 ]; then 111 | if [ $decode_cpu = true ]; then 112 | th tools/release_model.lua -force -model exp/model-$corpus-$sl$tl"_final.t7" -output_model exp/model-$corpus-$sl$tl"_cpu.t7" -gpuid 1 113 | fi 114 | fi 115 | 116 | # Translate using gpu 117 | # you can change this by changing the model name from _final to _cpu and remove -gpuid 1 118 | if [ $stage -le 5 ]; then 119 | [ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1" 120 | th translate.lua -replace_unk -model exp/model-$corpus-$sl$tl"_final.t7" \ 121 | -src data/$corpus-$sl$tl/*_test.$sl.tok -output exp/${corpus}_test.hyp.$tl.tok $dec_opt 122 | fi 123 | 124 | # Evaluate the generic test set with multi-bleu 125 | if [ $stage -le 6 ]; then 126 | th tools/detokenize.lua -case_feature < exp/${corpus}_test.hyp.$tl.tok > exp/${corpus}_test.hyp.$tl.detok 127 | perl local/multi-bleu.perl data/$corpus-$sl$tl/*_test.$tl \ 128 | < exp/${corpus}_test.hyp.$tl.detok > exp/${corpus}_test_multibleu.txt 129 | fi 130 | 131 | ############################### 132 | #### Newstest Evaluation 133 | #### 134 | 135 | if [ $stage -le 7 ]; then 136 | 137 | testset=newstest2014-$sl$tl 138 | 139 | perl local/input-from-sgm.perl < data/testsets-$sl$tl/News/$testset-src.$sl.sgm \ 140 | > data/testsets-$sl$tl/News/$testset-src.$sl 141 | 142 | th tools/tokenize.lua -case_feature -joiner_annotate < data/testsets-$sl$tl/News/$testset-src.$sl \ 143 | > data/testsets-$sl$tl/News/$testset-src.$sl.tok 144 | 145 | [ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1" 146 | 147 | th translate.lua -replace_unk -model exp/model-$corpus-$sl$tl"_final"*.t7 \ 148 | -src data/testsets-$sl$tl/News/$testset-src.$sl.tok \ 149 | -output exp/$testset-tgt.trans.$tl.tok $dec_opt 150 | 151 | th tools/detokenize.lua -case_feature < exp/$testset-tgt.trans.$tl.tok \ 152 | > exp/$testset-tgt.trans.$tl 153 | 154 | # Wrap-xml to convert to sgm 155 | perl local/wrap-xml.perl $tl data/testsets-$sl$tl/News/$testset-src.$sl.sgm tst \ 156 | < exp/$testset-tgt.trans.$tl \ 157 | > exp/$testset-tgt.trans.$tl.sgm 158 | 159 | perl local/mteval-v13a.pl -r data/testsets-$sl$tl/News/$testset-ref.$tl.sgm \ 160 | -s data/testsets-$sl$tl/News/$testset-src.$sl.sgm -t exp/$testset-tgt.trans.$tl.sgm \ 161 | -c > exp/nist-bleu-$testset 162 | fi 163 | 164 | -------------------------------------------------------------------------------- /baseline-small/local/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /baseline-small/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2017 Ubiqus (Author: Vincent Nguyen) 4 | # 5 | # License MIT 6 | # 7 | # This recipe shows how to build an openNMT translation model from French to English 8 | # based on 9 | # Global Voices 10 | # News Commentary v11 11 | # This script does not download the datasets, you need to drop the files in data/public 12 | # same for the test set newstest2014 13 | # making these variables to make replication easier for other languages 14 | 15 | sl=fr 16 | tl=en 17 | corpus[1]=data/public/News-Commentary11.en-fr.clean 18 | corpus[2]=data/public/GlobalVoices.en-fr.clean 19 | 20 | vocab_size=50000 21 | seq_len=50 22 | 23 | testset=newstest2014-fren 24 | 25 | use_bpe=false 26 | bpe_size=32000 27 | [ $use_bpe = false ] && bpe_model="" || bpe_model="-bpe_model data/train-$sl$tl.bpe32000" 28 | use_case=false 29 | [ $use_case = false ] && case_feat="" || case_feat="-case_feature" 30 | 31 | # TODO test is GPU is present or not 32 | CUDA_VISIBLE_DEVICES=0 33 | decode_cpu=false 34 | 35 | # Make symlinks to access OpenNMT scripts - change this line if needed 36 | OPENNMT_PATH=../../OpenNMT 37 | [ ! -h tools ] && ln -s $OPENNMT_PATH/tools tools 38 | [ ! -h preprocess.lua ] && ln -s $OPENNMT_PATH/preprocess.lua preprocess.lua 39 | [ ! -h train.lua ] && ln -s $OPENNMT_PATH/train.lua train.lua 40 | [ ! -h translate.lua ] && ln -s $OPENNMT_PATH/translate.lua translate.lua 41 | [ ! -h onmt ] && ln -s $OPENNMT_PATH/onmt onmt 42 | 43 | # this is usefull to skip some stages during step by step execution 44 | stage=0 45 | 46 | # if you want to run without training and use an existing model in the "exp" folder set notrain to true 47 | notrain=false 48 | 49 | # At the moment only "stage" option is available anyway 50 | . local/parse_options.sh 51 | 52 | function score_epoch { 53 | # convert sgm input into text file 54 | local/input-from-sgm.perl < data/public/test/$testset-src.$sl.sgm > data/$testset-src.$sl 55 | # tokenize the text file 56 | th tools/tokenize.lua $case_feat -mode aggressive -joiner_annotate $bpe_model < data/$testset-src.$sl > data/$testset-src.$sl.tok 57 | # translate the test set 58 | [ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1" 59 | th translate.lua -replace_unk -disable_logs -model exp/model-$sl$tl"_epoch"$1"_"*.t7 \ 60 | -src data/$testset-src.$sl.tok \ 61 | -output exp/$testset-tgt.trans.$tl.tok $dec_opt 62 | # detokenize 63 | th tools/detokenize.lua $case_feat < exp/$testset-tgt.trans.$tl.tok \ 64 | > exp/$testset-tgt.trans.$tl 65 | # Wrap-xml to convert to sgm the translated text 66 | local/wrap-xml.perl $tl data/public/test/$testset-src.$sl.sgm tst \ 67 | < exp/$testset-tgt.trans.$tl > exp/$testset-tgt.trans.$tl.sgm 68 | # compute the bleu score 69 | local/mteval-v13a.pl -r data/public/test/$testset-ref.$tl.sgm \ 70 | -s data/public/test/$testset-src.$sl.sgm -t exp/$testset-tgt.trans.$tl.sgm \ 71 | -c > exp/nist-bleu-$testset-epoch-$1 72 | 73 | [ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1" 74 | th translate.lua -replace_unk -disable_logs -model exp/model-$sl$tl"_epoch"$1"_"*".t7" \ 75 | -src data/valid.$sl.tok -output exp/valid.hyp.$tl.tok $dec_opt 76 | 77 | th tools/detokenize.lua $case_feat < exp/valid.hyp.$tl.tok > exp/valid.hyp.$tl.detok 78 | th tools/detokenize.lua $case_feat < data/valid.$tl.tok > exp/valid.$tl.detok 79 | local/multi-bleu.perl exp/valid.$tl.detok \ 80 | < exp/valid.hyp.$tl.detok > exp/generic_test_multibleu-detok-epoch$1.txt 81 | local/multi-bleu.perl data/valid.$tl.tok \ 82 | < exp/valid.hyp.$tl.tok > exp/generic_test_multibleu-tok-epoch$1.txt 83 | } 84 | 85 | 86 | if [ $stage -le 0 ]; then 87 | cd local 88 | if [ ! -f mteval-v13a.pl ]; then 89 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/mteval-v13a.pl 90 | fi 91 | if [ ! -f input-from-sgm.perl ]; then 92 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/input-from-sgm.perl 93 | fi 94 | if [ ! -f wrap-xml.perl ]; then 95 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/wrap-xml.perl 96 | fi 97 | if [ ! -f multi-bleu.perl ]; then 98 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl 99 | fi 100 | if [ ! -f learn_bpe.py ]; then 101 | wget https://raw.githubusercontent.com/rsennrich/subword-nmt/master/learn_bpe.py 102 | fi 103 | cd .. 104 | fi 105 | 106 | 107 | # Prepare Corpus, build BPE model, build dictionary 108 | if [ $stage -le 1 ]; then 109 | 110 | if $use_bpe; then 111 | echo "$0: tokenizing corpus for BPE modelling" 112 | for ((i=1; i<= ${#corpus[@]}; i++)) 113 | do 114 | for f in ${corpus[$i]}.$sl ${corpus[$i]}.$tl 115 | do 116 | file=$(basename $f) 117 | th tools/tokenize.lua -mode aggressive -nparallel 6 < $f > data/$file.rawtok 118 | done 119 | done 120 | cat data/*.rawtok | python local/learn_bpe.py -s $bpe_size > data/train-$sl$tl.bpe$bpe_size 121 | rm data/*.rawtok 122 | fi 123 | 124 | echo "$0: tokenizing corpus" 125 | for ((i=1; i<= ${#corpus[@]}; i++)) 126 | do 127 | for f in ${corpus[$i]}.$sl ${corpus[$i]}.$tl 128 | do 129 | file=$(basename $f) 130 | th tools/tokenize.lua -mode aggressive $case_feat -joiner_annotate -nparallel 6 \ 131 | $bpe_model < $f > data/$file.tok 132 | done 133 | done 134 | 135 | echo "$0: building dictionaries based on public and private data" 136 | cat data/*.$sl.tok > data/tempo.$sl.tok 137 | cat data/*.$tl.tok > data/tempo.$tl.tok 138 | th tools/build_vocab.lua -data data/tempo.$sl.tok -save_vocab data/dict.$sl -vocab_size $vocab_size 139 | th tools/build_vocab.lua -data data/tempo.$tl.tok -save_vocab data/dict.$tl -vocab_size $vocab_size 140 | rm data/tempo.??.tok 141 | 142 | echo "$0: preparing public and private training sets" 143 | for ((i=1; i<= ${#corpus[@]}; i++)) 144 | do 145 | file=$(basename ${corpus[$i]}.$sl) 146 | cat data/$file.tok >> data/train-full.$sl.tok 147 | file=$(basename ${corpus[$i]}.$tl) 148 | cat data/$file.tok >> data/train-full.$tl.tok 149 | done 150 | 151 | local/testset.pl -n 2000 -o data/valid.$sl.tok -h data/train.$sl.tok < data/train-full.$sl.tok > lines-tmp.txt 152 | local/lineextract.pl lines-tmp.txt < data/train-full.$tl.tok > data/valid.$tl.tok 153 | local/heldextract.pl lines-tmp.txt < data/train-full.$tl.tok > data/train.$tl.tok 154 | rm data/train-full.*.tok 155 | rm lines-tmp.txt 156 | 157 | fi 158 | 159 | 160 | # Preprocess the data - decide here the vocabulary size 50000 default value 161 | if [ $stage -le 2 ]; then 162 | mkdir -p exp 163 | echo "$0: preprocessing corpus" 164 | th preprocess.lua -src_vocab_size $vocab_size -tgt_vocab_size $vocab_size \ 165 | -src_seq_length $seq_len -tgt_seq_length $seq_len \ 166 | -train_src data/train.$sl.tok -train_tgt data/train.$tl.tok \ 167 | -valid_src data/valid.$sl.tok -valid_tgt data/valid.$tl.tok \ 168 | -src_vocab data/dict.$sl.dict -tgt_vocab data/dict.$tl.dict \ 169 | -save_data exp/data-$sl$tl 170 | fi 171 | 172 | # Train the model !!!! even if OS cuda device ID is 0 you need -gpuid=1 173 | # Decide here the number of epochs, learning rate, which epoch to start decay, decay rate 174 | # if you change number of epochs do not forget to change the model name too 175 | 176 | # Train on corpus 177 | 178 | if [ $stage -le 3 ]; then 179 | learning_rate=1 180 | start_decay_at=6 181 | learning_rate_decay=0.5 182 | echo "$0: training public corpus starting, will take a while." 183 | # train first epoch 184 | th train.lua -layers 2 -rnn_size 512 -data exp/data-$sl$tl-train.t7 \ 185 | -save_model exp/model-$sl$tl -dropout 0.3 -report_every 500 -word_vec_size 512 \ 186 | -start_epoch 1 -end_epoch 1 -max_batch_size 32 \ 187 | -learning_rate $learning_rate -start_decay_at $start_decay_at \ 188 | -learning_rate_decay $learning_rate_decay -gpuid 1 189 | # score it -sample 50000 -sample_tgt_vocab -sample_type partition 190 | score_epoch 1 191 | # th tools/release_model.lua -force -model exp/model-$sl$tl"_epoch1_"*".t7" \ 192 | # -output_model exp/modelcpu-$sl$tl"_epoch1.t7" -gpuid 1 193 | 194 | for epoch in 2 3 4 5 6 7 8 9 10 195 | do 196 | prev_epoch=$(expr $epoch - 1) 197 | [ $epoch -ge $start_decay_at ] && \ 198 | learning_rate=`awk 'BEGIN{printf("%0.4f", '$learning_rate' * '$learning_rate_decay')}'` 199 | th train.lua -rnn_size 512 -train_from exp/model-$sl$tl"_epoch"$prev_epoch"_"*".t7" \ 200 | -data exp/data-$sl$tl-train.t7 \ 201 | -save_model exp/model-$sl$tl -report_every 500 -word_vec_size 512 \ 202 | -start_epoch $epoch -end_epoch $epoch -max_batch_size 32 \ 203 | -learning_rate $learning_rate -start_decay_at $start_decay_at \ 204 | -learning_rate_decay $learning_rate_decay -gpuid 1 205 | # score it 206 | score_epoch $epoch 207 | # th tools/release_model.lua -force -model exp/model-$sl$tl"_epoch"$epoch"_"*".t7" \ 208 | # -output_model exp/modelcpu-$sl$tl"_epoch"$epoch".t7" -gpuid 1 209 | done 210 | fi 211 | -------------------------------------------------------------------------------- /romance-multi-way/local/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | -------------------------------------------------------------------------------- /romance-multi-way/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2017 Ubiqus (Author: Vincent Nguyen) 4 | # Systran (Author: Jean Senellart) 5 | # License MIT 6 | # 7 | # This recipe shows how to build an openNMT translation model for Romance Multi way languages 8 | # based on 200 000 parallel sentences for each pair 9 | # 10 | # Based on the tuto from the OpenNMT forum 11 | 12 | 13 | # TODO test is GPU is present or not 14 | CUDA_VISIBLE_DEVICES=0 15 | decode_cpu=false 16 | 17 | # Make symlinks to access OpenNMT scripts - change this line if needed 18 | OPENNMT_PATH=../../OpenNMT 19 | [ ! -h tools ] && ln -s $OPENNMT_PATH/tools tools 20 | [ ! -h preprocess.lua ] && ln -s $OPENNMT_PATH/preprocess.lua preprocess.lua 21 | [ ! -h train.lua ] && ln -s $OPENNMT_PATH/train.lua train.lua 22 | [ ! -h translate.lua ] && ln -s $OPENNMT_PATH/translate.lua translate.lua 23 | [ ! -h onmt ] && ln -s $OPENNMT_PATH/onmt onmt 24 | 25 | # this is usefull to skip some stages during step by step execution 26 | stage=0 27 | 28 | # if you want to run without training and use an existing model in the "exp" folder set notrain to true 29 | notrain=false 30 | 31 | # At the moment only "stage" option is available anyway 32 | . local/parse_options.sh 33 | 34 | # Data download and preparation 35 | 36 | if [ $stage -le 0 ]; then 37 | # TODO put this part in a local/download_data.sh script ? 38 | mkdir -p data 39 | cd data 40 | if [ ! -f multi-esfritptro-parallel.tgz ]; then 41 | echo "$0: downloading the baseline corpus from amazon s3" 42 | wget https://s3.amazonaws.com/opennmt-trainingdata/multi-esfritptro-parallel.tgz 43 | tar xzfv multi-esfritptro-parallel.tgz 44 | fi 45 | cd ../local 46 | if [ ! -f mteval-v13a.pl ]; then 47 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/mteval-v13a.pl 48 | fi 49 | if [ ! -f input-from-sgm.perl ]; then 50 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/input-from-sgm.perl 51 | fi 52 | if [ ! -f wrap-xml.perl ]; then 53 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/wrap-xml.perl 54 | fi 55 | if [ ! -f multi-bleu.perl ]; then 56 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl 57 | fi 58 | if [ ! -f learn_bpe.py ]; then 59 | wget https://raw.githubusercontent.com/rsennrich/subword-nmt/master/learn_bpe.py 60 | fi 61 | cd .. 62 | fi 63 | 64 | # Tokenize and prepare the Corpus 65 | if [ $stage -le 1 ]; then 66 | echo "$0: tokenizing corpus" 67 | for f in data/train*.?? ; do th tools/tokenize.lua < $f > $f.rawtok ; done 68 | cat data/train*.rawtok | python local/learn_bpe.py -s 32000 > data/esfritptro.bpe32000 69 | for f in data/*-????.?? ; do \ 70 | th tools/tokenize.lua -case_feature -joiner_annotate -nparrallel 4 -bpe_model data/esfritptro.bpe32000 < $f > $f.tok 71 | done 72 | for set in train valid test ; do rm data/$set-multi.???.tok ; done 73 | for src in es fr it pt ro ; do 74 | for tgt in es fr it pt ro ; do 75 | [ ! $src = $tgt ] && perl -i.bak -pe "s//__opt_tgt_$tgt\xEF\xBF\xA8N /" data/*-$src$tgt.$src.tok 76 | for set in train valid test ; do 77 | [ ! $src = $tgt ] && cat data/$set-$src$tgt.$src.tok >> data/$set-multi.src.tok 78 | [ ! $src = $tgt ] && cat data/$set-$src$tgt.$tgt.tok >> data/$set-multi.tgt.tok 79 | done 80 | done 81 | done 82 | paste data/valid-multi.src.tok data/valid-multi.tgt.tok | shuf > data/valid-multi.srctgt.tok 83 | head -2000 data/valid-multi.srctgt.tok | cut -f1 > data/valid-multi2000.src.tok 84 | head -2000 data/valid-multi.srctgt.tok | cut -f2 > data/valid-multi2000.tgt.tok 85 | fi 86 | 87 | # Preprocess the data - decide here the vocabulary size 50000 default value 88 | if [ $stage -le 2 ]; then 89 | mkdir -p exp 90 | echo "$0: preprocessing corpus" 91 | th preprocess.lua -src_vocab_size 50000 -tgt_vocab_size 50000 \ 92 | -train_src data/train-multi.src.tok -train_tgt data/train-multi.tgt.tok \ 93 | -valid_src data/valid-multi2000.src.tok -valid_tgt data/valid-multi2000.tgt.tok \ 94 | -save_data exp/model-multi 95 | fi 96 | 97 | # Train the model !!!! even if OS cuda device ID is 0 you need -gpuid=1 98 | # Decide here the number of epochs, learning rate, which epoch to start decay, decay rate 99 | # if you change number of epochs do not forget to change the model name too 100 | # This example has a smaller topology compared to tuto for faster training (worse results) 101 | if [ $stage -le 3 ]; then 102 | if [ $notrain = false ]; then 103 | echo "$0: training starting, will take a while." 104 | th train.lua -layers 2 -rnn_size 500 -brnn -word_vec_size 600 \ 105 | -end_epoch 13 -learning_rate 1 -start_decay_at 5 -learning_rate_decay 0.65 \ 106 | -data exp/model-multi-train.t7 -save_model exp/model-multi-2-500-600 -gpuid 1 107 | cp -f exp/model-multi-2-500-600"_epoch13_"*".t7" exp/model-multi-2-500-600"_final.t7" 108 | else 109 | echo "$0: using an existing model" 110 | if [ ! -f exp/model-multi-2-500-600"_final.t7" ]; then 111 | echo "$0: mode file does not exist" 112 | exit 1 113 | fi 114 | fi 115 | fi 116 | 117 | # Deploy model for CPU usage 118 | if [ $stage -le 4 ]; then 119 | if [ $decode_cpu = true ]; then 120 | th tools/release_model.lua -force -model exp/model-multi-2-500-600"_final.t7" \ 121 | -output_model exp/model-multi-2-500-600"_cpu.t7" -gpuid 1 122 | fi 123 | fi 124 | 125 | # Translate using gpu 126 | # you can change this by changing the model name from _final to _cpu and remove -gpuid 1 127 | if [ $stage -le 5 ]; then 128 | [ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1" 129 | for src in es fr it pt ro ; do 130 | for tgt in es fr it pt ro ; do 131 | [ ! $src = $tgt ] && th translate.lua -replace_unk -model exp/model-multi-2-500-600"_final"*".t7" \ 132 | -src data/test-$src$tgt.$src.tok -output exp/test-$src$tgt.hyp.$tgt.tok $dec_opt 133 | done 134 | done 135 | fi 136 | 137 | # Evaluate the generic test set with multi-bleu 138 | if [ $stage -le 6 ]; then 139 | for src in es fr it pt ro ; do 140 | for tgt in es fr it pt ro ; do 141 | [ ! $src = $tgt ] && local/multi-bleu.perl data/test-$src$tgt.$tgt.tok \ 142 | < exp/test-$src$tgt.hyp.$tgt.tok > exp/test-$src$tgt"_multibleu".txt 143 | done 144 | done 145 | grep BLEU exp/*multibleu.txt 146 | fi 147 | 148 | 149 | --------------------------------------------------------------------------------