├── .gitignore ├── LICENSE ├── PhD-thesis.pdf ├── README.md ├── config ├── AMU │ ├── XL.yaml │ ├── XXL.yaml │ ├── avg.py │ ├── eval.sh │ ├── large.yaml │ ├── medium.yaml │ ├── post-process.sh │ ├── prepare.sh │ └── small.yaml ├── APE │ ├── clean-raw-data.sh │ ├── eval.sh │ ├── large │ │ ├── chained.yaml │ │ ├── forced.yaml │ │ ├── global.yaml │ │ ├── multi-global.yaml │ │ └── multi.yaml │ ├── medium │ │ ├── chained.yaml │ │ ├── forced.yaml │ │ ├── global.yaml │ │ ├── multi-global.yaml │ │ └── multi.yaml │ ├── prepare.sh │ └── small │ │ ├── chained.yaml │ │ ├── forced.yaml │ │ ├── global.yaml │ │ ├── multi-global.yaml │ │ └── multi.yaml ├── BTEC │ ├── ASR.yaml │ ├── AST.yaml │ ├── MT.yaml │ ├── Multi-Task-joint.yaml │ ├── Multi-Task.yaml │ ├── README.md │ ├── prepare.sh │ └── voxygen │ │ ├── convert-to-audio.sh │ │ └── wsclient.py ├── IWSLT14 │ ├── BPE-TED.yaml │ ├── BPE.yaml │ ├── BPE2char-TED.yaml │ ├── BPE2char.yaml │ ├── Back-Translation │ │ ├── baseline-TED.yaml │ │ ├── char-level-TED.yaml │ │ ├── decode.sh │ │ ├── eval.sh │ │ ├── prepare.sh │ │ ├── split.sh │ │ ├── subwords-TED.yaml │ │ └── train.sh │ ├── prepare-TED.sh │ ├── prepare-lexicon.sh │ ├── prepare-mixer.sh │ ├── prepare.sh │ └── train-SMT.sh ├── LibriSpeech │ ├── ASR.yaml │ ├── AST.yaml │ ├── MT.yaml │ ├── Multi-Task.yaml │ ├── README.md │ ├── model-outputs.tar.xz │ ├── prepare-raw.sh │ └── prepare.sh ├── WMT14 │ ├── RNNsearch-Adam.yaml │ ├── RNNsearch-BPE.yaml │ ├── RNNsearch.yaml │ ├── download.sh │ ├── prepare-lexicon.sh │ └── prepare.sh └── default.yaml ├── install.sh ├── run-tests.py ├── scripts ├── bpe │ ├── apply_bpe.py │ ├── bpe_toy.py │ ├── chrF.py │ ├── concat-bpe.py │ ├── get_vocab.py │ ├── learn_bpe.py │ ├── learn_joint_bpe_and_vocab.py │ └── segment-char-ngrams.py ├── config-diff.sh ├── copy-model.py ├── coverage.py ├── decode-moses.sh ├── extract-lexicon.py ├── get-best-score.py ├── join.py ├── moses │ ├── clean-corpus-n.perl │ ├── deescape-special-chars.perl │ ├── detokenizer.perl │ ├── detruecase.perl │ ├── escape-special-chars.perl │ ├── lowercase.perl │ ├── multi-bleu.perl │ ├── nonbreaking_prefixes │ │ ├── nonbreaking_prefix.de │ │ ├── nonbreaking_prefix.el │ │ ├── nonbreaking_prefix.en │ │ ├── nonbreaking_prefix.es │ │ └── nonbreaking_prefix.fr │ ├── normalize-punctuation.perl │ ├── strip-xml.perl │ ├── tokenizer.perl │ ├── train-truecaser.perl │ ├── truecase.perl │ └── wrap-xml.perl ├── multi-print.py ├── paired-eval.py ├── plot-loss.py ├── plot-score-per-length.py ├── post_editing │ ├── apply-edits.py │ ├── extract-edits.py │ ├── extract-ter-vectors.py │ ├── noisify.py │ ├── plot-ops.py │ ├── plot-ter.py │ ├── reverse-edits.py │ ├── select-by-index.py │ ├── select-by-length.py │ ├── select-by-ter.py │ ├── stats-TER.py │ ├── ter-stats.py │ ├── to-sgm.py │ └── well-formed.py ├── prepare-data.py ├── reverse.py ├── score.py ├── shuf-corpus.py ├── speech │ ├── cat.py │ ├── convert.py │ ├── extract-new.py │ ├── extract.py │ ├── head.py │ ├── python_speech_features │ │ ├── __init__.py │ │ ├── base.py │ │ └── sigproc.py │ └── shuf.py ├── split-corpus.py ├── stats-bleu.py ├── stats.py ├── tercom.jar ├── train-moses.sh └── vocab-stats.py ├── seq2seq.sh └── translate ├── __init__.py ├── __main__.py ├── beam_search.py ├── conv_lstm.py ├── evaluation.py ├── models.py ├── multitask_model.py ├── rnn.py ├── seq2seq_model.py ├── translation_model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | 7 | # Distribution / packaging 8 | .Python 9 | env/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | *.log 26 | .ipynb_checkpoints 27 | 28 | data_* 29 | data 30 | tests 31 | raw_data 32 | tmp/ 33 | .idea/ 34 | models 35 | model 36 | .spyderproject 37 | wsclient.cred 38 | *.svg 39 | *.png 40 | -------------------------------------------------------------------------------- /PhD-thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/PhD-thesis.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seq2seq 2 | Attention-based sequence to sequence learning 3 | 4 | ## Dependencies 5 | 6 | * [TensorFlow 1.2+ for Python 3](https://www.tensorflow.org/get_started/os_setup.html) 7 | * YAML and Matplotlib modules for Python 3: `sudo apt-get install python3-yaml python3-matplotlib` 8 | * A recent NVIDIA GPU 9 | 10 | ## How to use 11 | 12 | 13 | Train a model (CONFIG is a YAML configuration file, such as `config/default.yaml`): 14 | 15 | ./seq2seq.sh CONFIG --train -v 16 | 17 | 18 | Translate text using an existing model: 19 | 20 | ./seq2seq.sh CONFIG --decode FILE_TO_TRANSLATE --output OUTPUT_FILE 21 | or for interactive decoding: 22 | 23 | ./seq2seq.sh CONFIG --decode 24 | 25 | #### Example English→French model 26 | This is the same model and dataset as [Bahdanau et al. 2015](https://arxiv.org/abs/1409.0473). 27 | 28 | config/WMT14/download.sh # download WMT14 data into raw_data/WMT14 29 | config/WMT14/prepare.sh # preprocess the data, and copy the files to data/WMT14 30 | ./seq2seq.sh config/WMT14/baseline.yaml --train -v # train a baseline model on this data 31 | 32 | You should get similar BLEU scores as these (our model was trained on a single Titan X I for about 4 days). 33 | 34 | | Dev | Test | +beam | Steps | Time | 35 | |:-----:|:-----:|:-----:|:-----:|:----:| 36 | | 25.04 | 28.64 | 29.22 | 240k | 60h | 37 | | 25.25 | 28.67 | 29.28 | 330k | 80h | 38 | 39 | Download this model [here](https://drive.google.com/file/d/1Qe4yZTYSTF-mlRlP_NTFGwXgacZnBwdp/view?usp=sharing). To use this model, just extract the archive into the `seq2seq/models` folder, and run: 40 | 41 | ./seq2seq.sh models/WMT14/config.yaml --decode -v 42 | 43 | #### Example German→English model 44 | This is the same dataset as [Ranzato et al. 2015](https://arxiv.org/abs/1511.06732). 45 | 46 | config/IWSLT14/prepare.sh 47 | ./seq2seq.sh config/IWSLT14/baseline.yaml --train -v 48 | 49 | | Dev | Test | +beam | Steps | 50 | |:-----:|:-----:|:-----:|:-----:| 51 | | 28.32 | 25.33 | 26.74 | 44k | 52 | 53 | The model is available for download [here](https://drive.google.com/file/d/1qCL3ZRxZ13fC45f74Nt6qiQ8tVAYFF9H/view?usp=sharing). 54 | 55 | ## Audio pre-processing 56 | If you want to use the toolkit for Automatic Speech Recognition (ASR) or Automatic Speech Translation (AST), then you'll need to pre-process your audio files accordingly. 57 | This [README](https://github.com/eske/seq2seq/tree/master/config/BTEC) details how it can be done. You'll need to install the **Yaafe** library, and use `scripts/speech/extract-audio-features.py` to extract MFCCs from a set of wav files. 58 | 59 | ## Features 60 | * **YAML configuration files** 61 | * **Beam-search decoder** 62 | * **Ensemble decoding** 63 | * **Multiple encoders** 64 | * **Hierarchical encoder** 65 | * **Bidirectional encoder** 66 | * **Local attention model** 67 | * **Convolutional attention model** 68 | * **Detailed logging** 69 | * **Periodic BLEU evaluation** 70 | * **Periodic checkpoints** 71 | * **Multi-task training:** train on several tasks at once (e.g. French->English and German->English MT) 72 | * **Subwords training and decoding** 73 | * **Input binary features instead of text** 74 | * **Pre-processing script:** we provide a fully-featured Python script for data pre-processing (vocabulary creation, lowercasing, tokenizing, splitting, etc.) 75 | * **Dynamic RNNs:** we use symbolic loops instead of statically unrolled RNNs. This means that we don't mean to manually configure bucket sizes, and that model creation is much faster. 76 | 77 | ## Credits 78 | 79 | * This project is based on [TensorFlow's reference implementation](https://www.tensorflow.org/tutorials/seq2seq) 80 | * We include some of the pre-processing scripts from [Moses](http://www.statmt.org/moses/) 81 | * The scripts for subword units come from [github.com/rsennrich/subword-nmt](https://github.com/rsennrich/subword-nmt) 82 | -------------------------------------------------------------------------------- /config/AMU/XL.yaml: -------------------------------------------------------------------------------- 1 | label: "ENCDEC-MCGRU" 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 4M + 500k + 12k train set" 3 | 4 | cell_size: 1024 5 | attn_size: 2048 6 | embedding_size: 512 7 | cell_type: GRU 8 | 9 | data_dir: data/AMU 10 | max_len: 50 11 | model_dir: models/AMU/XL 12 | train_prefix: train.XL 13 | vocab_prefix: vocab.XL 14 | dev_prefix: dev.XL 15 | 16 | steps_per_checkpoint: 10000 17 | steps_per_eval: 10000 18 | score_function: corpus_scores_ter 19 | keep_best: 8 20 | 21 | optimizer: adam 22 | learning_rate: 0.0001 23 | batch_size: 64 24 | batch_mode: standard 25 | read_ahead: 100 26 | max_gradient_norm: 1.0 27 | max_epochs: 12 28 | 29 | attention_type: global 30 | final_state: average 31 | 32 | weight_scale: 0.01 33 | 34 | use_dropout: True 35 | pervasive_dropout: True 36 | rnn_input_dropout: 0.2 37 | rnn_output_dropout: 0.2 38 | attn_dropout: 0.2 39 | word_dropout: 0.2 40 | initial_state_dropout: 0.2 41 | 42 | train_initial_states: False 43 | 44 | encoders: 45 | - name: mt 46 | - name: src 47 | 48 | decoders: 49 | - name: pe 50 | conditional_rnn: True 51 | pred_deep_layer: True 52 | 53 | ref_ext: pe.ref 54 | 55 | post_process_script: config/AMU/post-process.sh 56 | -------------------------------------------------------------------------------- /config/AMU/XXL.yaml: -------------------------------------------------------------------------------- 1 | label: "ENCDEC-MCGRU" 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 4M + 500k + 23k train set" 3 | 4 | cell_size: 1024 5 | attn_size: 2048 6 | embedding_size: 512 7 | cell_type: GRU 8 | 9 | data_dir: data/AMU 10 | max_len: 50 11 | model_dir: models/AMU/XXL 12 | train_prefix: train.XXL 13 | vocab_prefix: vocab.XXL 14 | dev_prefix: dev.XL 15 | 16 | steps_per_checkpoint: 10000 17 | steps_per_eval: 10000 18 | score_function: corpus_scores_ter 19 | keep_best: 8 20 | 21 | optimizer: adam 22 | learning_rate: 0.0001 23 | batch_size: 64 24 | batch_mode: standard 25 | read_ahead: 100 26 | max_gradient_norm: 1.0 27 | max_epochs: 12 28 | 29 | attention_type: global 30 | final_state: average 31 | 32 | weight_scale: 0.01 33 | 34 | use_dropout: True 35 | pervasive_dropout: True 36 | rnn_input_dropout: 0.2 37 | rnn_output_dropout: 0.2 38 | attn_dropout: 0.2 39 | word_dropout: 0.2 40 | initial_state_dropout: 0.2 41 | 42 | train_initial_states: False 43 | 44 | encoders: 45 | - name: mt 46 | - name: src 47 | 48 | decoders: 49 | - name: pe 50 | conditional_rnn: True 51 | pred_deep_layer: True 52 | 53 | ref_ext: pe.ref 54 | 55 | post_process_script: config/AMU/post-process.sh 56 | -------------------------------------------------------------------------------- /config/AMU/avg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import re 5 | import math 6 | from collections import defaultdict 7 | 8 | stats = defaultdict(list) 9 | 10 | for line in sys.stdin: 11 | for s in re.findall(r'[^\s]*=\d+\.?\d*', line): 12 | key, value = s.split('=') 13 | stats[key].append(float(value)) 14 | 15 | keys = ['ter', 'bleu', 'bleu1', 'wer'] 16 | def sort_key(item): 17 | key, _ = item 18 | if key in keys: 19 | return keys.index(key) 20 | else: 21 | return len(keys) 22 | 23 | new_stats = [] 24 | for key, values in sorted(stats.items(), key=sort_key): 25 | mean = sum(values) / len(values) 26 | stdev = math.sqrt(sum((x - mean) ** 2 for x in values) / (len(values) - 1)) 27 | new_stats.append((key, mean, stdev)) 28 | 29 | print('\n'.join('{:<7} {:6.2f} ({:.2f})'.format(*data) for data in new_stats)) 30 | -------------------------------------------------------------------------------- /config/AMU/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | size=$1 4 | gpu_id=$2 5 | 6 | root_dir=models/AMU 7 | eval_dir=${root_dir}/eval_${size} 8 | log_file=${eval_dir}/log.txt 9 | 10 | rm -f ${log_file} 11 | rm -rf ${eval_dir} 12 | mkdir -p ${eval_dir} 13 | 14 | for index in 1 2 3 4 15 | do 16 | model=${size}.${index} 17 | model_dir=${root_dir}/${model} 18 | rm -rf ${model_dir}.avg 19 | checkpoints=`find ${model_dir}/checkpoints/best-* -printf "%f\n" | cut -d'.' -f1,1 | sort | uniq | cut -d'-' -f2,2 | xargs printf " %s|" | sed s/\|$//` 20 | checkpoints=`cat ${model_dir}/checkpoints/scores.txt | grep -P "${checkpoints}" | sed s/-// | sort -g | head -n4 | cut -d' ' -f2,2 | xargs printf "${model_dir}/checkpoints/best-%s "` 21 | echo ${checkpoints} 22 | ./seq2seq.sh ${model_dir}/config.yaml --average --checkpoints ${checkpoints} --save --model-dir ${model_dir}.avg --no-gpu >/dev/null 2>&1 23 | rename "s/translate-[0-9]*/average/" ${model_dir}.avg/checkpoints/translate-* 24 | mv ${model_dir}.avg/checkpoints/average.* ${model_dir}/checkpoints/ 25 | rm -rf ${model_dir}.avg 26 | done 27 | 28 | function header { 29 | printf "%s %-40s" `date +"%H:%M:%S"` $1 >> ${log_file} 30 | } 31 | 32 | function filter { 33 | tail -n1 | grep -Po "(ter|bleu1|bleu|wer|penalty|ratio)=[0-9]*.?[0-9]*" | xargs printf "%s " | sed "s/ $/\n/" >> ${log_file} 34 | } 35 | 36 | for beam_size in 12 1 37 | do 38 | for corpus in dev test test.2017 39 | do 40 | if [ ${size} = medium ] 41 | then 42 | eval_corpus=${corpus} 43 | else 44 | eval_corpus=${corpus}.${size} 45 | fi 46 | 47 | for index in 1 2 3 4 48 | do 49 | model=${size}.${index} 50 | model_dir=${root_dir}/${model} 51 | 52 | output=${corpus}.${model}.beam${beam_size} 53 | header ${output} 54 | ./seq2seq.sh ${model_dir}/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 55 | config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 56 | 57 | output=${corpus}.${model}.avg.beam${beam_size} 58 | header ${output} 59 | ./seq2seq.sh ${model_dir}/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --checkpoints ${model_dir}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 60 | config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 61 | done 62 | 63 | output=${corpus}.${size}.ensemble.beam${beam_size} 64 | header ${output} 65 | ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/best --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 66 | config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 67 | 68 | output=${corpus}.${size}.ensemble.avg.beam${beam_size} 69 | header ${output} 70 | ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 71 | config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 72 | done 73 | done 74 | -------------------------------------------------------------------------------- /config/AMU/large.yaml: -------------------------------------------------------------------------------- 1 | label: "ENCDEC-MCGRU LARGE" 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 500k + 23k train set" 3 | 4 | cell_size: 512 5 | attn_size: 1024 6 | embedding_size: 256 7 | cell_type: GRU 8 | 9 | data_dir: data/AMU 10 | max_len: 50 11 | model_dir: models/AMU/large 12 | train_prefix: train.large 13 | vocab_prefix: vocab.large 14 | dev_prefix: dev.large 15 | 16 | steps_per_checkpoint: 1000 17 | steps_per_eval: 1000 18 | keep_best: 4 19 | score_function: corpus_scores_ter 20 | 21 | batch_size: 32 22 | max_gradient_norm: 1.0 23 | max_steps: 150000 24 | 25 | attention_type: global 26 | final_state: average 27 | 28 | weight_scale: 0.01 29 | 30 | use_dropout: True 31 | pervasive_dropout: True 32 | rnn_input_dropout: 0.4 33 | rnn_output_dropout: 0.4 34 | word_dropout: 0.2 35 | 36 | train_initial_states: False 37 | 38 | encoders: 39 | - name: de 40 | ext: mt 41 | - name: src 42 | 43 | decoders: 44 | - name: de 45 | ext: pe 46 | conditional_rnn: True 47 | pred_deep_layer: False 48 | pred_embed_proj: False 49 | tie_embeddings: False 50 | 51 | ref_ext: pe.ref 52 | 53 | post_process_script: config/AMU/post-process.sh 54 | -------------------------------------------------------------------------------- /config/AMU/medium.yaml: -------------------------------------------------------------------------------- 1 | label: "ENCDEC-MCGRU MEDIUM" 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 23k train set" 3 | 4 | cell_size: 256 5 | attn_size: 512 6 | embedding_size: 128 7 | cell_type: GRU 8 | 9 | data_dir: data/AMU 10 | max_len: 50 11 | model_dir: models/AMU/medium 12 | 13 | steps_per_checkpoint: 1000 14 | steps_per_eval: 1000 15 | keep_best: 4 16 | score_function: corpus_scores_ter 17 | 18 | batch_size: 32 19 | max_gradient_norm: 1.0 20 | max_steps: 150000 21 | 22 | attention_type: global 23 | final_state: average 24 | 25 | weight_scale: 0.01 26 | 27 | use_dropout: True 28 | pervasive_dropout: True 29 | rnn_input_dropout: 0.4 30 | rnn_output_dropout: 0.4 31 | word_dropout: 0.2 32 | 33 | train_initial_states: False 34 | 35 | encoders: 36 | - name: de 37 | ext: mt 38 | - name: src 39 | 40 | decoders: 41 | - name: de 42 | ext: pe 43 | conditional_rnn: True 44 | pred_deep_layer: False 45 | pred_embed_proj: True 46 | tie_embeddings: True 47 | 48 | ref_ext: pe.ref 49 | 50 | post_process_script: config/AMU/post-process.sh 51 | -------------------------------------------------------------------------------- /config/AMU/post-process.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cat "${1:-/dev/stdin}" | sed "s/@@ //g" | scripts/moses/detruecase.perl | scripts/moses/deescape-special-chars.perl 4 | -------------------------------------------------------------------------------- /config/AMU/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | raw_data=raw_data/APE 4 | data_dir=data/AMU 5 | 6 | mkdir -p ${data_dir} 7 | 8 | for ext in src mt pe 9 | do 10 | if [ ${ext} = "src" ] 11 | then 12 | lang=en 13 | else 14 | lang=de 15 | fi 16 | 17 | for corpus in train train.2017 500K 4M dev test test.2017 18 | do 19 | cat ${raw_data}/${corpus}.${ext} | scripts/moses/escape-special-chars.perl | scripts/moses/truecase.perl --model ${raw_data}/true.${lang} > ${data_dir}/${corpus}.true.${ext} 20 | cat ${data_dir}/${corpus}.true.${ext} | scripts/bpe/apply_bpe.py -c ${raw_data}/${lang}.bpe > ${data_dir}/${corpus}.tmp.${ext} 21 | done 22 | 23 | mv ${data_dir}/dev.tmp.${ext} ${data_dir}/dev.XL.${ext} 24 | mv ${data_dir}/test.tmp.${ext} ${data_dir}/test.XL.${ext} 25 | mv ${data_dir}/test.2017.tmp.${ext} ${data_dir}/test.2017.XL.${ext} 26 | 27 | cat ${data_dir}/train.tmp.${ext} | scripts/bpe/get_vocab.py > ${data_dir}/bpe-vocab.small.${ext} 28 | cat ${data_dir}/{train,train.2017}.tmp.${ext} | scripts/bpe/get_vocab.py > ${data_dir}/bpe-vocab.medium.${ext} 29 | cat ${data_dir}/{train,train.2017,500K}.tmp.${ext} | scripts/bpe/get_vocab.py > ${data_dir}/bpe-vocab.large.${ext} 30 | 31 | cat ${data_dir}/{4M,500K}.tmp.${ext} > ${data_dir}/train.XL.${ext} 32 | rm ${data_dir}/{4M,500K}.tmp.${ext} 33 | cp ${data_dir}/train.XL.${ext} ${data_dir}/train.XXL.${ext} 34 | 35 | for i in {1..20}; do 36 | cat ${data_dir}/train.tmp.${ext} >> ${data_dir}/train.XL.${ext} 37 | cat ${data_dir}/{train,train.2017}.tmp.${ext} >> ${data_dir}/train.XXL.${ext} 38 | done 39 | rm ${data_dir}/{train,train.2017}.tmp.${ext} 40 | 41 | for size in small medium large 42 | do 43 | for corpus in train train.2017 500K dev test test.2017 44 | do 45 | cat ${data_dir}/${corpus}.true.${ext} | scripts/bpe/apply_bpe.py -c ${raw_data}/${lang}.bpe --vocabulary-threshold 5 --vocabulary ${data_dir}/bpe-vocab.${size}.${ext} > ${data_dir}/${corpus}.${size}.${ext} 46 | done 47 | done 48 | rm -f ${data_dir}/*.tmp.* ${data_dir}/*.true.* 49 | cat ${data_dir}/train.2017.medium.${ext} >> ${data_dir}/train.medium.${ext} 50 | for i in {1..20}; do 51 | cat ${data_dir}/{train,train.2017}.large.${ext} >> ${data_dir}/500K.large.${ext} 52 | done 53 | mv ${data_dir}/500K.large.${ext} ${data_dir}/train.large.${ext} 54 | rm -f ${data_dir}/{train.2017,500K}.{small,medium,large}.${ext} 55 | done 56 | 57 | for size in small medium large XL 58 | do 59 | cp ${raw_data}/dev.pe ${data_dir}/dev.${size}.pe.ref 60 | cp ${raw_data}/test.pe ${data_dir}/test.${size}.pe.ref 61 | cp ${raw_data}/test.2017.pe ${data_dir}/test.2017.${size}.pe.ref 62 | done 63 | 64 | for size in small medium 65 | do 66 | cat ${data_dir}/train.${size}.{src,mt,pe} > ${data_dir}/train.${size}.all 67 | scripts/prepare-data.py ${data_dir}/train.${size} all ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.${size} 68 | cp ${data_dir}/vocab.${size}.all ${data_dir}/vocab.${size}.mt 69 | cp ${data_dir}/vocab.${size}.all ${data_dir}/vocab.${size}.pe 70 | mv ${data_dir}/vocab.${size}.all ${data_dir}/vocab.${size}.src 71 | done 72 | for size in large XL 73 | do 74 | cat ${data_dir}/train.${size}.{mt,pe} > ${data_dir}/train.${size}.de 75 | scripts/prepare-data.py ${data_dir}/train.${size} src de ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.${size} 76 | cp ${data_dir}/vocab.${size}.de ${data_dir}/vocab.${size}.mt 77 | cp ${data_dir}/vocab.${size}.de ${data_dir}/vocab.${size}.pe 78 | rm ${data_dir}/train.${size}.de ${data_dir}/vocab.${size}.de 79 | done 80 | 81 | scripts/prepare-data.py ${data_dir}/train.XXL src mt pe ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.XXL 82 | rename s/\.medium// ${data_dir}/* 83 | 84 | -------------------------------------------------------------------------------- /config/AMU/small.yaml: -------------------------------------------------------------------------------- 1 | label: "ENCDEC-MCGRU SMALL" 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 12k train set" 3 | 4 | cell_size: 256 5 | attn_size: 512 6 | embedding_size: 128 7 | cell_type: GRU 8 | 9 | data_dir: data/AMU 10 | max_len: 60 11 | model_dir: models/AMU/small 12 | train_prefix: train.small 13 | vocab_prefix: vocab.small 14 | dev_prefix: dev.small 15 | 16 | steps_per_checkpoint: 1000 17 | steps_per_eval: 1000 18 | keep_best: 4 19 | score_function: corpus_scores_ter 20 | 21 | batch_size: 32 22 | max_gradient_norm: 1.0 23 | max_steps: 75000 24 | 25 | attention_type: global 26 | final_state: average 27 | 28 | weight_scale: 0.01 29 | 30 | use_dropout: True 31 | pervasive_dropout: True 32 | rnn_input_dropout: 0.4 33 | rnn_output_dropout: 0.4 34 | word_dropout: 0.2 35 | 36 | train_initial_states: False 37 | 38 | encoders: 39 | - name: de 40 | ext: mt 41 | - name: src 42 | 43 | decoders: 44 | - name: de 45 | ext: pe 46 | conditional_rnn: True 47 | pred_deep_layer: False 48 | pred_embed_proj: True 49 | tie_embeddings: True 50 | 51 | ref_ext: pe.ref 52 | 53 | post_process_script: config/AMU/post-process.sh 54 | -------------------------------------------------------------------------------- /config/APE/clean-raw-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # start by downloading all the data files from "http://www.statmt.org/wmt17/ape-task.html", "EN-DE" language pair 4 | # extract all the text files into the same "raw_data/APE" directory (no sub-directories) 5 | # also copy the "true.{en,de}" and "{en,de}.bpe" files 6 | # then run the following commands 7 | raw_data=raw_data/APE 8 | cur_dir=`pwd` 9 | cd ${raw_data} 10 | 11 | for ext in src mt pe 12 | do 13 | mv en-de.train.${ext} train.2017.${ext} 14 | mv en-de.${ext}.test.2017 test.2017.${ext} 15 | done 16 | cd ${cur_dir} 17 | # then run the pre-processing scripts "config/{APE,AMU}/prepare.sh" 18 | -------------------------------------------------------------------------------- /config/APE/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | root_dir=models/APE/$1 4 | size=$2 5 | gpu_id=$3 6 | eval_dir=${root_dir}/eval 7 | log_file=${eval_dir}/${size}.log 8 | 9 | rm -f ${log_file} 10 | mkdir -p ${eval_dir} 11 | 12 | for index in 1 2 3 4 13 | do 14 | model=${size}.${index} 15 | model_dir=${root_dir}/${model} 16 | rm -rf ${model_dir}.avg 17 | checkpoints=`find ${model_dir}/checkpoints/best-* -printf "%f\n" | cut -d'.' -f1,1 | sort | uniq | cut -d'-' -f2,2 | xargs printf " %s|" | sed s/\|$//` 18 | checkpoints=`cat ${model_dir}/checkpoints/scores.txt | grep -P "${checkpoints}" | sed s/-// | sort -g | head -n4 | cut -d' ' -f2,2 | xargs printf "${model_dir}/checkpoints/best-%s "` 19 | echo ${checkpoints} 20 | ./seq2seq.sh ${model_dir}/config.yaml --average --checkpoints ${checkpoints} --save --model-dir ${model_dir}.avg --no-gpu >/dev/null 2>&1 21 | rename "s/translate-[0-9]*/average/" ${model_dir}.avg/checkpoints/translate-* 22 | mv ${model_dir}.avg/checkpoints/average.* ${model_dir}/checkpoints/ 23 | rm -rf ${model_dir}.avg 24 | done 25 | 26 | function header { 27 | printf "%s %-40s" `date +"%H:%M:%S"` $1 >> ${log_file} 28 | } 29 | 30 | function filter { 31 | tail -n1 | grep -Po "(ter|bleu1|bleu|wer|penalty|ratio)=[0-9]*.?[0-9]*" | xargs printf "%s " | sed "s/ $/\n/" >> ${log_file} 32 | } 33 | 34 | for beam_size in 1 6 35 | do 36 | for corpus in dev test test.2017 37 | do 38 | for index in 1 2 3 4 39 | do 40 | model=${size}.${index} 41 | model_dir=${root_dir}/${model} 42 | 43 | output=${corpus}.${model}.beam${beam_size} 44 | header ${output} 45 | ./seq2seq.sh ${model_dir}/config.yaml --eval ${corpus} --beam-size ${beam_size} --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 46 | scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 47 | 48 | output=${corpus}.${model}.avg.beam${beam_size} 49 | header ${output} 50 | ./seq2seq.sh ${model_dir}/config.yaml --eval ${corpus} --beam-size ${beam_size} --checkpoints ${model_dir}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 51 | scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 52 | done 53 | 54 | output=${corpus}.${size}.ensemble.beam${beam_size} 55 | header ${output} 56 | ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/best --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 57 | scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 58 | 59 | output=${corpus}.${size}.ensemble.avg.beam${beam_size} 60 | header ${output} 61 | ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter 62 | scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out 63 | done 64 | done 65 | -------------------------------------------------------------------------------- /config/APE/large/chained.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/large/chained 11 | train_prefix: train.large 12 | vocab_prefix: vocab.large 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.8 19 | decay_every_n_epoch: 0.5 20 | decay_after_n_epoch: 1 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 200000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: local 35 | attn_window_size: 0 36 | max_len: 40 37 | 38 | - name: src 39 | attention_type: global 40 | max_len: 40 41 | 42 | decoders: 43 | - name: edits 44 | max_len: 50 45 | 46 | pred_edits: True 47 | ref_ext: pe 48 | 49 | use_dropout: True 50 | pervasive_dropout: True 51 | rnn_input_dropout: 0.5 52 | initial_state_dropout: 0.5 53 | 54 | chained_encoders: True 55 | chaining_strategy: map_attns 56 | chaining_non_linearity: True 57 | chaining_loss_ratio: 0.5 58 | chaining_stop_gradient: False 59 | -------------------------------------------------------------------------------- /config/APE/large/forced.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/large/forced 11 | train_prefix: train.large 12 | vocab_prefix: vocab.large 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.8 19 | decay_every_n_epoch: 0.5 20 | decay_after_n_epoch: 1 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 200000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: local 35 | attn_window_size: 0 36 | max_len: 40 37 | 38 | decoders: 39 | - name: edits 40 | max_len: 50 41 | 42 | pred_edits: True 43 | ref_ext: pe 44 | 45 | use_dropout: True 46 | pervasive_dropout: True 47 | rnn_input_dropout: 0.5 48 | initial_state_dropout: 0.5 49 | -------------------------------------------------------------------------------- /config/APE/large/global.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/large/global 11 | train_prefix: train.large 12 | vocab_prefix: vocab.large 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.8 19 | decay_every_n_epoch: 0.5 20 | decay_after_n_epoch: 1 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 200000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: global 35 | max_len: 40 36 | 37 | decoders: 38 | - name: edits 39 | max_len: 50 40 | 41 | pred_edits: True 42 | ref_ext: pe 43 | 44 | use_dropout: True 45 | pervasive_dropout: True 46 | rnn_input_dropout: 0.5 47 | initial_state_dropout: 0.5 48 | -------------------------------------------------------------------------------- /config/APE/large/multi-global.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/large/multi_global 11 | train_prefix: train.large 12 | vocab_prefix: vocab.large 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.8 19 | decay_every_n_epoch: 0.5 20 | decay_after_n_epoch: 1 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 200000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: global 35 | max_len: 40 36 | - name: src 37 | attention_type: global 38 | max_len: 40 39 | 40 | decoders: 41 | - name: edits 42 | max_len: 50 43 | 44 | pred_edits: True 45 | ref_ext: pe 46 | 47 | use_dropout: True 48 | pervasive_dropout: True 49 | rnn_input_dropout: 0.5 50 | initial_state_dropout: 0.5 51 | -------------------------------------------------------------------------------- /config/APE/large/multi.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/large/multi 11 | train_prefix: train.large 12 | vocab_prefix: vocab.large 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.8 19 | decay_every_n_epoch: 0.5 20 | decay_after_n_epoch: 1 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 200000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: local 35 | attn_window_size: 0 36 | max_len: 40 37 | - name: src 38 | attention_type: global 39 | max_len: 40 40 | 41 | decoders: 42 | - name: edits 43 | max_len: 50 44 | 45 | pred_edits: True 46 | ref_ext: pe 47 | 48 | use_dropout: True 49 | pervasive_dropout: True 50 | rnn_input_dropout: 0.5 51 | initial_state_dropout: 0.5 52 | -------------------------------------------------------------------------------- /config/APE/medium/chained.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/medium/chained 11 | 12 | batch_size: 32 13 | 14 | optimizer: sgd 15 | learning_rate: 1.0 16 | learning_rate_decay_factor: 0.95 17 | decay_every_n_epoch: 1 18 | decay_after_n_epoch: 3 19 | 20 | steps_per_checkpoint: 1000 21 | steps_per_eval: 1000 22 | score_function: corpus_scores_ter 23 | 24 | max_gradient_norm: 1.0 25 | max_steps: 60000 26 | 27 | final_state: average 28 | pred_embed_proj: False 29 | 30 | encoders: 31 | - name: mt 32 | attention_type: local 33 | attn_window_size: 0 34 | max_len: 37 35 | 36 | - name: src 37 | attention_type: global 38 | max_len: 33 39 | 40 | decoders: 41 | - name: edits 42 | max_len: 45 43 | 44 | pred_edits: True 45 | ref_ext: pe 46 | 47 | use_dropout: True 48 | pervasive_dropout: True 49 | rnn_input_dropout: 0.5 50 | initial_state_dropout: 0.5 51 | 52 | chained_encoders: True 53 | chaining_strategy: map_attns 54 | chaining_non_linearity: True 55 | chaining_loss_ratio: 0.5 56 | chaining_stop_gradient: False 57 | -------------------------------------------------------------------------------- /config/APE/medium/forced.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/medium/forced 11 | 12 | batch_size: 32 13 | 14 | optimizer: sgd 15 | learning_rate: 1.0 16 | learning_rate_decay_factor: 0.95 17 | decay_every_n_epoch: 1 18 | decay_after_n_epoch: 3 19 | 20 | steps_per_checkpoint: 1000 21 | steps_per_eval: 1000 22 | score_function: corpus_scores_ter 23 | 24 | max_gradient_norm: 1.0 25 | max_steps: 60000 26 | 27 | final_state: average 28 | pred_embed_proj: False 29 | 30 | encoders: 31 | - name: mt 32 | attention_type: local 33 | attn_window_size: 0 34 | max_len: 37 35 | 36 | decoders: 37 | - name: edits 38 | max_len: 45 39 | 40 | pred_edits: True 41 | ref_ext: pe 42 | 43 | use_dropout: True 44 | pervasive_dropout: True 45 | rnn_input_dropout: 0.5 46 | initial_state_dropout: 0.5 47 | -------------------------------------------------------------------------------- /config/APE/medium/global.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/medium/global 11 | 12 | batch_size: 32 13 | 14 | optimizer: sgd 15 | learning_rate: 1.0 16 | learning_rate_decay_factor: 0.95 17 | decay_every_n_epoch: 1 18 | decay_after_n_epoch: 3 19 | 20 | steps_per_checkpoint: 1000 21 | steps_per_eval: 1000 22 | score_function: corpus_scores_ter 23 | 24 | max_gradient_norm: 1.0 25 | max_steps: 60000 26 | 27 | final_state: average 28 | pred_embed_proj: False 29 | 30 | encoders: 31 | - name: mt 32 | attention_type: global 33 | max_len: 37 34 | 35 | decoders: 36 | - name: edits 37 | max_len: 45 38 | 39 | pred_edits: True 40 | ref_ext: pe 41 | 42 | use_dropout: True 43 | pervasive_dropout: True 44 | rnn_input_dropout: 0.5 45 | initial_state_dropout: 0.5 46 | -------------------------------------------------------------------------------- /config/APE/medium/multi-global.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/medium/multi_global 11 | 12 | batch_size: 32 13 | 14 | optimizer: sgd 15 | learning_rate: 1.0 16 | learning_rate_decay_factor: 0.95 17 | decay_every_n_epoch: 1 18 | decay_after_n_epoch: 3 19 | 20 | steps_per_checkpoint: 1000 21 | steps_per_eval: 1000 22 | score_function: corpus_scores_ter 23 | 24 | max_gradient_norm: 1.0 25 | max_steps: 60000 26 | 27 | final_state: average 28 | pred_embed_proj: False 29 | 30 | encoders: 31 | - name: mt 32 | attention_type: global 33 | max_len: 37 34 | - name: src 35 | attention_type: global 36 | max_len: 33 37 | 38 | decoders: 39 | - name: edits 40 | max_len: 45 41 | 42 | pred_edits: True 43 | ref_ext: pe 44 | 45 | use_dropout: True 46 | pervasive_dropout: True 47 | rnn_input_dropout: 0.5 48 | initial_state_dropout: 0.5 49 | -------------------------------------------------------------------------------- /config/APE/medium/multi.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/medium/multi 11 | 12 | batch_size: 32 13 | 14 | optimizer: sgd 15 | learning_rate: 1.0 16 | learning_rate_decay_factor: 0.95 17 | decay_every_n_epoch: 1 18 | decay_after_n_epoch: 3 19 | 20 | steps_per_checkpoint: 1000 21 | steps_per_eval: 1000 22 | score_function: corpus_scores_ter 23 | 24 | max_gradient_norm: 1.0 25 | max_steps: 60000 26 | 27 | final_state: average 28 | pred_embed_proj: False 29 | 30 | encoders: 31 | - name: mt 32 | attention_type: local 33 | attn_window_size: 0 34 | max_len: 37 35 | - name: src 36 | attention_type: global 37 | max_len: 33 38 | 39 | decoders: 40 | - name: edits 41 | max_len: 45 42 | 43 | pred_edits: True 44 | ref_ext: pe 45 | 46 | use_dropout: True 47 | pervasive_dropout: True 48 | rnn_input_dropout: 0.5 49 | initial_state_dropout: 0.5 50 | -------------------------------------------------------------------------------- /config/APE/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | raw_data=raw_data/APE 4 | data_dir=data/APE 5 | 6 | max_vocab_size=30000 7 | 8 | rm -rf ${data_dir} 9 | mkdir -p ${data_dir} 10 | 11 | for ext in mt pe src 12 | do 13 | cat ${raw_data}/train.${ext} > ${data_dir}/train.small.${ext} 14 | cat ${raw_data}/{train,train.2017}.${ext} > ${data_dir}/train.${ext} 15 | cat ${raw_data}/500K.${ext} > ${data_dir}/train.large.${ext} 16 | for i in {1..10} # oversample PE data 17 | do 18 | cat ${raw_data}/{train,train.2017}.${ext} >> ${data_dir}/train.large.${ext} 19 | done 20 | 21 | cp ${raw_data}/dev.${ext} ${data_dir}/dev.${ext} 22 | cp ${raw_data}/test.${ext} ${data_dir}/test.${ext} 23 | cp ${raw_data}/test.2017.${ext} ${data_dir}/test.2017.${ext} 24 | done 25 | 26 | for corpus in train.small train train.large dev test test.2017 27 | do 28 | scripts/post_editing/extract-edits.py ${data_dir}/${corpus}.{mt,pe} > ${data_dir}/${corpus}.edits 29 | done 30 | 31 | cat ${data_dir}/train.small.{mt,pe} > ${data_dir}/train.small.de 32 | cat ${data_dir}/train.{mt,pe} > ${data_dir}/train.de 33 | cat ${data_dir}/train.large.{mt,pe} > ${data_dir}/train.large.de 34 | 35 | scripts/prepare-data.py ${data_dir}/train.small src de edits ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.small 36 | scripts/prepare-data.py ${data_dir}/train src de edits ${data_dir} --mode vocab --vocab-size 0 37 | scripts/prepare-data.py ${data_dir}/train.large src de edits ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.large --vocab-size ${max_vocab_size} 38 | 39 | for vocab in vocab vocab.small vocab.large # joint vocabularies 40 | do 41 | cp ${data_dir}/${vocab}.de ${data_dir}/${vocab}.mt 42 | cp ${data_dir}/${vocab}.de ${data_dir}/${vocab}.pe 43 | done 44 | rm ${data_dir}/*.de 45 | -------------------------------------------------------------------------------- /config/APE/small/chained.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/small/chained 11 | train_prefix: train.small 12 | vocab_prefix: vocab.small 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.95 19 | decay_every_n_epoch: 1 20 | decay_after_n_epoch: 3 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 40000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: local 35 | attn_window_size: 0 36 | max_len: 37 37 | 38 | - name: src 39 | attention_type: global 40 | max_len: 33 41 | 42 | decoders: 43 | - name: edits 44 | max_len: 45 45 | 46 | pred_edits: True 47 | ref_ext: pe 48 | 49 | use_dropout: True 50 | pervasive_dropout: True 51 | rnn_input_dropout: 0.5 52 | initial_state_dropout: 0.5 53 | 54 | chained_encoders: True 55 | chaining_strategy: map_attns 56 | chaining_non_linearity: True 57 | chaining_loss_ratio: 0.5 58 | chaining_stop_gradient: False 59 | -------------------------------------------------------------------------------- /config/APE/small/forced.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/small/forced 11 | train_prefix: train.small 12 | vocab_prefix: vocab.small 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.95 19 | decay_every_n_epoch: 1 20 | decay_after_n_epoch: 3 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 40000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: local 35 | attn_window_size: 0 36 | max_len: 37 37 | 38 | decoders: 39 | - name: edits 40 | max_len: 45 41 | 42 | pred_edits: True 43 | ref_ext: pe 44 | 45 | use_dropout: True 46 | pervasive_dropout: True 47 | rnn_input_dropout: 0.5 48 | initial_state_dropout: 0.5 49 | -------------------------------------------------------------------------------- /config/APE/small/global.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/small/global 11 | train_prefix: train.small 12 | vocab_prefix: vocab.small 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.95 19 | decay_every_n_epoch: 1 20 | decay_after_n_epoch: 3 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 40000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: global 35 | max_len: 37 36 | 37 | decoders: 38 | - name: edits 39 | max_len: 45 40 | 41 | pred_edits: True 42 | ref_ext: pe 43 | 44 | use_dropout: True 45 | pervasive_dropout: True 46 | rnn_input_dropout: 0.5 47 | initial_state_dropout: 0.5 48 | -------------------------------------------------------------------------------- /config/APE/small/multi-global.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/small/multi_global 11 | train_prefix: train.small 12 | vocab_prefix: vocab.small 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.95 19 | decay_every_n_epoch: 1 20 | decay_after_n_epoch: 3 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 40000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: global 35 | max_len: 37 36 | - name: src 37 | attention_type: global 38 | max_len: 33 39 | 40 | decoders: 41 | - name: edits 42 | max_len: 45 43 | 44 | pred_edits: True 45 | ref_ext: pe 46 | 47 | use_dropout: True 48 | pervasive_dropout: True 49 | rnn_input_dropout: 0.5 50 | initial_state_dropout: 0.5 51 | -------------------------------------------------------------------------------- /config/APE/small/multi.yaml: -------------------------------------------------------------------------------- 1 | 2 | cell_size: 128 3 | attn_size: 128 4 | embedding_size: 128 5 | cell_type: LSTM 6 | 7 | weight_scale: 0.1 8 | 9 | data_dir: data/APE 10 | model_dir: models/APE/small/multi 11 | train_prefix: train.small 12 | vocab_prefix: vocab.small 13 | 14 | batch_size: 32 15 | 16 | optimizer: sgd 17 | learning_rate: 1.0 18 | learning_rate_decay_factor: 0.95 19 | decay_every_n_epoch: 1 20 | decay_after_n_epoch: 3 21 | 22 | steps_per_checkpoint: 1000 23 | steps_per_eval: 1000 24 | score_function: corpus_scores_ter 25 | 26 | max_gradient_norm: 1.0 27 | max_steps: 40000 28 | 29 | final_state: average 30 | pred_embed_proj: False 31 | 32 | encoders: 33 | - name: mt 34 | attention_type: local 35 | attn_window_size: 0 36 | max_len: 37 37 | - name: src 38 | attention_type: global 39 | max_len: 33 40 | 41 | decoders: 42 | - name: edits 43 | max_len: 45 44 | 45 | pred_edits: True 46 | ref_ext: pe 47 | 48 | use_dropout: True 49 | pervasive_dropout: True 50 | rnn_input_dropout: 0.5 51 | initial_state_dropout: 0.5 52 | -------------------------------------------------------------------------------- /config/BTEC/ASR.yaml: -------------------------------------------------------------------------------- 1 | label: 'BTEC ASR' 2 | 3 | data_dir: data/BTEC 4 | model_dir: models/BTEC/ASR 5 | train_prefix: train.concat 6 | max_train_size: 40000 7 | 8 | batch_size: 64 9 | weight_scale: null 10 | 11 | steps_per_checkpoint: 1000 12 | steps_per_eval: 1000 13 | max_steps: 60000 14 | score_function: corpus_scores_wer 15 | 16 | cell_size: 256 17 | attn_size: 256 18 | cell_type: LSTM 19 | 20 | encoders: 21 | - name: speech.fr 22 | ext: npz 23 | embedding_size: 41 24 | layers: 3 25 | conv_filters: [16, 16] 26 | conv_size: [3, 3] 27 | conv_strides: [2, 2] 28 | conv_activation: null 29 | binary: True 30 | max_len: 600 31 | input_layers: [256, 128] 32 | bidir_projection: True 33 | final_state: concat_last 34 | train_initial_states: False 35 | input_layer_dropout: 0.4 36 | 37 | decoders: 38 | - name: char.fr 39 | layers: 2 40 | embedding_size: 64 41 | max_len: 140 42 | pred_maxout_layer: False 43 | use_previous_word: False 44 | pred_embed_proj: False 45 | character_level: True 46 | 47 | use_dropout: True 48 | pervasive_dropout: True 49 | attn_dropout: 0.4 50 | rnn_input_dropout: 0.4 51 | initial_state_dropout: 0.4 52 | -------------------------------------------------------------------------------- /config/BTEC/AST.yaml: -------------------------------------------------------------------------------- 1 | label: 'BTEC AST' 2 | 3 | data_dir: data/BTEC 4 | model_dir: models/BTEC/AST 5 | train_prefix: train.concat 6 | max_train_size: 40000 7 | 8 | batch_size: 64 9 | weight_scale: null 10 | 11 | steps_per_checkpoint: 1000 12 | steps_per_eval: 1000 13 | max_steps: 100000 14 | score_function: corpus_scores 15 | 16 | cell_size: 256 17 | attn_size: 256 18 | cell_type: LSTM 19 | 20 | encoders: 21 | - name: speech.fr 22 | ext: npz 23 | embedding_size: 41 24 | layers: 3 25 | conv_filters: [16, 16] 26 | conv_size: [3, 3] 27 | conv_strides: [2, 2] 28 | conv_activation: null 29 | binary: True 30 | max_len: 600 31 | input_layers: [256, 128] 32 | bidir_projection: True 33 | final_state: concat_last 34 | train_initial_states: False 35 | input_layer_dropout: 0.4 36 | 37 | decoders: 38 | - name: char.en 39 | embedding_size: 64 40 | max_len: 120 41 | conditional_rnn: True 42 | pred_maxout_layer: False 43 | use_previous_word: False 44 | pred_embed_proj: False 45 | character_level: True 46 | 47 | use_dropout: True 48 | pervasive_dropout: True 49 | attn_dropout: 0.4 50 | rnn_input_dropout: 0.4 51 | initial_state_dropout: 0.4 52 | -------------------------------------------------------------------------------- /config/BTEC/MT.yaml: -------------------------------------------------------------------------------- 1 | label: 'BTEC MT' 2 | 3 | data_dir: data/BTEC 4 | model_dir: models/BTEC/MT 5 | 6 | batch_size: 64 7 | weight_scale: null 8 | embedding_weight_scale: 0.1 9 | embedding_initializer: uniform 10 | 11 | steps_per_checkpoint: 1000 12 | steps_per_eval: 1000 13 | max_steps: 100000 14 | score_function: corpus_scores 15 | 16 | cell_size: 256 17 | attn_size: 256 18 | cell_type: LSTM 19 | 20 | encoders: 21 | - name: fr 22 | embedding_size: 128 23 | max_len: 25 24 | bidir_projection: True 25 | final_state: average 26 | train_initial_states: False 27 | embedding_dropout: 0.2 28 | 29 | decoders: 30 | - name: char.en 31 | embedding_size: 64 32 | max_len: 120 33 | conditional_rnn: True 34 | pred_maxout_layer: False 35 | use_previous_word: False 36 | pred_embed_proj: False 37 | character_level: True 38 | word_dropout: 0.2 39 | 40 | use_dropout: True 41 | attn_dropout: 0.2 42 | rnn_input_dropout: 0.2 43 | initial_state_dropout: 0.2 44 | rnn_output_dropout: 0.2 45 | -------------------------------------------------------------------------------- /config/BTEC/Multi-Task-joint.yaml: -------------------------------------------------------------------------------- 1 | label: 'BTEC Multi-Task' 2 | description: "Multi-Task training of AST, MT and ASR models on BTEC, with a joint training loss" 3 | 4 | data_dir: data/BTEC 5 | model_dir: models/BTEC/AST_multitask_joint 6 | train_prefix: train.concat 7 | max_train_size: 40000 8 | 9 | batch_size: 64 10 | weight_scale: null 11 | 12 | steps_per_checkpoint: 1000 13 | steps_per_eval: 1000 14 | max_steps: 100000 15 | score_function: corpus_scores 16 | 17 | train_initial_states: False 18 | bidir_projection: True 19 | pred_embed_proj: False 20 | use_previous_word: False 21 | pred_deep_layer: False 22 | pred_maxout_layer: False 23 | 24 | cell_size: 256 25 | attn_size: 256 26 | cell_type: LSTM 27 | embedding_size: 64 28 | 29 | multi_task: True 30 | task_ratios: [0.6, 0.2, 0.2, 0] # (0,0) (0,1) (1,0) (1,1) 31 | 32 | encoders: 33 | - name: speech.fr 34 | ext: npz 35 | embedding_size: 41 36 | layers: 3 37 | binary: True 38 | final_state: concat_last 39 | conv_filters: [16, 16] 40 | conv_size: [3, 3] 41 | conv_strides: [2, 2] 42 | conv_activation: null 43 | input_layers: [256, 128] 44 | input_layer_activation: tanh 45 | max_len: 600 46 | - name: fr 47 | embedding_size: 128 48 | conv_filters: null 49 | input_layers: null 50 | max_len: 25 51 | final_state: average 52 | decoders: 53 | - name: char.en 54 | character_level: True 55 | conditional_rnn: True 56 | max_len: 120 57 | - name: char.fr 58 | layers: 2 59 | character_level: True 60 | max_len: 140 61 | 62 | use_dropout: True 63 | pervasive_dropout: True 64 | attn_dropout: 0.4 65 | rnn_input_dropout: 0.4 66 | initial_state_dropout: 0.4 67 | input_layer_dropout: 0.4 68 | -------------------------------------------------------------------------------- /config/BTEC/Multi-Task.yaml: -------------------------------------------------------------------------------- 1 | label: 'BTEC Multi-Task' 2 | description: "Multi-Task training of AST, MT and ASR models on BTEC" 3 | 4 | data_dir: data/BTEC 5 | model_dir: models/BTEC/AST_multitask 6 | train_prefix: train.concat 7 | max_train_size: 40000 8 | 9 | batch_size: 64 10 | weight_scale: null 11 | 12 | steps_per_checkpoint: 500 13 | steps_per_eval: 500 14 | max_steps: 100000 15 | score_function: corpus_scores 16 | 17 | train_initial_states: False 18 | bidir_projection: True 19 | pred_embed_proj: False 20 | use_previous_word: False 21 | pred_deep_layer: False 22 | pred_maxout_layer: False 23 | conditional_rnn: True 24 | conv_filters: [16, 16] 25 | conv_size: [3, 3] 26 | conv_strides: [2, 2] 27 | conv_activation: null 28 | input_layers: [256, 128] 29 | input_layer_activation: tanh 30 | final_state: concat_last 31 | max_len: 600 32 | 33 | cell_size: 256 34 | attn_size: 256 35 | cell_type: LSTM 36 | embedding_size: 64 37 | 38 | tasks: 39 | - name: AST 40 | ratio: 0.6 41 | encoders: 42 | - name: speech.fr 43 | ext: npz 44 | embedding_size: 41 45 | layers: 3 46 | binary: True 47 | 48 | decoders: 49 | - name: char.en 50 | layers: 1 51 | character_level: True 52 | max_len: 120 53 | 54 | - name: ASR 55 | ratio: 0.2 56 | encoders: 57 | - name: speech.fr 58 | ext: npz 59 | embedding_size: 41 60 | layers: 3 61 | binary: True 62 | 63 | decoders: 64 | - name: char.fr 65 | layers: 2 66 | character_level: True 67 | conditional_rnn: False 68 | max_len: 140 69 | 70 | - name: MT 71 | ratio: 0.2 72 | train_prefix: train 73 | encoders: 74 | - name: fr 75 | embedding_size: 128 76 | conv_filters: null 77 | input_layers: null 78 | max_len: 25 79 | final_state: average 80 | 81 | decoders: 82 | - name: char.en 83 | layers: 1 84 | character_level: True 85 | max_len: 120 86 | 87 | use_dropout: True 88 | pervasive_dropout: True 89 | attn_dropout: 0.4 90 | rnn_input_dropout: 0.4 91 | initial_state_dropout: 0.4 92 | input_layer_dropout: 0.4 93 | -------------------------------------------------------------------------------- /config/BTEC/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Speech processing 4 | 5 | ## Install Yaafe 6 | 7 | ~~~ 8 | sudo apt-get install cmake cmake-curses-gui libargtable2-0 libargtable2-dev 9 | libsndfile1 libsndfile1-dev libmpg123-0 libmpg123-dev libfftw3-3 libfftw3-dev 10 | liblapack-dev libhdf5-serial-dev gcc-4.8 g++-4.8 11 | 12 | wget https://sourceforge.net/projects/yaafe/files/yaafe-v0.64.tgz/download -O yaafe-v0.64.tgz 13 | 14 | tar xzf yaafe-v0.64.tgz 15 | cd yaafe-v0.64 16 | 17 | # fix bug in the official release 18 | cat src_cpp/yaafe-core/Ports.h | sed "s/\tpush_back/\tthis->push_back/g" > src_cpp/yaafe-core/Ports.h.fixed 19 | mv src_cpp/yaafe-core/Ports.h.fixed src_cpp/yaafe-core/Ports.h 20 | 21 | mkdir build 22 | cd build 23 | export CC=/usr/bin/gcc-4.8 24 | export CXX=/usr/bin/g++-4.8 25 | cmake .. 26 | make 27 | sudo make install 28 | 29 | echo "export PYTHONPATH=/usr/local/python_packages/:\$PYTHONPATH" >> ~/.bashrc 30 | echo "export LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH" >> ~/.bashrc 31 | echo "export YAAFE_PATH=/usr/local/yaafe_extensions" >> ~/.bashrc 32 | ~~~ 33 | 34 | ## Configuration files 35 | 36 | Examples of configuration files for ASR and AST are: `config/BTEC/ASR.yaml` and `config/BTEC/AST.yaml`. 37 | You'll need to modify the `data`, `model`, `data_prefix` and `vocab_prefix` parameters. Also, you should set the right `name` for the `encoders` and `decoders` parameters (it should be the same as the source and target extensions). 38 | 39 | A very important parameter for ASR and AST is the `max_len` parameters (specific to each encoder and decoder). It defines the maximum length of the input and output sequences. Training time and memory usage depend on this limit. Because audio sequences are very long (1 frame every 10 ms), training can take a lot of memory. 40 | 41 | -------------------------------------------------------------------------------- /config/BTEC/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # speech data preparation script 4 | # this script assumes that the BTEC raw files are in the ${raw_data} dir 5 | raw_data=raw_data/BTEC 6 | data_dir=data/BTEC # output directory for the processed files (text and audio features) 7 | 8 | rm -rf ${data_dir} 9 | mkdir -p ${data_dir} 10 | 11 | scripts/speech/extract.py ${raw_data}/train-{Fabienne,Helene,Loic,Marion,Michel,Philippe}.tar ${data_dir}/train.concat.npz 12 | scripts/speech/extract.py ${raw_data}/dev-Agnes.tar ${data_dir}/dev.npz 13 | scripts/speech/extract.py ${raw_data}/test-Agnes.tar ${data_dir}/test.npz 14 | 15 | rm -f ${data_dir}/train.raw.{fr,en} 16 | for i in {1..6} 17 | do 18 | cat ${raw_data}/train.fr >> ${data_dir}/train.raw.fr 19 | cat ${raw_data}/train.en >> ${data_dir}/train.raw.en 20 | done 21 | 22 | scripts/prepare-data.py ${data_dir}/train.raw fr en ${data_dir} --lowercase --output train.concat --mode prepare 23 | scripts/prepare-data.py ${raw_data}/dev fr en ${data_dir} --lowercase --output dev --mode prepare 24 | scripts/prepare-data.py ${raw_data}/test fr en ${data_dir} --lowercase --output test --mode prepare 25 | scripts/prepare-data.py ${raw_data}/train fr en ${data_dir} --lowercase 26 | 27 | scripts/prepare-data.py ${raw_data}/dev mref.en ${data_dir} --lowercase --output dev --mode prepare --lang en 28 | scripts/prepare-data.py ${raw_data}/test mref.en ${data_dir} --lowercase --output test --mode prepare --lang en 29 | 30 | scripts/speech/shuf.py ${data_dir}/train.concat.npz --input-txt ${data_dir}/train.concat.{fr,en} 31 | 32 | scripts/prepare-data.py ${data_dir}/train fr en ${data_dir} --mode vocab --character-level --no-tokenize --vocab-prefix vocab.char 33 | 34 | for corpus in train.concat train dev test 35 | do 36 | cp ${data_dir}/${corpus}.fr ${data_dir}/${corpus}.char.fr 37 | cp ${data_dir}/${corpus}.en ${data_dir}/${corpus}.char.en 38 | done 39 | 40 | -------------------------------------------------------------------------------- /config/BTEC/voxygen/convert-to-audio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | filename=$1 5 | dir=$2 6 | voice=$3 # Agnes, Fabienne, Helene, Loic, Marion, Michel, Philippe (default = Agnes) 7 | 8 | mkdir -p ${dir} 9 | lines=`wc -l ${filename} | cut -d' ' -f1` 10 | digits=$((`echo ${lines} | wc -c` - 1)) 11 | 12 | for i in `seq 1 ${lines}`; 13 | do 14 | num=`printf "%0${digits}d" ${i}` 15 | cat ${filename} | sed -n "${i},${i}p" > ${dir}/${num}.txt 16 | config/BTEC/voxygen/wsclient.py -i ${dir}/${num}.txt -o ${dir}/${num}.wav header=wav-header frequency=16000 coding=lin voice=${voice} 17 | rm ${dir}/${num}.txt 18 | done 19 | 20 | -------------------------------------------------------------------------------- /config/IWSLT14/BPE-TED.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 BPE Model + Monolingual data' 2 | 3 | cell_size: 512 4 | attn_size: 512 5 | embedding_size: 128 6 | 7 | cell_type: LSTM 8 | weight_scale: 0.1 9 | 10 | data_dir: data/IWSLT14 11 | model_dir: models/IWSLT14/BPE_TED 12 | train_prefix: train.TED 13 | vocab_prefix: vocab.joint 14 | 15 | batch_size: 32 16 | 17 | steps_per_checkpoint: 4000 18 | steps_per_eval: 4000 19 | score_function: corpus_scores 20 | 21 | max_gradient_norm: 1.0 22 | batch_mode: standard 23 | read_ahead: 20 24 | max_steps: 800000 25 | keep_best: 4 26 | 27 | encoders: 28 | - name: joint 29 | ext: jsub.de 30 | max_len: 52 31 | train_initial_states: False 32 | 33 | decoders: 34 | - name: joint 35 | ext: jsub.en 36 | max_len: 50 37 | conditional_rnn: True 38 | pred_deep_layer: True 39 | use_previous_word: False 40 | initial_state: zero 41 | 42 | use_dropout: True 43 | rnn_input_dropout: 0.4 44 | word_dropout: 0.2 45 | -------------------------------------------------------------------------------- /config/IWSLT14/BPE.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 BPE Model' 2 | 3 | cell_size: 512 4 | attn_size: 512 5 | embedding_size: 128 6 | 7 | cell_type: LSTM 8 | weight_scale: 0.1 9 | 10 | data_dir: data/IWSLT14 11 | model_dir: models/IWSLT14/BPE 12 | vocab_prefix: vocab.joint 13 | 14 | batch_size: 32 15 | 16 | steps_per_checkpoint: 4000 17 | steps_per_eval: 4000 18 | score_function: corpus_scores 19 | 20 | max_gradient_norm: 1.0 21 | batch_mode: standard 22 | read_ahead: 20 23 | max_steps: 400000 24 | keep_best: 4 25 | 26 | encoders: 27 | - name: joint 28 | ext: jsub.de 29 | max_len: 52 30 | train_initial_states: False 31 | 32 | decoders: 33 | - name: joint 34 | ext: jsub.en 35 | max_len: 50 36 | conditional_rnn: True 37 | pred_deep_layer: True 38 | use_previous_word: False 39 | initial_state: zero 40 | 41 | use_dropout: True 42 | rnn_input_dropout: 0.4 43 | word_dropout: 0.2 44 | -------------------------------------------------------------------------------- /config/IWSLT14/BPE2char-TED.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 BPE to character Model + Monolingual data' 2 | 3 | cell_size: 512 4 | attn_size: 512 5 | embedding_size: 128 6 | 7 | cell_type: LSTM 8 | weight_scale: 0.1 9 | 10 | data_dir: data/IWSLT14 11 | model_dir: models/IWSLT14/BPE2char_TED 12 | train_prefix: train.TED 13 | 14 | batch_size: 32 15 | 16 | steps_per_checkpoint: 4000 17 | steps_per_eval: 4000 18 | score_function: corpus_scores 19 | 20 | max_gradient_norm: 1.0 21 | batch_mode: standard 22 | read_ahead: 20 23 | max_steps: 800000 24 | keep_best: 4 25 | 26 | encoders: 27 | - name: jsub.de 28 | max_len: 52 29 | train_initial_states: False 30 | 31 | decoders: 32 | - name: char.en 33 | max_len: 239 34 | character_level: True 35 | conditional_rnn: True 36 | pred_deep_layer: True 37 | use_previous_word: False 38 | initial_state: zero 39 | 40 | use_dropout: True 41 | rnn_input_dropout: 0.4 42 | word_dropout: 0.2 43 | -------------------------------------------------------------------------------- /config/IWSLT14/BPE2char.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 BPE to character Model' 2 | 3 | cell_size: 512 4 | attn_size: 512 5 | embedding_size: 128 6 | 7 | cell_type: LSTM 8 | weight_scale: 0.1 9 | 10 | data_dir: data/IWSLT14 11 | model_dir: models/IWSLT14/BPE2char 12 | 13 | batch_size: 32 14 | 15 | steps_per_checkpoint: 4000 16 | steps_per_eval: 4000 17 | score_function: corpus_scores 18 | 19 | max_gradient_norm: 1.0 20 | batch_mode: standard 21 | read_ahead: 20 22 | max_steps: 800000 23 | keep_best: 4 24 | 25 | encoders: 26 | - name: jsub.de 27 | max_len: 52 28 | train_initial_states: False 29 | 30 | decoders: 31 | - name: char.en 32 | max_len: 239 33 | character_level: True 34 | conditional_rnn: True 35 | pred_deep_layer: True 36 | use_previous_word: False 37 | initial_state: zero 38 | 39 | use_dropout: True 40 | rnn_input_dropout: 0.4 41 | word_dropout: 0.2 42 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/baseline-TED.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 Baseline' 2 | description: "IWSLT14 new baseline" 3 | 4 | cell_size: 256 5 | attn_size: 256 6 | embedding_size: 128 7 | 8 | bidir: True 9 | cell_type: LSTM 10 | weight_scale: 0.1 11 | 12 | data_dir: data/IWSLT14 13 | model_dir: models/IWSLT14/baseline_TED 14 | train_prefix: train.TED 15 | vocab_prefix: vocab.TED 16 | batch_size: 32 17 | 18 | optimizer: adam 19 | learning_rate: 0.001 20 | learning_rate_decay_factor: 0.5 21 | decay_every_n_epoch: 1 22 | 23 | steps_per_checkpoint: 2000 24 | steps_per_eval: 2000 25 | 26 | max_gradient_norm: 1.0 27 | batch_mode: standard 28 | read_ahead: 20 29 | max_epochs: 4 30 | 31 | encoders: 32 | - name: de 33 | max_len: 45 34 | final_state: last_both 35 | 36 | decoders: 37 | - name: en 38 | max_len: 47 39 | conditional_rnn: True 40 | pred_deep_layer: True 41 | 42 | use_dropout: True 43 | pervasive_dropout: True 44 | rnn_input_dropout: 0.2 45 | attn_dropout: 0.2 46 | word_dropout: 0.2 47 | initial_state_dropout: 0.2 48 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/char-level-TED.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 Char-level' 2 | description: "IWSLT14 subwords to characters" 3 | 4 | cell_size: 256 5 | attn_size: 256 6 | embedding_size: 128 7 | 8 | bidir: True 9 | cell_type: LSTM 10 | weight_scale: 0.1 11 | 12 | data_dir: data/IWSLT14 13 | model_dir: models/IWSLT14/char_level_TED 14 | train_prefix: train.TED 15 | vocab_prefix: vocab.TED 16 | batch_size: 32 17 | 18 | optimizer: adam 19 | learning_rate: 0.001 20 | learning_rate_decay_factor: 0.5 21 | decay_every_n_epoch: 1 22 | 23 | steps_per_checkpoint: 2000 24 | steps_per_eval: 2000 25 | 26 | max_gradient_norm: 1.0 27 | batch_mode: standard 28 | read_ahead: 20 29 | max_epochs: 4 30 | 31 | encoders: 32 | - name: jsub.de 33 | max_len: 51 34 | final_state: last_both 35 | 36 | decoders: 37 | - name: char.en 38 | max_len: 200 39 | conditional_rnn: True 40 | pred_deep_layer: True 41 | character_level: True 42 | 43 | use_dropout: True 44 | pervasive_dropout: True 45 | rnn_input_dropout: 0.2 46 | attn_dropout: 0.2 47 | word_dropout: 0.2 48 | initial_state_dropout: 0.2 49 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/decode.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | data_dir=data/IWSLT14 6 | model_dir=models/IWSLT14/Back_Translation_LM 7 | 8 | file_id=$1 9 | 10 | input_filename=${model_dir}/data/${file_id} 11 | output_filename=${model_dir}/output/${file_id} 12 | 13 | new_dir=`mktemp -d` 14 | tmp_dir=${new_dir}/moses 15 | scripts/decode-moses.sh ${model_dir}/moses.tuned.ini ${tmp_dir} ${input_filename} ${output_filename} 1>/dev/null 2>/dev/null 16 | rm -rf ${new_dir} 17 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | data_dir=data/IWSLT14 6 | model_dir=models/IWSLT14 7 | 8 | if [ -z ${MOSES} ] 9 | then 10 | echo "variable MOSES undefined" 11 | exit 0 12 | fi 13 | 14 | new_dir=`mktemp -d` 15 | tmp_dir=${new_dir}/moses 16 | 17 | scripts/decode-moses.sh ${model_dir}/Back_Translation/moses.tuned.ini ${tmp_dir} ${data_dir}/test.en ${model_dir}/Back_Translation/test.mt 1>/dev/null 2>/dev/null 18 | scripts/score.py ${model_dir}/Back_Translation/test.mt ${data_dir}/test.de --bleu 19 | 20 | scripts/decode-moses.sh ${model_dir}/Back_Translation_LM/moses.tuned.ini ${tmp_dir} ${data_dir}/test.en ${model_dir}/Back_Translation_LM/test.mt 1>/dev/null 2>/dev/null 21 | scripts/score.py ${model_dir}/Back_Translation_LM/test.mt ${data_dir}/test.de --bleu 22 | 23 | rm -rf ${new_dir} 24 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | data_dir=data/IWSLT14 4 | 5 | cat ${data_dir}/TED.de > ${data_dir}/train.TED.de 6 | cat ${data_dir}/TED.en > ${data_dir}/train.TED.en 7 | 8 | for i in {1..10} 9 | do 10 | cat ${data_dir}/train.de >> ${data_dir}/train.TED.de 11 | cat ${data_dir}/train.en >> ${data_dir}/train.TED.en 12 | done 13 | 14 | scripts/prepare-data.py ${data_dir}/train.TED de en ${data_dir} --mode vocab --vocab-size 30000 --vocab-prefix vocab.TED 15 | 16 | scripts/prepare-data.py ${data_dir}/train.TED de en ${data_dir} --subwords --bpe-path ${data_dir}/bpe.joint \ 17 | --output train.TED.jsub --vocab-size 0 --vocab-prefix vocab.TED.jsub --no-tokenize 18 | 19 | cp ${data_dir}/train.TED.de ${data_dir}/train.TED.char.de 20 | cp ${data_dir}/train.TED.en ${data_dir}/train.TED.char.en 21 | 22 | cp ${data_dir}/vocab.char.de ${data_dir}/vocab.TED.char.de 23 | cp ${data_dir}/vocab.char.en ${data_dir}/vocab.TED.char.en 24 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/split.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | data_dir=data/IWSLT14 4 | model_dir=models/IWSLT14/Back_Translation_LM 5 | scripts/split-corpus.py ${data_dir}/TED.en ${model_dir}/data --splits 12 --tokens 6 | mkdir -p ${model_dir}/output 7 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/subwords-TED.yaml: -------------------------------------------------------------------------------- 1 | label: 'IWSLT14 Subwords' 2 | description: "IWSLT14 Joint subwords" 3 | 4 | cell_size: 256 5 | attn_size: 256 6 | embedding_size: 128 7 | 8 | bidir: True 9 | cell_type: LSTM 10 | weight_scale: 0.1 11 | 12 | data_dir: data/IWSLT14 13 | model_dir: models/IWSLT14/subwords_TED 14 | train_prefix: train.TED 15 | vocab_prefix: vocab.TED 16 | batch_size: 32 17 | 18 | optimizer: adam 19 | learning_rate: 0.001 20 | learning_rate_decay_factor: 0.5 21 | decay_every_n_epoch: 1 22 | 23 | steps_per_checkpoint: 2000 24 | steps_per_eval: 2000 25 | 26 | max_gradient_norm: 1.0 27 | batch_mode: standard 28 | read_ahead: 20 29 | max_epochs: 4 30 | 31 | encoders: 32 | - name: jsub.de 33 | max_len: 51 34 | final_state: last_both 35 | 36 | decoders: 37 | - name: jsub.en 38 | max_len: 50 39 | conditional_rnn: True 40 | pred_deep_layer: True 41 | 42 | use_dropout: True 43 | pervasive_dropout: True 44 | rnn_input_dropout: 0.2 45 | attn_dropout: 0.2 46 | word_dropout: 0.2 47 | initial_state_dropout: 0.2 48 | -------------------------------------------------------------------------------- /config/IWSLT14/Back-Translation/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | data_dir=data/IWSLT14 4 | model_dir=models/IWSLT14 5 | train_script=scripts/train-moses.sh 6 | 7 | # model_dir data_dir corpus dev_corpus src_ext trg_ext lm_corpus lm_order 8 | ${train_script} ${model_dir}/Back_Translation ${data_dir} train dev en de train 3 9 | cat ${data_dir}/{train,OpenSubtitles}.de > ${data_dir}/train+OpenSubtitles.de 10 | ${train_script} ${model_dir}/Back_Translation_LM ${data_dir} train dev en de train+OpenSubtitles 3 11 | -------------------------------------------------------------------------------- /config/IWSLT14/prepare-TED.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | data_dir=data/IWSLT14 4 | 5 | cat ${data_dir}/TED.de > ${data_dir}/train.TED.de 6 | cat ${data_dir}/TED.en > ${data_dir}/train.TED.en 7 | 8 | for i in {1..10} 9 | do 10 | cat ${data_dir}/train.de >> ${data_dir}/train.TED.de 11 | cat ${data_dir}/train.en >> ${data_dir}/train.TED.en 12 | done 13 | 14 | scripts/prepare-data.py ${data_dir}/train.TED de en ${data_dir} --mode vocab --vocab-size 30000 --vocab-prefix vocab.TED 15 | 16 | for ext in de en 17 | do 18 | scripts/bpe/apply_bpe.py -c ${data_dir}/bpe.joint.${ext} --vocabulary ${data_dir}/bpe-vocab.${ext} --vocabulary-threshold 10 < ${data_dir}/train.TED.${ext} > ${data_dir}/train.TED.jsub.${ext} 19 | done 20 | 21 | cp ${data_dir}/train.TED.de ${data_dir}/train.TED.char.de 22 | cp ${data_dir}/train.TED.en ${data_dir}/train.TED.char.en 23 | cp ${data_dir}/vocab.char.de ${data_dir}/vocab.TED.char.de 24 | cp ${data_dir}/vocab.char.en ${data_dir}/vocab.TED.char.en 25 | 26 | -------------------------------------------------------------------------------- /config/IWSLT14/prepare-lexicon.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | data_dir=data/IWSLT14 6 | 7 | rm -rf fast_align-master 8 | 9 | wget https://github.com/clab/fast_align/archive/master.zip 10 | unzip master.zip 11 | rm master.zip 12 | cd fast_align-master 13 | mkdir build 14 | cd build 15 | cmake .. 16 | make 17 | cd ../.. 18 | 19 | corpus=train 20 | fast_align=fast_align-master/build 21 | 22 | scripts/join.py ${data_dir}/${corpus}.{de,en} > ${data_dir}/${corpus}.de-en 23 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.de-en -d -o -v > ${data_dir}/${corpus}.forward.align 24 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.de-en -d -o -v -r > ${data_dir}/${corpus}.reverse.align 25 | ${fast_align}/atools -i ${data_dir}/${corpus}.forward.align -j ${data_dir}/${corpus}.reverse.align -c grow-diag-final-and > ${data_dir}/${corpus}.align 26 | 27 | scripts/extract-lexicon.py ${data_dir}/${corpus}.{de,en,align} > ${data_dir}/${corpus}.lexicon 28 | python3 -c "print('\n'.join(line.rstrip() for line in open('${data_dir}/${corpus}.lexicon') if not line[0].isupper() and not line.split()[0] == line.split()[1]))" > ${data_dir}/${corpus}.lexicon.purged 29 | 30 | rm -rf fast_align-master 31 | rm ${data_dir}/${corpus}.de-en 32 | rm ${data_dir}/*.align 33 | -------------------------------------------------------------------------------- /config/IWSLT14/prepare-mixer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Script downloaded from: https://github.com/facebookresearch/MIXER 4 | 5 | TOKENIZER=scripts/moses/tokenizer.perl 6 | UNESCAPE=scripts/moses/unescape-special-chars.perl 7 | LC=scripts/moses/lowercase.perl 8 | CLEAN=scripts/moses/clean-corpus-n.perl 9 | 10 | URL="http://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz" 11 | GZ=de-en.tgz 12 | 13 | src=de 14 | tgt=en 15 | lang=de-en 16 | prep=prep 17 | tmp=prep/tmp 18 | orig=orig 19 | 20 | mkdir -p $orig $tmp $prep 21 | 22 | echo "Downloading data from ${URL}..." 23 | cd $orig 24 | wget "$URL" 25 | 26 | if [ -f $GZ ]; then 27 | echo "Data successfully downloaded." 28 | else 29 | echo "Data not successfully downloaded." 30 | exit 31 | fi 32 | 33 | tar zxvf $GZ 34 | cd .. 35 | 36 | echo "pre-processing train data..." 37 | for l in $src $tgt; do 38 | f=train.tags.$lang.$l 39 | tok=train.tags.$lang.tok.$l 40 | 41 | cat $orig/$lang/$f | \ 42 | grep -v '' | \ 43 | grep -v '' | \ 44 | grep -v '' | \ 45 | sed -e 's///g' | \ 46 | sed -e 's/<\/title>//g' | \ 47 | sed -e 's/<description>//g' | \ 48 | sed -e 's/<\/description>//g' | \ 49 | # perl $UNESCAPE | \ 50 | perl $TOKENIZER -threads 8 -l $l > $tmp/$tok 51 | echo "" 52 | done 53 | perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 50 54 | for l in $src $tgt; do 55 | perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l 56 | done 57 | 58 | echo "pre-processing valid/test data..." 59 | for l in $src $tgt; do 60 | for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do 61 | fname=${o##*/} 62 | f=$tmp/${fname%.*} 63 | echo $o $f 64 | grep '<seg id' $o | \ 65 | sed -e 's/<seg id="[0-9]*">\s*//g' | \ 66 | sed -e 's/\s*<\/seg>\s*//g' | \ 67 | sed -e "s/\’/\'/g" | \ 68 | # perl $UNESCAPE | \ 69 | perl $TOKENIZER -threads 8 -l $l | \ 70 | perl $LC > $f 71 | echo "" 72 | done 73 | done 74 | 75 | echo "creating train, valid, test..." 76 | for l in $src $tgt; do 77 | awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.de-en.$l 78 | awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $prep/train.de-en.$l 79 | 80 | cat $tmp/IWSLT14.TED.dev2010.de-en.$l \ 81 | $tmp/IWSLT14.TEDX.dev2012.de-en.$l \ 82 | $tmp/IWSLT14.TED.tst2010.de-en.$l \ 83 | $tmp/IWSLT14.TED.tst2011.de-en.$l \ 84 | $tmp/IWSLT14.TED.tst2012.de-en.$l \ 85 | > $prep/test.de-en.$l 86 | done 87 | 88 | -------------------------------------------------------------------------------- /config/IWSLT14/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | data_dir=data/IWSLT14 4 | mkdir -p ${data_dir} 5 | 6 | config/IWSLT14/prepare-mixer.sh 7 | mv prep/*.{en,de} ${data_dir} 8 | rename s/.de-en// ${data_dir}/* 9 | rename s/valid/dev/ ${data_dir}/* 10 | rm -rf prep orig 11 | 12 | scripts/prepare-data.py ${data_dir}/train de en ${data_dir} --mode vocab --vocab-size 30000 13 | 14 | scripts/bpe/learn_joint_bpe_and_vocab.py --input ${data_dir}/train.{de,en} -s 30000 -o ${data_dir}/bpe.joint.en --write-vocabulary ${data_dir}/bpe-vocab.de ${data_dir}/bpe-vocab.en 15 | cp ${data_dir}/bpe.joint.en ${data_dir}/bpe.joint.de 16 | 17 | cat ${data_dir}/train.{de,en} > ${data_dir}/train.concat 18 | scripts/prepare-data.py ${data_dir}/train concat ${data_dir} --mode vocab --vocab-size 0 --character-level 19 | mv ${data_dir}/vocab.concat ${data_dir}/vocab.char.en 20 | cp ${data_dir}/vocab.char.en ${data_dir}/vocab.char.de 21 | rm ${data_dir}/train.concat 22 | 23 | for ext in de en 24 | do 25 | for corpus in train dev test 26 | do 27 | scripts/bpe/apply_bpe.py -c ${data_dir}/bpe.joint.${ext} --vocabulary ${data_dir}/bpe-vocab.${ext} --vocabulary-threshold 10 < ${data_dir}/${corpus}.${ext} > ${data_dir}/${corpus}.jsub.${ext} 28 | done 29 | done 30 | 31 | cat ${data_dir}/train.jsub.{en,de} > ${data_dir}/train.jsub.concat 32 | scripts/prepare-data.py ${data_dir}/train jsub.en jsub.de ${data_dir} --mode vocab --vocab-size 0 33 | scripts/prepare-data.py ${data_dir}/train.jsub concat ${data_dir} --mode vocab --vocab-size 0 34 | mv ${data_dir}/vocab.concat ${data_dir}/vocab.joint.jsub.en 35 | cp ${data_dir}/vocab.joint.jsub.{en,de} 36 | rm ${data_dir}/train.jsub.concat 37 | 38 | cp ${data_dir}/train.en ${data_dir}/train.char.en 39 | cp ${data_dir}/train.de ${data_dir}/train.char.de 40 | cp ${data_dir}/dev.en ${data_dir}/dev.char.en 41 | cp ${data_dir}/dev.de ${data_dir}/dev.char.de 42 | 43 | wget http://opus.nlpl.eu/download/TED2013/mono/TED2013.en.gz -O ${data_dir}/TED2013.en.gz 44 | #wget http://opus.nlpl.eu/download/OpenSubtitles2018/mono/OpenSubtitles2018.de.gz -O ${data_dir}/OpenSubtitles2018.de.gz 45 | #wget http://opus.nlpl.eu/download/OpenSubtitles2018/mono/OpenSubtitles2018.en.gz -O ${data_dir}/OpenSubtitles2018.en.gz 46 | 47 | function filter { 48 | filename=`mktemp` 49 | cat > ${filename} << EOF 50 | import sys 51 | lines = set(list(open('${data_dir}/dev.$1')) + list(open('${data_dir}/test.$1'))) 52 | for line in sys.stdin: 53 | if line not in lines: 54 | sys.stdout.write(line) 55 | EOF 56 | python3 ${filename} 57 | rm ${filename} 58 | } 59 | 60 | gunzip ${data_dir}/TED2013.en.gz --stdout | scripts/moses/lowercase.perl | filter en > ${data_dir}/TED.en 61 | rm ${data_dir}/TED2013.en.gz 62 | #gunzip ${data_dir}/OpenSubtitles2018.de.gz --stdout | scripts/moses/lowercase.perl | filter de > ${data_dir}/OpenSubtitles.de 63 | #gunzip ${data_dir}/OpenSubtitles2018.en.gz --stdout | scripts/moses/lowercase.perl | filter en > ${data_dir}/OpenSubtitles.en 64 | -------------------------------------------------------------------------------- /config/IWSLT14/train-SMT.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | data_dir=data/IWSLT14 6 | model_dir=models/IWSLT14 7 | train_script=scripts/train-moses.sh 8 | 9 | # model_dir data_dir corpus dev_corpus src_ext trg_ext lm_corpus lm_order 10 | ${train_script} ${model_dir}/SMT ${data_dir} train dev de en train 3 11 | ${train_script} ${model_dir}/SMT_subwords ${data_dir} train.jsub dev.jsub de en train.jsub 3 12 | 13 | cat ${data_dir}/{train,TED}.en > ${data_dir}/train+TED.en 14 | ${train_script} ${model_dir}/SMT_LM ${data_dir} train dev de en train+TED 3 15 | cat ${data_dir}/{train,TED}.jsub.en > ${data_dir}/train+TED.jsub.en 16 | ${train_script} ${model_dir}/SMT_LM_subwords ${data_dir} train.jsub dev.jsub de en train+TED.jsub 3 17 | 18 | ${train_script} ${model_dir}/SMT_huge_LM ${data_dir} train dev de en OpenSubtitles 5 19 | 20 | -------------------------------------------------------------------------------- /config/LibriSpeech/ASR.yaml: -------------------------------------------------------------------------------- 1 | label: 'LibriSpeech ASR' 2 | description: "Character-Level Automatic Speech Recognition on LibriSpeech" 3 | 4 | data_dir: data/LibriSpeech 5 | model_dir: models/LibriSpeech/ASR 6 | max_train_size: 20000 7 | 8 | batch_size: 32 9 | weight_scale: null 10 | 11 | steps_per_checkpoint: 1000 12 | steps_per_eval: 1000 13 | max_steps: 500000 14 | score_function: corpus_scores_wer 15 | 16 | cell_size: 256 17 | attn_size: 256 18 | cell_type: LSTM 19 | 20 | encoders: 21 | - name: speech.en 22 | ext: npz 23 | embedding_size: 41 24 | layers: 3 25 | conv_filters: [16, 16] 26 | conv_size: [3, 3] 27 | conv_strides: [2, 2] 28 | conv_activation: null 29 | binary: True 30 | max_len: 1400 31 | input_layers: [256, 128] 32 | bidir_projection: True 33 | final_state: average 34 | train_initial_states: False 35 | input_layer_dropout: 0.2 36 | 37 | decoders: 38 | - name: char.en 39 | conditional_rnn: True 40 | pred_deep_layer: True 41 | character_level: True 42 | use_previous_word: True 43 | embedding_size: 128 44 | max_len: 300 45 | 46 | use_dropout: True 47 | pervasive_dropout: True 48 | attn_dropout: 0.2 49 | rnn_input_dropout: 0.2 50 | initial_state_dropout: 0.2 51 | -------------------------------------------------------------------------------- /config/LibriSpeech/AST.yaml: -------------------------------------------------------------------------------- 1 | label: 'LibriSpeech AST' 2 | description: "Character-Level Automatic Speech Translation on LibriSpeech" 3 | 4 | data_dir: data/LibriSpeech 5 | model_dir: models/LibriSpeech/AST 6 | max_train_size: 20000 7 | train_prefix: train+google 8 | 9 | batch_size: 32 10 | weight_scale: null 11 | 12 | steps_per_checkpoint: 1000 13 | steps_per_eval: 1000 14 | max_steps: 500000 15 | score_function: corpus_scores_bleu 16 | 17 | cell_size: 512 18 | attn_size: 512 19 | cell_type: LSTM 20 | 21 | encoders: 22 | - name: speech.en 23 | ext: npz 24 | embedding_size: 41 25 | layers: 3 26 | cell_size: 256 27 | conv_filters: [16, 16] 28 | conv_size: [3, 3] 29 | conv_strides: [2, 2] 30 | conv_activation: null 31 | binary: True 32 | max_len: 1400 33 | input_layers: [256, 128] 34 | bidir_projection: False 35 | final_state: average 36 | train_initial_states: False 37 | input_layer_dropout: 0.2 38 | 39 | decoders: 40 | - name: char.fr 41 | conditional_rnn: True 42 | pred_deep_layer: True 43 | character_level: True 44 | use_previous_word: True 45 | embedding_size: 128 46 | max_len: 300 47 | 48 | use_dropout: True 49 | pervasive_dropout: True 50 | attn_dropout: 0.2 51 | rnn_input_dropout: 0.2 52 | initial_state_dropout: 0.2 53 | -------------------------------------------------------------------------------- /config/LibriSpeech/MT.yaml: -------------------------------------------------------------------------------- 1 | label: 'LibriSpeech MT' 2 | description: "Character-Level Machine Translation on LibriSpeech" 3 | 4 | data_dir: data/LibriSpeech 5 | model_dir: models/LibriSpeech/MT 6 | max_train_size: 20000 7 | train_prefix: train+google 8 | 9 | batch_size: 64 10 | weight_scale: null 11 | 12 | steps_per_checkpoint: 1000 13 | steps_per_eval: 1000 14 | max_steps: 100000 15 | score_function: corpus_scores_bleu 16 | 17 | cell_size: 512 18 | attn_size: 512 19 | cell_type: LSTM 20 | 21 | encoders: 22 | - name: sub.en 23 | embedding_size: 256 24 | max_len: 60 25 | bidir_projection: True 26 | final_state: average 27 | train_initial_states: True 28 | embedding_dropout: 0.2 29 | 30 | decoders: 31 | - name: char.fr 32 | conditional_rnn: True 33 | pred_deep_layer: True 34 | character_level: True 35 | use_previous_word: True 36 | embedding_size: 128 37 | max_len: 400 38 | word_dropout: 0.2 39 | 40 | use_dropout: True 41 | pervasive_dropout: True 42 | attn_dropout: 0.2 43 | rnn_input_dropout: 0.2 44 | rnn_output_dropout: 0.2 45 | initial_state_dropout: 0.2 46 | -------------------------------------------------------------------------------- /config/LibriSpeech/Multi-Task.yaml: -------------------------------------------------------------------------------- 1 | label: 'LibriSpeech Multi-Task' 2 | description: "Multi-Task training of AST, MT and ASR models on LibriSpeech" 3 | 4 | data_dir: data/LibriSpeech 5 | model_dir: models/LibriSpeech/AST_multitask 6 | max_train_size: 10000 7 | 8 | batch_size: 32 9 | weight_scale: null 10 | 11 | steps_per_checkpoint: 500 12 | steps_per_eval: 500 13 | max_steps: 500000 14 | 15 | cell_size: 512 16 | attn_size: 512 17 | cell_type: LSTM 18 | 19 | conv_filters: [16, 16] 20 | conv_size: [3, 3] 21 | conv_strides: [2, 2] 22 | conv_activation: null 23 | final_state: average 24 | train_initial_states: False 25 | bidir_projection: False 26 | input_layers: [256, 128] 27 | 28 | conditional_rnn: True 29 | pred_deep_layer: True 30 | use_previous_word: True 31 | embedding_size: 128 32 | 33 | tasks: 34 | - name: AST 35 | score_function: corpus_scores_bleu 36 | train_prefix: train+google 37 | ratio: 0.6 38 | 39 | encoders: 40 | - name: speech.en 41 | ext: npz 42 | embedding_size: 41 43 | layers: 3 44 | cell_size: 256 45 | binary: True 46 | max_len: 1400 47 | decoders: 48 | - name: char.fr 49 | character_level: True 50 | max_len: 300 51 | 52 | - name: ASR 53 | score_function: corpus_scores_wer 54 | train_prefix: train 55 | ratio: 0.2 56 | 57 | encoders: 58 | - name: speech.en 59 | ext: npz 60 | embedding_size: 41 61 | layers: 3 62 | cell_size: 256 63 | attn_size: 256 64 | binary: True 65 | bidir_projection: True 66 | max_len: 1400 67 | decoders: 68 | - name: char.en 69 | cell_size: 256 70 | character_level: True 71 | max_len: 300 72 | 73 | - name: MT 74 | score_function: corpus_scores_bleu 75 | train_prefix: train+google 76 | ratio: 0.2 77 | 78 | encoders: 79 | - name: sub.en 80 | embedding_size: 256 81 | bidir_projection: True 82 | train_initial_states: True 83 | input_layers: null 84 | conv_filters: null 85 | max_len: 60 86 | decoders: 87 | - name: char.fr 88 | character_level: True 89 | max_len: 300 90 | 91 | 92 | use_dropout: True 93 | pervasive_dropout: True 94 | attn_dropout: 0.2 95 | rnn_input_dropout: 0.2 96 | initial_state_dropout: 0.2 97 | input_layer_dropout: 0.2 98 | -------------------------------------------------------------------------------- /config/LibriSpeech/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Augmented LibriSpeech 3 | 4 | The raw corpus can be downloaded [here](https://persyval-platform.univ-grenoble-alpes.fr/DS91/detaildataset). It consists in an automatic alignment of the [LibriSpeech ASR corpus](http://www.openslr.org/12/) (English audio with transcriptions), with [Project Gutenberg](https://www.gutenberg.org/), which distributes public domain e-books in many languages. 5 | The scripts that were used for the alignment are freely available [here](https://github.com/alicank/Translation-Augmented-LibriSpeech-Corpus). 6 | 7 | The pre-processed corpus (with MFCCs) is available [here](https://drive.google.com/open?id=15ZwzXe_FEx-K7yn6ZVksrUc0QWV072Xt). If you want to use it to train new models, you should extract it as `data/LibriSpeech`. Then, you can train a new model using the configuration files inside `config/LibriSpeech`. For example: 8 | 9 | ./seq2seq.sh config/LibriSpeech/AST.yaml --train -v --purge 10 | 11 | If you want to do your own pre-processing, then you can use [this corpus](https://drive.google.com/open?id=1n6r-gkTPooK8oEWjllv1i5vO3ZWHkRNe). The audio files are grouped into tar archives for convenience. The `scripts/speech/extract.py` and `scripts/speech/extract-new.py` directly take this tar archive as input, and output a numpy binary file containing the extracted features. The text files are non-processed and should be tokenized and optionally lowercased before training. 12 | 13 | ## Trained models 14 | 15 | You can download some pre-trained models on Augmented LibriSpeech [here](https://drive.google.com/open?id=1QUS7VjaaFouBX7HNAl05vzKLzlzkZvcY). 16 | This archive should be extracted inside `models/`. Then, to decode the test set using a model, e.g., `AST.1`, do: 17 | 18 | ./seq2seq.sh models/LibriSpeech/AST.1/config.yaml --decode models/LibriSpeech/data/test.npz 19 | 20 | The directory `models/LibriSpeech/eval-outputs` contains all the outputs by our our pre-trained models on the test and dev set. The `models/LibriSpeech/eval.log` file contains the commands that were used for the evaluation along with the obtained scores. Each model has a `config.yaml` file that can be used to use it or re-train it. The config files of the more important models are also available inside `config/LibriSpeech/`. 21 | -------------------------------------------------------------------------------- /config/LibriSpeech/model-outputs.tar.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/config/LibriSpeech/model-outputs.tar.xz -------------------------------------------------------------------------------- /config/LibriSpeech/prepare-raw.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # first download the Augmented LibriSpeech zip files inside `archive_dir` 4 | archive_dir=raw_data/LibriSpeech/archives 5 | raw_data=raw_data/LibriSpeech 6 | mkdir -p ${raw_data} 7 | 8 | #unzip -q ${archive_dir}/dev.zip -d ${raw_data} 9 | #unzip -q ${archive_dir}/test.zip -d ${raw_data} 10 | #unzip -q ${archive_dir}/train_100h.zip -d ${raw_data} 11 | #unzip -q ${archive_dir}/train_130h_additional.zip -d ${raw_data} 12 | #unzip -q ${archive_dir}/database.zip -d ${raw_data} 13 | 14 | function clean-dash { 15 | perl -pe 's/^(-+)([^\s-])/$1 $2/g' 16 | } 17 | function clean-quotes { 18 | perl -pe "s/\"\s*\"/\"/g" 19 | } 20 | 21 | for corpus in dev test train other 22 | do 23 | cp ${raw_data}/${corpus}/${corpus}.en ${raw_data} 24 | if [ ${corpus} = train ] || [ ${corpus} = other ] 25 | then 26 | cat ${raw_data}/${corpus}/${corpus}.fr | clean-quotes | clean-dash > ${raw_data}/${corpus}.fr 27 | else 28 | cat ${raw_data}/${corpus}/${corpus}.fr | clean-dash > ${raw_data}/${corpus}.fr 29 | fi 30 | cat ${raw_data}/${corpus}/${corpus}_gtranslate.fr | clean-quotes > ${raw_data}/${corpus}.google.fr 31 | 32 | rm -f ${raw_data}/${corpus}.orig.en 33 | 34 | alignments=${raw_data}/${corpus}/alignments.meta 35 | database=${raw_data}/TA-LibriSpeechCorpus.db 36 | var=1 37 | lines=`tail -n+2 ${alignments} | wc -l` 38 | len=`python -c "import math; print(1 + int(math.log10(${lines})))"` 39 | python3 -c "import sqlite3; conn = sqlite3.connect('${database}'); c = conn.cursor(); c.execute('SELECT audio_filename, source_segment FROM alignments'); d = dict(c.fetchall()); print('\n'.join(d[x.split()[-2]].strip() for x in open('${alignments}').readlines()[1:]))" | clean-quotes > ${raw_data}/${corpus}.orig.en 40 | 41 | for filename in `tail -n+2 ${alignments} | cut -f5,5` 42 | do 43 | name=`printf "%0${len}d" ${var}` 44 | #cp ${raw_data}/${corpus}/audiofiles/${filename}.wav ${raw_data}/${corpus}/${name}.wav 45 | ((var++)); 46 | done 47 | 48 | #rm -r ${raw_data}/${corpus}/audiofiles 49 | find raw_data/LibriSpeech/${corpus}/ -maxdepth 1 -name "*.wav" > /tmp/files.txt 50 | tar -cf ${raw_data}/${corpus}.tar -T /tmp/files.txt 51 | #rm -r ${raw_data}/${corpus} 52 | done 53 | 54 | sed -i '1743,1744d' ${raw_data}/test.fr 55 | sed -i '1743,1744d' ${raw_data}/test.en 56 | -------------------------------------------------------------------------------- /config/LibriSpeech/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | data_dir=data/LibriSpeech 4 | raw_data=raw_data/LibriSpeech 5 | mkdir -p ${data_dir} 6 | 7 | scripts/prepare-data.py ${raw_data}/train fr en google.fr ${data_dir} --lowercase --no-tokenize en \ 8 | --dev-corpus ${raw_data}/dev --test-corpus ${raw_data}/test --normalize-punk fr google.fr --lang fr en fr \ 9 | --mode prepare 10 | 11 | scripts/speech/extract.py ${raw_data}/train.tar ${data_dir}/train.npz 12 | scripts/speech/extract.py ${raw_data}/dev.tar ${data_dir}/dev.npz 13 | scripts/speech/extract.py ${raw_data}/test.tar ${data_dir}/test.npz 14 | scripts/speech/extract.py ${raw_data}/other.tar ${data_dir}/other.npz 15 | 16 | cat ${data_dir}/{train,train.google}.fr > ${data_dir}/train+google.fr 17 | cat ${data_dir}/{train,train}.en > ${data_dir}/train+google.en 18 | scripts/speech/cat.py ${data_dir}/{train,train}.npz ${data_dir}/train+google.npz 19 | scripts/speech/shuf.py ${data_dir}/train+google.npz --input-txt ${data_dir}/train+google.{fr,en} 20 | scripts/speech/shuf.py ${data_dir}/train.npz --input-txt ${data_dir}/train.{fr,en,google.fr} 21 | 22 | # prepare BPE 23 | scripts/bpe/learn_bpe.py -i ${data_dir}/train.en -s 30000 -o ${data_dir}/bpe.en 24 | 25 | # apply BPE 26 | scripts/prepare-data.py ${data_dir}/train en ${data_dir} --no-tokenize --subwords --bpe-path ${data_dir}/bpe \ 27 | --output train.sub --dev-prefix dev.sub --test-prefix test.sub --vocab-prefix vocab.sub \ 28 | --dev-corpus ${data_dir}/dev --test-corpus ${data_dir}/test 29 | 30 | scripts/prepare-data.py ${data_dir}/train+google en ${data_dir} --no-tokenize --subwords \ 31 | --bpe-path ${data_dir}/bpe --output train+google.sub --mode prepare 32 | 33 | # prepare word-level vocabs 34 | scripts/prepare-data.py ${data_dir}/train+google fr en ${data_dir} --mode vocab --vocab-size 30000 35 | 36 | # prepare character-level vocabs 37 | scripts/prepare-data.py ${data_dir}/train+google fr en ${data_dir} --mode vocab --character-level \ 38 | --vocab-size 200 --vocab-prefix vocab.char 39 | 40 | for corpus in train+google train dev test 41 | do 42 | cp ${data_dir}/${corpus}.fr ${data_dir}/${corpus}.char.fr 43 | cp ${data_dir}/${corpus}.en ${data_dir}/${corpus}.char.en 44 | done 45 | 46 | -------------------------------------------------------------------------------- /config/WMT14/RNNsearch-Adam.yaml: -------------------------------------------------------------------------------- 1 | label: "RNNsearch + Adam" 2 | description: "Same config as RNNsearch (Bahdanau 2014), with Adam instead of AdaDelta" 3 | 4 | cell_size: 1000 5 | attn_size: 1000 6 | embedding_size: 620 7 | cell_type: GRU 8 | 9 | data_dir: data/WMT14 10 | max_len: 50 11 | model_dir: models/WMT14/RNNsearch_Adam 12 | max_train_size: 1000000 13 | 14 | steps_per_checkpoint: 10000 15 | steps_per_eval: 10000 16 | keep_best: 1 17 | max_to_keep: 1 18 | score_function: corpus_bleu 19 | 20 | optimizer: adam 21 | learning_rate: 0.0002 22 | batch_size: 80 23 | batch_mode: standard 24 | shuffle: False 25 | read_ahead: 20 26 | max_gradient_norm: 1.0 27 | max_epochs: 10 28 | learning_rate_decay_factor: 0.5 29 | decay_every_n_epoch: 0.5 30 | 31 | attention_type: global 32 | final_state: last 33 | 34 | weight_scale: 0.01 35 | 36 | encoders: 37 | - name: en 38 | train_initial_states: False 39 | 40 | decoders: 41 | - name: fr 42 | 43 | generate_first: False 44 | orthogonal_init: True 45 | -------------------------------------------------------------------------------- /config/WMT14/RNNsearch-BPE.yaml: -------------------------------------------------------------------------------- 1 | label: "RNNsearch + BPE" 2 | description: "Same config as RNNsearch (Bahdanau 2014), with Adam, a Cond-GRU decoder and BPE units" 3 | 4 | cell_size: 1000 5 | attn_size: 1000 6 | embedding_size: 620 7 | cell_type: GRU 8 | 9 | data_dir: data/WMT14 10 | max_len: 60 11 | model_dir: models/WMT14/RNNsearch_BPE 12 | max_train_size: 1000000 13 | 14 | steps_per_checkpoint: 10000 15 | steps_per_eval: 10000 16 | keep_best: 1 17 | max_to_keep: 1 18 | score_function: corpus_bleu 19 | 20 | optimizer: adam 21 | learning_rate: 0.0002 22 | batch_size: 80 23 | batch_mode: standard 24 | read_ahead: 20 25 | max_gradient_norm: 1.0 26 | max_epochs: 5 27 | learning_rate_decay_factor: 0.5 28 | decay_every_n_epoch: 0.5 29 | 30 | attention_type: global 31 | final_state: last_both 32 | 33 | weight_scale: 0.01 34 | 35 | encoders: 36 | - name: joint 37 | ext: jsub.en 38 | 39 | decoders: 40 | - name: joint 41 | ext: jsub.fr 42 | conditional_rnn: True 43 | pred_deep_layer: True 44 | 45 | orthogonal_init: True 46 | -------------------------------------------------------------------------------- /config/WMT14/RNNsearch.yaml: -------------------------------------------------------------------------------- 1 | label: "RNNsearch" 2 | description: "Baseline WMT14 model, exact same config as Bahdanau et al. 2014" 3 | 4 | cell_size: 1000 5 | attn_size: 1000 6 | embedding_size: 620 7 | cell_type: GRU 8 | 9 | data_dir: data/WMT14 10 | max_len: 50 11 | model_dir: models/WMT14/RNNsearch 12 | max_train_size: 1000000 13 | 14 | steps_per_checkpoint: 10000 15 | steps_per_eval: 10000 16 | keep_best: 1 17 | max_to_keep: 1 18 | score_function: corpus_bleu 19 | 20 | optimizer: adadelta 21 | learning_rate: 1.0 22 | batch_size: 80 23 | batch_mode: standard 24 | shuffle: False 25 | read_ahead: 20 26 | max_gradient_norm: 1.0 27 | max_epochs: 10 28 | 29 | attention_type: global 30 | final_state: last 31 | 32 | weight_scale: 0.01 33 | 34 | encoders: 35 | - name: en 36 | train_initial_states: False 37 | 38 | decoders: 39 | - name: fr 40 | 41 | generate_first: False 42 | orthogonal_init: True 43 | -------------------------------------------------------------------------------- /config/WMT14/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | raw_data=raw_data/WMT14 4 | 5 | mkdir -p ${raw_data} 6 | cur_dir=`pwd` 7 | cd ${raw_data} 8 | 9 | wget "http://www-lium.univ-lemans.fr/~schwenk/nnmt-shared-task/data/bitexts.tgz" 10 | tar xzf bitexts.tgz 11 | gunzip bitexts.selected/* 12 | cat bitexts.selected/{ep7_pc45,nc9,dev08_11,crawl,ccb2_pc30,un2000_pc34}.en > WMT14.fr-en.en 13 | cat bitexts.selected/{ep7_pc45,nc9,dev08_11,crawl,ccb2_pc30,un2000_pc34}.fr > WMT14.fr-en.fr 14 | rm -rf bitexts.selected 15 | 16 | wget "http://www-lium.univ-lemans.fr/~schwenk/nnmt-shared-task/data/dev+test.tgz" 17 | tar xzf dev+test.tgz 18 | rename s@dev/ntst1213@ntst1213.fr-en@ dev/* 19 | rename s@dev/ntst14@ntst14.fr-en@ dev/* 20 | rmdir dev 21 | 22 | rm bitexts.tgz dev+test.tgz 23 | 24 | cd ${cur_dir} 25 | -------------------------------------------------------------------------------- /config/WMT14/prepare-lexicon.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | data_dir=data/WMT14 6 | 7 | rm -rf fast_align-master 8 | 9 | wget https://github.com/clab/fast_align/archive/master.zip 10 | unzip master.zip 11 | rm master.zip 12 | cd fast_align-master 13 | mkdir build 14 | cd build 15 | cmake .. 16 | make 17 | cd ../.. 18 | 19 | corpus=train 20 | fast_align=fast_align-master/build 21 | 22 | scripts/join.py ${data_dir}/${corpus}.{en,fr} > ${data_dir}/${corpus}.en-fr 23 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.en-fr -d -o -v > ${data_dir}/${corpus}.forward.align 24 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.en-fr -d -o -v -r > ${data_dir}/${corpus}.reverse.align 25 | ${fast_align}/atools -i ${data_dir}/${corpus}.forward.align -j ${data_dir}/${corpus}.reverse.align -c grow-diag-final-and > ${data_dir}/${corpus}.align 26 | 27 | scripts/extract-lexicon.py ${data_dir}/${corpus}.{en,fr,align} > ${data_dir}/${corpus}.lexicon 28 | python3 -c "print('\n'.join(line.rstrip() for line in open('${data_dir}/${corpus}.lexicon') if not line[0].isupper() and not line.split()[0] == line.split()[1]))" > ${data_dir}/${corpus}.lexicon.purged 29 | 30 | rm -rf fast_align-master 31 | rm ${data_dir}/${corpus}.en-fr 32 | rm ${data_dir}/*.align 33 | -------------------------------------------------------------------------------- /config/WMT14/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Filtered WMT14 data, available on http://www-lium.univ-lemans.fr/~schwenk/nnmt-shared-task/ 4 | 5 | raw_data=raw_data/WMT14 6 | data_dir=data/WMT14 7 | 8 | rm -rf ${data_dir} 9 | mkdir -p ${data_dir} 10 | 11 | scripts/prepare-data.py ${raw_data}/WMT14.fr-en fr en ${data_dir} --no-tokenize \ 12 | --dev-corpus ${raw_data}/ntst1213.fr-en \ 13 | --test-corpus ${raw_data}/ntst14.fr-en \ 14 | --vocab-size 30000 --shuffle --seed 1234 15 | 16 | cat ${raw_data}/WMT14.fr-en.{fr,en} > ${data_dir}/train.concat 17 | scripts/bpe/learn_bpe.py -i ${data_dir}/train.concat -o ${data_dir}/bpe.joint -s 30000 18 | cp ${data_dir}/bpe.joint ${data_dir}/bpe.joint.fr 19 | cp ${data_dir}/bpe.joint ${data_dir}/bpe.joint.en 20 | rm ${data_dir}/train.concat 21 | 22 | scripts/prepare-data.py ${raw_data}/WMT14.fr-en fr en ${data_dir} --no-tokenize \ 23 | --subwords --bpe-path ${data_dir}/bpe.joint \ 24 | --dev-corpus ${raw_data}/ntst1213.fr-en --dev-prefix dev.jsub \ 25 | --test-corpus ${raw_data}/ntst14.fr-en --test-prefix test.jsub \ 26 | --shuffle --seed 1234 --output train.jsub --mode prepare 27 | 28 | cat ${data_dir}/train.jsub.{fr,en} > ${data_dir}/train.concat.jsub 29 | scripts/prepare-data.py ${data_dir}/train concat.jsub ${data_dir} --vocab-size 0 --mode vocab 30 | cp ${data_dir}/vocab.concat.jsub ${data_dir}/vocab.jsub.fr 31 | cp ${data_dir}/vocab.concat.jsub ${data_dir}/vocab.jsub.en 32 | rm ${data_dir}/*.concat.* 33 | 34 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | root_dir=`pwd` 4 | 5 | /usr/bin/env pip3 install tensorflow-gpu python-dateutil pyyaml matplotlib --user --upgrade 6 | 7 | cat >>~/.bashrc << EOL 8 | export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/cuda/lib64 9 | alias get-best-score=${root_dir}/scripts/get-best-score.py 10 | alias plot-loss=${root_dir}/scripts/plot-loss.py 11 | alias multi-print=${root_dir}/scripts/multi-print.py 12 | alias copy-model=${root_dir}/scripts/copy-model.py 13 | EOL 14 | -------------------------------------------------------------------------------- /run-tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import subprocess 3 | import shlex 4 | import re 5 | import os 6 | import argparse 7 | 8 | OKGREEN = '\033[92m' 9 | FAIL = '\033[91m' 10 | ENDC = '\033[0m' 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--no-gpu', action='store_true') 14 | parser.add_argument('--gpu-id', type=int) 15 | parser.add_argument('dirs', nargs='*') 16 | args = parser.parse_args() 17 | 18 | extra_params = [] 19 | if args.no_gpu: 20 | extra_params.append('--no-gpu') 21 | if args.gpu_id is not None: 22 | extra_params += ['--gpu-id', str(args.gpu_id)] 23 | 24 | def failure(message): 25 | print('{}failure: {}{}'.format(FAIL, message, ENDC)) 26 | def success(message): 27 | print('{}success: {}{}'.format(OKGREEN, message, ENDC)) 28 | 29 | log_file = os.path.join('tests', 'log.txt') 30 | 31 | try: 32 | os.remove(log_file) 33 | except FileNotFoundError: 34 | pass 35 | 36 | 37 | def get_best_score(log_file): 38 | scores = [] 39 | with open(log_file) as f: 40 | for line in f: 41 | score_ = re.search(r' (score|bleu|ter|loss|cer|wer|bleu1)=(.*?) ', line + ' ') 42 | 43 | if score_: 44 | scores.append(float(score_.group(2))) 45 | 46 | if len(scores) == 0: 47 | return None 48 | elif len(scores) == 1: 49 | return scores[0] 50 | elif scores[0] <= scores[-1]: 51 | return max(scores) 52 | else: 53 | return min(scores) 54 | 55 | 56 | def run(dir_, score=None): 57 | config_file = os.path.join(dir_, 'config.yaml') 58 | log_file_ = os.path.join(dir_, 'log.txt') 59 | name = os.path.basename(dir_) 60 | 61 | if score is None: 62 | try: 63 | score = get_best_score(log_file_) 64 | except: 65 | pass 66 | 67 | print('Running {}'.format(name)) 68 | 69 | try: 70 | output = subprocess.check_output(['./seq2seq.sh', config_file, '--eval'] + extra_params, 71 | stderr=subprocess.STDOUT).decode() 72 | except subprocess.CalledProcessError as e: 73 | output = e.output.decode() 74 | 75 | scores = output.strip().split('\n')[-1] + ' ' 76 | score_ = re.search(r' (score|bleu|ter|loss|cer|wer|bleu1)=(.*?) ', scores) 77 | 78 | with open(log_file, 'a') as f: 79 | f.write(output) 80 | 81 | if not score_: 82 | failure('unable to run test (see log file)') 83 | else: 84 | score_ = float(score_.group(2)) 85 | if score is None: 86 | success('obtained {}'.format(score_)) 87 | elif score_ == score: 88 | success('scores matching ({})'.format(score_)) 89 | else: 90 | failure('obtained {}, expected {}'.format(score_, score)) 91 | 92 | 93 | if not args.dirs: 94 | dirs = [os.path.join('tests', name) for name in os.listdir('tests')] 95 | else: 96 | dirs = args.dirs 97 | 98 | for path in dirs: 99 | if os.path.isdir(path): 100 | run(path) 101 | 102 | -------------------------------------------------------------------------------- /scripts/bpe/bpe_toy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens. 8 | This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets, 9 | indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py). 10 | 11 | Reference: 12 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 13 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 14 | """ 15 | 16 | 17 | import re 18 | import sys 19 | import collections 20 | 21 | def get_stats(vocab): 22 | pairs = collections.defaultdict(int) 23 | for word, freq in vocab.items(): 24 | symbols = word.split() 25 | for i in range(len(symbols)-1): 26 | pairs[symbols[i],symbols[i+1]] += freq 27 | return pairs 28 | 29 | def merge_vocab(pair, v_in): 30 | v_out = {} 31 | bigram_pattern = re.escape(' '.join(pair)) 32 | p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)') 33 | for word in v_in: 34 | w_out = p.sub(''.join(pair), word) 35 | v_out[w_out] = v_in[word] 36 | return v_out 37 | 38 | vocab = {'l o w</w>' : 5, 'l o w e r</w>' : 2, 39 | 'n e w e s t</w>' : 6, 'w i d e s t</w>' : 3} 40 | num_merges = 15 41 | for i in range(num_merges): 42 | pairs = get_stats(vocab) 43 | try: 44 | best = max(pairs, key=pairs.get) 45 | except ValueError: 46 | break 47 | if pairs[best] < 2: 48 | sys.stderr.write('no pair has frequency > 1. Stopping\n') 49 | break 50 | vocab = merge_vocab(best, vocab) 51 | print(best) 52 | -------------------------------------------------------------------------------- /scripts/bpe/chrF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Compute chrF3 for machine translation evaluation 6 | 7 | Reference: 8 | Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal. 9 | """ 10 | 11 | from __future__ import print_function, unicode_literals, division 12 | import sys 13 | import codecs 14 | import io 15 | import argparse 16 | from collections import defaultdict 17 | 18 | # hack for python2/3 compatibility 19 | from io import open 20 | argparse.open = open 21 | 22 | # python 2/3 compatibility 23 | if sys.version_info < (3, 0): 24 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 25 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 26 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 27 | 28 | 29 | def create_parser(): 30 | parser = argparse.ArgumentParser( 31 | formatter_class=argparse.RawDescriptionHelpFormatter, 32 | description="learn BPE-based word segmentation") 33 | 34 | parser.add_argument( 35 | '--ref', '-r', type=argparse.FileType('r'), required=True, 36 | metavar='PATH', 37 | help="Reference file") 38 | parser.add_argument( 39 | '--hyp', type=argparse.FileType('r'), metavar='PATH', 40 | default=sys.stdin, 41 | help="Hypothesis file (default: stdin).") 42 | parser.add_argument( 43 | '--beta', '-b', type=float, default=3, 44 | metavar='FLOAT', 45 | help="beta parameter (default: '%(default)s')") 46 | parser.add_argument( 47 | '--ngram', '-n', type=int, default=6, 48 | metavar='INT', 49 | help="ngram order (default: '%(default)s')") 50 | parser.add_argument( 51 | '--space', '-s', action='store_true', 52 | help="take spaces into account (default: '%(default)s')") 53 | parser.add_argument( 54 | '--precision', action='store_true', 55 | help="report precision (default: '%(default)s')") 56 | parser.add_argument( 57 | '--recall', action='store_true', 58 | help="report recall (default: '%(default)s')") 59 | 60 | return parser 61 | 62 | def extract_ngrams(words, max_length=4, spaces=False): 63 | 64 | if not spaces: 65 | words = ''.join(words.split()) 66 | else: 67 | words = words.strip() 68 | 69 | results = defaultdict(lambda: defaultdict(int)) 70 | for length in range(max_length): 71 | for start_pos in range(len(words)): 72 | end_pos = start_pos + length + 1 73 | if end_pos <= len(words): 74 | results[length][tuple(words[start_pos: end_pos])] += 1 75 | return results 76 | 77 | 78 | def get_correct(ngrams_ref, ngrams_test, correct, total): 79 | 80 | for rank in ngrams_test: 81 | for chain in ngrams_test[rank]: 82 | total[rank] += ngrams_test[rank][chain] 83 | if chain in ngrams_ref[rank]: 84 | correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain]) 85 | 86 | return correct, total 87 | 88 | 89 | def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0): 90 | 91 | precision = 0 92 | recall = 0 93 | 94 | for i in range(max_length): 95 | if total_hyp[i] + smooth and total_ref[i] + smooth: 96 | precision += (correct[i] + smooth) / (total_hyp[i] + smooth) 97 | recall += (correct[i] + smooth) / (total_ref[i] + smooth) 98 | 99 | precision /= max_length 100 | recall /= max_length 101 | 102 | return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall 103 | 104 | def main(args): 105 | 106 | correct = [0]*args.ngram 107 | total = [0]*args.ngram 108 | total_ref = [0]*args.ngram 109 | for line in args.ref: 110 | line2 = args.hyp.readline() 111 | 112 | ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space) 113 | ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space) 114 | 115 | get_correct(ngrams_ref, ngrams_test, correct, total) 116 | 117 | for rank in ngrams_ref: 118 | for chain in ngrams_ref[rank]: 119 | total_ref[rank] += ngrams_ref[rank][chain] 120 | 121 | chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta) 122 | 123 | print('chrF3: {0:.4f}'.format(chrf)) 124 | if args.precision: 125 | print('chrPrec: {0:.4f}'.format(precision)) 126 | if args.recall: 127 | print('chrRec: {0:.4f}'.format(recall)) 128 | 129 | if __name__ == '__main__': 130 | 131 | parser = create_parser() 132 | args = parser.parse_args() 133 | 134 | main(args) 135 | -------------------------------------------------------------------------------- /scripts/bpe/concat-bpe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('vocab') 6 | parser.add_argument('bpe') 7 | 8 | 9 | def build_vocab(bpe_pairs): 10 | vocab = set() 11 | for a, b in bpe_pairs: 12 | words = [a, b, a + b] 13 | for word in words: 14 | if word.endswith('</w>'): 15 | vocab.add(word[:-4]) 16 | else: 17 | vocab.add(word + '@@') 18 | vocab.add(word) 19 | return vocab 20 | 21 | 22 | if __name__ == '__main__': 23 | args = parser.parse_args() 24 | 25 | with open(args.bpe) as bpe_file, open(args.vocab) as vocab_file: 26 | bpe_pairs = [line.split() for line in bpe_file] 27 | vocab = [line.strip() for line in vocab_file] 28 | 29 | bpe_vocab = build_vocab(bpe_pairs) 30 | 31 | for w in vocab: 32 | print(w) 33 | 34 | vocab = set(vocab) 35 | for w in bpe_vocab: 36 | if w not in vocab: 37 | print(w) 38 | -------------------------------------------------------------------------------- /scripts/bpe/get_vocab.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | from __future__ import print_function 3 | import sys 4 | from collections import Counter 5 | 6 | c = Counter() 7 | 8 | for line in sys.stdin: 9 | for word in line.split(): 10 | c[word] += 1 11 | 12 | for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): 13 | print(key+" "+ str(f)) 14 | -------------------------------------------------------------------------------- /scripts/bpe/learn_joint_bpe_and_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus, 7 | applies the learned operation to each and (optionally) returns the resulting vocabulary of each text. 8 | The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text. 9 | 10 | Reference: 11 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 12 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 13 | """ 14 | 15 | from __future__ import unicode_literals 16 | 17 | import sys 18 | import os 19 | import codecs 20 | import argparse 21 | import tempfile 22 | from collections import Counter 23 | 24 | import learn_bpe 25 | import apply_bpe 26 | 27 | # hack for python2/3 compatibility 28 | from io import open 29 | argparse.open = open 30 | 31 | def create_parser(): 32 | parser = argparse.ArgumentParser( 33 | formatter_class=argparse.RawDescriptionHelpFormatter, 34 | description="learn BPE-based word segmentation") 35 | 36 | parser.add_argument( 37 | '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+', 38 | metavar='PATH', 39 | help="Input texts (multiple allowed).") 40 | parser.add_argument( 41 | '--output', '-o', type=argparse.FileType('w'), required=True, 42 | metavar='PATH', 43 | help="Output file for BPE codes.") 44 | parser.add_argument( 45 | '--symbols', '-s', type=int, default=10000, 46 | help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))") 47 | parser.add_argument( 48 | '--separator', type=str, default='@@', metavar='STR', 49 | help="Separator between non-final subword units (default: '%(default)s'))") 50 | parser.add_argument( 51 | '--write-vocabulary', type=argparse.FileType('w'), nargs = '+', default=None, 52 | metavar='PATH', dest='vocab', 53 | help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py') 54 | parser.add_argument( 55 | '--min-frequency', type=int, default=2, metavar='FREQ', 56 | help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') 57 | parser.add_argument( 58 | '--verbose', '-v', action="store_true", 59 | help="verbose mode.") 60 | 61 | return parser 62 | 63 | 64 | 65 | if __name__ == '__main__': 66 | 67 | # python 2/3 compatibility 68 | if sys.version_info < (3, 0): 69 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 70 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 71 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 72 | else: 73 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) 74 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) 75 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) 76 | 77 | parser = create_parser() 78 | args = parser.parse_args() 79 | 80 | if args.vocab and len(args.input) != len(args.vocab): 81 | sys.stderr.write('Error: number of input files and vocabulary files must match\n') 82 | sys.exit(1) 83 | 84 | # read/write files as UTF-8 85 | args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] 86 | args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] 87 | 88 | # get combined vocabulary of all input texts 89 | full_vocab = Counter() 90 | for f in args.input: 91 | full_vocab += learn_bpe.get_vocabulary(f) 92 | f.seek(0) 93 | 94 | vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] 95 | 96 | # learn BPE on combined vocabulary 97 | with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: 98 | learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) 99 | 100 | with codecs.open(args.output.name, encoding='UTF-8') as codes: 101 | bpe = apply_bpe.BPE(codes, separator=args.separator) 102 | 103 | # apply BPE to each training corpus and get vocabulary 104 | for train_file, vocab_file in zip(args.input, args.vocab): 105 | 106 | tmp = tempfile.NamedTemporaryFile(delete=False) 107 | tmp.close() 108 | 109 | tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') 110 | 111 | train_file.seek(0) 112 | for line in train_file: 113 | tmpout.write(bpe.segment(line).strip()) 114 | tmpout.write('\n') 115 | 116 | tmpout.close() 117 | tmpin = codecs.open(tmp.name, encoding='UTF-8') 118 | 119 | vocab = learn_bpe.get_vocabulary(tmpin) 120 | tmpin.close() 121 | os.remove(tmp.name) 122 | 123 | for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): 124 | vocab_file.write("{0} {1}\n".format(key, freq)) 125 | vocab_file.close() 126 | -------------------------------------------------------------------------------- /scripts/bpe/segment-char-ngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | from __future__ import unicode_literals, division 6 | 7 | import sys 8 | import codecs 9 | import argparse 10 | 11 | # hack for python2/3 compatibility 12 | from io import open 13 | argparse.open = open 14 | 15 | def create_parser(): 16 | parser = argparse.ArgumentParser( 17 | formatter_class=argparse.RawDescriptionHelpFormatter, 18 | description="segment rare words into character n-grams") 19 | 20 | parser.add_argument( 21 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 22 | metavar='PATH', 23 | help="Input file (default: standard input).") 24 | parser.add_argument( 25 | '--vocab', type=argparse.FileType('r'), metavar='PATH', 26 | required=True, 27 | help="Vocabulary file.") 28 | parser.add_argument( 29 | '--shortlist', type=int, metavar='INT', default=0, 30 | help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).") 31 | parser.add_argument( 32 | '-n', type=int, metavar='INT', default=2, 33 | help="segment rare words into character n-grams of size INT (default: '%(default)s')).") 34 | parser.add_argument( 35 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 36 | metavar='PATH', 37 | help="Output file (default: standard output)") 38 | parser.add_argument( 39 | '--separator', '-s', type=str, default='@@', metavar='STR', 40 | help="Separator between non-final subword units (default: '%(default)s'))") 41 | 42 | return parser 43 | 44 | 45 | if __name__ == '__main__': 46 | 47 | # python 2/3 compatibility 48 | if sys.version_info < (3, 0): 49 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 50 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 51 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 52 | else: 53 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) 54 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) 55 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) 56 | 57 | parser = create_parser() 58 | args = parser.parse_args() 59 | 60 | # read/write files as UTF-8 61 | args.vocab = codecs.open(args.vocab.name, encoding='utf-8') 62 | if args.input.name != '<stdin>': 63 | args.input = codecs.open(args.input.name, encoding='utf-8') 64 | if args.output.name != '<stdout>': 65 | args.output = codecs.open(args.output.name, 'w', encoding='utf-8') 66 | 67 | vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2] 68 | vocab = dict((y,x) for (x,y) in enumerate(vocab)) 69 | 70 | for line in args.input: 71 | for word in line.split(): 72 | if word not in vocab or vocab[word] > args.shortlist: 73 | i = 0 74 | while i*args.n < len(word): 75 | args.output.write(word[i*args.n:i*args.n+args.n]) 76 | i += 1 77 | if i*args.n < len(word): 78 | args.output.write(args.separator) 79 | args.output.write(' ') 80 | else: 81 | args.output.write(word + ' ') 82 | args.output.write('\n') 83 | -------------------------------------------------------------------------------- /scripts/config-diff.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | function sort_config { 6 | tmp=`mktemp` 7 | grep -Pv "^$|model_dir|data_dir|label|description" $1 | sed "s/^\\s\+-\?\\s*/ /" | grep -Pv "^\\s*#" > ${tmp} 8 | output=`mktemp` 9 | grep -Pv "encoders|decoders|reverse_mapping|^[\s]" ${tmp} | sort > ${output} 10 | echo "decoders:" >> ${output} 11 | sed -n -e "/encoders/,/^[^ ]/p" ${tmp} | grep "^\s\+" | sort >> ${output} 12 | echo "encoders:" >> ${output} 13 | sed -n -e "/decoders/,/^[^ ]/p" ${tmp} | grep "^\s\+" | sort >> ${output} 14 | rm -f ${tmp} 15 | echo ${output} 16 | } 17 | 18 | filename1=`sort_config $1` 19 | filename2=`sort_config $2` 20 | 21 | sdiff -dbBWZs ${filename1} ${filename2} 22 | rm -f ${filename1} ${filename2} 23 | 24 | -------------------------------------------------------------------------------- /scripts/copy-model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import subprocess 4 | import os 5 | import shutil 6 | import re 7 | import sys 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('model_dir') 11 | parser.add_argument('dest_dir') 12 | parser.add_argument('--move', action='store_true') 13 | parser.add_argument('--copy-data', action='store_true') 14 | parser.add_argument('--compact', action='store_true') 15 | parser.add_argument('--force', action='store_true') 16 | 17 | args = parser.parse_args() 18 | 19 | 20 | if os.path.exists(args.dest_dir): 21 | if args.force and os.path.isdir(args.dest_dir): 22 | shutil.rmtree(args.dest_dir) 23 | else: 24 | raise Exception 25 | if not os.path.isdir(args.model_dir): 26 | raise Exception 27 | 28 | config_dir = os.path.realpath(args.dest_dir) 29 | root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 30 | if config_dir.startswith(root_dir): 31 | config_dir = config_dir[len(root_dir):] 32 | else: 33 | config_dir = args.dest_dir 34 | 35 | if args.compact: 36 | os.makedirs(os.path.join(args.dest_dir, 'checkpoints')) 37 | 38 | files = ['config.yaml', 'default.yaml', 'log.txt', 'code.tar.gz'] 39 | dirs = ['data'] 40 | 41 | for filename in files: 42 | shutil.copy(os.path.join(args.model_dir, filename), args.dest_dir) 43 | for dirname in dirs: 44 | try: 45 | shutil.copytree(os.path.join(args.model_dir, dirname), os.path.join(args.dest_dir, dirname)) 46 | except: 47 | pass 48 | 49 | checkpoint_dir = os.path.join(args.model_dir, 'checkpoints') 50 | for filename in os.listdir(checkpoint_dir): 51 | if filename.startswith('best.') or filename.startswith('average.') or filename in ('vars.pkl', 'scores.txt'): 52 | shutil.copy(os.path.join(checkpoint_dir, filename), 53 | os.path.join(args.dest_dir, 'checkpoints')) 54 | 55 | if args.move: # delete 56 | shutil.rmtree(args.model_dir) 57 | elif args.move: 58 | shutil.move(args.model_dir, args.dest_dir) 59 | else: 60 | shutil.copytree(args.model_dir, args.dest_dir) 61 | 62 | 63 | config_filename = os.path.join(args.dest_dir, 'config.yaml') 64 | with open(config_filename) as f: 65 | content = f.read() 66 | 67 | content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(args.dest_dir), content, flags=re.MULTILINE) 68 | with open(config_filename, 'w') as f: 69 | f.write(content) 70 | 71 | if args.copy_data: 72 | data_dir = re.search(r'data_dir:\s*(.*)\s*\n', content, flags=re.MULTILINE).group(1) 73 | 74 | content = re.sub(r'data_dir:.*?\n', 'data_dir: {}/data\n'.format(args.dest_dir), content, flags=re.MULTILINE) 75 | with open(config_filename, 'w') as f: 76 | f.write(content) 77 | 78 | for filename in os.listdir(data_dir): 79 | if filename.startswith('dev') or filename.startswith('test'): 80 | shutil.copy(os.path.join(data_dir, filename), os.path.join(args.dest_dir, 'data', filename)) 81 | -------------------------------------------------------------------------------- /scripts/coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from collections import Counter 5 | from itertools import starmap 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('filename') 9 | parser.add_argument('vocab') 10 | 11 | 12 | if __name__ == '__main__': 13 | args = parser.parse_args() 14 | with open(args.filename) as f, open(args.vocab) as vocab_file: 15 | vocab = set(line.strip() for line in vocab_file) 16 | 17 | true_vocab = Counter(w for line in f for w in line.split()) 18 | 19 | unk_words = Counter({w: c for w, c in true_vocab.items() if w not in vocab}) 20 | 21 | print('Unknown words:') 22 | print('\n'.join(starmap(' {:20} {}'.format, unk_words.most_common()[::-1]))) 23 | 24 | print('{:22} {} ({:.2f}%)'.format('Unknown words:', len(unk_words), 100 * len(unk_words) / len(true_vocab))) 25 | 26 | total_unk_words = sum(unk_words.values()) 27 | total_words = sum(true_vocab.values()) 28 | print('{:22} {} ({:.2f}%)'.format('Total count:', total_unk_words, 100 * total_unk_words / total_words)) 29 | -------------------------------------------------------------------------------- /scripts/decode-moses.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [[ $# -lt 4 ]] 6 | then 7 | echo "wrong number of arguments supplied: $#" 8 | exit 0 9 | fi 10 | 11 | if [ -z ${MOSES} ] 12 | then 13 | echo "variable MOSES undefined" 14 | exit 0 15 | fi 16 | 17 | config_file=`readlink -f $1` 18 | temp_dir=`readlink -f $2` 19 | filename=`readlink -f $3` 20 | output_filename=$4 21 | cores=`lscpu | grep -Po "^(CPU\(s\)|Processeur\(s\)).?:\s+\K\d+$"` 22 | 23 | if [ -d "${temp_dir}" ] 24 | then 25 | echo "directory ${temp_dir} already exists" 26 | exit 0 27 | fi 28 | 29 | mkdir -p ${temp_dir} 30 | printf "started: "; date 31 | ${MOSES}/scripts/training/filter-model-given-input.pl ${temp_dir}/model ${config_file} ${filename} >/dev/null 2>/dev/null 32 | cat ${filename} | sed "s/|//g" | ${MOSES}/bin/moses -f ${temp_dir}/model/moses.ini -threads ${cores} > ${output_filename} 2>/dev/null 33 | rm -rf ${temp_dir} 34 | printf "finished: "; date 35 | 36 | -------------------------------------------------------------------------------- /scripts/extract-lexicon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import argparse 4 | from collections import defaultdict, OrderedDict 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('source_file') 8 | parser.add_argument('target_file') 9 | parser.add_argument('align_file') 10 | 11 | args = parser.parse_args() 12 | 13 | src_vocab = OrderedDict() 14 | trg_vocab = OrderedDict() 15 | 16 | counts = defaultdict(dict) 17 | 18 | with open(args.source_file) as src_file, open(args.target_file) as trg_file, open(args.align_file) as align_file: 19 | for src, trg, align in zip(src_file, trg_file, align_file): 20 | src = src.split() 21 | trg = trg.split() 22 | align = align.split() 23 | for i, j in map(lambda p: map(int, p.split('-')), align): 24 | src_ = src[i] 25 | trg_ = trg[j] 26 | 27 | src_id = src_vocab.setdefault(src_, len(src_vocab)) 28 | trg_id = trg_vocab.setdefault(trg_, len(trg_vocab)) 29 | 30 | #src_counts[src_id] = src_counts.get(src_id, 0) + 1 31 | #trg_counts[trg_id] = trg_counts.get(trg_id, 0) + 1 32 | #pair_counts((src_id, trg_id)) = pair_counts.get((src_id, trg_id), 0) + 1 33 | 34 | counts[src_id][trg_id] = counts[src_id].get(trg_id, 0) + 1 35 | 36 | src_vocab = list(src_vocab.keys()) 37 | trg_vocab = list(trg_vocab.keys()) 38 | 39 | for source, counts_ in counts.items(): 40 | target = max(counts_.keys(), key=lambda word: counts_[word]) 41 | source = src_vocab[source] 42 | target = trg_vocab[target] 43 | print(source, target) 44 | -------------------------------------------------------------------------------- /scripts/join.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('source_file') 6 | parser.add_argument('target_file') 7 | parser.add_argument('-s', '--separator', default='|||') 8 | 9 | args = parser.parse_args() 10 | 11 | with open(args.source_file) as src_file, open(args.target_file) as trg_file: 12 | for src, trg in zip(src_file, trg_file): 13 | line = ' '.join([src.rstrip(), args.separator, trg.rstrip()]) 14 | print(line) 15 | 16 | -------------------------------------------------------------------------------- /scripts/moses/clean-corpus-n.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $ 7 | use warnings; 8 | use strict; 9 | use Getopt::Long; 10 | my $help; 11 | my $lc = 0; # lowercase the corpus? 12 | my $ignore_ratio = 0; 13 | my $ignore_xml = 0; 14 | my $enc = "utf8"; # encoding of the input and output files 15 | # set to anything else you wish, but I have not tested it yet 16 | my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars 17 | # is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000) 18 | # and crashes if it encounters a word that exceeds it 19 | my $ratio = 9; 20 | 21 | GetOptions( 22 | "help" => \$help, 23 | "lowercase|lc" => \$lc, 24 | "encoding=s" => \$enc, 25 | "ratio=f" => \$ratio, 26 | "ignore-ratio" => \$ignore_ratio, 27 | "ignore-xml" => \$ignore_xml, 28 | "max-word-length|mwl=s" => \$max_word_length 29 | ) or exit(1); 30 | 31 | if (scalar(@ARGV) < 6 || $help) { 32 | print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n"; 33 | exit; 34 | } 35 | 36 | my $corpus = $ARGV[0]; 37 | my $l1 = $ARGV[1]; 38 | my $l2 = $ARGV[2]; 39 | my $out = $ARGV[3]; 40 | my $min = $ARGV[4]; 41 | my $max = $ARGV[5]; 42 | 43 | my $linesRetainedFile = ""; 44 | if (scalar(@ARGV) > 6) { 45 | $linesRetainedFile = $ARGV[6]; 46 | open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile"; 47 | } 48 | 49 | print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n"; 50 | 51 | my $opn = undef; 52 | my $l1input = "$corpus.$l1"; 53 | if (-e $l1input) { 54 | $opn = $l1input; 55 | } elsif (-e $l1input.".gz") { 56 | $opn = "gunzip -c $l1input.gz |"; 57 | } else { 58 | die "Error: $l1input does not exist"; 59 | } 60 | open(F,$opn) or die "Can't open '$opn'"; 61 | $opn = undef; 62 | my $l2input = "$corpus.$l2"; 63 | if (-e $l2input) { 64 | $opn = $l2input; 65 | } elsif (-e $l2input.".gz") { 66 | $opn = "gunzip -c $l2input.gz |"; 67 | } else { 68 | die "Error: $l2input does not exist"; 69 | } 70 | 71 | open(E,$opn) or die "Can't open '$opn'"; 72 | 73 | open(FO,">$out.$l1") or die "Can't write $out.$l1"; 74 | open(EO,">$out.$l2") or die "Can't write $out.$l2"; 75 | 76 | # necessary for proper lowercasing 77 | my $binmode; 78 | if ($enc eq "utf8") { 79 | $binmode = ":utf8"; 80 | } else { 81 | $binmode = ":encoding($enc)"; 82 | } 83 | binmode(F, $binmode); 84 | binmode(E, $binmode); 85 | binmode(FO, $binmode); 86 | binmode(EO, $binmode); 87 | 88 | my $innr = 0; 89 | my $outnr = 0; 90 | my $factored_flag; 91 | while(my $f = <F>) { 92 | $innr++; 93 | print STDERR "." if $innr % 10000 == 0; 94 | print STDERR "($innr)" if $innr % 100000 == 0; 95 | my $e = <E>; 96 | die "$corpus.$l2 is too short!" if !defined $e; 97 | chomp($e); 98 | chomp($f); 99 | if ($innr == 1) { 100 | $factored_flag = ($e =~ /\|/ || $f =~ /\|/); 101 | } 102 | 103 | #if lowercasing, lowercase 104 | if ($lc) { 105 | $e = lc($e); 106 | $f = lc($f); 107 | } 108 | 109 | $e =~ s/\|//g unless $factored_flag; 110 | $e =~ s/\s+/ /g; 111 | $e =~ s/^ //; 112 | $e =~ s/ $//; 113 | $f =~ s/\|//g unless $factored_flag; 114 | $f =~ s/\s+/ /g; 115 | $f =~ s/^ //; 116 | $f =~ s/ $//; 117 | next if $f eq ''; 118 | next if $e eq ''; 119 | 120 | my $ec = &word_count($e); 121 | my $fc = &word_count($f); 122 | next if $ec > $max; 123 | next if $fc > $max; 124 | next if $ec < $min; 125 | next if $fc < $min; 126 | next if !$ignore_ratio && $ec/$fc > $ratio; 127 | next if !$ignore_ratio && $fc/$ec > $ratio; 128 | # Skip this segment if any factor is longer than $max_word_length 129 | my $max_word_length_plus_one = $max_word_length + 1; 130 | next if $e =~ /[^\s\|]{$max_word_length_plus_one}/; 131 | next if $f =~ /[^\s\|]{$max_word_length_plus_one}/; 132 | 133 | # An extra check: none of the factors can be blank! 134 | die "There is a blank factor in $corpus.$l1 on line $innr: $f" 135 | if $f =~ /[ \|]\|/; 136 | die "There is a blank factor in $corpus.$l2 on line $innr: $e" 137 | if $e =~ /[ \|]\|/; 138 | 139 | $outnr++; 140 | print FO $f."\n"; 141 | print EO $e."\n"; 142 | 143 | if ($linesRetainedFile ne "") { 144 | print LINES_RETAINED $innr."\n"; 145 | } 146 | } 147 | 148 | if ($linesRetainedFile ne "") { 149 | close LINES_RETAINED; 150 | } 151 | 152 | print STDERR "\n"; 153 | my $e = <E>; 154 | die "$corpus.$l2 is too long!" if defined $e; 155 | 156 | print STDERR "Input sentences: $innr Output sentences: $outnr\n"; 157 | 158 | sub word_count { 159 | my ($line) = @_; 160 | if ($ignore_xml) { 161 | $line =~ s/<\S[^>]*\S>/ /g; 162 | $line =~ s/\s+/ /g; 163 | $line =~ s/^ //g; 164 | $line =~ s/ $//g; 165 | } 166 | my @w = split(/ /,$line); 167 | return scalar @w; 168 | } 169 | -------------------------------------------------------------------------------- /scripts/moses/deescape-special-chars.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | while(<STDIN>) { 10 | s/\&bar;/\|/g; # factor separator (legacy) 11 | s/\|/\|/g; # factor separator 12 | s/\</\</g; # xml 13 | s/\>/\>/g; # xml 14 | s/\&bra;/\[/g; # syntax non-terminal (legacy) 15 | s/\&ket;/\]/g; # syntax non-terminal (legacy) 16 | s/\"/\"/g; # xml 17 | s/\'/\'/g; # xml 18 | s/\[/\[/g; # syntax non-terminal 19 | s/\]/\]/g; # syntax non-terminal 20 | s/\&/\&/g; # escape escape 21 | print $_; 22 | } 23 | -------------------------------------------------------------------------------- /scripts/moses/detruecase.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | use Getopt::Long "GetOptions"; 5 | 6 | binmode(STDIN, ":utf8"); 7 | binmode(STDOUT, ":utf8"); 8 | 9 | my ($SRC,$INFILE,$UNBUFFERED); 10 | die("detruecase.perl < in > out") 11 | unless &GetOptions('headline=s' => \$SRC, 12 | 'in=s' => \$INFILE, 13 | 'b|unbuffered' => \$UNBUFFERED); 14 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } 15 | 16 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); 17 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); 18 | 19 | # lowercase even in headline 20 | my %ALWAYS_LOWER; 21 | foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; } 22 | 23 | # find out about the headlines 24 | my @HEADLINE; 25 | if (defined($SRC)) { 26 | open(SRC,$SRC); 27 | my $headline_flag = 0; 28 | while(<SRC>) { 29 | $headline_flag = 1 if /<hl>/; 30 | $headline_flag = 0 if /<.hl>/; 31 | next unless /^<seg/; 32 | push @HEADLINE, $headline_flag; 33 | } 34 | close(SRC); 35 | } 36 | 37 | my $sentence = 0; 38 | if ($INFILE) { 39 | open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'"); 40 | binmode(IN, ":utf8"); 41 | while(<IN>) { 42 | &process($_,$sentence++); 43 | } 44 | close(IN); 45 | } 46 | else { 47 | while(<STDIN>) { 48 | &process($_,$sentence++); 49 | } 50 | } 51 | 52 | sub process { 53 | my $line = $_[0]; 54 | chomp($line); 55 | $line =~ s/^\s+//; 56 | $line =~ s/\s+$//; 57 | my @WORD = split(/\s+/,$line); 58 | 59 | # uppercase at sentence start 60 | my $sentence_start = 1; 61 | for(my $i=0;$i<scalar(@WORD);$i++) { 62 | &uppercase(\$WORD[$i]) if $sentence_start; 63 | if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; } 64 | elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; } 65 | } 66 | 67 | # uppercase headlines { 68 | if (defined($SRC) && $HEADLINE[$sentence]) { 69 | foreach (@WORD) { 70 | &uppercase(\$_) unless $ALWAYS_LOWER{$_}; 71 | } 72 | } 73 | 74 | # output 75 | my $first = 1; 76 | foreach (@WORD) { 77 | print " " unless $first; 78 | $first = 0; 79 | print $_; 80 | } 81 | print "\n"; 82 | $sentence++; 83 | } 84 | 85 | sub uppercase { 86 | my ($W) = @_; 87 | $$W = uc(substr($$W,0,1)).substr($$W,1); 88 | } 89 | -------------------------------------------------------------------------------- /scripts/moses/escape-special-chars.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | while(<STDIN>) { 10 | chop; 11 | 12 | # avoid general madness 13 | s/[\000-\037]//g; 14 | s/\s+/ /g; 15 | s/^ //g; 16 | s/ $//g; 17 | 18 | # special characters in moses 19 | s/\&/\&/g; # escape escape 20 | s/\|/\|/g; # factor separator 21 | s/\</\</g; # xml 22 | s/\>/\>/g; # xml 23 | s/\'/\'/g; # xml 24 | s/\"/\"/g; # xml 25 | s/\[/\[/g; # syntax non-terminal 26 | s/\]/\]/g; # syntax non-terminal 27 | 28 | # restore xml instructions 29 | s/\<(\S+) translation="(.+?)"> (.+?) <\/(\S+)>/\<$1 translation=\"$2\"> $3 <\/$4>/g; 30 | print $_."\n"; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /scripts/moses/lowercase.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | 5 | binmode(STDIN, ":utf8"); 6 | binmode(STDOUT, ":utf8"); 7 | 8 | while(<STDIN>) { 9 | print lc($_); 10 | } 11 | -------------------------------------------------------------------------------- /scripts/moses/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # $Id$ 4 | use strict; 5 | 6 | my $lowercase = 0; 7 | if ($ARGV[0] eq "-lc") { 8 | $lowercase = 1; 9 | shift; 10 | } 11 | 12 | my $stem = $ARGV[0]; 13 | if (!defined $stem) { 14 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 15 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 16 | exit(1); 17 | } 18 | 19 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 20 | 21 | my @REF; 22 | my $ref=0; 23 | while(-e "$stem$ref") { 24 | &add_to_ref("$stem$ref",\@REF); 25 | $ref++; 26 | } 27 | &add_to_ref($stem,\@REF) if -e $stem; 28 | die("ERROR: could not find reference file $stem") unless scalar @REF; 29 | 30 | sub add_to_ref { 31 | my ($file,$REF) = @_; 32 | my $s=0; 33 | open(REF,$file) or die "Can't read $file"; 34 | while(<REF>) { 35 | chop; 36 | push @{$$REF[$s++]}, $_; 37 | } 38 | close(REF); 39 | } 40 | 41 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 42 | my $s=0; 43 | while(<STDIN>) { 44 | chop; 45 | $_ = lc if $lowercase; 46 | my @WORD = split; 47 | my %REF_NGRAM = (); 48 | my $length_translation_this_sentence = scalar(@WORD); 49 | my ($closest_diff,$closest_length) = (9999,9999); 50 | foreach my $reference (@{$REF[$s]}) { 51 | # print "$s $_ <=> $reference\n"; 52 | $reference = lc($reference) if $lowercase; 53 | my @WORD = split(' ',$reference); 54 | my $length = scalar(@WORD); 55 | my $diff = abs($length_translation_this_sentence-$length); 56 | if ($diff < $closest_diff) { 57 | $closest_diff = $diff; 58 | $closest_length = $length; 59 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 60 | } elsif ($diff == $closest_diff) { 61 | $closest_length = $length if $length < $closest_length; 62 | # from two references with the same closeness to me 63 | # take the *shorter* into account, not the "first" one. 64 | } 65 | for(my $n=1;$n<=4;$n++) { 66 | my %REF_NGRAM_N = (); 67 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 68 | my $ngram = "$n"; 69 | for(my $w=0;$w<$n;$w++) { 70 | $ngram .= " ".$WORD[$start+$w]; 71 | } 72 | $REF_NGRAM_N{$ngram}++; 73 | } 74 | foreach my $ngram (keys %REF_NGRAM_N) { 75 | if (!defined($REF_NGRAM{$ngram}) || 76 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 77 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 78 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n"; 79 | } 80 | } 81 | } 82 | } 83 | $length_translation += $length_translation_this_sentence; 84 | $length_reference += $closest_length; 85 | for(my $n=1;$n<=4;$n++) { 86 | my %T_NGRAM = (); 87 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 88 | my $ngram = "$n"; 89 | for(my $w=0;$w<$n;$w++) { 90 | $ngram .= " ".$WORD[$start+$w]; 91 | } 92 | $T_NGRAM{$ngram}++; 93 | } 94 | foreach my $ngram (keys %T_NGRAM) { 95 | $ngram =~ /^(\d+) /; 96 | my $n = $1; 97 | # my $corr = 0; 98 | # print "$i e $ngram $T_NGRAM{$ngram}<BR>\n"; 99 | $TOTAL[$n] += $T_NGRAM{$ngram}; 100 | if (defined($REF_NGRAM{$ngram})) { 101 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 102 | $CORRECT[$n] += $T_NGRAM{$ngram}; 103 | # $corr = $T_NGRAM{$ngram}; 104 | # print "$i e correct1 $T_NGRAM{$ngram}<BR>\n"; 105 | } 106 | else { 107 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 108 | # $corr = $REF_NGRAM{$ngram}; 109 | # print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n"; 110 | } 111 | } 112 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 113 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 114 | } 115 | } 116 | $s++; 117 | } 118 | my $brevity_penalty = 1; 119 | my $bleu = 0; 120 | 121 | my @bleu=(); 122 | 123 | for(my $n=1;$n<=4;$n++) { 124 | if (defined ($TOTAL[$n])){ 125 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 126 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 127 | }else{ 128 | $bleu[$n]=0; 129 | } 130 | } 131 | 132 | if ($length_reference==0){ 133 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 134 | exit(1); 135 | } 136 | 137 | if ($length_translation<$length_reference) { 138 | $brevity_penalty = exp(1-$length_reference/$length_translation); 139 | } 140 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 141 | my_log( $bleu[2] ) + 142 | my_log( $bleu[3] ) + 143 | my_log( $bleu[4] ) ) / 4) ; 144 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 145 | 100*$bleu, 146 | 100*$bleu[1], 147 | 100*$bleu[2], 148 | 100*$bleu[3], 149 | 100*$bleu[4], 150 | $brevity_penalty, 151 | $length_translation / $length_reference, 152 | $length_translation, 153 | $length_reference; 154 | 155 | sub my_log { 156 | return -9999999999 unless $_[0]; 157 | return log($_[0]); 158 | } 159 | -------------------------------------------------------------------------------- /scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.el: -------------------------------------------------------------------------------- 1 | # for now, just include the Greek equivalent of "Mr." 2 | κ 3 | -------------------------------------------------------------------------------- /scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | -------------------------------------------------------------------------------- /scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /scripts/moses/normalize-punctuation.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | 5 | my ($language) = @ARGV; 6 | 7 | while(<STDIN>) { 8 | s/\r//g; 9 | # remove extra spaces 10 | s/\(/ \(/g; 11 | s/\)/\) /g; s/ +/ /g; 12 | s/\) ([\.\!\:\?\;\,])/\)$1/g; 13 | s/\( /\(/g; 14 | s/ \)/\)/g; 15 | s/(\d) \%/$1\%/g; 16 | s/ :/:/g; 17 | s/ ;/;/g; 18 | # normalize unicode punctuation 19 | s/„/\"/g; 20 | s/“/\"/g; 21 | s/”/\"/g; 22 | s/–/-/g; 23 | s/—/ - /g; s/ +/ /g; 24 | s/´/\'/g; 25 | s/(\pL)‘(\pL)/$1\'$2/gi; 26 | s/(\pL)’(\pL)/$1\'$2/gi; 27 | #s/([A-Za-zé])‘([A-Za-zé])/$1\'$2/gi; 28 | #s/([A-Za-zé])([A-Za-zé])/$1\'$2/gi; 29 | s/‘/\"/g; 30 | s/‚/\"/g; 31 | s/’/\"/g; 32 | s/''/\"/g; 33 | s/´´/\"/g; 34 | s/…/.../g; 35 | # French quotes 36 | s/ « / \"/g; 37 | s/« /\"/g; 38 | s/«/\"/g; 39 | s/ » /\" /g; 40 | s/ »/\"/g; 41 | s/»/\"/g; 42 | # handle pseudo-spaces 43 | s/ \%/\%/g; 44 | s/nº /nº /g; 45 | s/ :/:/g; 46 | s/ ºC/ ºC/g; 47 | s/ cm/ cm/g; 48 | s/ \?/\?/g; 49 | s/ \!/\!/g; 50 | s/ ;/;/g; 51 | s/, /, /g; s/ +/ /g; 52 | 53 | # English "quotation," followed by comma, style 54 | if ($language eq "en") { 55 | s/\"([,\.]+)/$1\"/g; 56 | } 57 | # Czech is confused 58 | elsif ($language eq "cs" || $language eq "cz") { 59 | } 60 | # German/Spanish/French "quotation", followed by comma, style 61 | else { 62 | s/,\"/\",/g; 63 | s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence 64 | } 65 | 66 | print STDERR $_ if //; 67 | 68 | if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { 69 | s/(\d) (\d)/$1,$2/g; 70 | } 71 | else { 72 | s/(\d) (\d)/$1.$2/g; 73 | } 74 | print $_; 75 | } 76 | -------------------------------------------------------------------------------- /scripts/moses/strip-xml.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # strip text file of any XML markup 4 | 5 | binmode(STDIN, ":utf8"); 6 | binmode(STDOUT, ":utf8"); 7 | 8 | use strict; 9 | 10 | while(<STDIN>) { 11 | s/<\S[^>]*>/ /g; 12 | chomp; 13 | s/ +/ /g; 14 | s/^ //; 15 | print $_; 16 | print "\n"; 17 | } 18 | -------------------------------------------------------------------------------- /scripts/moses/tokenizer.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # Sample Tokenizer 4 | # written by Josh Schroeder, based on code by Philipp Koehn 5 | 6 | binmode(STDIN, ":utf8"); 7 | binmode(STDOUT, ":utf8"); 8 | 9 | use FindBin qw($Bin); 10 | use strict; 11 | #use Time::HiRes; 12 | 13 | my $mydir = "$Bin/nonbreaking_prefixes"; 14 | 15 | my %NONBREAKING_PREFIX = (); 16 | my $language = "en"; 17 | my $QUIET = 0; 18 | my $HELP = 0; 19 | 20 | #my $start = [ Time::HiRes::gettimeofday( ) ]; 21 | 22 | while (@ARGV) { 23 | $_ = shift; 24 | /^-l$/ && ($language = shift, next); 25 | /^-q$/ && ($QUIET = 1, next); 26 | /^-h$/ && ($HELP = 1, next); 27 | } 28 | 29 | if ($HELP) { 30 | print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n"; 31 | exit; 32 | } 33 | if (!$QUIET) { 34 | print STDERR "Tokenizer v3\n"; 35 | print STDERR "Language: $language\n"; 36 | } 37 | 38 | load_prefixes($language,\%NONBREAKING_PREFIX); 39 | 40 | if (scalar(%NONBREAKING_PREFIX) eq 0){ 41 | print STDERR "Warning: No known abbreviations for language '$language'\n"; 42 | } 43 | 44 | while(<STDIN>) { 45 | if (/^<.+>$/ || /^\s*$/) { 46 | #don't try to tokenize XML/HTML tag lines 47 | print $_; 48 | } 49 | else { 50 | print &tokenize($_); 51 | } 52 | } 53 | 54 | #my $duration = Time::HiRes::tv_interval( $start ); 55 | #print STDERR ("EXECUTION TIME: ".$duration."\n"); 56 | 57 | 58 | sub tokenize { 59 | my($text) = @_; 60 | chomp($text); 61 | $text = " $text "; 62 | 63 | # seperate out all "other" special characters 64 | $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; 65 | 66 | #multi-dots stay together 67 | $text =~ s/\.([\.]+)/ DOTMULTI$1/g; 68 | while($text =~ /DOTMULTI\./) { 69 | $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; 70 | $text =~ s/DOTMULTI\./DOTDOTMULTI/g; 71 | } 72 | 73 | # seperate out "," except if within numbers (5,300) 74 | $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 75 | # separate , pre and post number 76 | $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; 77 | $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; 78 | 79 | # turn `into ' 80 | $text =~ s/\`/\'/g; 81 | 82 | #turn '' into " 83 | $text =~ s/\'\'/ \" /g; 84 | 85 | if ($language eq "en") { 86 | #split contractions right 87 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 88 | $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; 89 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 90 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; 91 | #special case for "1990's" 92 | $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; 93 | } elsif (($language eq "fr") or ($language eq "it")) { 94 | #split contractions left 95 | $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 96 | $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; 97 | $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; 98 | $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; 99 | } else { 100 | $text =~ s/\'/ \' /g; 101 | } 102 | 103 | #word token method 104 | my @words = split(/\s/,$text); 105 | $text = ""; 106 | for (my $i=0;$i<(scalar(@words));$i++) { 107 | my $word = $words[$i]; 108 | if ( $word =~ /^(\S+)\.$/) { 109 | my $pre = $1; 110 | if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) { 111 | #no change 112 | } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) { 113 | #no change 114 | } else { 115 | $word = $pre." ."; 116 | } 117 | } 118 | $text .= $word." "; 119 | } 120 | 121 | # clean up extraneous spaces 122 | $text =~ s/ +/ /g; 123 | $text =~ s/^ //g; 124 | $text =~ s/ $//g; 125 | 126 | #restore multi-dots 127 | while($text =~ /DOTDOTMULTI/) { 128 | $text =~ s/DOTDOTMULTI/DOTMULTI./g; 129 | } 130 | $text =~ s/DOTMULTI/./g; 131 | 132 | #ensure final line break 133 | $text .= "\n" unless $text =~ /\n$/; 134 | 135 | return $text; 136 | } 137 | 138 | sub load_prefixes { 139 | my ($language, $PREFIX_REF) = @_; 140 | 141 | my $prefixfile = "$mydir/nonbreaking_prefix.$language"; 142 | 143 | #default back to English if we don't have a language-specific prefix file 144 | if (!(-e $prefixfile)) { 145 | $prefixfile = "$mydir/nonbreaking_prefix.en"; 146 | print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; 147 | die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); 148 | } 149 | 150 | if (-e "$prefixfile") { 151 | open(PREFIX, "<:utf8", "$prefixfile"); 152 | while (<PREFIX>) { 153 | my $item = $_; 154 | chomp($item); 155 | if (($item) && (substr($item,0,1) ne "#")) { 156 | if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) { 157 | $PREFIX_REF->{$1} = 2; 158 | } else { 159 | $PREFIX_REF->{$item} = 1; 160 | } 161 | } 162 | } 163 | close(PREFIX); 164 | } 165 | 166 | } 167 | 168 | -------------------------------------------------------------------------------- /scripts/moses/train-truecaser.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ 4 | 5 | # 6 | # Options: 7 | # 8 | # --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token. 9 | # 10 | 11 | use strict; 12 | use Getopt::Long "GetOptions"; 13 | 14 | # apply switches 15 | my ($MODEL,$CORPUS); 16 | die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]") 17 | unless &GetOptions('corpus=s' => \$CORPUS, 18 | 'model=s' => \$MODEL, 19 | 'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0)) 20 | && defined($CORPUS) && defined($MODEL); 21 | my %CASING; 22 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); 23 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); 24 | open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'"); 25 | binmode(CORPUS, ":utf8"); 26 | while(<CORPUS>) { 27 | chop; 28 | my @WORD = split; 29 | my $start = 0; 30 | while($start<=$#WORD && defined($DELAYED_SENTENCE_START{$WORD[$start]})) { $start++; } 31 | my $firstWordOfSentence = 1; 32 | for(my $i=$start;$i<=$#WORD;$i++) { 33 | my $currentWord = $WORD[$i]; 34 | if (! $firstWordOfSentence && defined($SENTENCE_END{$WORD[$i-1]})) { 35 | $firstWordOfSentence = 1; 36 | } 37 | 38 | my $currentWordWeight = 0; 39 | if (! $firstWordOfSentence) { 40 | $currentWordWeight = 1; 41 | } elsif ($possiblyUseFirstToken) { 42 | # gated special handling of first word of sentence 43 | my $firstChar = substr($currentWord, 0, 1); 44 | if (lc($firstChar) eq $firstChar) { 45 | # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial) 46 | $currentWordWeight = 1; 47 | } elsif (scalar(@WORD) == 1) { 48 | # if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case) 49 | $currentWordWeight = 0.1; 50 | } 51 | } 52 | if ($currentWordWeight > 0) { 53 | $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight; 54 | } 55 | 56 | $firstWordOfSentence = 0; 57 | } 58 | } 59 | close(CORPUS); 60 | 61 | open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'"); 62 | binmode(MODEL, ":utf8"); 63 | foreach my $type (keys %CASING) { 64 | my ($score,$total,$best) = (-1,0,""); 65 | foreach my $word (keys %{$CASING{$type}}) { 66 | my $count = $CASING{$type}{$word}; 67 | $total += $count; 68 | if ($count > $score) { 69 | $best = $word; 70 | $score = $count; 71 | } 72 | } 73 | print MODEL "$best ($score/$total)"; 74 | foreach my $word (keys %{$CASING{$type}}) { 75 | print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best; 76 | } 77 | print MODEL "\n"; 78 | } 79 | close(MODEL); 80 | -------------------------------------------------------------------------------- /scripts/moses/truecase.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ 4 | use strict; 5 | use Getopt::Long "GetOptions"; 6 | 7 | binmode(STDIN, ":utf8"); 8 | binmode(STDOUT, ":utf8"); 9 | 10 | # apply switches 11 | my ($MODEL, $UNBUFFERED); 12 | die("truecase.perl --model MODEL [-b] < in > out") 13 | unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) 14 | && defined($MODEL); 15 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } 16 | 17 | my (%BEST,%KNOWN); 18 | open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); 19 | binmode(MODEL, ":utf8"); 20 | while(<MODEL>) { 21 | my ($word,@OPTIONS) = split; 22 | $BEST{ lc($word) } = $word; 23 | $KNOWN{ $word } = 1; 24 | for(my $i=1;$i<$#OPTIONS;$i+=2) { 25 | $KNOWN{ $OPTIONS[$i] } = 1; 26 | } 27 | } 28 | close(MODEL); 29 | 30 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); 31 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); 32 | 33 | while(<STDIN>) { 34 | chop; 35 | my ($WORD,$MARKUP) = split_xml($_); 36 | my $sentence_start = 1; 37 | for(my $i=0;$i<=$#$WORD;$i++) { 38 | print " " if $i; 39 | print $$MARKUP[$i]; 40 | 41 | $$WORD[$i] =~ /^([^\|]+)(.*)/; 42 | my $word = $1; 43 | my $otherfactors = $2; 44 | 45 | if ($sentence_start && defined($BEST{lc($word)})) { 46 | print $BEST{lc($word)}; # truecase sentence start 47 | } 48 | elsif (defined($KNOWN{$word})) { 49 | print $word; # don't change known words 50 | } 51 | elsif (defined($BEST{lc($word)})) { 52 | print $BEST{lc($word)}; # truecase otherwise unknown words 53 | } 54 | else { 55 | print $word; # unknown, nothing to do 56 | } 57 | print $otherfactors; 58 | 59 | if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } 60 | elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } 61 | } 62 | print " ".$$MARKUP[$#$MARKUP]; 63 | print "\n"; 64 | } 65 | 66 | # store away xml markup 67 | sub split_xml { 68 | my ($line) = @_; 69 | my (@WORD,@MARKUP); 70 | my $i = 0; 71 | $MARKUP[0] = ""; 72 | while($line =~ /\S/) { 73 | if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { 74 | $MARKUP[$i] .= $1." "; 75 | $line = $2; 76 | } 77 | elsif ($line =~ /^\s*(\S+)(.*)$/) { 78 | $WORD[$i++] = $1; 79 | $MARKUP[$i] = ""; 80 | $line = $2; 81 | } 82 | else { 83 | die("ERROR: huh? $line\n"); 84 | } 85 | } 86 | chop($MARKUP[$#MARKUP]); 87 | return (\@WORD,\@MARKUP); 88 | } 89 | -------------------------------------------------------------------------------- /scripts/moses/wrap-xml.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | 9 | my ($language,$src,$system) = @ARGV; 10 | die("wrapping frame not found ($src)") unless -e $src; 11 | $system = "Edinburgh" unless $system; 12 | 13 | open(SRC,$src) or die "Cannot open: $!"; 14 | my @OUT = <STDIN>; 15 | chomp(@OUT); 16 | #my @OUT = `cat $decoder_output`; 17 | my $missing_end_seg = 0; 18 | while(<SRC>) { 19 | chomp; 20 | if (/^<srcset/) { 21 | s/<srcset/<tstset trglang="$language"/i; 22 | } 23 | elsif (/^<\/srcset/) { 24 | s/<\/srcset/<\/tstset/i; 25 | } 26 | elsif (/^<doc/i) { 27 | s/ *sysid="[^\"]+"//; 28 | s/<doc/<doc sysid="$system"/i; 29 | } 30 | elsif (/<seg/) { 31 | my $line = shift(@OUT); 32 | $line = "" if $line =~ /NO BEST TRANSLATION/; 33 | if (/<\/seg>/) { 34 | s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i; 35 | $missing_end_seg = 0; 36 | } 37 | else { 38 | s/(<seg[^>]+> *)[^<]*/$1$line<\/seg>/i; 39 | $missing_end_seg = 1; 40 | } 41 | } 42 | elsif ($missing_end_seg) { 43 | if (/<\/doc>/) { 44 | $missing_end_seg = 0; 45 | } 46 | else { 47 | next; 48 | } 49 | } 50 | print $_."\n"; 51 | } 52 | -------------------------------------------------------------------------------- /scripts/multi-print.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import argparse 4 | import sys 5 | import subprocess 6 | import re 7 | import sys 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('files', nargs='+') 11 | parser.add_argument('--head', action='store_true') 12 | parser.add_argument('--shuf', action='store_true') 13 | parser.add_argument('-n', type=int) 14 | parser.add_argument('-d', '--delimiter', default='^', choices=['&', '^', '@', '~', '|', '/', '#', '$']) 15 | parser.add_argument('--space', action='store_true') 16 | 17 | args = parser.parse_args() 18 | 19 | commands = [] 20 | paste = ['paste', '-d', args.delimiter] + list(args.files) 21 | commands.append(paste) 22 | 23 | if args.shuf: 24 | shuf = ['shuf'] 25 | if args.n: 26 | shuf += ['-n', str(args.n)] 27 | commands.append(shuf) 28 | if args.head: 29 | head = ['head', '-n', str(args.n or 10)] 30 | commands.append(head) 31 | 32 | if args.space: 33 | space = ['sed', 'G'] 34 | commands.append(space) 35 | 36 | delimiter = re.escape(args.delimiter) if args.delimiter in ('/', '^', '$') else args.delimiter 37 | sed = ['sed', 's/{}/\\n/g'.format(delimiter)] 38 | commands.append(sed) 39 | 40 | ps = None 41 | 42 | for i, cmd in enumerate(commands): 43 | stdout = sys.stdout if i == len(commands) - 1 else subprocess.PIPE 44 | stdin = None if i == 0 else ps.stdout 45 | ps = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=open('/dev/null', 'w')) 46 | 47 | ps.wait() 48 | 49 | -------------------------------------------------------------------------------- /scripts/paired-eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import numpy as np 6 | from translate.evaluation import corpus_bleu, corpus_ter 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('source1') 10 | parser.add_argument('source2') 11 | parser.add_argument('target') 12 | 13 | parser.add_argument('--bleu', action='store_true') 14 | parser.add_argument('--max-size', type=int) 15 | parser.add_argument('--case-insensitive', '-i', action='store_true') 16 | 17 | parser.add_argument('--samples', type=int, default=1000) 18 | parser.add_argument('--sample-size', type=int, default=0) 19 | parser.add_argument('-p', type=float, default=0.05) 20 | 21 | 22 | if __name__ == '__main__': 23 | args = parser.parse_args() 24 | 25 | with open(args.source1) as src_file_1, open(args.source2) as src_file_2, open(args.target) as trg_file: 26 | if args.case_insensitive: 27 | fun = lambda x: x.strip().lower() 28 | else: 29 | fun = lambda x: x.strip() 30 | 31 | hypotheses_1 = list(map(fun, src_file_1)) 32 | hypotheses_2 = list(map(fun, src_file_2)) 33 | references = list(map(fun, trg_file)) 34 | 35 | if args.max_size is not None: 36 | hypotheses_1 = hypotheses_1[:args.max_size] 37 | hypotheses_2 = hypotheses_2[:args.max_size] 38 | references = references[:args.max_size] 39 | 40 | if len(hypotheses_1) != len(references) or len(hypotheses_2) != len(references): 41 | sys.stderr.write('warning: source and target don\'t have the same length\n') 42 | size = min(len(hypotheses_1), len(hypotheses_2), len(references)) 43 | hypotheses_1 = hypotheses_1[:size] 44 | hypotheses_2 = hypotheses_2[:size] 45 | references = references[:size] 46 | 47 | indices = np.arange(len(references)) 48 | if args.sample_size == 0: 49 | args.sample_size = len(references) 50 | 51 | diffs = [] 52 | 53 | hypotheses_1 = np.array(hypotheses_1) 54 | hypotheses_2 = np.array(hypotheses_2) 55 | references = np.array(references) 56 | 57 | score_fun = corpus_bleu if args.bleu else corpus_ter 58 | 59 | #diff = abs(score_fun(hypotheses_1, references)[0] - score_fun(hypotheses_2, references)[0]) 60 | 61 | for _ in range(args.samples): 62 | indices = np.random.randint(len(references), size=args.sample_size) 63 | hypotheses_1_ = hypotheses_1[indices] 64 | hypotheses_2_ = hypotheses_2[indices] 65 | references_ = references[indices] 66 | 67 | score_1, _ = score_fun(hypotheses_1_, references_) 68 | score_2, _ = score_fun(hypotheses_2_, references_) 69 | 70 | diffs.append(int(score_1 > score_2)) 71 | #diffs.append(abs(score_1 - score_2)) 72 | 73 | # avg_diff = sum(diffs) / len(diffs) 74 | # c = sum( 75 | # int(diff_ - avg_diff >= diff) for diff_ in diffs 76 | # ) 77 | # 78 | # p = (c + 1) / (len(diffs) + 1) 79 | # print(p) 80 | 81 | p = sum(diffs) / len(diffs) 82 | if not args.bleu: 83 | p = 1 - p 84 | 85 | print('x is better than y {:.1f}% of the time'.format(p * 100)) 86 | -------------------------------------------------------------------------------- /scripts/plot-score-per-length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import os 6 | import re 7 | import numpy as np 8 | from collections import OrderedDict 9 | from matplotlib import pyplot as plt 10 | 11 | script_dir = os.path.dirname(os.path.abspath(__file__)) 12 | root_dir = os.path.dirname(script_dir) 13 | sys.path.append(root_dir) 14 | tercom_path = os.path.join(script_dir, 'tercom.jar') 15 | 16 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer, corpus_cer, corpus_bleu1 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('mt', nargs='+') 20 | parser.add_argument('ref') 21 | 22 | parser.add_argument('--src') 23 | parser.add_argument('--min', type=int, default=0) 24 | parser.add_argument('--max', type=int, default=70) 25 | parser.add_argument('--step', type=int, default=5) 26 | parser.add_argument('--labels', nargs='*') 27 | parser.add_argument('--output') 28 | 29 | parser.add_argument('--bar', action='store_true') 30 | 31 | args = parser.parse_args() 32 | 33 | if args.src is None: 34 | args.src = args.ref 35 | 36 | assert args.labels is None or len(args.labels) == len(args.mt) 37 | 38 | for k, mt in enumerate(args.mt): 39 | with open(args.src) as src_file, open(mt) as mt_file, open(args.ref) as ref_file: 40 | lines = list(zip(src_file, mt_file, ref_file)) 41 | 42 | bins = OrderedDict() 43 | 44 | for i in range(args.min, args.max, args.step): 45 | lines_ = [(mt.strip(), ref.strip()) for src, mt, ref in lines if i < len(src.split()) <= i + args.step] 46 | if len(lines_) > 0: 47 | score, summary = corpus_bleu(*zip(*lines_)) 48 | bins[i + args.step] = score 49 | # print(i + args.step, '{:.1f}'.format(score), len(lines_), summary) 50 | 51 | values = np.array(list(bins.values())) 52 | keys = np.array(list(bins.keys())) 53 | 54 | label = args.labels[k] if args.labels else None 55 | 56 | if args.bar: 57 | width = 1 if len(args.mt) > 1 else args.step - 1 58 | keys += k 59 | plt.bar(keys + k, values, width=width, label=label) 60 | else: 61 | plt.plot(keys, values, label=label) 62 | 63 | xlabel = 'Reference words' if args.src == args.ref else 'Source words' 64 | plt.xlabel(xlabel) 65 | plt.ylabel('BLEU') 66 | plt.legend() 67 | 68 | if args.output: 69 | plt.savefig(args.output) 70 | else: 71 | plt.show() 72 | -------------------------------------------------------------------------------- /scripts/post_editing/apply-edits.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from translate.utils import reverse_edits 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('source') 8 | parser.add_argument('edits') 9 | parser.add_argument('--not-strict', action='store_false', dest='strict') 10 | parser.add_argument('--no-fix', action='store_false', dest='fix') 11 | 12 | if __name__ == '__main__': 13 | args = parser.parse_args() 14 | with open(args.source) as src_file, open(args.edits) as edit_file: 15 | for source, edits in zip(src_file, edit_file): 16 | target = reverse_edits(source.strip('\n'), edits.strip('\n'), strict=args.strict, fix=args.fix) 17 | print(target) 18 | -------------------------------------------------------------------------------- /scripts/post_editing/extract-ter-vectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | from translate.evaluation import tercom_statistics 6 | from itertools import islice 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('source') 10 | parser.add_argument('target') 11 | parser.add_argument('--output') 12 | parser.add_argument('--precision', type=int, default=4) 13 | 14 | parser.add_argument('--case-insensitive', '-i', action='store_true') 15 | 16 | if __name__ == '__main__': 17 | args = parser.parse_args() 18 | 19 | vectors = [] 20 | fields = ['DEL', 'INS', 'SUB', 'WORD_SHIFT', 'REF_WORDS', 'TER'] 21 | 22 | with open(args.source) as src_file, open(args.target) as trg_file: 23 | 24 | i = 0 25 | n = 1000 26 | 27 | avg_length = 0 28 | 29 | while True: 30 | i += 1 31 | hypotheses = list(islice(src_file, n)) 32 | references = list(islice(trg_file, n)) 33 | 34 | if not hypotheses or not references: 35 | break 36 | 37 | hypotheses = [line.strip() for line in hypotheses] 38 | references = [line.strip() for line in references] 39 | 40 | _, stats = tercom_statistics(hypotheses, references, not args.case_insensitive) 41 | 42 | if avg_length == 0: 43 | avg_length = sum(stats_['REF_WORDS'] for stats_ in stats) / len(stats) 44 | 45 | for stats_ in stats: 46 | for field in ('DEL', 'INS', 'SUB', 'WORD_SHIFT'): 47 | stats_[field] /= stats_['REF_WORDS'] 48 | 49 | stats_['REF_WORDS'] = (stats_['REF_WORDS'] - avg_length) / avg_length 50 | stats_['TER'] /= 100 51 | 52 | if not args.output: 53 | print('\n'.join(','.join(str(round(stats_[k], args.precision)) for k in fields) 54 | for stats_ in stats)) 55 | else: 56 | vectors += [np.array([stats_[k] for k in fields]) for stats_ in stats] 57 | print('{}'.format(i * n), end='\r') 58 | 59 | if args.output: 60 | import h5py 61 | h5f = h5py.File(args.output, 'w') 62 | h5f.create_dataset('dataset_1', data=vectors) 63 | h5f.close() 64 | -------------------------------------------------------------------------------- /scripts/post_editing/noisify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import sklearn.mixture 6 | import numpy as np 7 | import random 8 | from scipy.stats import truncnorm 9 | from collections import Counter 10 | from translate.evaluation import tercom_statistics 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('source') 14 | parser.add_argument('target') 15 | 16 | parser.add_argument('--mono') 17 | parser.add_argument('--min-count', type=int, default=2) 18 | parser.add_argument('--case-insensitive', '-i', action='store_true') 19 | 20 | if __name__ == '__main__': 21 | args = parser.parse_args() 22 | 23 | fields = ['DEL', 'INS', 'SUB', 'WORD_SHIFT', 'REF_WORDS'] 24 | op_fields = ['DEL', 'INS', 'SUB', 'WORD_SHIFT'] 25 | 26 | with open(args.source) as src_file, open(args.target) as trg_file: 27 | hypotheses = [line.strip() for line in src_file] 28 | references = [line.strip() for line in trg_file] 29 | 30 | _, stats = tercom_statistics(hypotheses, references, not args.case_insensitive) 31 | 32 | for stats_ in stats: 33 | for field in op_fields: 34 | stats_[field] /= stats_['REF_WORDS'] 35 | 36 | ops = np.array([[stats_[k] for k in op_fields] for stats_ in stats]) 37 | 38 | model = sklearn.mixture.GMM(n_components=1) 39 | model.fit(ops) 40 | 41 | sigma = model.covars_ 42 | mu = model.means_ 43 | distribution = truncnorm(-mu / sigma, np.inf, loc=mu, scale=sigma) 44 | 45 | unigram_filename = args.mono or args.source 46 | with open(unigram_filename) as unigram_file: 47 | unigrams = Counter(w for line in unigram_file for w in line.split()) 48 | unigrams = Counter({w: c for w, c in unigrams.items() if c >= args.min_count}) 49 | 50 | total = sum(unigrams.values()) 51 | for k in unigrams.keys(): 52 | unigrams[k] /= total 53 | 54 | vocab = list(unigrams.keys()) 55 | p = np.array(list(unigrams.values())) 56 | 57 | def unigram_sampler(): 58 | while True: 59 | x = np.random.choice(vocab, size=1000, p=p) 60 | for w in x: 61 | yield w 62 | 63 | sampler = unigram_sampler() 64 | 65 | for line in sys.stdin: 66 | words = line.split() 67 | 68 | sample = distribution.rvs(len(op_fields)) * len(words) 69 | 70 | x = sample.astype(np.int32) 71 | i = np.random.random(sample.shape) < sample - sample.astype(np.int32) 72 | x += i.astype(np.int32) 73 | 74 | dels, ins, subs, shifts = x 75 | 76 | for _ in range(dels): 77 | k = random.randrange(len(words)) 78 | del words[k] 79 | 80 | for _ in range(shifts): 81 | j, k = random.sample(range(len(words)), 2) 82 | w = words.pop(j) 83 | words.insert(k, w) 84 | 85 | for _ in range(subs): 86 | w = next(sampler) 87 | k = random.randrange(len(words)) 88 | words[k] = w 89 | 90 | for _ in range(ins): 91 | w = next(sampler) 92 | k = random.randrange(len(words) + 1) 93 | words.insert(k, w) 94 | 95 | print(' '.join(words)) 96 | -------------------------------------------------------------------------------- /scripts/post_editing/plot-ops.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from matplotlib import pyplot as plt 5 | import os 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('eval_dir') 9 | parser.add_argument('reference') 10 | parser.add_argument('--max-step', type=int) 11 | args = parser.parse_args() 12 | 13 | filenames = sorted(os.listdir(args.eval_dir), key=lambda filename: int(filename.split('.')[-2])) 14 | steps = [int(filename.split('.')[-2]) for filename in filenames] 15 | 16 | filenames = [os.path.join(args.eval_dir, filename) for filename in filenames] 17 | 18 | with open(args.reference) as ref_file: 19 | lines = [line.split() for line in ref_file] 20 | ref_keeps = [line.count('<KEEP>') for line in lines] 21 | ref_dels = [line.count('<DEL>') for line in lines] 22 | ref_ins = [len(line) - line.count('<KEEP>') - line.count('<DEL>') for line in lines] 23 | 24 | 25 | keeps = [] 26 | dels = [] 27 | ins = [] 28 | 29 | fun = lambda x, y, z: abs(x - y) / z 30 | #fun = lambda x, y, z: x/z 31 | 32 | for filename in filenames: 33 | with open(filename) as f: 34 | keep_ = 0 35 | del_ = 0 36 | ins_ = 0 37 | lines = 0 38 | 39 | for i, line in enumerate(f): 40 | words = line.split() 41 | lines += 1 42 | keep_ += fun(words.count('<KEEP>'), ref_keeps[i], len(words)) 43 | del_ += fun(words.count('<DEL>'), ref_dels[i], len(words)) 44 | ins_ += fun(len(words) - words.count('<KEEP>') - words.count('<DEL>'), ref_ins[i], len(words)) 45 | 46 | keeps.append(keep_ / lines) 47 | dels.append(del_ / lines) 48 | ins.append(ins_ / lines) 49 | 50 | 51 | if args.max_step: 52 | steps, keeps, dels, ins = zip(*[ 53 | (step, keep_, del_, ins_) for step, keep_, del_, ins_ 54 | in zip(steps, keeps, dels, ins) if step <= args.max_step 55 | ]) 56 | 57 | plt.plot(steps, keeps, label='KEEP') 58 | plt.plot(steps, dels, label='DEL') 59 | plt.plot(steps, ins, label='INS(x)') 60 | 61 | legend = plt.legend(loc='best', shadow=True) 62 | 63 | plt.show() -------------------------------------------------------------------------------- /scripts/post_editing/plot-ter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from translate.evaluation import tercom_statistics 4 | from matplotlib import pyplot as plt 5 | import numpy as np 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('hyp_files', nargs='+') 9 | parser.add_argument('ref_file') 10 | parser.add_argument('--reverse', action='store_true') 11 | parser.add_argument('--labels', nargs='+') 12 | parser.add_argument('--legend-loc', default='upper right') 13 | parser.add_argument('--bar-width', type=float, default=0.2) 14 | parser.add_argument('--ymin', type=float, default=0.0) 15 | parser.add_argument('--ymax', type=float, default=0.3) 16 | parser.add_argument('--ops', nargs='+', default=['ins', 'del', 'sub', 'shift']) 17 | parser.add_argument('--fig-size', nargs=2, type=float) 18 | parser.add_argument('--save') 19 | 20 | parser.add_argument('--average', nargs='+', type=int) 21 | 22 | if __name__ == '__main__': 23 | args = parser.parse_args() 24 | 25 | with open(args.ref_file) as f: 26 | references = [line.strip() for line in f] 27 | 28 | hypotheses = [] 29 | for hyp_file in args.hyp_files: 30 | with open(hyp_file) as f: 31 | hypotheses.append([line.strip() for line in f]) 32 | 33 | if args.reverse: 34 | scores = [tercom_statistics(references, hyp)[0] for hyp in hypotheses] 35 | else: 36 | scores = [tercom_statistics(hyp, references)[0] for hyp in hypotheses] 37 | 38 | N = len(args.average) if args.average else len(args.hyp_files) 39 | ind = np.arange(N) 40 | op_name_mapping = {'ins': 'Insertions', 'del': 'Deletions', 'sub': 'Substitutions', 'shift': 'Shifts'} 41 | 42 | ref_words = np.array([score["REF_WORDS"] for score in scores]) 43 | bars = [] 44 | legend = [] 45 | 46 | bottom = np.zeros(N) 47 | 48 | colors = ['#e66101', '#fdb863', '#b2abd2', '#5e3c99'] 49 | 50 | if args.fig_size: 51 | plt.figure(figsize=tuple(args.fig_size)) 52 | 53 | for op, color in zip(args.ops, colors): 54 | scores_ = np.array([score[op.upper()] for score in scores]) / ref_words 55 | if args.average: 56 | new_scores_ = [] 57 | j = 0 58 | for n in args.average: 59 | new_scores_.append(np.average(scores_[j:j+n])) 60 | j += n 61 | scores_ = np.array(new_scores_) 62 | 63 | bar = plt.bar(ind, scores_, args.bar_width, bottom=bottom, color=color, align='center') 64 | 65 | bars.append(bar) 66 | legend.append(op_name_mapping[op]) 67 | bottom += scores_ 68 | 69 | #plt.legend((p_ins[0], p_del[0], p_sub[0], p_shift[0])[::-1], ('Insertions', 'Deletions', 'Substitutions', 'Shifts')[::-1], 70 | # loc='upper right') 71 | 72 | try: 73 | loc = float(args.legend_loc) 74 | plt.legend(bars[::-1], legend[::-1], bbox_to_anchor=[loc, 1], loc="upper center") 75 | except: 76 | plt.legend(bars[::-1], legend[::-1], loc=args.legend_loc) 77 | 78 | plt.ylabel('TER') 79 | 80 | if args.labels: 81 | plt.xticks(ind, args.labels) 82 | else: 83 | plt.xticks([]) 84 | 85 | axes = plt.gca() 86 | axes.set_ylim([args.ymin, args.ymax]) 87 | 88 | if args.save: 89 | plt.savefig(args.save) 90 | else: 91 | plt.show() 92 | 93 | """ 94 | N = 5 95 | menMeans = (20, 35, 30, 35, 27) 96 | womenMeans = (25, 32, 34, 20, 25) 97 | menStd = (2, 3, 4, 1, 2) 98 | womenStd = (3, 5, 2, 3, 3) 99 | ind = np.arange(N) # the x locations for the groups 100 | width = 0.35 # the width of the bars: can also be len(x) sequence 101 | 102 | p1 = plt.bar(ind, menMeans, width, yerr=menStd) 103 | p2 = plt.bar(ind, womenMeans, width, 104 | bottom=menMeans, yerr=womenStd) 105 | 106 | plt.ylabel('Scores') 107 | plt.title('Scores by group and gender') 108 | plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5')) 109 | plt.yticks(np.arange(0, 81, 10)) 110 | plt.legend((p1[0], p2[0]), ('Men', 'Women')) 111 | 112 | plt.show() 113 | 114 | import ipdb; ipdb.set_trace() 115 | """ 116 | -------------------------------------------------------------------------------- /scripts/post_editing/reverse-edits.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from translate import utils 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('source') 8 | parser.add_argument('edits') 9 | 10 | 11 | if __name__ == '__main__': 12 | args = parser.parse_args() 13 | with open(args.source) as src_file, open(args.edits) as edit_file: 14 | for src_line, edits in zip(src_file, edit_file): 15 | trg_line = utils.reverse_edits(src_line.split(), [edits.split()]) 16 | print(' '.join(trg_line)) 17 | -------------------------------------------------------------------------------- /scripts/post_editing/select-by-index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import random 6 | import sys 7 | 8 | parser = argparse.ArgumentParser() 9 | 10 | parser.add_argument('indices') 11 | 12 | if __name__ == '__main__': 13 | args = parser.parse_args() 14 | 15 | with open(args.indices) as f: 16 | indices = sorted(list(set([int(line) for line in f])), reverse=True) 17 | 18 | for i, line in enumerate(sys.stdin): 19 | if len(indices) == 0: 20 | break 21 | 22 | if i == indices[-1]: 23 | indices.pop() 24 | print(line.rstrip('\r\n')) 25 | -------------------------------------------------------------------------------- /scripts/post_editing/select-by-length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import random 5 | import sys 6 | 7 | parser = argparse.ArgumentParser() 8 | 9 | parser.add_argument('ref_sentences') 10 | parser.add_argument('sentences') 11 | parser.add_argument('-n', type=int, default=500000) 12 | parser.add_argument('-k', type=int, default=1) 13 | parser.add_argument('-m', type=int, default=1000) 14 | 15 | if __name__ == '__main__': 16 | args = parser.parse_args() 17 | 18 | with open(args.ref_sentences) as f: 19 | ref_lengths = [len(line.split()) for line in f] 20 | with open(args.sentences) as f: 21 | lengths = [len(line.split()) for line in f] 22 | lengths = list(enumerate(lengths)) 23 | 24 | n = 0 25 | l = len(lengths) 26 | 27 | while n < args.n and l > 0: 28 | length = ref_lengths[n % len(ref_lengths)] 29 | 30 | def key(i): 31 | return abs(length - lengths[i][1]) 32 | 33 | indices = random.sample(range(l), k=args.m) 34 | 35 | if args.k > 1: 36 | indices = sorted(indices, key=key)[:args.k] 37 | else: 38 | indices = [min(indices, key=key)] 39 | 40 | for i in indices: 41 | sys.stdout.write(str(lengths[i][0]) + '\n') 42 | 43 | #sys.stdout.flush() 44 | 45 | for i in indices: 46 | lengths[i], lengths[l - 1] = lengths[l - 1], lengths[i] 47 | l -= 1 48 | n += 1 49 | -------------------------------------------------------------------------------- /scripts/post_editing/select-by-ter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import random 6 | import sys 7 | 8 | parser = argparse.ArgumentParser() 9 | 10 | parser.add_argument('ref_vectors') 11 | parser.add_argument('vectors') 12 | parser.add_argument('-n', type=int, default=500000) 13 | parser.add_argument('-k', type=int, default=1) 14 | parser.add_argument('-m', type=int, default=1000) 15 | 16 | if __name__ == '__main__': 17 | args = parser.parse_args() 18 | 19 | with open(args.ref_vectors) as f: 20 | ref_vectors = [np.array([float(x) for x in line.split(',')]) for line in f] 21 | with open(args.vectors) as f: 22 | vectors = [np.array([float(x) for x in line.split(',')]) for line in f] 23 | vectors = list(enumerate(vectors)) 24 | 25 | n = 0 26 | l = len(vectors) 27 | 28 | while n < args.n and l > 0: 29 | vector = ref_vectors[n % len(ref_vectors)] 30 | n += 1 31 | 32 | def key(i): 33 | return np.sum((vector - vectors[i][1]) ** 2) 34 | 35 | indices = random.sample(range(l), k=args.m) 36 | 37 | if args.k > 1: 38 | indices = sorted(indices, key=key)[:args.k] 39 | else: 40 | indices = [min(indices, key=key)] 41 | 42 | for i in indices: 43 | sys.stdout.write(str(vectors[i][0]) + '\n') 44 | 45 | #sys.stdout.flush() 46 | 47 | for i in indices: 48 | vectors[i], vectors[l - 1] = vectors[l - 1], vectors[i] 49 | l -= 1 50 | -------------------------------------------------------------------------------- /scripts/post_editing/stats-TER.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import numpy as np 6 | import re 7 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer, tercom_statistics 8 | from collections import OrderedDict 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('source') 12 | parser.add_argument('target') 13 | parser.add_argument('--bleu', action='store_true') 14 | #parser.add_argument('--ter', action='store_true') 15 | #parser.add_argument('--wer', action='store_true') 16 | #parser.add_argument('--all', '-a', action='store_true') 17 | parser.add_argument('--max-size', type=int) 18 | parser.add_argument('--case-insensitive', '-i', action='store_true') 19 | 20 | parser.add_argument('--draws', type=int, default=1000) 21 | parser.add_argument('--sample-size', type=int, default=0) 22 | parser.add_argument('-p', type=float, default=0.05) 23 | 24 | 25 | if __name__ == '__main__': 26 | args = parser.parse_args() 27 | 28 | with open(args.source) as src_file, open(args.target) as trg_file: 29 | if args.case_insensitive: 30 | hypotheses = [line.strip().lower() for line in src_file] 31 | references = [line.strip().lower() for line in trg_file] 32 | else: 33 | hypotheses = [line.strip() for line in src_file] 34 | references = [line.strip() for line in trg_file] 35 | 36 | if args.max_size is not None: 37 | hypotheses = hypotheses[:args.max_size] 38 | references = references[:args.max_size] 39 | 40 | if len(hypotheses) != len(references): 41 | sys.stderr.write('warning: source and target don\'t have the same length\n') 42 | size = min(len(hypotheses), len(references)) 43 | hypotheses = hypotheses[:size] 44 | references = references[:size] 45 | 46 | avg_stats, stats = tercom_statistics(hypotheses, references) 47 | 48 | ters = [stats_['TER'] for stats_ in stats] 49 | 50 | mean = sum(ters) / len(ters) 51 | variance = sum((ter - mean) ** 2 for ter in ters) / (len(ters) - 1) 52 | 53 | ts = {0.01: 2.5841, 0.05: 1.9639, 0.10: 1.6474} 54 | t = ts.get(args.p) 55 | if t is None: 56 | raise Exception 57 | 58 | d = t * np.sqrt(variance / len(ters)) 59 | 60 | print('{:.3f} +/- {:.3f}'.format(mean, d)) -------------------------------------------------------------------------------- /scripts/post_editing/ter-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from translate.evaluation import tercom_statistics 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('source') 8 | parser.add_argument('target') 9 | 10 | parser.add_argument('--case-insensitive', '-i', action='store_true') 11 | 12 | 13 | if __name__ == '__main__': 14 | args = parser.parse_args() 15 | 16 | with open(args.source) as src_file, open(args.target) as trg_file: 17 | hypotheses = [line.strip() for line in src_file] 18 | references = [line.strip() for line in trg_file] 19 | 20 | total, _ = tercom_statistics(hypotheses, references, not args.case_insensitive) 21 | 22 | total['TER'] = total['ERRORS'] / total['REF_WORDS'] 23 | print(' '.join('{}={:.2f}'.format(k, v) for k, v in sorted(total.items()))) 24 | -------------------------------------------------------------------------------- /scripts/post_editing/to-sgm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import numpy as np 6 | #from translate.evaluation import corpus_bleu, corpus_ter 7 | 8 | parser = argparse.ArgumentParser() 9 | # parser.add_argument('source1') 10 | # parser.add_argument('source2') 11 | # parser.add_argument('target') 12 | # 13 | # parser.add_argument('--bleu', action='store_true') 14 | # parser.add_argument('--max-size', type=int) 15 | # parser.add_argument('--case-insensitive', '-i', action='store_true') 16 | # 17 | # parser.add_argument('--draws', type=int, default=1000) 18 | # parser.add_argument('--sample-size', type=int, default=0) 19 | # parser.add_argument('-p', type=float, default=0.05) 20 | parser.add_argument('--set-type') 21 | parser.add_argument('--set-id') 22 | 23 | args = parser.parse_args() 24 | 25 | if args.set_type is not None: 26 | if args.set_id is None: 27 | args.set_id = 'dummy' 28 | 29 | print('<{} setid="{}" srclang="any" trglang="any">'.format(args.set_type, args.set_id)) 30 | 31 | print('<doc docid="dummy" sysid="{}">'.format(args.set_type)) 32 | for i, line in enumerate(sys.stdin, 1): 33 | print('<seg id="{}">{}</seg>'.format(i, line.strip())) 34 | print('</doc>') 35 | 36 | if args.set_type is not None: 37 | print('</{}>'.format(args.set_type)) 38 | -------------------------------------------------------------------------------- /scripts/post_editing/well-formed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import string 4 | from signal import signal, SIGPIPE, SIG_DFL 5 | 6 | signal(SIGPIPE, SIG_DFL) 7 | 8 | punk = '.!?:")' 9 | 10 | def is_well_formed(line): 11 | if len(line) < 21: 12 | return False 13 | 14 | x = line[0] 15 | if not x.isdigit() and not (x.isalpha() and x.isupper()): 16 | return False 17 | if not line[-2] in punk: # last character is '\n' 18 | return False 19 | 20 | i = 0 21 | k = 0 22 | 23 | for c in line: 24 | if c == ' ': 25 | continue 26 | 27 | k += 1 28 | if c.isalpha(): 29 | i += 1 30 | 31 | j = 0 32 | prev = None 33 | for word in line.split(): 34 | if prev is not None and word == prev: 35 | j += 1 36 | if j > 3: 37 | return False 38 | else: 39 | prev = word 40 | j = 1 41 | 42 | return i >= 20 and i >= k * 0.75 43 | 44 | 45 | if __name__ == '__main__': 46 | for line in sys.stdin: 47 | if is_well_formed(line): 48 | sys.stdout.write(line) 49 | -------------------------------------------------------------------------------- /scripts/reverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | for line in sys.stdin: 5 | print(' '.join(reversed(line.split()))) 6 | -------------------------------------------------------------------------------- /scripts/score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import os 6 | import re 7 | from collections import OrderedDict 8 | 9 | script_dir = os.path.dirname(os.path.abspath(__file__)) 10 | root_dir = os.path.dirname(script_dir) 11 | sys.path.append(root_dir) 12 | tercom_path = os.path.join(script_dir, 'tercom.jar') 13 | 14 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer, corpus_cer, corpus_bleu1 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('source') 18 | parser.add_argument('target') 19 | parser.add_argument('--bleu', action='store_true') 20 | parser.add_argument('--ter', action='store_true') 21 | parser.add_argument('--wer', action='store_true') 22 | parser.add_argument('--cer', action='store_true') 23 | parser.add_argument('--bleu1', action='store_true') 24 | parser.add_argument('--all', '-a', action='store_true') 25 | parser.add_argument('--max-size', type=int) 26 | parser.add_argument('--no-punk', action='store_true') 27 | 28 | parser.add_argument('--max-len', type=int, default=0) 29 | parser.add_argument('--min-len', type=int, default=0) 30 | 31 | parser.add_argument('--case-insensitive', '-i', action='store_true') 32 | 33 | 34 | if __name__ == '__main__': 35 | args = parser.parse_args() 36 | 37 | if not args.max_len: 38 | args.max_len = float('inf') 39 | 40 | if not any([args.all, args.wer, args.ter, args.bleu, args.cer, args.bleu1]): 41 | args.all = True 42 | 43 | if args.all: 44 | args.wer = args.ter = args.bleu = args.bleu1 = True 45 | 46 | with open(args.source) as src_file, open(args.target) as trg_file: 47 | 48 | lines = [(src, trg) for src, trg in zip(src_file, trg_file) 49 | if args.min_len <= len(trg.split()) <= args.max_len] 50 | src_lines, trg_lines = zip(*lines) 51 | 52 | def transform(sentence): 53 | sentence = sentence.strip() 54 | sentence = re.sub(r'\s+', ' ', sentence) 55 | if args.case_insensitive: 56 | sentence = sentence.lower() 57 | if args.no_punk: 58 | sentence = re.sub(r'[,!;:?"\'\.]', '', sentence) 59 | sentence = re.sub(r'@@ ', '', sentence) 60 | sentence = re.sub(r'@@', '', sentence) 61 | return sentence 62 | 63 | hypotheses = list(map(transform, src_lines)) 64 | references = list(map(transform, trg_lines)) 65 | 66 | if args.max_size is not None: 67 | hypotheses = hypotheses[:args.max_size] 68 | references = references[:args.max_size] 69 | 70 | if len(hypotheses) != len(references): 71 | sys.stderr.write('warning: source and target don\'t have the same length\n') 72 | size = min(len(hypotheses), len(references)) 73 | hypotheses = hypotheses[:size] 74 | references = references[:size] 75 | 76 | scores = OrderedDict() 77 | if args.bleu: 78 | scores['bleu'], summary = corpus_bleu(hypotheses, references) 79 | try: 80 | scores['penalty'], scores['ratio'] = map(float, re.findall('\w+=(\d+.\d+)', summary)) 81 | except ValueError: 82 | pass 83 | if args.wer: 84 | scores['wer'], _ = corpus_wer(hypotheses, references) 85 | if args.ter: 86 | try: # java missing 87 | scores['ter'], _ = corpus_ter(hypotheses, references, tercom_path=tercom_path) 88 | except: 89 | scores['ter'] = 0 90 | if args.cer: 91 | scores['cer'], _ = corpus_cer(hypotheses, references) 92 | if args.bleu1: 93 | scores['bleu1'], _ = corpus_bleu1(hypotheses, references) 94 | 95 | print(' '.join('{}={:.2f}'.format(k, v) for k, v in scores.items())) 96 | -------------------------------------------------------------------------------- /scripts/shuf-corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | import sys 5 | import random 6 | import argparse 7 | import shutil 8 | 9 | help_msg = """\ 10 | Shuffles a corpus. 11 | 12 | Usage example: 13 | shuf-corpus.py data/my_corpus data/my_corpus.shuf fr en 14 | """ 15 | 16 | if __name__ == '__main__': 17 | parser = argparse.ArgumentParser(description=help_msg, formatter_class=argparse.RawDescriptionHelpFormatter) 18 | 19 | parser.add_argument('corpus', help='name of the input corpus (path without extension, e.g. data/my_corpus)') 20 | parser.add_argument('--output', help='name of the output corpus (if not specified, input corpus is overwritten)') 21 | parser.add_argument('--seed', type=int) 22 | parser.add_argument('extensions', nargs='+', help='extensions (e.g. fr, en)') 23 | 24 | args = parser.parse_args() 25 | 26 | corpus = args.corpus 27 | 28 | if args.output is not None: 29 | output = args.output 30 | else: 31 | output = corpus 32 | 33 | input_files = ['{0}.{1}'.format(args.corpus, ext) for ext in args.extensions] 34 | output_files = ['{0}.{1}'.format(output, ext) for ext in args.extensions] 35 | 36 | # reads the whole contents into memory (might cause problems if the files are too large) 37 | # TODO: process files one by one 38 | contents = [] 39 | for filename in input_files: 40 | with open(filename) as f: 41 | contents.append(f.readlines()) 42 | 43 | indices = list(range(len(contents[0]))) 44 | random.seed(args.seed) 45 | random.shuffle(indices) 46 | 47 | contents = [[content[i] for i in indices] for content in contents] 48 | 49 | for filename, content in zip(output_files, contents): 50 | with open(filename, 'w') as f: 51 | f.writelines(content) 52 | -------------------------------------------------------------------------------- /scripts/speech/cat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import numpy as np 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('inputs', nargs='+') 7 | parser.add_argument('output') 8 | parser.add_argument('-v', '--verbose', action='store_true') 9 | 10 | args = parser.parse_args() 11 | 12 | dim = None 13 | n = 0 14 | for filename in args.inputs: 15 | with open(filename, 'rb') as f: 16 | n_, dim_ = np.load(f) 17 | n += n_ 18 | assert dim is None or dim_ == dim, 'incompatible dimensions {} != {}'.format(dim_, dim) 19 | dim = dim_ 20 | 21 | if args.verbose: 22 | print('count: {}, dim: {}'.format(n, dim)) 23 | 24 | with open(args.output, 'wb') as output_file: 25 | np.save(output_file, (n, dim)) 26 | for filename in args.inputs: 27 | with open(filename, 'rb') as f: 28 | n_, _ = np.load(f) 29 | for _ in range(n_): 30 | feats = np.load(f) 31 | np.save(output_file, feats) 32 | 33 | -------------------------------------------------------------------------------- /scripts/speech/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import numpy as np 4 | import struct 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('input') 8 | parser.add_argument('output') 9 | 10 | args = parser.parse_args() 11 | 12 | with open(args.input, 'rb') as infile, open(args.output, 'wb') as outfile: 13 | lines, dim = struct.unpack('ii', infile.read(8)) 14 | np.save(outfile, (lines, dim)) 15 | 16 | for _ in range(lines): 17 | x = infile.read(4) 18 | frames, = struct.unpack('i', x) 19 | n = frames * dim 20 | x = infile.read(4 * n) 21 | feats = struct.unpack('f' * n, x) 22 | feats = np.array(feats).reshape(frames, dim) 23 | np.save(outfile, feats.astype(np.float32)) 24 | 25 | -------------------------------------------------------------------------------- /scripts/speech/extract-new.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import numpy as np 4 | import scipy.io.wavfile as wav 5 | import tarfile 6 | import sys 7 | from python_speech_features import mfcc, delta, fbank 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('inputs', nargs='+') 11 | parser.add_argument('output') 12 | 13 | parser.add_argument('--mfcc', action='store_true') 14 | parser.add_argument('--filters', type=int, default=40) 15 | parser.add_argument('--energy', action='store_true') 16 | parser.add_argument('--step-size', type=float, default=0.010) 17 | parser.add_argument('--win-size', type=float, default=0.025) 18 | parser.add_argument('--delta', action='store_true') 19 | parser.add_argument('--delta-delta', action='store_true') 20 | parser.add_argument('--window', default='hamming') 21 | parser.add_argument('--nfft', type=int, default=512) 22 | parser.add_argument('--low-freq', type=float, default=0) 23 | parser.add_argument('--high-freq', type=float) 24 | parser.add_argument('-v', '--verbose', action='store_true') 25 | 26 | args = parser.parse_args() 27 | 28 | if args.delta_delta: 29 | args.delta = True 30 | 31 | if args.window.lower().startswith('ham'): 32 | winfunc = np.hamming 33 | elif args.window.lower().startswith('han'): 34 | winfunc = np.hanning 35 | else: 36 | winfunc = lambda x: np.ones((x,)) 37 | 38 | params = dict( 39 | winlen=args.win_size, 40 | winstep=args.step_size, 41 | nfilt=args.filters, 42 | preemph=0, 43 | winfunc=winfunc, 44 | lowfreq=args.low_freq, 45 | highfreq=args.high_freq, 46 | nfft=args.nfft) 47 | 48 | outfile = open(args.output, 'wb') 49 | 50 | total = 0 51 | for filename in args.inputs: 52 | tar = tarfile.open(filename) 53 | files = [f for f in tar.getmembers() if f.isfile()] 54 | total += len(files) 55 | 56 | dim = min(12, args.filters - 1) if args.mfcc else args.filters 57 | if args.delta_delta: 58 | dim *= 3 59 | elif args.delta: 60 | dim *= 2 61 | if args.energy: 62 | dim += 1 63 | if args.verbose: 64 | print('count: {}, dim: {}'.format(total, dim)) 65 | 66 | np.save(outfile, (total, dim)) 67 | 68 | i = 1 69 | for filename in args.inputs: 70 | tar = tarfile.open(filename) 71 | files = [f for f in tar.getmembers() if f.isfile()] 72 | files = sorted(files, key=lambda f: f.name) 73 | 74 | for fileinfo in files: 75 | with tar.extractfile(fileinfo) as f: 76 | rate, data = wav.read(f) 77 | 78 | if args.mfcc: 79 | feats = mfcc(data, rate, ceplifter=0, **params) 80 | energy = feats[:,:1] 81 | feats = feats[:,1:] 82 | else: 83 | feats, energy = fbank(data, rate, **params) 84 | feats = np.log(feats) 85 | energy = np.expand_dims(np.log(energy), axis=1) 86 | 87 | if args.delta: 88 | d1 = delta(feats, 2) 89 | feats = np.concatenate([feats, d1], axis=1) 90 | if args.delta_delta: 91 | d2 = delta(d1, 2) 92 | feats = np.concatenate([feats, d2], axis=1) 93 | 94 | if args.energy: 95 | feats = np.concatenate([energy, feats], axis=1) 96 | 97 | np.save(outfile, feats) 98 | if args.verbose and i % 10 == 0: 99 | sys.stdout.write('\rfiles processed: {}'.format(i)) 100 | i += 1 101 | 102 | if args.verbose: 103 | print('\rfiles processed: {}'.format(i)) 104 | 105 | outfile.close() 106 | 107 | -------------------------------------------------------------------------------- /scripts/speech/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division 4 | import argparse 5 | import numpy as np 6 | import yaafelib 7 | import tarfile 8 | import tempfile 9 | import os 10 | from collections import Counter 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('inputs', nargs='+', help='tar archive which contains all the wav files') 14 | parser.add_argument('output', help='output file') 15 | parser.add_argument('--derivatives', action='store_true') 16 | 17 | args = parser.parse_args() 18 | 19 | parameters = dict( 20 | step_size=160, # corresponds to 10 ms (at 16 kHz) 21 | block_size=640, # corresponds to 40 ms 22 | mfcc_coeffs=40, 23 | mfcc_filters=41 # more filters? (needs to be at least mfcc_coeffs+1, because first coeff is ignored) 24 | ) 25 | 26 | # TODO: ensure that all input files use this rate 27 | fp = yaafelib.FeaturePlan(sample_rate=16000) 28 | 29 | mfcc_features = 'MFCC MelNbFilters={mfcc_filters} CepsNbCoeffs={mfcc_coeffs} ' \ 30 | 'blockSize={block_size} stepSize={step_size}'.format(**parameters) 31 | energy_features = 'Energy blockSize={block_size} stepSize={step_size}'.format(**parameters) 32 | 33 | fp.addFeature('mfcc: {}'.format(mfcc_features)) 34 | if args.derivatives: 35 | fp.addFeature('mfcc_d1: {} > Derivate DOrder=1'.format(mfcc_features)) 36 | fp.addFeature('mfcc_d2: {} > Derivate DOrder=2'.format(mfcc_features)) 37 | 38 | fp.addFeature('energy: {}'.format(energy_features)) 39 | if args.derivatives: 40 | fp.addFeature('energy_d1: {} > Derivate DOrder=1'.format(energy_features)) 41 | fp.addFeature('energy_d2: {} > Derivate DOrder=2'.format(energy_features)) 42 | 43 | if args.derivatives: 44 | keys = ['mfcc', 'mfcc_d1', 'mfcc_d2', 'energy', 'energy_d1', 'energy_d2'] 45 | else: 46 | keys = ['mfcc', 'energy'] 47 | 48 | df = fp.getDataFlow() 49 | engine = yaafelib.Engine() 50 | engine.load(df) 51 | afp = yaafelib.AudioFileProcessor() 52 | 53 | frame_counter = Counter() 54 | 55 | outfile = open(args.output, 'wb') 56 | 57 | total = 0 58 | for filename in args.inputs: 59 | tar = tarfile.open(filename) 60 | total += len([f for f in tar if f.isfile()]) 61 | 62 | _, tmp_file = tempfile.mkstemp() 63 | 64 | for j, filename in enumerate(args.inputs): 65 | tar = tarfile.open(filename) 66 | files = sorted([f for f in tar if f.isfile()], key=lambda f: f.name) 67 | 68 | for i, fileinfo in enumerate(files): 69 | file_ = tar.extractfile(fileinfo) 70 | with open(tmp_file, 'wb') as f: 71 | f.write(file_.read()) 72 | 73 | afp.processFile(engine, tmp_file) 74 | feats = engine.readAllOutputs() 75 | feats = np.concatenate([feats[k] for k in keys], axis=1) 76 | frames, dim = feats.shape 77 | 78 | feats = feats.astype(np.float32) 79 | 80 | if frames == 0: 81 | print(frames, dim, fileinfo.name) 82 | raise Exception 83 | 84 | if i == 0 and j == 0: 85 | np.save(outfile, (total, dim)) 86 | 87 | np.save(outfile, feats) 88 | 89 | outfile.close() 90 | os.remove(tmp_file) 91 | -------------------------------------------------------------------------------- /scripts/speech/head.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import numpy as np 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('input') 7 | parser.add_argument('output') 8 | parser.add_argument('-n', type=int, default=10) 9 | 10 | args = parser.parse_args() 11 | 12 | with open(args.input, 'rb') as input_file, open(args.output, 'wb') as output_file: 13 | n, dim = np.load(input_file) 14 | n = min(args.n, n) 15 | np.save(output_file, (n, dim)) 16 | for _ in range(n): 17 | feats = np.load(input_file) 18 | np.save(output_file, feats) 19 | 20 | -------------------------------------------------------------------------------- /scripts/speech/python_speech_features/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | -------------------------------------------------------------------------------- /scripts/speech/shuf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import random 4 | import numpy as np 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('input') 8 | parser.add_argument('--output') 9 | parser.add_argument('-n', type=int, default=0) 10 | 11 | parser.add_argument('--input-txt', nargs='*') 12 | parser.add_argument('--output-txt', nargs='*') 13 | 14 | args = parser.parse_args() 15 | 16 | if not args.output: 17 | args.output = args.input 18 | if args.input_txt and not args.output_txt: 19 | args.output_txt = args.input_txt 20 | 21 | with open(args.input, 'rb') as input_file: 22 | n, dim = np.load(input_file) 23 | 24 | indices = list(range(n)) 25 | random.shuffle(indices) 26 | 27 | if args.n > 0: 28 | indices = indices[:args.n] 29 | 30 | frames = [] 31 | 32 | for _ in range(n): 33 | feats = np.load(input_file) 34 | frames.append(feats) 35 | 36 | with open(args.output, 'wb') as output_file: 37 | np.save(output_file, (len(indices), dim)) 38 | for index in indices: 39 | feats = frames[index] 40 | np.save(output_file, feats) 41 | 42 | if args.input_txt and args.output_txt: 43 | lines = [] 44 | for input_filename in args.input_txt: 45 | with open(input_filename) as input_file: 46 | lines.append(input_file.readlines()) 47 | 48 | for lines_, output_filename in zip(lines, args.output_txt): 49 | with open(output_filename, 'w') as output_file: 50 | for index in indices: 51 | line = lines_[index] 52 | output_file.write(line) 53 | 54 | -------------------------------------------------------------------------------- /scripts/split-corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('filename') 8 | parser.add_argument('dest') 9 | parser.add_argument('--splits', type=int, required=True) 10 | parser.add_argument('--tokens', action='store_true') 11 | 12 | args = parser.parse_args() 13 | 14 | os.makedirs(args.dest, exist_ok=True) 15 | 16 | with open(args.filename) as input_file: 17 | if args.tokens: 18 | total_size = sum(len(line.split()) for line in input_file) 19 | else: 20 | total_size = sum(1 for line in input_file) 21 | 22 | input_file.seek(0) 23 | 24 | shard_size = total_size // args.splits 25 | 26 | for i in range(args.splits): 27 | filename = os.path.join(args.dest, str(i + 1).zfill(len(str(args.splits)))) 28 | 29 | with open(filename, 'w') as output_file: 30 | 31 | this_size = 0 32 | for line in input_file: 33 | line_size = len(line.split()) if args.tokens else 1 34 | this_size += line_size 35 | 36 | output_file.write(line) 37 | 38 | if this_size >= shard_size and i < args.splits - 1: 39 | break 40 | -------------------------------------------------------------------------------- /scripts/stats-bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import numpy as np 6 | import re 7 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer 8 | from collections import OrderedDict 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('source') 12 | parser.add_argument('target') 13 | parser.add_argument('--bleu', action='store_true') 14 | #parser.add_argument('--ter', action='store_true') 15 | #parser.add_argument('--wer', action='store_true') 16 | #parser.add_argument('--all', '-a', action='store_true') 17 | parser.add_argument('--max-size', type=int) 18 | parser.add_argument('--case-insensitive', '-i', action='store_true') 19 | 20 | parser.add_argument('--draws', type=int, default=1000) 21 | parser.add_argument('--sample-size', type=int, default=0) 22 | parser.add_argument('-p', type=float, default=0.05) 23 | 24 | 25 | if __name__ == '__main__': 26 | args = parser.parse_args() 27 | 28 | with open(args.source) as src_file, open(args.target) as trg_file: 29 | if args.case_insensitive: 30 | hypotheses = [line.strip().lower() for line in src_file] 31 | references = [line.strip().lower() for line in trg_file] 32 | else: 33 | hypotheses = [line.strip() for line in src_file] 34 | references = [line.strip() for line in trg_file] 35 | 36 | if args.max_size is not None: 37 | hypotheses = hypotheses[:args.max_size] 38 | references = references[:args.max_size] 39 | 40 | if len(hypotheses) != len(references): 41 | sys.stderr.write('warning: source and target don\'t have the same length\n') 42 | size = min(len(hypotheses), len(references)) 43 | hypotheses = hypotheses[:size] 44 | references = references[:size] 45 | 46 | indices = np.arange(len(hypotheses)) 47 | if args.sample_size == 0: 48 | args.sample_size = len(hypotheses) 49 | 50 | bleu_scores = [] 51 | hypotheses = np.array(hypotheses) 52 | references = np.array(references) 53 | 54 | for _ in range(args.draws): 55 | indices = np.random.randint(len(hypotheses), size=args.sample_size) 56 | hypotheses_ = hypotheses[indices] 57 | references_ = references[indices] 58 | 59 | bleu, _ = corpus_bleu(hypotheses_, references_) 60 | bleu_scores.append(bleu) 61 | 62 | bleu_scores = sorted(bleu_scores) 63 | k = int(len(bleu_scores) * args.p) // 2 # FIXME 64 | 65 | bleu_scores = bleu_scores[k:len(bleu_scores) - k] 66 | 67 | print('[{:.3f}, {:.3f}]'.format(bleu_scores[0], bleu_scores[-1])) 68 | -------------------------------------------------------------------------------- /scripts/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from collections import Counter, namedtuple, OrderedDict 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('filename') 7 | parser.add_argument('--lower', action='store_true') 8 | parser.add_argument('--count-whitespaces', action='store_true') 9 | parser.add_argument('-c', '-b', '--chars', action='store_true', help='display char info') 10 | parser.add_argument('-l', '--lines', action='store_true', help='display line count') 11 | parser.add_argument('-w', '--words', action='store_true', help='display word info') 12 | parser.add_argument('-a', '--all', action='store_true', 13 | help='display all info and more (large memory usage)') 14 | 15 | args = parser.parse_args() 16 | 17 | if not args.chars and not args.lines and not args.words or args.all: 18 | args.chars = args.words = args.lines = True 19 | 20 | word_counts = Counter() 21 | char_counts = Counter() 22 | 23 | word_dict = Counter() 24 | char_dict = Counter() 25 | 26 | line_dict = Counter() 27 | lines = 0 28 | 29 | with open(args.filename) as f: 30 | for line in f: 31 | if args.lower: 32 | line = line.lower() 33 | 34 | if args.words: 35 | words = line.split() 36 | word_counts[len(words)] += 1 37 | for word in words: 38 | word_dict[word] += 1 39 | 40 | if args.chars: 41 | chars = line 42 | if not args.count_whitespaces: 43 | chars = line.strip().replace(' ', '') 44 | 45 | char_counts[len(chars)] += 1 46 | for char in chars: 47 | char_dict[char] += 1 48 | 49 | lines += 1 50 | if args.all: 51 | line_dict[line] += 1 52 | 53 | 54 | def info_dict(title, counter): 55 | total = sum(counter.values()) 56 | unique = len(counter) 57 | avg = total / unique 58 | min_ = min(counter.values()) 59 | max_ = max(counter.values()) 60 | 61 | cumulative_count = 0 62 | coverage = OrderedDict([(90, 0), (95, 0), (99, 0)]) 63 | 64 | for i, pair in enumerate(counter.most_common(), 1): 65 | _, count = pair 66 | cumulative_count += count 67 | 68 | for percent, count in coverage.items(): 69 | if count == 0 and cumulative_count * 100 >= percent * total: 70 | coverage[percent] = i 71 | 72 | summary = [ 73 | '{}\n{}'.format(title, '-' * len(title)), 74 | 'Total: {}'.format(total), 75 | 'Unique: {}'.format(unique), 76 | 'Minimum: {}'.format(min_), 77 | 'Maximum: {}'.format(max_), 78 | 'Average: {:.1f}'.format(avg) 79 | ] 80 | 81 | for percent, count in coverage.items(): 82 | summary.append('{}% cov: {}'.format(percent, count)) 83 | 84 | return '\n '.join(summary) + '\n' 85 | 86 | 87 | def info_lengths(title, counter): 88 | total = sum(counter.values()) 89 | avg = sum(k * v for k, v in counter.items()) / total 90 | 91 | coverage = OrderedDict([(1, 0), (5, 0), (10, 0), 92 | (50, 0), (90, 0), (95, 0), (99, 0)]) 93 | 94 | cumulative_count = 0 95 | prev_k = 0 96 | 97 | for k, v in sorted(counter.items()): 98 | cumulative_count += v 99 | 100 | for percent, count in coverage.items(): 101 | if count == 0 and cumulative_count * 100 >= percent * total: 102 | coverage[percent] = prev_k if percent < 50 else k 103 | 104 | prev_k = k 105 | 106 | summary = [ 107 | '{}\n{}'.format(title, '-' * len(title)), 108 | 'Minimum: {}'.format(min(counter)), 109 | 'Maximum: {}'.format(max(counter)), 110 | 'Average: {:.1f}'.format(avg), 111 | ] 112 | 113 | for percent, count in coverage.items(): 114 | summary.append('{}{:2d}%: {}'.format('<=' if percent < 50 else '>=', percent, count)) 115 | 116 | return '\n '.join(summary) + '\n' 117 | 118 | 119 | if args.lines: 120 | print("Lines\n-----\n Total: {}".format(lines)) 121 | 122 | if args.all: 123 | summary = [ 124 | 'Unique: {}'.format(len(line_dict)), 125 | 'Average: {:.2f}'.format(lines / len(line_dict)) 126 | ] 127 | print(' ' + '\n '.join(summary)) 128 | 129 | print() 130 | 131 | if args.words: 132 | print(info_lengths('Words per line', word_counts)) 133 | print(info_dict('Words', word_dict)) 134 | 135 | if args.chars: 136 | print(info_lengths('Chars per line', char_counts)) 137 | print(info_dict('Chars', char_dict)) 138 | -------------------------------------------------------------------------------- /scripts/tercom.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/scripts/tercom.jar -------------------------------------------------------------------------------- /scripts/train-moses.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [[ $# -lt 8 ]] 6 | then 7 | echo "wrong number of arguments supplied: $#" 8 | exit 0 9 | fi 10 | 11 | if [ -z ${MOSES} ] || [ -z ${GIZA} ] 12 | then 13 | echo "variables MOSES and/or GIZA undefined" 14 | exit 0 15 | fi 16 | 17 | model_dir=`readlink -f $1` 18 | data_dir=`readlink -f $2` 19 | corpus=${data_dir}/$3 20 | dev_corpus=${data_dir}/$4 21 | src_ext=$5 22 | trg_ext=$6 23 | lm_path=${data_dir}/$7 24 | lm_corpus=`basename ${lm_path}` 25 | lm_order=$8 26 | cores=`lscpu | grep -Po "^(CPU\(s\)|Processeur\(s\)).?:\s+\K\d+$"` 27 | 28 | echo "training on ${cores} CPUs" 29 | 30 | rm -rf ${model_dir} 31 | mkdir -p ${model_dir} 32 | 33 | echo "training language model, corpus=${lm_corpus}, order=${lm_order}" | ts 34 | ${MOSES}/bin/lmplz -o ${lm_order} --discount_fallback < ${lm_path}.${trg_ext} > ${model_dir}/${lm_corpus}.${trg_ext}.arpa 2>${model_dir}/train.log 35 | 36 | echo "training moses, corpus=${corpus}" | ts 37 | ${MOSES}/scripts/training/train-model.perl -root-dir ${model_dir} \ 38 | -corpus ${corpus} -f ${src_ext} -e ${trg_ext} -alignment grow-diag-final-and \ 39 | -reordering msd-bidirectional-fe -lm 0:${lm_order}:${model_dir}/${lm_corpus}.${trg_ext}.arpa \ 40 | -mgiza -external-bin-dir ${GIZA} \ 41 | -mgiza-cpus ${cores} -cores ${cores} --parallel 2>&1 | ts >> ${model_dir}/train.log 42 | 43 | echo "tuning moses, corpus=${dev_corpus}" | ts 44 | ${MOSES}/scripts/training/mert-moses.pl ${dev_corpus}.${src_ext} ${dev_corpus}.${trg_ext} \ 45 | ${MOSES}/bin/moses ${model_dir}/model/moses.ini --mertdir ${MOSES}/bin/ \ 46 | --decoder-flags="-threads ${cores}" --working-dir ${model_dir}/mert-work 2>&1 | ts > ${model_dir}/tuning.log 47 | 48 | echo "finished" | ts 49 | mv ${model_dir}/mert-work/moses.ini ${model_dir}/moses.tuned.ini 50 | rm -rf ${model_dir}/mert-work 51 | 52 | -------------------------------------------------------------------------------- /scripts/vocab-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import argparse 5 | from collections import Counter 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('hyp') 9 | parser.add_argument('--reference') 10 | parser.add_argument('--source') 11 | parser.add_argument('--max', type=int) 12 | 13 | args = parser.parse_args() 14 | 15 | if args.reference is not None: 16 | with open(args.reference) as ref_file: 17 | ref_lines = [line.split() for line in ref_file] 18 | ref_words = list(map(Counter, ref_lines)) 19 | else: 20 | ref_words = None 21 | ref_lines = None 22 | 23 | if args.source is not None: 24 | with open(args.source) as src_file: 25 | src_lines = [line.split() for line in src_file] 26 | else: 27 | src_lines = None 28 | 29 | total = Counter() 30 | ok = Counter() 31 | del_counts = Counter() 32 | ok_del_counts = Counter() 33 | 34 | def extract_deletes(ops, src_words): 35 | i = 0 36 | deletes = [] 37 | 38 | for op in ops: 39 | if op == '<KEEP>': 40 | i += 1 41 | elif op == '<DEL>': 42 | deletes.append(src_words[i]) 43 | 44 | return deletes 45 | 46 | with open(args.hyp) as hyp_file: 47 | for i, line in enumerate(hyp_file): 48 | if ref_words and i >= len(ref_words): 49 | break 50 | 51 | if src_lines and i < len(src_lines): 52 | hyp_del = Counter(extract_deletes(line.split(), src_lines[i])) 53 | del_counts += hyp_del 54 | 55 | if ref_lines: 56 | ref_del = Counter(extract_deletes(ref_lines[i], src_lines[i])) 57 | ok_del_counts += Counter( 58 | dict((w, min(c, ref_del[w])) 59 | for w, c in hyp_del.items()) 60 | ) 61 | 62 | words = Counter(line.split()) 63 | total += words 64 | 65 | if ref_words: 66 | ref = ref_words[i] 67 | ok += Counter(dict((w, min(c, ref[w])) for w, c in words.items())) 68 | 69 | total_count = sum(total.values()) 70 | 71 | precision_header = ' {:8}'.format('precision') if args.reference else '' 72 | header = '{:15} {:8} {:8}'.format('word', 'count', 'percentage') + precision_header 73 | print(header) 74 | 75 | for w, c in total.most_common(args.max): 76 | precision = ' {:8.2f}%'.format(100 * ok[w] / c) if args.reference else '' 77 | 78 | print('{:15} {:8} {:8.2f}%'.format(w, c, 100 * c / total_count) + precision) 79 | 80 | if del_counts: 81 | print('\nMost deleted words') 82 | for w, c in del_counts.most_common(args.max): 83 | precision = ' {:8.2f}%'.format(100 * ok_del_counts[w] / c) if args.source else '' 84 | 85 | print('{:15} {:8} {:8.2f}%'.format(w, c, 100 * c / sum(del_counts.values())) + precision) -------------------------------------------------------------------------------- /seq2seq.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #CUDA_VISIBLE_DEVICES="" 3 | 4 | /usr/bin/env python3 -m translate "$@" 5 | -------------------------------------------------------------------------------- /translate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/translate/__init__.py -------------------------------------------------------------------------------- /translate/multitask_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from translate import utils 3 | from translate.translation_model import TranslationModel 4 | 5 | 6 | class MultiTaskModel: 7 | def __init__(self, tasks, **kwargs): 8 | self.models = [] 9 | self.ratios = [] 10 | 11 | for i, task in enumerate(tasks, 1): 12 | if task.name is None: 13 | task.name = 'task_{}'.format(i) 14 | 15 | # merging both dictionaries (task parameters have a higher precedence) 16 | kwargs_ = dict(**kwargs) 17 | kwargs_.update(task) 18 | model = TranslationModel(**kwargs_) 19 | 20 | self.models.append(model) 21 | self.ratios.append(task.ratio if task.ratio is not None else 1) 22 | 23 | self.main_model = self.models[0] 24 | self.ratios = [ratio / sum(self.ratios) for ratio in self.ratios] # unit normalization 25 | 26 | def train(self, **kwargs): 27 | for model in self.models: 28 | utils.log('initializing {}'.format(model.name)) 29 | model.init_training(**kwargs) 30 | 31 | utils.log('starting training') 32 | while True: 33 | i = np.random.choice(len(self.models), 1, p=self.ratios)[0] 34 | model = self.models[i] 35 | try: 36 | model.train_step(**kwargs) 37 | except (utils.FinishedTrainingException, KeyboardInterrupt): 38 | utils.log('exiting...') 39 | self.main_model.save() 40 | return 41 | except utils.EvalException: 42 | if i == 0: 43 | model.save() 44 | step, score = model.training.scores[-1] 45 | model.manage_best_checkpoints(step, score) 46 | except utils.CheckpointException: 47 | if i == 0: # only save main model (includes all variables) 48 | model.save() 49 | step, score = model.training.scores[-1] 50 | model.manage_best_checkpoints(step, score) 51 | 52 | def decode(self, *args, **kwargs): 53 | self.main_model.decode(*args, **kwargs) 54 | 55 | def evaluate(self, *args, **kwargs): 56 | return self.main_model.evaluate(*args, **kwargs) 57 | 58 | def align(self, *args, **kwargs): 59 | self.main_model.align(*args, **kwargs) 60 | 61 | def initialize(self, *args, **kwargs): 62 | self.main_model.initialize(*args, **kwargs) 63 | 64 | def save(self, *args, **kwargs): 65 | self.main_model.save(*args, **kwargs) 66 | --------------------------------------------------------------------------------