├── .gitignore
├── LICENSE
├── PhD-thesis.pdf
├── README.md
├── config
    ├── AMU
    │   ├── XL.yaml
    │   ├── XXL.yaml
    │   ├── avg.py
    │   ├── eval.sh
    │   ├── large.yaml
    │   ├── medium.yaml
    │   ├── post-process.sh
    │   ├── prepare.sh
    │   └── small.yaml
    ├── APE
    │   ├── clean-raw-data.sh
    │   ├── eval.sh
    │   ├── large
    │   │   ├── chained.yaml
    │   │   ├── forced.yaml
    │   │   ├── global.yaml
    │   │   ├── multi-global.yaml
    │   │   └── multi.yaml
    │   ├── medium
    │   │   ├── chained.yaml
    │   │   ├── forced.yaml
    │   │   ├── global.yaml
    │   │   ├── multi-global.yaml
    │   │   └── multi.yaml
    │   ├── prepare.sh
    │   └── small
    │   │   ├── chained.yaml
    │   │   ├── forced.yaml
    │   │   ├── global.yaml
    │   │   ├── multi-global.yaml
    │   │   └── multi.yaml
    ├── BTEC
    │   ├── ASR.yaml
    │   ├── AST.yaml
    │   ├── MT.yaml
    │   ├── Multi-Task-joint.yaml
    │   ├── Multi-Task.yaml
    │   ├── README.md
    │   ├── prepare.sh
    │   └── voxygen
    │   │   ├── convert-to-audio.sh
    │   │   └── wsclient.py
    ├── IWSLT14
    │   ├── BPE-TED.yaml
    │   ├── BPE.yaml
    │   ├── BPE2char-TED.yaml
    │   ├── BPE2char.yaml
    │   ├── Back-Translation
    │   │   ├── baseline-TED.yaml
    │   │   ├── char-level-TED.yaml
    │   │   ├── decode.sh
    │   │   ├── eval.sh
    │   │   ├── prepare.sh
    │   │   ├── split.sh
    │   │   ├── subwords-TED.yaml
    │   │   └── train.sh
    │   ├── prepare-TED.sh
    │   ├── prepare-lexicon.sh
    │   ├── prepare-mixer.sh
    │   ├── prepare.sh
    │   └── train-SMT.sh
    ├── LibriSpeech
    │   ├── ASR.yaml
    │   ├── AST.yaml
    │   ├── MT.yaml
    │   ├── Multi-Task.yaml
    │   ├── README.md
    │   ├── model-outputs.tar.xz
    │   ├── prepare-raw.sh
    │   └── prepare.sh
    ├── WMT14
    │   ├── RNNsearch-Adam.yaml
    │   ├── RNNsearch-BPE.yaml
    │   ├── RNNsearch.yaml
    │   ├── download.sh
    │   ├── prepare-lexicon.sh
    │   └── prepare.sh
    └── default.yaml
├── install.sh
├── run-tests.py
├── scripts
    ├── bpe
    │   ├── apply_bpe.py
    │   ├── bpe_toy.py
    │   ├── chrF.py
    │   ├── concat-bpe.py
    │   ├── get_vocab.py
    │   ├── learn_bpe.py
    │   ├── learn_joint_bpe_and_vocab.py
    │   └── segment-char-ngrams.py
    ├── config-diff.sh
    ├── copy-model.py
    ├── coverage.py
    ├── decode-moses.sh
    ├── extract-lexicon.py
    ├── get-best-score.py
    ├── join.py
    ├── moses
    │   ├── clean-corpus-n.perl
    │   ├── deescape-special-chars.perl
    │   ├── detokenizer.perl
    │   ├── detruecase.perl
    │   ├── escape-special-chars.perl
    │   ├── lowercase.perl
    │   ├── multi-bleu.perl
    │   ├── nonbreaking_prefixes
    │   │   ├── nonbreaking_prefix.de
    │   │   ├── nonbreaking_prefix.el
    │   │   ├── nonbreaking_prefix.en
    │   │   ├── nonbreaking_prefix.es
    │   │   └── nonbreaking_prefix.fr
    │   ├── normalize-punctuation.perl
    │   ├── strip-xml.perl
    │   ├── tokenizer.perl
    │   ├── train-truecaser.perl
    │   ├── truecase.perl
    │   └── wrap-xml.perl
    ├── multi-print.py
    ├── paired-eval.py
    ├── plot-loss.py
    ├── plot-score-per-length.py
    ├── post_editing
    │   ├── apply-edits.py
    │   ├── extract-edits.py
    │   ├── extract-ter-vectors.py
    │   ├── noisify.py
    │   ├── plot-ops.py
    │   ├── plot-ter.py
    │   ├── reverse-edits.py
    │   ├── select-by-index.py
    │   ├── select-by-length.py
    │   ├── select-by-ter.py
    │   ├── stats-TER.py
    │   ├── ter-stats.py
    │   ├── to-sgm.py
    │   └── well-formed.py
    ├── prepare-data.py
    ├── reverse.py
    ├── score.py
    ├── shuf-corpus.py
    ├── speech
    │   ├── cat.py
    │   ├── convert.py
    │   ├── extract-new.py
    │   ├── extract.py
    │   ├── head.py
    │   ├── python_speech_features
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── sigproc.py
    │   └── shuf.py
    ├── split-corpus.py
    ├── stats-bleu.py
    ├── stats.py
    ├── tercom.jar
    ├── train-moses.sh
    └── vocab-stats.py
├── seq2seq.sh
└── translate
    ├── __init__.py
    ├── __main__.py
    ├── beam_search.py
    ├── conv_lstm.py
    ├── evaluation.py
    ├── models.py
    ├── multitask_model.py
    ├── rnn.py
    ├── seq2seq_model.py
    ├── translation_model.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | 
 7 | # Distribution / packaging
 8 | .Python
 9 | env/
10 | build/
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | *.log
26 | .ipynb_checkpoints
27 | 
28 | data_*
29 | data
30 | tests
31 | raw_data
32 | tmp/
33 | .idea/
34 | models
35 | model
36 | .spyderproject
37 | wsclient.cred
38 | *.svg
39 | *.png
40 | 


--------------------------------------------------------------------------------
/PhD-thesis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/PhD-thesis.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # seq2seq
 2 | Attention-based sequence to sequence learning
 3 | 
 4 | ## Dependencies
 5 | 
 6 | * [TensorFlow 1.2+ for Python 3](https://www.tensorflow.org/get_started/os_setup.html)
 7 | * YAML and Matplotlib modules for Python 3: `sudo apt-get install python3-yaml python3-matplotlib`
 8 | * A recent NVIDIA GPU
 9 | 
10 | ## How to use
11 | 
12 | 
13 | Train a model (CONFIG is a YAML configuration file, such as `config/default.yaml`):
14 | 
15 |     ./seq2seq.sh CONFIG --train -v 
16 | 
17 | 
18 | Translate text using an existing model:
19 | 
20 |     ./seq2seq.sh CONFIG --decode FILE_TO_TRANSLATE --output OUTPUT_FILE
21 | or for interactive decoding:
22 | 
23 |     ./seq2seq.sh CONFIG --decode
24 | 
25 | #### Example English&rarr;French model
26 | This is the same model and dataset as [Bahdanau et al. 2015](https://arxiv.org/abs/1409.0473).
27 | 
28 |     config/WMT14/download.sh    # download WMT14 data into raw_data/WMT14
29 |     config/WMT14/prepare.sh     # preprocess the data, and copy the files to data/WMT14
30 |     ./seq2seq.sh config/WMT14/baseline.yaml --train -v   # train a baseline model on this data
31 | 
32 | You should get similar BLEU scores as these (our model was trained on a single Titan X I for about 4 days).
33 | 
34 | | Dev   | Test  | +beam | Steps | Time |
35 | |:-----:|:-----:|:-----:|:-----:|:----:|
36 | | 25.04 | 28.64 | 29.22 | 240k  | 60h  |
37 | | 25.25 | 28.67 | 29.28 | 330k  | 80h  |
38 | 
39 | Download this model [here](https://drive.google.com/file/d/1Qe4yZTYSTF-mlRlP_NTFGwXgacZnBwdp/view?usp=sharing). To use this model, just extract the archive into the `seq2seq/models` folder, and run:
40 | 
41 |      ./seq2seq.sh models/WMT14/config.yaml --decode -v
42 | 
43 | #### Example German&rarr;English model
44 | This is the same dataset as [Ranzato et al. 2015](https://arxiv.org/abs/1511.06732).
45 | 
46 |     config/IWSLT14/prepare.sh
47 |     ./seq2seq.sh config/IWSLT14/baseline.yaml --train -v
48 | 
49 | | Dev   | Test  | +beam | Steps |
50 | |:-----:|:-----:|:-----:|:-----:|
51 | | 28.32 | 25.33 | 26.74 | 44k   |
52 | 
53 | The model is available for download [here](https://drive.google.com/file/d/1qCL3ZRxZ13fC45f74Nt6qiQ8tVAYFF9H/view?usp=sharing).
54 | 
55 | ## Audio pre-processing
56 | If you want to use the toolkit for Automatic Speech Recognition (ASR) or Automatic Speech Translation (AST), then you'll need to pre-process your audio files accordingly.
57 | This [README](https://github.com/eske/seq2seq/tree/master/config/BTEC) details how it can be done. You'll need to install the **Yaafe** library, and use `scripts/speech/extract-audio-features.py` to extract MFCCs from a set of wav files.
58 | 
59 | ## Features
60 | * **YAML configuration files**
61 | * **Beam-search decoder**
62 | * **Ensemble decoding**
63 | * **Multiple encoders**
64 | * **Hierarchical encoder**
65 | * **Bidirectional encoder**
66 | * **Local attention model**
67 | * **Convolutional attention model**
68 | * **Detailed logging**
69 | * **Periodic BLEU evaluation**
70 | * **Periodic checkpoints**
71 | * **Multi-task training:** train on several tasks at once (e.g. French->English and German->English MT)
72 | * **Subwords training and decoding**
73 | * **Input binary features instead of text**
74 | * **Pre-processing script:** we provide a fully-featured Python script for data pre-processing (vocabulary creation, lowercasing, tokenizing, splitting, etc.)
75 | * **Dynamic RNNs:** we use symbolic loops instead of statically unrolled RNNs. This means that we don't mean to manually configure bucket sizes, and that model creation is much faster.
76 | 
77 | ## Credits
78 | 
79 | * This project is based on [TensorFlow's reference implementation](https://www.tensorflow.org/tutorials/seq2seq)
80 | * We include some of the pre-processing scripts from [Moses](http://www.statmt.org/moses/)
81 | * The scripts for subword units come from [github.com/rsennrich/subword-nmt](https://github.com/rsennrich/subword-nmt)
82 | 


--------------------------------------------------------------------------------
/config/AMU/XL.yaml:
--------------------------------------------------------------------------------
 1 | label: "ENCDEC-MCGRU"
 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 4M + 500k + 12k train set"
 3 | 
 4 | cell_size: 1024
 5 | attn_size: 2048
 6 | embedding_size: 512
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/AMU
10 | max_len: 50
11 | model_dir: models/AMU/XL
12 | train_prefix: train.XL
13 | vocab_prefix: vocab.XL
14 | dev_prefix: dev.XL
15 | 
16 | steps_per_checkpoint: 10000
17 | steps_per_eval: 10000
18 | score_function: corpus_scores_ter
19 | keep_best: 8
20 | 
21 | optimizer: adam
22 | learning_rate: 0.0001
23 | batch_size: 64
24 | batch_mode: standard
25 | read_ahead: 100
26 | max_gradient_norm: 1.0
27 | max_epochs: 12
28 | 
29 | attention_type: global
30 | final_state: average
31 | 
32 | weight_scale: 0.01
33 | 
34 | use_dropout: True
35 | pervasive_dropout: True
36 | rnn_input_dropout: 0.2
37 | rnn_output_dropout: 0.2
38 | attn_dropout: 0.2
39 | word_dropout: 0.2
40 | initial_state_dropout: 0.2
41 | 
42 | train_initial_states: False
43 | 
44 | encoders:
45 |   - name: mt
46 |   - name: src
47 | 
48 | decoders:
49 |   - name: pe
50 |     conditional_rnn: True
51 |     pred_deep_layer: True
52 | 
53 | ref_ext: pe.ref
54 | 
55 | post_process_script: config/AMU/post-process.sh
56 | 


--------------------------------------------------------------------------------
/config/AMU/XXL.yaml:
--------------------------------------------------------------------------------
 1 | label: "ENCDEC-MCGRU"
 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 4M + 500k + 23k train set"
 3 | 
 4 | cell_size: 1024
 5 | attn_size: 2048
 6 | embedding_size: 512
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/AMU
10 | max_len: 50
11 | model_dir: models/AMU/XXL
12 | train_prefix: train.XXL
13 | vocab_prefix: vocab.XXL
14 | dev_prefix: dev.XL
15 | 
16 | steps_per_checkpoint: 10000
17 | steps_per_eval: 10000
18 | score_function: corpus_scores_ter
19 | keep_best: 8
20 | 
21 | optimizer: adam
22 | learning_rate: 0.0001
23 | batch_size: 64
24 | batch_mode: standard
25 | read_ahead: 100
26 | max_gradient_norm: 1.0
27 | max_epochs: 12
28 | 
29 | attention_type: global
30 | final_state: average
31 | 
32 | weight_scale: 0.01
33 | 
34 | use_dropout: True
35 | pervasive_dropout: True
36 | rnn_input_dropout: 0.2
37 | rnn_output_dropout: 0.2
38 | attn_dropout: 0.2
39 | word_dropout: 0.2
40 | initial_state_dropout: 0.2
41 | 
42 | train_initial_states: False
43 | 
44 | encoders:
45 |   - name: mt
46 |   - name: src
47 | 
48 | decoders:
49 |   - name: pe
50 |     conditional_rnn: True
51 |     pred_deep_layer: True
52 | 
53 | ref_ext: pe.ref
54 | 
55 | post_process_script: config/AMU/post-process.sh
56 | 


--------------------------------------------------------------------------------
/config/AMU/avg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import re
 5 | import math
 6 | from collections import defaultdict
 7 | 
 8 | stats = defaultdict(list)
 9 | 
10 | for line in sys.stdin:
11 |     for s in re.findall(r'[^\s]*=\d+\.?\d*', line):
12 |         key, value = s.split('=')
13 |         stats[key].append(float(value))
14 | 
15 | keys = ['ter', 'bleu', 'bleu1', 'wer']
16 | def sort_key(item):
17 |     key, _ = item
18 |     if key in keys:
19 |         return keys.index(key)
20 |     else:
21 |         return len(keys)
22 | 
23 | new_stats = []
24 | for key, values in sorted(stats.items(), key=sort_key):
25 |     mean = sum(values) / len(values)
26 |     stdev = math.sqrt(sum((x - mean) ** 2 for x in values) / (len(values) - 1))
27 |     new_stats.append((key, mean, stdev))
28 | 
29 | print('\n'.join('{:<7} {:6.2f} ({:.2f})'.format(*data) for data in new_stats))
30 | 


--------------------------------------------------------------------------------
/config/AMU/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | size=$1
 4 | gpu_id=$2
 5 | 
 6 | root_dir=models/AMU
 7 | eval_dir=${root_dir}/eval_${size}
 8 | log_file=${eval_dir}/log.txt
 9 | 
10 | rm -f ${log_file}
11 | rm -rf ${eval_dir}
12 | mkdir -p ${eval_dir}
13 | 
14 | for index in 1 2 3 4
15 | do
16 |     model=${size}.${index}
17 |     model_dir=${root_dir}/${model}
18 |     rm -rf ${model_dir}.avg
19 |     checkpoints=`find ${model_dir}/checkpoints/best-* -printf "%f\n" | cut -d'.' -f1,1 | sort | uniq | cut -d'-' -f2,2 | xargs printf " %s|" | sed s/\|$//`
20 |     checkpoints=`cat ${model_dir}/checkpoints/scores.txt | grep -P "${checkpoints}" | sed s/-// | sort -g | head -n4 | cut -d' ' -f2,2 | xargs printf "${model_dir}/checkpoints/best-%s "`
21 |     echo ${checkpoints}
22 |     ./seq2seq.sh ${model_dir}/config.yaml --average --checkpoints ${checkpoints} --save --model-dir ${model_dir}.avg --no-gpu >/dev/null 2>&1
23 |     rename "s/translate-[0-9]*/average/" ${model_dir}.avg/checkpoints/translate-*
24 |     mv ${model_dir}.avg/checkpoints/average.* ${model_dir}/checkpoints/
25 |     rm -rf ${model_dir}.avg
26 | done
27 | 
28 | function header {
29 |     printf "%s %-40s" `date +"%H:%M:%S"` $1 >> ${log_file}
30 | }
31 | 
32 | function filter {
33 |     tail -n1 | grep -Po "(ter|bleu1|bleu|wer|penalty|ratio)=[0-9]*.?[0-9]*" | xargs printf "%s " | sed "s/ $/\n/" >> ${log_file}
34 | }
35 | 
36 | for beam_size in 12 1
37 | do
38 |     for corpus in dev test test.2017
39 |     do
40 |         if [ ${size} = medium ]
41 |         then
42 |             eval_corpus=${corpus}
43 |         else
44 |             eval_corpus=${corpus}.${size}
45 |         fi
46 | 
47 |         for index in 1 2 3 4
48 |         do
49 |             model=${size}.${index}
50 |             model_dir=${root_dir}/${model}
51 | 
52 |             output=${corpus}.${model}.beam${beam_size}
53 |             header ${output}
54 |             ./seq2seq.sh ${model_dir}/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
55 |             config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
56 | 
57 |             output=${corpus}.${model}.avg.beam${beam_size}
58 |             header ${output}
59 |             ./seq2seq.sh ${model_dir}/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --checkpoints ${model_dir}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
60 |             config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
61 |         done
62 | 
63 |         output=${corpus}.${size}.ensemble.beam${beam_size}
64 |         header ${output}
65 |         ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/best --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
66 |         config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
67 | 
68 |         output=${corpus}.${size}.ensemble.avg.beam${beam_size}
69 |         header ${output}
70 |         ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${eval_corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
71 |         config/AMU/post-process.sh < ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
72 |     done
73 | done
74 | 


--------------------------------------------------------------------------------
/config/AMU/large.yaml:
--------------------------------------------------------------------------------
 1 | label: "ENCDEC-MCGRU LARGE"
 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 500k + 23k train set"
 3 | 
 4 | cell_size: 512
 5 | attn_size: 1024
 6 | embedding_size: 256
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/AMU
10 | max_len: 50
11 | model_dir: models/AMU/large
12 | train_prefix: train.large
13 | vocab_prefix: vocab.large
14 | dev_prefix: dev.large
15 | 
16 | steps_per_checkpoint: 1000
17 | steps_per_eval: 1000
18 | keep_best: 4
19 | score_function: corpus_scores_ter
20 | 
21 | batch_size: 32
22 | max_gradient_norm: 1.0
23 | max_steps: 150000
24 | 
25 | attention_type: global
26 | final_state: average
27 | 
28 | weight_scale: 0.01
29 | 
30 | use_dropout: True
31 | pervasive_dropout: True
32 | rnn_input_dropout: 0.4
33 | rnn_output_dropout: 0.4
34 | word_dropout: 0.2
35 | 
36 | train_initial_states: False
37 | 
38 | encoders:
39 |   - name: de
40 |     ext: mt
41 |   - name: src
42 | 
43 | decoders:
44 |   - name: de
45 |     ext: pe
46 |     conditional_rnn: True
47 |     pred_deep_layer: False
48 |     pred_embed_proj: False
49 |     tie_embeddings: False
50 | 
51 | ref_ext: pe.ref
52 | 
53 | post_process_script: config/AMU/post-process.sh
54 | 


--------------------------------------------------------------------------------
/config/AMU/medium.yaml:
--------------------------------------------------------------------------------
 1 | label: "ENCDEC-MCGRU MEDIUM"
 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 23k train set"
 3 | 
 4 | cell_size: 256
 5 | attn_size: 512
 6 | embedding_size: 128
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/AMU
10 | max_len: 50
11 | model_dir: models/AMU/medium
12 | 
13 | steps_per_checkpoint: 1000
14 | steps_per_eval: 1000
15 | keep_best: 4
16 | score_function: corpus_scores_ter
17 | 
18 | batch_size: 32
19 | max_gradient_norm: 1.0
20 | max_steps: 150000
21 | 
22 | attention_type: global
23 | final_state: average
24 | 
25 | weight_scale: 0.01
26 | 
27 | use_dropout: True
28 | pervasive_dropout: True
29 | rnn_input_dropout: 0.4
30 | rnn_output_dropout: 0.4
31 | word_dropout: 0.2
32 | 
33 | train_initial_states: False
34 | 
35 | encoders:
36 |   - name: de
37 |     ext: mt
38 |   - name: src
39 | 
40 | decoders:
41 |   - name: de
42 |     ext: pe
43 |     conditional_rnn: True
44 |     pred_deep_layer: False
45 |     pred_embed_proj: True
46 |     tie_embeddings: True
47 | 
48 | ref_ext: pe.ref
49 | 
50 | post_process_script: config/AMU/post-process.sh
51 | 


--------------------------------------------------------------------------------
/config/AMU/post-process.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cat "${1:-/dev/stdin}" | sed "s/@@ //g" | scripts/moses/detruecase.perl | scripts/moses/deescape-special-chars.perl
4 | 


--------------------------------------------------------------------------------
/config/AMU/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | raw_data=raw_data/APE
 4 | data_dir=data/AMU
 5 | 
 6 | mkdir -p ${data_dir}
 7 | 
 8 | for ext in src mt pe
 9 | do
10 |     if [ ${ext} = "src" ]
11 |     then
12 |         lang=en
13 |     else
14 |         lang=de
15 |     fi
16 | 
17 |     for corpus in train train.2017 500K 4M dev test test.2017
18 |     do
19 |         cat ${raw_data}/${corpus}.${ext} | scripts/moses/escape-special-chars.perl | scripts/moses/truecase.perl --model ${raw_data}/true.${lang} > ${data_dir}/${corpus}.true.${ext}
20 |         cat ${data_dir}/${corpus}.true.${ext} | scripts/bpe/apply_bpe.py -c ${raw_data}/${lang}.bpe > ${data_dir}/${corpus}.tmp.${ext}
21 |     done
22 | 
23 |     mv ${data_dir}/dev.tmp.${ext} ${data_dir}/dev.XL.${ext}
24 |     mv ${data_dir}/test.tmp.${ext} ${data_dir}/test.XL.${ext}
25 |     mv ${data_dir}/test.2017.tmp.${ext} ${data_dir}/test.2017.XL.${ext}
26 | 
27 |     cat ${data_dir}/train.tmp.${ext} | scripts/bpe/get_vocab.py > ${data_dir}/bpe-vocab.small.${ext}
28 |     cat ${data_dir}/{train,train.2017}.tmp.${ext} | scripts/bpe/get_vocab.py > ${data_dir}/bpe-vocab.medium.${ext}
29 |     cat ${data_dir}/{train,train.2017,500K}.tmp.${ext} | scripts/bpe/get_vocab.py > ${data_dir}/bpe-vocab.large.${ext}
30 | 
31 |     cat ${data_dir}/{4M,500K}.tmp.${ext} > ${data_dir}/train.XL.${ext}
32 |     rm ${data_dir}/{4M,500K}.tmp.${ext}
33 |     cp ${data_dir}/train.XL.${ext} ${data_dir}/train.XXL.${ext}
34 | 
35 |     for i in {1..20}; do
36 |         cat ${data_dir}/train.tmp.${ext} >> ${data_dir}/train.XL.${ext}
37 |         cat ${data_dir}/{train,train.2017}.tmp.${ext} >> ${data_dir}/train.XXL.${ext}
38 |     done
39 |     rm ${data_dir}/{train,train.2017}.tmp.${ext}
40 | 
41 |     for size in small medium large
42 |     do
43 |         for corpus in train train.2017 500K dev test test.2017
44 |         do
45 |             cat ${data_dir}/${corpus}.true.${ext} | scripts/bpe/apply_bpe.py -c ${raw_data}/${lang}.bpe --vocabulary-threshold 5 --vocabulary ${data_dir}/bpe-vocab.${size}.${ext} > ${data_dir}/${corpus}.${size}.${ext}
46 |         done
47 |     done 
48 |     rm -f ${data_dir}/*.tmp.* ${data_dir}/*.true.*
49 |     cat ${data_dir}/train.2017.medium.${ext} >> ${data_dir}/train.medium.${ext}
50 |     for i in {1..20}; do
51 |         cat ${data_dir}/{train,train.2017}.large.${ext} >> ${data_dir}/500K.large.${ext}
52 |     done
53 |     mv ${data_dir}/500K.large.${ext} ${data_dir}/train.large.${ext}
54 |     rm -f ${data_dir}/{train.2017,500K}.{small,medium,large}.${ext}
55 | done
56 | 
57 | for size in small medium large XL
58 | do
59 |     cp ${raw_data}/dev.pe ${data_dir}/dev.${size}.pe.ref
60 |     cp ${raw_data}/test.pe ${data_dir}/test.${size}.pe.ref
61 |     cp ${raw_data}/test.2017.pe ${data_dir}/test.2017.${size}.pe.ref
62 | done
63 | 
64 | for size in small medium
65 | do
66 |     cat ${data_dir}/train.${size}.{src,mt,pe} > ${data_dir}/train.${size}.all
67 |     scripts/prepare-data.py ${data_dir}/train.${size} all ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.${size}
68 |     cp ${data_dir}/vocab.${size}.all ${data_dir}/vocab.${size}.mt
69 |     cp ${data_dir}/vocab.${size}.all ${data_dir}/vocab.${size}.pe
70 |     mv ${data_dir}/vocab.${size}.all ${data_dir}/vocab.${size}.src
71 | done
72 | for size in large XL
73 | do
74 |     cat ${data_dir}/train.${size}.{mt,pe} > ${data_dir}/train.${size}.de
75 |     scripts/prepare-data.py ${data_dir}/train.${size} src de ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.${size}
76 |     cp ${data_dir}/vocab.${size}.de ${data_dir}/vocab.${size}.mt
77 |     cp ${data_dir}/vocab.${size}.de ${data_dir}/vocab.${size}.pe
78 |     rm ${data_dir}/train.${size}.de ${data_dir}/vocab.${size}.de
79 | done
80 | 
81 | scripts/prepare-data.py ${data_dir}/train.XXL src mt pe ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.XXL
82 | rename s/\.medium// ${data_dir}/*
83 | 
84 | 


--------------------------------------------------------------------------------
/config/AMU/small.yaml:
--------------------------------------------------------------------------------
 1 | label: "ENCDEC-MCGRU SMALL"
 2 | description: "AMU 2017 Multi-Encoder Cond-GRU Model - 12k train set"
 3 | 
 4 | cell_size: 256
 5 | attn_size: 512
 6 | embedding_size: 128
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/AMU
10 | max_len: 60
11 | model_dir: models/AMU/small
12 | train_prefix: train.small
13 | vocab_prefix: vocab.small
14 | dev_prefix: dev.small
15 | 
16 | steps_per_checkpoint: 1000
17 | steps_per_eval: 1000
18 | keep_best: 4
19 | score_function: corpus_scores_ter
20 | 
21 | batch_size: 32
22 | max_gradient_norm: 1.0
23 | max_steps: 75000
24 | 
25 | attention_type: global
26 | final_state: average
27 | 
28 | weight_scale: 0.01
29 | 
30 | use_dropout: True
31 | pervasive_dropout: True
32 | rnn_input_dropout: 0.4
33 | rnn_output_dropout: 0.4
34 | word_dropout: 0.2
35 | 
36 | train_initial_states: False
37 | 
38 | encoders:
39 |   - name: de
40 |     ext: mt
41 |   - name: src
42 | 
43 | decoders:
44 |   - name: de
45 |     ext: pe
46 |     conditional_rnn: True
47 |     pred_deep_layer: False
48 |     pred_embed_proj: True
49 |     tie_embeddings: True
50 | 
51 | ref_ext: pe.ref
52 | 
53 | post_process_script: config/AMU/post-process.sh
54 | 


--------------------------------------------------------------------------------
/config/APE/clean-raw-data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # start by downloading all the data files from "http://www.statmt.org/wmt17/ape-task.html", "EN-DE" language pair
 4 | # extract all the text files into the same "raw_data/APE" directory (no sub-directories)
 5 | # also copy the "true.{en,de}" and "{en,de}.bpe" files
 6 | # then run the following commands
 7 | raw_data=raw_data/APE
 8 | cur_dir=`pwd`
 9 | cd ${raw_data}
10 | 
11 | for ext in src mt pe
12 | do
13 |     mv en-de.train.${ext} train.2017.${ext}
14 |     mv en-de.${ext}.test.2017 test.2017.${ext}
15 | done
16 | cd ${cur_dir}
17 | # then run the pre-processing scripts "config/{APE,AMU}/prepare.sh"
18 | 


--------------------------------------------------------------------------------
/config/APE/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | root_dir=models/APE/$1
 4 | size=$2
 5 | gpu_id=$3
 6 | eval_dir=${root_dir}/eval
 7 | log_file=${eval_dir}/${size}.log
 8 | 
 9 | rm -f ${log_file}
10 | mkdir -p ${eval_dir}
11 | 
12 | for index in 1 2 3 4
13 | do
14 |     model=${size}.${index}
15 |     model_dir=${root_dir}/${model}
16 |     rm -rf ${model_dir}.avg
17 |     checkpoints=`find ${model_dir}/checkpoints/best-* -printf "%f\n" | cut -d'.' -f1,1 | sort | uniq | cut -d'-' -f2,2 | xargs printf " %s|" | sed s/\|$//`
18 |     checkpoints=`cat ${model_dir}/checkpoints/scores.txt | grep -P "${checkpoints}" | sed s/-// | sort -g | head -n4 | cut -d' ' -f2,2 | xargs printf "${model_dir}/checkpoints/best-%s "`
19 |     echo ${checkpoints}
20 |     ./seq2seq.sh ${model_dir}/config.yaml --average --checkpoints ${checkpoints} --save --model-dir ${model_dir}.avg --no-gpu >/dev/null 2>&1
21 |     rename "s/translate-[0-9]*/average/" ${model_dir}.avg/checkpoints/translate-*
22 |     mv ${model_dir}.avg/checkpoints/average.* ${model_dir}/checkpoints/
23 |     rm -rf ${model_dir}.avg
24 | done
25 | 
26 | function header {
27 |     printf "%s %-40s" `date +"%H:%M:%S"` $1 >> ${log_file}
28 | }
29 | 
30 | function filter {
31 |     tail -n1 | grep -Po "(ter|bleu1|bleu|wer|penalty|ratio)=[0-9]*.?[0-9]*" | xargs printf "%s " | sed "s/ $/\n/" >> ${log_file}
32 | }
33 | 
34 | for beam_size in 1 6
35 | do
36 |     for corpus in dev test test.2017
37 |     do
38 |         for index in 1 2 3 4
39 |         do
40 |             model=${size}.${index}
41 |             model_dir=${root_dir}/${model}
42 | 
43 |             output=${corpus}.${model}.beam${beam_size}
44 |             header ${output}
45 |             ./seq2seq.sh ${model_dir}/config.yaml --eval ${corpus} --beam-size ${beam_size} --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
46 |             scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
47 | 
48 |             output=${corpus}.${model}.avg.beam${beam_size}
49 |             header ${output}
50 |             ./seq2seq.sh ${model_dir}/config.yaml --eval ${corpus} --beam-size ${beam_size} --checkpoints ${model_dir}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
51 |             scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
52 |         done
53 | 
54 |         output=${corpus}.${size}.ensemble.beam${beam_size}
55 |         header ${output}
56 |         ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/best --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
57 |         scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
58 | 
59 |         output=${corpus}.${size}.ensemble.avg.beam${beam_size}
60 |         header ${output}
61 |         ./seq2seq.sh ${root_dir}/${size}.1/config.yaml --eval ${corpus} --beam-size ${beam_size} --ensemble --checkpoints ${root_dir}/${size}.{1,2,3,4}/checkpoints/average --gpu-id ${gpu_id} --raw-output --output ${eval_dir}/${output}.raw 2>&1 | filter
62 |         scripts/post_editing/reverse-edits.py data/APE/${corpus}.mt ${eval_dir}/${output}.raw > ${eval_dir}/${output}.out
63 |     done
64 | done
65 | 


--------------------------------------------------------------------------------
/config/APE/large/chained.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/large/chained
11 | train_prefix: train.large
12 | vocab_prefix: vocab.large
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.8
19 | decay_every_n_epoch: 0.5
20 | decay_after_n_epoch: 1
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 200000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: local
35 |     attn_window_size: 0
36 |     max_len: 40
37 | 
38 |   - name: src
39 |     attention_type: global
40 |     max_len: 40
41 | 
42 | decoders:
43 |   - name: edits
44 |     max_len: 50
45 | 
46 | pred_edits: True
47 | ref_ext: pe
48 | 
49 | use_dropout: True
50 | pervasive_dropout: True
51 | rnn_input_dropout: 0.5
52 | initial_state_dropout: 0.5
53 | 
54 | chained_encoders: True
55 | chaining_strategy: map_attns
56 | chaining_non_linearity: True
57 | chaining_loss_ratio: 0.5
58 | chaining_stop_gradient: False
59 | 


--------------------------------------------------------------------------------
/config/APE/large/forced.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/large/forced
11 | train_prefix: train.large
12 | vocab_prefix: vocab.large
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.8
19 | decay_every_n_epoch: 0.5
20 | decay_after_n_epoch: 1
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 200000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: local
35 |     attn_window_size: 0
36 |     max_len: 40
37 | 
38 | decoders:
39 |   - name: edits
40 |     max_len: 50
41 | 
42 | pred_edits: True
43 | ref_ext: pe
44 | 
45 | use_dropout: True
46 | pervasive_dropout: True
47 | rnn_input_dropout: 0.5
48 | initial_state_dropout: 0.5
49 | 


--------------------------------------------------------------------------------
/config/APE/large/global.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/large/global
11 | train_prefix: train.large
12 | vocab_prefix: vocab.large
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.8
19 | decay_every_n_epoch: 0.5
20 | decay_after_n_epoch: 1
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 200000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: global
35 |     max_len: 40
36 | 
37 | decoders:
38 |   - name: edits
39 |     max_len: 50
40 | 
41 | pred_edits: True
42 | ref_ext: pe
43 | 
44 | use_dropout: True
45 | pervasive_dropout: True
46 | rnn_input_dropout: 0.5
47 | initial_state_dropout: 0.5
48 | 


--------------------------------------------------------------------------------
/config/APE/large/multi-global.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/large/multi_global
11 | train_prefix: train.large
12 | vocab_prefix: vocab.large
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.8
19 | decay_every_n_epoch: 0.5
20 | decay_after_n_epoch: 1
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 200000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: global
35 |     max_len: 40
36 |   - name: src
37 |     attention_type: global
38 |     max_len: 40
39 | 
40 | decoders:
41 |   - name: edits
42 |     max_len: 50
43 | 
44 | pred_edits: True
45 | ref_ext: pe
46 | 
47 | use_dropout: True
48 | pervasive_dropout: True
49 | rnn_input_dropout: 0.5
50 | initial_state_dropout: 0.5
51 | 


--------------------------------------------------------------------------------
/config/APE/large/multi.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/large/multi
11 | train_prefix: train.large
12 | vocab_prefix: vocab.large
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.8
19 | decay_every_n_epoch: 0.5
20 | decay_after_n_epoch: 1
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 200000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: local
35 |     attn_window_size: 0
36 |     max_len: 40
37 |   - name: src
38 |     attention_type: global
39 |     max_len: 40
40 | 
41 | decoders:
42 |   - name: edits
43 |     max_len: 50
44 | 
45 | pred_edits: True
46 | ref_ext: pe
47 | 
48 | use_dropout: True
49 | pervasive_dropout: True
50 | rnn_input_dropout: 0.5
51 | initial_state_dropout: 0.5
52 | 


--------------------------------------------------------------------------------
/config/APE/medium/chained.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/medium/chained
11 | 
12 | batch_size: 32
13 | 
14 | optimizer: sgd
15 | learning_rate: 1.0
16 | learning_rate_decay_factor: 0.95
17 | decay_every_n_epoch: 1
18 | decay_after_n_epoch: 3
19 | 
20 | steps_per_checkpoint: 1000
21 | steps_per_eval: 1000
22 | score_function: corpus_scores_ter
23 | 
24 | max_gradient_norm: 1.0
25 | max_steps: 60000
26 | 
27 | final_state: average
28 | pred_embed_proj: False
29 | 
30 | encoders:
31 |   - name: mt
32 |     attention_type: local
33 |     attn_window_size: 0
34 |     max_len: 37
35 | 
36 |   - name: src
37 |     attention_type: global
38 |     max_len: 33
39 | 
40 | decoders:
41 |   - name: edits
42 |     max_len: 45
43 | 
44 | pred_edits: True
45 | ref_ext: pe
46 | 
47 | use_dropout: True
48 | pervasive_dropout: True
49 | rnn_input_dropout: 0.5
50 | initial_state_dropout: 0.5
51 | 
52 | chained_encoders: True
53 | chaining_strategy: map_attns
54 | chaining_non_linearity: True
55 | chaining_loss_ratio: 0.5
56 | chaining_stop_gradient: False
57 | 


--------------------------------------------------------------------------------
/config/APE/medium/forced.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/medium/forced
11 | 
12 | batch_size: 32
13 | 
14 | optimizer: sgd
15 | learning_rate: 1.0
16 | learning_rate_decay_factor: 0.95
17 | decay_every_n_epoch: 1
18 | decay_after_n_epoch: 3
19 | 
20 | steps_per_checkpoint: 1000
21 | steps_per_eval: 1000
22 | score_function: corpus_scores_ter
23 | 
24 | max_gradient_norm: 1.0
25 | max_steps: 60000
26 | 
27 | final_state: average
28 | pred_embed_proj: False
29 | 
30 | encoders:
31 |   - name: mt
32 |     attention_type: local
33 |     attn_window_size: 0
34 |     max_len: 37
35 | 
36 | decoders:
37 |   - name: edits
38 |     max_len: 45
39 | 
40 | pred_edits: True
41 | ref_ext: pe
42 | 
43 | use_dropout: True
44 | pervasive_dropout: True
45 | rnn_input_dropout: 0.5
46 | initial_state_dropout: 0.5
47 | 


--------------------------------------------------------------------------------
/config/APE/medium/global.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/medium/global
11 | 
12 | batch_size: 32
13 | 
14 | optimizer: sgd
15 | learning_rate: 1.0
16 | learning_rate_decay_factor: 0.95
17 | decay_every_n_epoch: 1
18 | decay_after_n_epoch: 3
19 | 
20 | steps_per_checkpoint: 1000
21 | steps_per_eval: 1000
22 | score_function: corpus_scores_ter
23 | 
24 | max_gradient_norm: 1.0
25 | max_steps: 60000
26 | 
27 | final_state: average
28 | pred_embed_proj: False
29 | 
30 | encoders:
31 |   - name: mt
32 |     attention_type: global
33 |     max_len: 37
34 | 
35 | decoders:
36 |   - name: edits
37 |     max_len: 45
38 | 
39 | pred_edits: True
40 | ref_ext: pe
41 | 
42 | use_dropout: True
43 | pervasive_dropout: True
44 | rnn_input_dropout: 0.5
45 | initial_state_dropout: 0.5
46 | 


--------------------------------------------------------------------------------
/config/APE/medium/multi-global.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/medium/multi_global
11 | 
12 | batch_size: 32
13 | 
14 | optimizer: sgd
15 | learning_rate: 1.0
16 | learning_rate_decay_factor: 0.95
17 | decay_every_n_epoch: 1
18 | decay_after_n_epoch: 3
19 | 
20 | steps_per_checkpoint: 1000
21 | steps_per_eval: 1000
22 | score_function: corpus_scores_ter
23 | 
24 | max_gradient_norm: 1.0
25 | max_steps: 60000
26 | 
27 | final_state: average
28 | pred_embed_proj: False
29 | 
30 | encoders:
31 |   - name: mt
32 |     attention_type: global
33 |     max_len: 37
34 |   - name: src
35 |     attention_type: global
36 |     max_len: 33
37 | 
38 | decoders:
39 |   - name: edits
40 |     max_len: 45
41 | 
42 | pred_edits: True
43 | ref_ext: pe
44 | 
45 | use_dropout: True
46 | pervasive_dropout: True
47 | rnn_input_dropout: 0.5
48 | initial_state_dropout: 0.5
49 | 


--------------------------------------------------------------------------------
/config/APE/medium/multi.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/medium/multi
11 | 
12 | batch_size: 32
13 | 
14 | optimizer: sgd
15 | learning_rate: 1.0
16 | learning_rate_decay_factor: 0.95
17 | decay_every_n_epoch: 1
18 | decay_after_n_epoch: 3
19 | 
20 | steps_per_checkpoint: 1000
21 | steps_per_eval: 1000
22 | score_function: corpus_scores_ter
23 | 
24 | max_gradient_norm: 1.0
25 | max_steps: 60000
26 | 
27 | final_state: average
28 | pred_embed_proj: False
29 | 
30 | encoders:
31 |   - name: mt
32 |     attention_type: local
33 |     attn_window_size: 0
34 |     max_len: 37
35 |   - name: src
36 |     attention_type: global
37 |     max_len: 33
38 | 
39 | decoders:
40 |   - name: edits
41 |     max_len: 45
42 | 
43 | pred_edits: True
44 | ref_ext: pe
45 | 
46 | use_dropout: True
47 | pervasive_dropout: True
48 | rnn_input_dropout: 0.5
49 | initial_state_dropout: 0.5
50 | 


--------------------------------------------------------------------------------
/config/APE/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | raw_data=raw_data/APE
 4 | data_dir=data/APE
 5 | 
 6 | max_vocab_size=30000
 7 | 
 8 | rm -rf ${data_dir}
 9 | mkdir -p ${data_dir}
10 | 
11 | for ext in mt pe src
12 | do
13 |     cat ${raw_data}/train.${ext} > ${data_dir}/train.small.${ext}
14 |     cat ${raw_data}/{train,train.2017}.${ext} > ${data_dir}/train.${ext}
15 |     cat ${raw_data}/500K.${ext} > ${data_dir}/train.large.${ext}
16 |     for i in {1..10}   # oversample PE data
17 |     do
18 |         cat ${raw_data}/{train,train.2017}.${ext} >> ${data_dir}/train.large.${ext}
19 |     done
20 | 
21 |     cp ${raw_data}/dev.${ext} ${data_dir}/dev.${ext}
22 |     cp ${raw_data}/test.${ext} ${data_dir}/test.${ext}
23 |     cp ${raw_data}/test.2017.${ext} ${data_dir}/test.2017.${ext}
24 | done
25 | 
26 | for corpus in train.small train train.large dev test test.2017
27 | do
28 |     scripts/post_editing/extract-edits.py ${data_dir}/${corpus}.{mt,pe} > ${data_dir}/${corpus}.edits
29 | done
30 | 
31 | cat ${data_dir}/train.small.{mt,pe} > ${data_dir}/train.small.de
32 | cat ${data_dir}/train.{mt,pe} > ${data_dir}/train.de
33 | cat ${data_dir}/train.large.{mt,pe} > ${data_dir}/train.large.de
34 | 
35 | scripts/prepare-data.py ${data_dir}/train.small src de edits ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.small
36 | scripts/prepare-data.py ${data_dir}/train src de edits ${data_dir} --mode vocab --vocab-size 0
37 | scripts/prepare-data.py ${data_dir}/train.large src de edits ${data_dir} --mode vocab --vocab-size 0 --vocab-prefix vocab.large --vocab-size ${max_vocab_size}
38 | 
39 | for vocab in vocab vocab.small vocab.large  # joint vocabularies
40 | do
41 |     cp ${data_dir}/${vocab}.de ${data_dir}/${vocab}.mt
42 |     cp ${data_dir}/${vocab}.de ${data_dir}/${vocab}.pe
43 | done
44 | rm ${data_dir}/*.de
45 | 


--------------------------------------------------------------------------------
/config/APE/small/chained.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/small/chained
11 | train_prefix: train.small
12 | vocab_prefix: vocab.small
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.95
19 | decay_every_n_epoch: 1
20 | decay_after_n_epoch: 3
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 40000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: local
35 |     attn_window_size: 0
36 |     max_len: 37
37 | 
38 |   - name: src
39 |     attention_type: global
40 |     max_len: 33
41 | 
42 | decoders:
43 |   - name: edits
44 |     max_len: 45
45 | 
46 | pred_edits: True
47 | ref_ext: pe
48 | 
49 | use_dropout: True
50 | pervasive_dropout: True
51 | rnn_input_dropout: 0.5
52 | initial_state_dropout: 0.5
53 | 
54 | chained_encoders: True
55 | chaining_strategy: map_attns
56 | chaining_non_linearity: True
57 | chaining_loss_ratio: 0.5
58 | chaining_stop_gradient: False
59 | 


--------------------------------------------------------------------------------
/config/APE/small/forced.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/small/forced
11 | train_prefix: train.small
12 | vocab_prefix: vocab.small
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.95
19 | decay_every_n_epoch: 1
20 | decay_after_n_epoch: 3
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 40000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: local
35 |     attn_window_size: 0
36 |     max_len: 37
37 | 
38 | decoders:
39 |   - name: edits
40 |     max_len: 45
41 | 
42 | pred_edits: True
43 | ref_ext: pe
44 | 
45 | use_dropout: True
46 | pervasive_dropout: True
47 | rnn_input_dropout: 0.5
48 | initial_state_dropout: 0.5
49 | 


--------------------------------------------------------------------------------
/config/APE/small/global.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/small/global
11 | train_prefix: train.small
12 | vocab_prefix: vocab.small
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.95
19 | decay_every_n_epoch: 1
20 | decay_after_n_epoch: 3
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 40000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: global
35 |     max_len: 37
36 | 
37 | decoders:
38 |   - name: edits
39 |     max_len: 45
40 | 
41 | pred_edits: True
42 | ref_ext: pe
43 | 
44 | use_dropout: True
45 | pervasive_dropout: True
46 | rnn_input_dropout: 0.5
47 | initial_state_dropout: 0.5
48 | 


--------------------------------------------------------------------------------
/config/APE/small/multi-global.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/small/multi_global
11 | train_prefix: train.small
12 | vocab_prefix: vocab.small
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.95
19 | decay_every_n_epoch: 1
20 | decay_after_n_epoch: 3
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 40000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: global
35 |     max_len: 37
36 |   - name: src
37 |     attention_type: global
38 |     max_len: 33
39 | 
40 | decoders:
41 |   - name: edits
42 |     max_len: 45
43 | 
44 | pred_edits: True
45 | ref_ext: pe
46 | 
47 | use_dropout: True
48 | pervasive_dropout: True
49 | rnn_input_dropout: 0.5
50 | initial_state_dropout: 0.5
51 | 


--------------------------------------------------------------------------------
/config/APE/small/multi.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cell_size: 128
 3 | attn_size: 128
 4 | embedding_size: 128
 5 | cell_type: LSTM
 6 | 
 7 | weight_scale: 0.1
 8 | 
 9 | data_dir: data/APE
10 | model_dir: models/APE/small/multi
11 | train_prefix: train.small
12 | vocab_prefix: vocab.small
13 | 
14 | batch_size: 32
15 | 
16 | optimizer: sgd
17 | learning_rate: 1.0
18 | learning_rate_decay_factor: 0.95
19 | decay_every_n_epoch: 1
20 | decay_after_n_epoch: 3
21 | 
22 | steps_per_checkpoint: 1000
23 | steps_per_eval: 1000
24 | score_function: corpus_scores_ter
25 | 
26 | max_gradient_norm: 1.0
27 | max_steps: 40000
28 | 
29 | final_state: average
30 | pred_embed_proj: False
31 | 
32 | encoders:
33 |   - name: mt
34 |     attention_type: local
35 |     attn_window_size: 0
36 |     max_len: 37
37 |   - name: src
38 |     attention_type: global
39 |     max_len: 33
40 | 
41 | decoders:
42 |   - name: edits
43 |     max_len: 45
44 | 
45 | pred_edits: True
46 | ref_ext: pe
47 | 
48 | use_dropout: True
49 | pervasive_dropout: True
50 | rnn_input_dropout: 0.5
51 | initial_state_dropout: 0.5
52 | 


--------------------------------------------------------------------------------
/config/BTEC/ASR.yaml:
--------------------------------------------------------------------------------
 1 | label: 'BTEC ASR'
 2 | 
 3 | data_dir: data/BTEC
 4 | model_dir: models/BTEC/ASR
 5 | train_prefix: train.concat
 6 | max_train_size: 40000
 7 | 
 8 | batch_size: 64
 9 | weight_scale: null
10 | 
11 | steps_per_checkpoint: 1000
12 | steps_per_eval: 1000
13 | max_steps: 60000
14 | score_function: corpus_scores_wer
15 | 
16 | cell_size: 256
17 | attn_size: 256
18 | cell_type: LSTM
19 | 
20 | encoders:
21 |   - name: speech.fr
22 |     ext: npz
23 |     embedding_size: 41
24 |     layers: 3
25 |     conv_filters: [16, 16]
26 |     conv_size: [3, 3]
27 |     conv_strides: [2, 2]
28 |     conv_activation: null
29 |     binary: True
30 |     max_len: 600
31 |     input_layers: [256, 128]
32 |     bidir_projection: True
33 |     final_state: concat_last
34 |     train_initial_states: False
35 |     input_layer_dropout: 0.4
36 | 
37 | decoders:
38 |   - name: char.fr
39 |     layers: 2
40 |     embedding_size: 64
41 |     max_len: 140
42 |     pred_maxout_layer: False
43 |     use_previous_word: False
44 |     pred_embed_proj: False
45 |     character_level: True
46 | 
47 | use_dropout: True
48 | pervasive_dropout: True
49 | attn_dropout: 0.4
50 | rnn_input_dropout: 0.4
51 | initial_state_dropout: 0.4
52 | 


--------------------------------------------------------------------------------
/config/BTEC/AST.yaml:
--------------------------------------------------------------------------------
 1 | label: 'BTEC AST'
 2 | 
 3 | data_dir: data/BTEC
 4 | model_dir: models/BTEC/AST
 5 | train_prefix: train.concat
 6 | max_train_size: 40000
 7 | 
 8 | batch_size: 64
 9 | weight_scale: null
10 | 
11 | steps_per_checkpoint: 1000
12 | steps_per_eval: 1000
13 | max_steps: 100000
14 | score_function: corpus_scores
15 | 
16 | cell_size: 256
17 | attn_size: 256
18 | cell_type: LSTM
19 | 
20 | encoders:
21 |   - name: speech.fr
22 |     ext: npz
23 |     embedding_size: 41
24 |     layers: 3
25 |     conv_filters: [16, 16]
26 |     conv_size: [3, 3]
27 |     conv_strides: [2, 2]
28 |     conv_activation: null
29 |     binary: True
30 |     max_len: 600
31 |     input_layers: [256, 128]
32 |     bidir_projection: True
33 |     final_state: concat_last
34 |     train_initial_states: False
35 |     input_layer_dropout: 0.4
36 | 
37 | decoders:
38 |   - name: char.en
39 |     embedding_size: 64
40 |     max_len: 120
41 |     conditional_rnn: True
42 |     pred_maxout_layer: False
43 |     use_previous_word: False
44 |     pred_embed_proj: False
45 |     character_level: True
46 | 
47 | use_dropout: True
48 | pervasive_dropout: True
49 | attn_dropout: 0.4
50 | rnn_input_dropout: 0.4
51 | initial_state_dropout: 0.4
52 | 


--------------------------------------------------------------------------------
/config/BTEC/MT.yaml:
--------------------------------------------------------------------------------
 1 | label: 'BTEC MT'
 2 | 
 3 | data_dir: data/BTEC
 4 | model_dir: models/BTEC/MT
 5 | 
 6 | batch_size: 64
 7 | weight_scale: null
 8 | embedding_weight_scale: 0.1
 9 | embedding_initializer: uniform
10 | 
11 | steps_per_checkpoint: 1000
12 | steps_per_eval: 1000
13 | max_steps: 100000
14 | score_function: corpus_scores
15 | 
16 | cell_size: 256
17 | attn_size: 256
18 | cell_type: LSTM
19 | 
20 | encoders:
21 |   - name: fr
22 |     embedding_size: 128
23 |     max_len: 25
24 |     bidir_projection: True
25 |     final_state: average
26 |     train_initial_states: False
27 |     embedding_dropout: 0.2
28 | 
29 | decoders:
30 |   - name: char.en
31 |     embedding_size: 64
32 |     max_len: 120
33 |     conditional_rnn: True
34 |     pred_maxout_layer: False
35 |     use_previous_word: False
36 |     pred_embed_proj: False
37 |     character_level: True
38 |     word_dropout: 0.2
39 | 
40 | use_dropout: True
41 | attn_dropout: 0.2
42 | rnn_input_dropout: 0.2
43 | initial_state_dropout: 0.2
44 | rnn_output_dropout: 0.2
45 | 


--------------------------------------------------------------------------------
/config/BTEC/Multi-Task-joint.yaml:
--------------------------------------------------------------------------------
 1 | label: 'BTEC Multi-Task'
 2 | description: "Multi-Task training of AST, MT and ASR models on BTEC, with a joint training loss"
 3 | 
 4 | data_dir: data/BTEC
 5 | model_dir: models/BTEC/AST_multitask_joint
 6 | train_prefix: train.concat
 7 | max_train_size: 40000
 8 | 
 9 | batch_size: 64
10 | weight_scale: null
11 | 
12 | steps_per_checkpoint: 1000
13 | steps_per_eval: 1000
14 | max_steps: 100000
15 | score_function: corpus_scores
16 | 
17 | train_initial_states: False
18 | bidir_projection: True
19 | pred_embed_proj: False
20 | use_previous_word: False
21 | pred_deep_layer: False
22 | pred_maxout_layer: False
23 | 
24 | cell_size: 256
25 | attn_size: 256
26 | cell_type: LSTM
27 | embedding_size: 64
28 | 
29 | multi_task: True
30 | task_ratios: [0.6, 0.2, 0.2, 0]  # (0,0) (0,1) (1,0) (1,1)
31 | 
32 | encoders:
33 |   - name: speech.fr
34 |     ext: npz
35 |     embedding_size: 41
36 |     layers: 3
37 |     binary: True
38 |     final_state: concat_last
39 |     conv_filters: [16, 16]
40 |     conv_size: [3, 3]
41 |     conv_strides: [2, 2]
42 |     conv_activation: null
43 |     input_layers: [256, 128]
44 |     input_layer_activation: tanh
45 |     max_len: 600
46 |   - name: fr
47 |     embedding_size: 128
48 |     conv_filters: null
49 |     input_layers: null
50 |     max_len: 25
51 |     final_state: average
52 | decoders:
53 |   - name: char.en
54 |     character_level: True
55 |     conditional_rnn: True
56 |     max_len: 120
57 |   - name: char.fr
58 |     layers: 2
59 |     character_level: True
60 |     max_len: 140
61 | 
62 | use_dropout: True
63 | pervasive_dropout: True
64 | attn_dropout: 0.4
65 | rnn_input_dropout: 0.4
66 | initial_state_dropout: 0.4
67 | input_layer_dropout: 0.4
68 | 


--------------------------------------------------------------------------------
/config/BTEC/Multi-Task.yaml:
--------------------------------------------------------------------------------
 1 | label: 'BTEC Multi-Task'
 2 | description: "Multi-Task training of AST, MT and ASR models on BTEC"
 3 | 
 4 | data_dir: data/BTEC
 5 | model_dir: models/BTEC/AST_multitask
 6 | train_prefix: train.concat
 7 | max_train_size: 40000
 8 | 
 9 | batch_size: 64
10 | weight_scale: null
11 | 
12 | steps_per_checkpoint: 500
13 | steps_per_eval: 500
14 | max_steps: 100000
15 | score_function: corpus_scores
16 | 
17 | train_initial_states: False
18 | bidir_projection: True
19 | pred_embed_proj: False
20 | use_previous_word: False
21 | pred_deep_layer: False
22 | pred_maxout_layer: False
23 | conditional_rnn: True
24 | conv_filters: [16, 16]
25 | conv_size: [3, 3]
26 | conv_strides: [2, 2]
27 | conv_activation: null
28 | input_layers: [256, 128]
29 | input_layer_activation: tanh
30 | final_state: concat_last
31 | max_len: 600
32 | 
33 | cell_size: 256
34 | attn_size: 256
35 | cell_type: LSTM
36 | embedding_size: 64
37 | 
38 | tasks:
39 |   - name: AST
40 |     ratio: 0.6
41 |     encoders:
42 |       - name: speech.fr
43 |         ext: npz
44 |         embedding_size: 41
45 |         layers: 3
46 |         binary: True
47 | 
48 |     decoders:
49 |       - name: char.en
50 |         layers: 1
51 |         character_level: True
52 |         max_len: 120
53 | 
54 |   - name: ASR
55 |     ratio: 0.2
56 |     encoders:
57 |       - name: speech.fr
58 |         ext: npz
59 |         embedding_size: 41
60 |         layers: 3
61 |         binary: True
62 | 
63 |     decoders:
64 |       - name: char.fr
65 |         layers: 2
66 |         character_level: True
67 |         conditional_rnn: False
68 |         max_len: 140
69 | 
70 |   - name: MT
71 |     ratio: 0.2
72 |     train_prefix: train
73 |     encoders:
74 |       - name: fr
75 |         embedding_size: 128
76 |         conv_filters: null
77 |         input_layers: null
78 |         max_len: 25
79 |         final_state: average
80 | 
81 |     decoders:
82 |       - name: char.en
83 |         layers: 1
84 |         character_level: True
85 |         max_len: 120
86 | 
87 | use_dropout: True
88 | pervasive_dropout: True
89 | attn_dropout: 0.4
90 | rnn_input_dropout: 0.4
91 | initial_state_dropout: 0.4
92 | input_layer_dropout: 0.4
93 | 


--------------------------------------------------------------------------------
/config/BTEC/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Speech processing
 4 | 
 5 | ## Install Yaafe
 6 | 
 7 | ~~~
 8 | sudo apt-get install cmake cmake-curses-gui libargtable2-0 libargtable2-dev
 9 | libsndfile1 libsndfile1-dev libmpg123-0 libmpg123-dev libfftw3-3 libfftw3-dev
10 | liblapack-dev libhdf5-serial-dev gcc-4.8 g++-4.8
11 | 
12 | wget https://sourceforge.net/projects/yaafe/files/yaafe-v0.64.tgz/download -O yaafe-v0.64.tgz
13 | 
14 | tar xzf yaafe-v0.64.tgz
15 | cd yaafe-v0.64
16 | 
17 | # fix bug in the official release
18 | cat src_cpp/yaafe-core/Ports.h | sed "s/\tpush_back/\tthis->push_back/g" > src_cpp/yaafe-core/Ports.h.fixed
19 | mv src_cpp/yaafe-core/Ports.h.fixed src_cpp/yaafe-core/Ports.h
20 | 
21 | mkdir build
22 | cd build
23 | export CC=/usr/bin/gcc-4.8
24 | export CXX=/usr/bin/g++-4.8
25 | cmake ..
26 | make
27 | sudo make install
28 | 
29 | echo "export PYTHONPATH=/usr/local/python_packages/:\$PYTHONPATH" >> ~/.bashrc
30 | echo "export LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH" >> ~/.bashrc
31 | echo "export YAAFE_PATH=/usr/local/yaafe_extensions" >> ~/.bashrc
32 | ~~~
33 | 
34 | ## Configuration files
35 | 
36 | Examples of configuration files for ASR and AST are: `config/BTEC/ASR.yaml` and `config/BTEC/AST.yaml`.
37 | You'll need to modify the `data`, `model`, `data_prefix` and `vocab_prefix` parameters. Also, you should set the right `name`  for the `encoders` and `decoders` parameters (it should be the same as the source and target extensions).
38 | 
39 | A very important parameter for ASR and AST is the `max_len` parameters (specific to each encoder and decoder). It defines the maximum length of the input and output sequences. Training time and memory usage depend on this limit. Because audio sequences are very long (1 frame every 10 ms), training can take a lot of memory.
40 | 
41 | 


--------------------------------------------------------------------------------
/config/BTEC/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # speech data preparation script
 4 | # this script assumes that the BTEC raw files are in the ${raw_data} dir
 5 | raw_data=raw_data/BTEC
 6 | data_dir=data/BTEC   # output directory for the processed files (text and audio features)
 7 | 
 8 | rm -rf ${data_dir}
 9 | mkdir -p ${data_dir}
10 | 
11 | scripts/speech/extract.py ${raw_data}/train-{Fabienne,Helene,Loic,Marion,Michel,Philippe}.tar ${data_dir}/train.concat.npz
12 | scripts/speech/extract.py ${raw_data}/dev-Agnes.tar ${data_dir}/dev.npz
13 | scripts/speech/extract.py ${raw_data}/test-Agnes.tar ${data_dir}/test.npz
14 | 
15 | rm -f ${data_dir}/train.raw.{fr,en}
16 | for i in {1..6}
17 | do
18 |     cat ${raw_data}/train.fr >> ${data_dir}/train.raw.fr
19 |     cat ${raw_data}/train.en >> ${data_dir}/train.raw.en
20 | done
21 | 
22 | scripts/prepare-data.py ${data_dir}/train.raw fr en ${data_dir} --lowercase --output train.concat --mode prepare
23 | scripts/prepare-data.py ${raw_data}/dev fr en ${data_dir} --lowercase --output dev --mode prepare
24 | scripts/prepare-data.py ${raw_data}/test fr en ${data_dir} --lowercase --output test --mode prepare
25 | scripts/prepare-data.py ${raw_data}/train fr en ${data_dir} --lowercase
26 | 
27 | scripts/prepare-data.py ${raw_data}/dev mref.en ${data_dir} --lowercase --output dev --mode prepare --lang en
28 | scripts/prepare-data.py ${raw_data}/test mref.en ${data_dir} --lowercase --output test --mode prepare --lang en
29 | 
30 | scripts/speech/shuf.py ${data_dir}/train.concat.npz --input-txt ${data_dir}/train.concat.{fr,en}
31 | 
32 | scripts/prepare-data.py ${data_dir}/train fr en ${data_dir} --mode vocab --character-level --no-tokenize --vocab-prefix vocab.char
33 | 
34 | for corpus in train.concat train dev test
35 | do
36 |     cp ${data_dir}/${corpus}.fr ${data_dir}/${corpus}.char.fr
37 |     cp ${data_dir}/${corpus}.en ${data_dir}/${corpus}.char.en
38 | done
39 | 
40 | 


--------------------------------------------------------------------------------
/config/BTEC/voxygen/convert-to-audio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | filename=$1
 5 | dir=$2
 6 | voice=$3   # Agnes, Fabienne, Helene, Loic, Marion, Michel, Philippe  (default = Agnes)
 7 | 
 8 | mkdir -p ${dir}
 9 | lines=`wc -l ${filename} | cut -d' ' -f1`
10 | digits=$((`echo ${lines} | wc -c` - 1))
11 | 
12 | for i in `seq 1 ${lines}`;
13 | do
14 |     num=`printf "%0${digits}d" ${i}`
15 |     cat ${filename} | sed -n "${i},${i}p" > ${dir}/${num}.txt
16 |     config/BTEC/voxygen/wsclient.py -i ${dir}/${num}.txt -o ${dir}/${num}.wav header=wav-header frequency=16000 coding=lin voice=${voice}
17 |     rm ${dir}/${num}.txt
18 | done
19 | 
20 | 


--------------------------------------------------------------------------------
/config/IWSLT14/BPE-TED.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 BPE Model + Monolingual data'
 2 | 
 3 | cell_size: 512
 4 | attn_size: 512
 5 | embedding_size: 128
 6 | 
 7 | cell_type: LSTM
 8 | weight_scale: 0.1
 9 | 
10 | data_dir: data/IWSLT14
11 | model_dir: models/IWSLT14/BPE_TED
12 | train_prefix: train.TED
13 | vocab_prefix: vocab.joint
14 | 
15 | batch_size: 32
16 | 
17 | steps_per_checkpoint: 4000
18 | steps_per_eval: 4000
19 | score_function: corpus_scores
20 | 
21 | max_gradient_norm: 1.0
22 | batch_mode: standard
23 | read_ahead: 20
24 | max_steps: 800000
25 | keep_best: 4
26 | 
27 | encoders:
28 |   - name: joint
29 |     ext: jsub.de
30 |     max_len: 52
31 |     train_initial_states: False
32 | 
33 | decoders:
34 |   - name: joint
35 |     ext: jsub.en
36 |     max_len: 50
37 |     conditional_rnn: True
38 |     pred_deep_layer: True
39 |     use_previous_word: False
40 |     initial_state: zero
41 | 
42 | use_dropout: True
43 | rnn_input_dropout: 0.4
44 | word_dropout: 0.2
45 | 


--------------------------------------------------------------------------------
/config/IWSLT14/BPE.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 BPE Model'
 2 | 
 3 | cell_size: 512
 4 | attn_size: 512
 5 | embedding_size: 128
 6 | 
 7 | cell_type: LSTM
 8 | weight_scale: 0.1
 9 | 
10 | data_dir: data/IWSLT14
11 | model_dir: models/IWSLT14/BPE
12 | vocab_prefix: vocab.joint
13 | 
14 | batch_size: 32
15 | 
16 | steps_per_checkpoint: 4000
17 | steps_per_eval: 4000
18 | score_function: corpus_scores
19 | 
20 | max_gradient_norm: 1.0
21 | batch_mode: standard
22 | read_ahead: 20
23 | max_steps: 400000
24 | keep_best: 4
25 | 
26 | encoders:
27 |   - name: joint
28 |     ext: jsub.de
29 |     max_len: 52
30 |     train_initial_states: False
31 | 
32 | decoders:
33 |   - name: joint
34 |     ext: jsub.en
35 |     max_len: 50
36 |     conditional_rnn: True
37 |     pred_deep_layer: True
38 |     use_previous_word: False
39 |     initial_state: zero
40 | 
41 | use_dropout: True
42 | rnn_input_dropout: 0.4
43 | word_dropout: 0.2
44 | 


--------------------------------------------------------------------------------
/config/IWSLT14/BPE2char-TED.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 BPE to character Model + Monolingual data'
 2 | 
 3 | cell_size: 512
 4 | attn_size: 512
 5 | embedding_size: 128
 6 | 
 7 | cell_type: LSTM
 8 | weight_scale: 0.1
 9 | 
10 | data_dir: data/IWSLT14
11 | model_dir: models/IWSLT14/BPE2char_TED
12 | train_prefix: train.TED
13 | 
14 | batch_size: 32
15 | 
16 | steps_per_checkpoint: 4000
17 | steps_per_eval: 4000
18 | score_function: corpus_scores
19 | 
20 | max_gradient_norm: 1.0
21 | batch_mode: standard
22 | read_ahead: 20
23 | max_steps: 800000
24 | keep_best: 4
25 | 
26 | encoders:
27 |   - name: jsub.de
28 |     max_len: 52
29 |     train_initial_states: False
30 | 
31 | decoders:
32 |   - name: char.en
33 |     max_len: 239
34 |     character_level: True
35 |     conditional_rnn: True
36 |     pred_deep_layer: True
37 |     use_previous_word: False
38 |     initial_state: zero
39 | 
40 | use_dropout: True
41 | rnn_input_dropout: 0.4
42 | word_dropout: 0.2
43 | 


--------------------------------------------------------------------------------
/config/IWSLT14/BPE2char.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 BPE to character Model'
 2 | 
 3 | cell_size: 512
 4 | attn_size: 512
 5 | embedding_size: 128
 6 | 
 7 | cell_type: LSTM
 8 | weight_scale: 0.1
 9 | 
10 | data_dir: data/IWSLT14
11 | model_dir: models/IWSLT14/BPE2char
12 | 
13 | batch_size: 32
14 | 
15 | steps_per_checkpoint: 4000
16 | steps_per_eval: 4000
17 | score_function: corpus_scores
18 | 
19 | max_gradient_norm: 1.0
20 | batch_mode: standard
21 | read_ahead: 20
22 | max_steps: 800000
23 | keep_best: 4
24 | 
25 | encoders:
26 |   - name: jsub.de
27 |     max_len: 52
28 |     train_initial_states: False
29 | 
30 | decoders:
31 |   - name: char.en
32 |     max_len: 239
33 |     character_level: True
34 |     conditional_rnn: True
35 |     pred_deep_layer: True
36 |     use_previous_word: False
37 |     initial_state: zero
38 | 
39 | use_dropout: True
40 | rnn_input_dropout: 0.4
41 | word_dropout: 0.2
42 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/baseline-TED.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 Baseline'
 2 | description: "IWSLT14 new baseline"
 3 | 
 4 | cell_size: 256
 5 | attn_size: 256
 6 | embedding_size: 128
 7 | 
 8 | bidir: True
 9 | cell_type: LSTM
10 | weight_scale: 0.1
11 | 
12 | data_dir: data/IWSLT14
13 | model_dir: models/IWSLT14/baseline_TED
14 | train_prefix: train.TED
15 | vocab_prefix: vocab.TED
16 | batch_size: 32
17 | 
18 | optimizer: adam
19 | learning_rate: 0.001
20 | learning_rate_decay_factor: 0.5
21 | decay_every_n_epoch: 1
22 | 
23 | steps_per_checkpoint: 2000
24 | steps_per_eval: 2000
25 | 
26 | max_gradient_norm: 1.0
27 | batch_mode: standard
28 | read_ahead: 20
29 | max_epochs: 4
30 | 
31 | encoders:
32 |   - name: de
33 |     max_len: 45
34 |     final_state: last_both
35 | 
36 | decoders:
37 |   - name: en
38 |     max_len: 47
39 |     conditional_rnn: True
40 |     pred_deep_layer: True
41 | 
42 | use_dropout: True
43 | pervasive_dropout: True
44 | rnn_input_dropout: 0.2
45 | attn_dropout: 0.2
46 | word_dropout: 0.2
47 | initial_state_dropout: 0.2
48 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/char-level-TED.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 Char-level'
 2 | description: "IWSLT14 subwords to characters"
 3 | 
 4 | cell_size: 256
 5 | attn_size: 256
 6 | embedding_size: 128
 7 | 
 8 | bidir: True
 9 | cell_type: LSTM
10 | weight_scale: 0.1
11 | 
12 | data_dir: data/IWSLT14
13 | model_dir: models/IWSLT14/char_level_TED
14 | train_prefix: train.TED
15 | vocab_prefix: vocab.TED
16 | batch_size: 32
17 | 
18 | optimizer: adam
19 | learning_rate: 0.001
20 | learning_rate_decay_factor: 0.5
21 | decay_every_n_epoch: 1
22 | 
23 | steps_per_checkpoint: 2000
24 | steps_per_eval: 2000
25 | 
26 | max_gradient_norm: 1.0
27 | batch_mode: standard
28 | read_ahead: 20
29 | max_epochs: 4
30 | 
31 | encoders:
32 |   - name: jsub.de
33 |     max_len: 51
34 |     final_state: last_both
35 | 
36 | decoders:
37 |   - name: char.en
38 |     max_len: 200
39 |     conditional_rnn: True
40 |     pred_deep_layer: True
41 |     character_level: True
42 | 
43 | use_dropout: True
44 | pervasive_dropout: True
45 | rnn_input_dropout: 0.2
46 | attn_dropout: 0.2
47 | word_dropout: 0.2
48 | initial_state_dropout: 0.2
49 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | data_dir=data/IWSLT14
 6 | model_dir=models/IWSLT14/Back_Translation_LM
 7 | 
 8 | file_id=$1
 9 | 
10 | input_filename=${model_dir}/data/${file_id}
11 | output_filename=${model_dir}/output/${file_id}
12 | 
13 | new_dir=`mktemp -d`
14 | tmp_dir=${new_dir}/moses
15 | scripts/decode-moses.sh ${model_dir}/moses.tuned.ini ${tmp_dir} ${input_filename} ${output_filename} 1>/dev/null 2>/dev/null
16 | rm -rf ${new_dir}
17 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | data_dir=data/IWSLT14
 6 | model_dir=models/IWSLT14
 7 | 
 8 | if [ -z ${MOSES} ]
 9 | then
10 |     echo "variable MOSES undefined"
11 |     exit 0
12 | fi
13 | 
14 | new_dir=`mktemp -d`
15 | tmp_dir=${new_dir}/moses
16 | 
17 | scripts/decode-moses.sh ${model_dir}/Back_Translation/moses.tuned.ini ${tmp_dir} ${data_dir}/test.en ${model_dir}/Back_Translation/test.mt 1>/dev/null 2>/dev/null
18 | scripts/score.py ${model_dir}/Back_Translation/test.mt ${data_dir}/test.de --bleu
19 | 
20 | scripts/decode-moses.sh ${model_dir}/Back_Translation_LM/moses.tuned.ini ${tmp_dir} ${data_dir}/test.en ${model_dir}/Back_Translation_LM/test.mt 1>/dev/null 2>/dev/null
21 | scripts/score.py ${model_dir}/Back_Translation_LM/test.mt ${data_dir}/test.de --bleu
22 | 
23 | rm -rf ${new_dir}
24 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | data_dir=data/IWSLT14
 4 | 
 5 | cat ${data_dir}/TED.de > ${data_dir}/train.TED.de
 6 | cat ${data_dir}/TED.en > ${data_dir}/train.TED.en
 7 | 
 8 | for i in {1..10}
 9 | do
10 |     cat ${data_dir}/train.de >> ${data_dir}/train.TED.de
11 |     cat ${data_dir}/train.en >> ${data_dir}/train.TED.en
12 | done
13 | 
14 | scripts/prepare-data.py ${data_dir}/train.TED de en ${data_dir} --mode vocab --vocab-size 30000 --vocab-prefix vocab.TED
15 | 
16 | scripts/prepare-data.py ${data_dir}/train.TED de en ${data_dir} --subwords --bpe-path ${data_dir}/bpe.joint \
17 | --output train.TED.jsub --vocab-size 0 --vocab-prefix vocab.TED.jsub --no-tokenize
18 | 
19 | cp ${data_dir}/train.TED.de ${data_dir}/train.TED.char.de
20 | cp ${data_dir}/train.TED.en ${data_dir}/train.TED.char.en
21 | 
22 | cp ${data_dir}/vocab.char.de ${data_dir}/vocab.TED.char.de
23 | cp ${data_dir}/vocab.char.en ${data_dir}/vocab.TED.char.en
24 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/split.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | data_dir=data/IWSLT14
4 | model_dir=models/IWSLT14/Back_Translation_LM
5 | scripts/split-corpus.py ${data_dir}/TED.en ${model_dir}/data --splits 12 --tokens
6 | mkdir -p ${model_dir}/output
7 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/subwords-TED.yaml:
--------------------------------------------------------------------------------
 1 | label: 'IWSLT14 Subwords'
 2 | description: "IWSLT14 Joint subwords"
 3 | 
 4 | cell_size: 256
 5 | attn_size: 256
 6 | embedding_size: 128
 7 | 
 8 | bidir: True
 9 | cell_type: LSTM
10 | weight_scale: 0.1
11 | 
12 | data_dir: data/IWSLT14
13 | model_dir: models/IWSLT14/subwords_TED
14 | train_prefix: train.TED
15 | vocab_prefix: vocab.TED
16 | batch_size: 32
17 | 
18 | optimizer: adam
19 | learning_rate: 0.001
20 | learning_rate_decay_factor: 0.5
21 | decay_every_n_epoch: 1
22 | 
23 | steps_per_checkpoint: 2000
24 | steps_per_eval: 2000
25 | 
26 | max_gradient_norm: 1.0
27 | batch_mode: standard
28 | read_ahead: 20
29 | max_epochs: 4
30 | 
31 | encoders:
32 |   - name: jsub.de
33 |     max_len: 51
34 |     final_state: last_both
35 | 
36 | decoders:
37 |   - name: jsub.en
38 |     max_len: 50
39 |     conditional_rnn: True
40 |     pred_deep_layer: True
41 | 
42 | use_dropout: True
43 | pervasive_dropout: True
44 | rnn_input_dropout: 0.2
45 | attn_dropout: 0.2
46 | word_dropout: 0.2
47 | initial_state_dropout: 0.2
48 | 


--------------------------------------------------------------------------------
/config/IWSLT14/Back-Translation/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | data_dir=data/IWSLT14
 4 | model_dir=models/IWSLT14
 5 | train_script=scripts/train-moses.sh
 6 | 
 7 | # model_dir data_dir corpus dev_corpus src_ext trg_ext lm_corpus lm_order
 8 | ${train_script} ${model_dir}/Back_Translation ${data_dir} train dev en de train 3
 9 | cat ${data_dir}/{train,OpenSubtitles}.de > ${data_dir}/train+OpenSubtitles.de
10 | ${train_script} ${model_dir}/Back_Translation_LM ${data_dir} train dev en de train+OpenSubtitles 3
11 | 


--------------------------------------------------------------------------------
/config/IWSLT14/prepare-TED.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | data_dir=data/IWSLT14
 4 | 
 5 | cat ${data_dir}/TED.de > ${data_dir}/train.TED.de
 6 | cat ${data_dir}/TED.en > ${data_dir}/train.TED.en
 7 | 
 8 | for i in {1..10}
 9 | do
10 |     cat ${data_dir}/train.de >> ${data_dir}/train.TED.de
11 |     cat ${data_dir}/train.en >> ${data_dir}/train.TED.en
12 | done
13 | 
14 | scripts/prepare-data.py ${data_dir}/train.TED de en ${data_dir} --mode vocab --vocab-size 30000 --vocab-prefix vocab.TED
15 | 
16 | for ext in de en
17 | do
18 |     scripts/bpe/apply_bpe.py -c ${data_dir}/bpe.joint.${ext} --vocabulary ${data_dir}/bpe-vocab.${ext} --vocabulary-threshold 10 < ${data_dir}/train.TED.${ext} > ${data_dir}/train.TED.jsub.${ext}
19 | done
20 | 
21 | cp ${data_dir}/train.TED.de ${data_dir}/train.TED.char.de
22 | cp ${data_dir}/train.TED.en ${data_dir}/train.TED.char.en
23 | cp ${data_dir}/vocab.char.de ${data_dir}/vocab.TED.char.de
24 | cp ${data_dir}/vocab.char.en ${data_dir}/vocab.TED.char.en
25 | 
26 | 


--------------------------------------------------------------------------------
/config/IWSLT14/prepare-lexicon.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | data_dir=data/IWSLT14
 6 | 
 7 | rm -rf fast_align-master
 8 | 
 9 | wget https://github.com/clab/fast_align/archive/master.zip
10 | unzip master.zip
11 | rm master.zip
12 | cd fast_align-master
13 | mkdir build
14 | cd build
15 | cmake ..
16 | make
17 | cd ../..
18 | 
19 | corpus=train
20 | fast_align=fast_align-master/build
21 | 
22 | scripts/join.py ${data_dir}/${corpus}.{de,en} > ${data_dir}/${corpus}.de-en
23 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.de-en -d -o -v > ${data_dir}/${corpus}.forward.align
24 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.de-en -d -o -v -r > ${data_dir}/${corpus}.reverse.align
25 | ${fast_align}/atools -i ${data_dir}/${corpus}.forward.align -j ${data_dir}/${corpus}.reverse.align -c grow-diag-final-and > ${data_dir}/${corpus}.align
26 | 
27 | scripts/extract-lexicon.py ${data_dir}/${corpus}.{de,en,align} > ${data_dir}/${corpus}.lexicon
28 | python3 -c "print('\n'.join(line.rstrip() for line in open('${data_dir}/${corpus}.lexicon') if not line[0].isupper() and not line.split()[0] == line.split()[1]))" > ${data_dir}/${corpus}.lexicon.purged
29 | 
30 | rm -rf fast_align-master
31 | rm ${data_dir}/${corpus}.de-en
32 | rm ${data_dir}/*.align
33 | 


--------------------------------------------------------------------------------
/config/IWSLT14/prepare-mixer.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Script downloaded from: https://github.com/facebookresearch/MIXER
 4 | 
 5 | TOKENIZER=scripts/moses/tokenizer.perl
 6 | UNESCAPE=scripts/moses/unescape-special-chars.perl
 7 | LC=scripts/moses/lowercase.perl
 8 | CLEAN=scripts/moses/clean-corpus-n.perl
 9 | 
10 | URL="http://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
11 | GZ=de-en.tgz
12 | 
13 | src=de
14 | tgt=en
15 | lang=de-en
16 | prep=prep
17 | tmp=prep/tmp
18 | orig=orig
19 | 
20 | mkdir -p $orig $tmp $prep
21 | 
22 | echo "Downloading data from ${URL}..."
23 | cd $orig
24 | wget "$URL"
25 | 
26 | if [ -f $GZ ]; then
27 |     echo "Data successfully downloaded."
28 | else
29 |     echo "Data not successfully downloaded."
30 |     exit
31 | fi
32 | 
33 | tar zxvf $GZ
34 | cd ..
35 | 
36 | echo "pre-processing train data..."
37 | for l in $src $tgt; do
38 |     f=train.tags.$lang.$l
39 |     tok=train.tags.$lang.tok.$l
40 | 
41 |     cat $orig/$lang/$f | \
42 |     grep -v '<url>' | \
43 |     grep -v '<talkid>' | \
44 |     grep -v '<keywords>' | \
45 |     sed -e 's/<title>//g' | \
46 |     sed -e 's/<\/title>//g' | \
47 |     sed -e 's/<description>//g' | \
48 |     sed -e 's/<\/description>//g' | \
49 |     # perl $UNESCAPE | \
50 |     perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
51 |     echo ""
52 | done
53 | perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 50
54 | for l in $src $tgt; do
55 |     perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
56 | done
57 | 
58 | echo "pre-processing valid/test data..."
59 | for l in $src $tgt; do
60 |     for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
61 |     fname=${o##*/}
62 |     f=$tmp/${fname%.*}
63 |     echo $o $f
64 |     grep '<seg id' $o | \
65 |         sed -e 's/<seg id="[0-9]*">\s*//g' | \
66 |         sed -e 's/\s*<\/seg>\s*//g' | \
67 |         sed -e "s/\’/\'/g" | \
68 |     # perl $UNESCAPE | \
69 |     perl $TOKENIZER -threads 8 -l $l | \
70 |     perl $LC > $f
71 |     echo ""
72 |     done
73 | done
74 | 
75 | echo "creating train, valid, test..."
76 | for l in $src $tgt; do
77 |     awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.de-en.$l > $prep/valid.de-en.$l
78 |     awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.de-en.$l > $prep/train.de-en.$l
79 | 
80 |     cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
81 |         $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
82 |         $tmp/IWSLT14.TED.tst2010.de-en.$l \
83 |         $tmp/IWSLT14.TED.tst2011.de-en.$l \
84 |         $tmp/IWSLT14.TED.tst2012.de-en.$l \
85 |         > $prep/test.de-en.$l
86 | done
87 | 
88 | 


--------------------------------------------------------------------------------
/config/IWSLT14/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | data_dir=data/IWSLT14
 4 | mkdir -p ${data_dir}
 5 | 
 6 | config/IWSLT14/prepare-mixer.sh
 7 | mv prep/*.{en,de} ${data_dir}
 8 | rename s/.de-en// ${data_dir}/*
 9 | rename s/valid/dev/ ${data_dir}/*
10 | rm -rf prep orig
11 | 
12 | scripts/prepare-data.py ${data_dir}/train de en ${data_dir} --mode vocab --vocab-size 30000
13 | 
14 | scripts/bpe/learn_joint_bpe_and_vocab.py --input ${data_dir}/train.{de,en} -s 30000 -o ${data_dir}/bpe.joint.en --write-vocabulary ${data_dir}/bpe-vocab.de ${data_dir}/bpe-vocab.en
15 | cp ${data_dir}/bpe.joint.en ${data_dir}/bpe.joint.de
16 | 
17 | cat ${data_dir}/train.{de,en} > ${data_dir}/train.concat
18 | scripts/prepare-data.py ${data_dir}/train concat ${data_dir} --mode vocab --vocab-size 0 --character-level
19 | mv ${data_dir}/vocab.concat ${data_dir}/vocab.char.en
20 | cp ${data_dir}/vocab.char.en ${data_dir}/vocab.char.de
21 | rm ${data_dir}/train.concat
22 | 
23 | for ext in de en
24 | do
25 |     for corpus in train dev test
26 |     do
27 |         scripts/bpe/apply_bpe.py -c ${data_dir}/bpe.joint.${ext} --vocabulary ${data_dir}/bpe-vocab.${ext} --vocabulary-threshold 10 < ${data_dir}/${corpus}.${ext} > ${data_dir}/${corpus}.jsub.${ext}
28 |     done
29 | done
30 | 
31 | cat ${data_dir}/train.jsub.{en,de} > ${data_dir}/train.jsub.concat
32 | scripts/prepare-data.py ${data_dir}/train jsub.en jsub.de ${data_dir} --mode vocab --vocab-size 0
33 | scripts/prepare-data.py ${data_dir}/train.jsub concat ${data_dir} --mode vocab --vocab-size 0
34 | mv ${data_dir}/vocab.concat ${data_dir}/vocab.joint.jsub.en
35 | cp ${data_dir}/vocab.joint.jsub.{en,de}
36 | rm ${data_dir}/train.jsub.concat
37 | 
38 | cp ${data_dir}/train.en ${data_dir}/train.char.en
39 | cp ${data_dir}/train.de ${data_dir}/train.char.de
40 | cp ${data_dir}/dev.en ${data_dir}/dev.char.en
41 | cp ${data_dir}/dev.de ${data_dir}/dev.char.de
42 | 
43 | wget http://opus.nlpl.eu/download/TED2013/mono/TED2013.en.gz -O ${data_dir}/TED2013.en.gz
44 | #wget http://opus.nlpl.eu/download/OpenSubtitles2018/mono/OpenSubtitles2018.de.gz -O ${data_dir}/OpenSubtitles2018.de.gz
45 | #wget http://opus.nlpl.eu/download/OpenSubtitles2018/mono/OpenSubtitles2018.en.gz -O ${data_dir}/OpenSubtitles2018.en.gz
46 | 
47 | function filter {
48 | filename=`mktemp`
49 | cat > ${filename} << EOF
50 | import sys
51 | lines = set(list(open('${data_dir}/dev.$1')) + list(open('${data_dir}/test.$1')))
52 | for line in sys.stdin:
53 |     if line not in lines:
54 |         sys.stdout.write(line)
55 | EOF
56 | python3 ${filename}
57 | rm ${filename}
58 | }
59 | 
60 | gunzip ${data_dir}/TED2013.en.gz --stdout | scripts/moses/lowercase.perl | filter en > ${data_dir}/TED.en
61 | rm ${data_dir}/TED2013.en.gz
62 | #gunzip ${data_dir}/OpenSubtitles2018.de.gz --stdout  | scripts/moses/lowercase.perl | filter de > ${data_dir}/OpenSubtitles.de
63 | #gunzip ${data_dir}/OpenSubtitles2018.en.gz --stdout  | scripts/moses/lowercase.perl | filter en > ${data_dir}/OpenSubtitles.en
64 | 


--------------------------------------------------------------------------------
/config/IWSLT14/train-SMT.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | data_dir=data/IWSLT14
 6 | model_dir=models/IWSLT14
 7 | train_script=scripts/train-moses.sh
 8 | 
 9 | # model_dir data_dir corpus dev_corpus src_ext trg_ext lm_corpus lm_order
10 | ${train_script} ${model_dir}/SMT ${data_dir} train dev de en train 3
11 | ${train_script} ${model_dir}/SMT_subwords ${data_dir} train.jsub dev.jsub de en train.jsub 3
12 | 
13 | cat ${data_dir}/{train,TED}.en > ${data_dir}/train+TED.en
14 | ${train_script} ${model_dir}/SMT_LM ${data_dir} train dev de en train+TED 3
15 | cat ${data_dir}/{train,TED}.jsub.en > ${data_dir}/train+TED.jsub.en
16 | ${train_script} ${model_dir}/SMT_LM_subwords ${data_dir} train.jsub dev.jsub de en train+TED.jsub 3
17 | 
18 | ${train_script} ${model_dir}/SMT_huge_LM ${data_dir} train dev de en OpenSubtitles 5
19 | 
20 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/ASR.yaml:
--------------------------------------------------------------------------------
 1 | label: 'LibriSpeech ASR'
 2 | description: "Character-Level Automatic Speech Recognition on LibriSpeech"
 3 | 
 4 | data_dir: data/LibriSpeech
 5 | model_dir: models/LibriSpeech/ASR
 6 | max_train_size: 20000
 7 | 
 8 | batch_size: 32
 9 | weight_scale: null
10 | 
11 | steps_per_checkpoint: 1000
12 | steps_per_eval: 1000
13 | max_steps: 500000
14 | score_function: corpus_scores_wer
15 | 
16 | cell_size: 256
17 | attn_size: 256
18 | cell_type: LSTM
19 | 
20 | encoders:
21 |   - name: speech.en
22 |     ext: npz
23 |     embedding_size: 41
24 |     layers: 3
25 |     conv_filters: [16, 16]
26 |     conv_size: [3, 3]
27 |     conv_strides: [2, 2]
28 |     conv_activation: null
29 |     binary: True
30 |     max_len: 1400
31 |     input_layers: [256, 128]
32 |     bidir_projection: True
33 |     final_state: average
34 |     train_initial_states: False
35 |     input_layer_dropout: 0.2
36 | 
37 | decoders:
38 |   - name: char.en
39 |     conditional_rnn: True
40 |     pred_deep_layer: True
41 |     character_level: True
42 |     use_previous_word: True
43 |     embedding_size: 128
44 |     max_len: 300
45 | 
46 | use_dropout: True
47 | pervasive_dropout: True
48 | attn_dropout: 0.2
49 | rnn_input_dropout: 0.2
50 | initial_state_dropout: 0.2
51 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/AST.yaml:
--------------------------------------------------------------------------------
 1 | label: 'LibriSpeech AST'
 2 | description: "Character-Level Automatic Speech Translation on LibriSpeech"
 3 | 
 4 | data_dir: data/LibriSpeech
 5 | model_dir: models/LibriSpeech/AST
 6 | max_train_size: 20000
 7 | train_prefix: train+google
 8 | 
 9 | batch_size: 32
10 | weight_scale: null
11 | 
12 | steps_per_checkpoint: 1000
13 | steps_per_eval: 1000
14 | max_steps: 500000
15 | score_function: corpus_scores_bleu
16 | 
17 | cell_size: 512
18 | attn_size: 512
19 | cell_type: LSTM
20 | 
21 | encoders:
22 |   - name: speech.en
23 |     ext: npz
24 |     embedding_size: 41
25 |     layers: 3
26 |     cell_size: 256
27 |     conv_filters: [16, 16]
28 |     conv_size: [3, 3]
29 |     conv_strides: [2, 2]
30 |     conv_activation: null
31 |     binary: True
32 |     max_len: 1400
33 |     input_layers: [256, 128]
34 |     bidir_projection: False
35 |     final_state: average
36 |     train_initial_states: False
37 |     input_layer_dropout: 0.2
38 | 
39 | decoders:
40 |   - name: char.fr
41 |     conditional_rnn: True
42 |     pred_deep_layer: True
43 |     character_level: True
44 |     use_previous_word: True
45 |     embedding_size: 128
46 |     max_len: 300
47 | 
48 | use_dropout: True
49 | pervasive_dropout: True
50 | attn_dropout: 0.2
51 | rnn_input_dropout: 0.2
52 | initial_state_dropout: 0.2
53 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/MT.yaml:
--------------------------------------------------------------------------------
 1 | label: 'LibriSpeech MT'
 2 | description: "Character-Level Machine Translation on LibriSpeech"
 3 | 
 4 | data_dir: data/LibriSpeech
 5 | model_dir: models/LibriSpeech/MT
 6 | max_train_size: 20000
 7 | train_prefix: train+google
 8 | 
 9 | batch_size: 64
10 | weight_scale: null
11 | 
12 | steps_per_checkpoint: 1000
13 | steps_per_eval: 1000
14 | max_steps: 100000
15 | score_function: corpus_scores_bleu
16 | 
17 | cell_size: 512
18 | attn_size: 512
19 | cell_type: LSTM
20 | 
21 | encoders:
22 |   - name: sub.en
23 |     embedding_size: 256
24 |     max_len: 60
25 |     bidir_projection: True
26 |     final_state: average
27 |     train_initial_states: True
28 |     embedding_dropout: 0.2
29 | 
30 | decoders:
31 |   - name: char.fr
32 |     conditional_rnn: True
33 |     pred_deep_layer: True
34 |     character_level: True
35 |     use_previous_word: True
36 |     embedding_size: 128
37 |     max_len: 400
38 |     word_dropout: 0.2
39 | 
40 | use_dropout: True
41 | pervasive_dropout: True
42 | attn_dropout: 0.2
43 | rnn_input_dropout: 0.2
44 | rnn_output_dropout: 0.2
45 | initial_state_dropout: 0.2
46 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/Multi-Task.yaml:
--------------------------------------------------------------------------------
 1 | label: 'LibriSpeech Multi-Task'
 2 | description: "Multi-Task training of AST, MT and ASR models on LibriSpeech"
 3 | 
 4 | data_dir: data/LibriSpeech
 5 | model_dir: models/LibriSpeech/AST_multitask
 6 | max_train_size: 10000
 7 | 
 8 | batch_size: 32
 9 | weight_scale: null
10 | 
11 | steps_per_checkpoint: 500
12 | steps_per_eval: 500
13 | max_steps: 500000
14 | 
15 | cell_size: 512
16 | attn_size: 512
17 | cell_type: LSTM
18 | 
19 | conv_filters: [16, 16]
20 | conv_size: [3, 3]
21 | conv_strides: [2, 2]
22 | conv_activation: null
23 | final_state: average
24 | train_initial_states: False
25 | bidir_projection: False
26 | input_layers: [256, 128]
27 | 
28 | conditional_rnn: True
29 | pred_deep_layer: True
30 | use_previous_word: True
31 | embedding_size: 128
32 | 
33 | tasks:
34 |   - name: AST
35 |     score_function: corpus_scores_bleu
36 |     train_prefix: train+google
37 |     ratio: 0.6
38 | 
39 |     encoders:
40 |       - name: speech.en
41 |         ext: npz
42 |         embedding_size: 41
43 |         layers: 3
44 |         cell_size: 256
45 |         binary: True
46 |         max_len: 1400
47 |     decoders:
48 |       - name: char.fr
49 |         character_level: True
50 |         max_len: 300
51 | 
52 |   - name: ASR
53 |     score_function: corpus_scores_wer
54 |     train_prefix: train
55 |     ratio: 0.2
56 | 
57 |     encoders:
58 |       - name: speech.en
59 |         ext: npz
60 |         embedding_size: 41
61 |         layers: 3
62 |         cell_size: 256
63 |         attn_size: 256
64 |         binary: True
65 |         bidir_projection: True
66 |         max_len: 1400
67 |     decoders:
68 |       - name: char.en
69 |         cell_size: 256
70 |         character_level: True
71 |         max_len: 300
72 | 
73 |   - name: MT
74 |     score_function: corpus_scores_bleu
75 |     train_prefix: train+google
76 |     ratio: 0.2
77 | 
78 |     encoders:
79 |       - name: sub.en
80 |         embedding_size: 256
81 |         bidir_projection: True
82 |         train_initial_states: True
83 |         input_layers: null
84 |         conv_filters: null
85 |         max_len: 60
86 |     decoders:
87 |       - name: char.fr
88 |         character_level: True
89 |         max_len: 300
90 | 
91 | 
92 | use_dropout: True
93 | pervasive_dropout: True
94 | attn_dropout: 0.2
95 | rnn_input_dropout: 0.2
96 | initial_state_dropout: 0.2
97 | input_layer_dropout: 0.2
98 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Augmented LibriSpeech
 3 | 
 4 | The raw corpus can be downloaded [here](https://persyval-platform.univ-grenoble-alpes.fr/DS91/detaildataset). It consists in an automatic alignment of the [LibriSpeech ASR corpus](http://www.openslr.org/12/) (English audio with transcriptions), with [Project Gutenberg](https://www.gutenberg.org/), which distributes public domain e-books in many languages.
 5 | The scripts that were used for the alignment are freely available [here](https://github.com/alicank/Translation-Augmented-LibriSpeech-Corpus).
 6 | 
 7 | The pre-processed corpus (with MFCCs) is available [here](https://drive.google.com/open?id=15ZwzXe_FEx-K7yn6ZVksrUc0QWV072Xt). If you want to use it to train new models, you should extract it as `data/LibriSpeech`. Then, you can train a new model using the configuration files inside `config/LibriSpeech`. For example:
 8 | 
 9 |     ./seq2seq.sh config/LibriSpeech/AST.yaml --train -v --purge
10 | 
11 | If you want to do your own pre-processing, then you can use [this corpus](https://drive.google.com/open?id=1n6r-gkTPooK8oEWjllv1i5vO3ZWHkRNe). The audio files are grouped into tar archives for convenience. The `scripts/speech/extract.py` and `scripts/speech/extract-new.py` directly take this tar archive as input, and output a numpy binary file containing the extracted features. The text files are non-processed and should be tokenized and optionally lowercased before training.
12 | 
13 | ## Trained models
14 | 
15 | You can download some pre-trained models on Augmented LibriSpeech [here](https://drive.google.com/open?id=1QUS7VjaaFouBX7HNAl05vzKLzlzkZvcY).
16 | This archive should be extracted inside `models/`. Then, to decode the test set using a model, e.g., `AST.1`, do:
17 |     
18 |     ./seq2seq.sh models/LibriSpeech/AST.1/config.yaml --decode models/LibriSpeech/data/test.npz
19 | 
20 | The directory `models/LibriSpeech/eval-outputs` contains all the outputs by our our pre-trained models on the test and dev set. The `models/LibriSpeech/eval.log` file contains the commands that were used for the evaluation along with the obtained scores. Each model has a `config.yaml` file that can be used to use it or re-train it. The config files of the more important models are also available inside `config/LibriSpeech/`.
21 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/model-outputs.tar.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/config/LibriSpeech/model-outputs.tar.xz


--------------------------------------------------------------------------------
/config/LibriSpeech/prepare-raw.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # first download the Augmented LibriSpeech zip files inside `archive_dir`
 4 | archive_dir=raw_data/LibriSpeech/archives
 5 | raw_data=raw_data/LibriSpeech
 6 | mkdir -p ${raw_data}
 7 | 
 8 | #unzip -q ${archive_dir}/dev.zip -d ${raw_data}
 9 | #unzip -q ${archive_dir}/test.zip -d ${raw_data}
10 | #unzip -q ${archive_dir}/train_100h.zip -d ${raw_data}
11 | #unzip -q ${archive_dir}/train_130h_additional.zip -d ${raw_data}
12 | #unzip -q ${archive_dir}/database.zip -d ${raw_data}
13 | 
14 | function clean-dash {
15 |     perl -pe 's/^(-+)([^\s-])/$1 $2/g'
16 | }
17 | function clean-quotes {
18 |     perl -pe "s/\"\s*\"/\"/g"
19 | }
20 | 
21 | for corpus in dev test train other
22 | do
23 |     cp ${raw_data}/${corpus}/${corpus}.en ${raw_data}
24 |     if [ ${corpus} = train ] || [ ${corpus} = other ]
25 |     then
26 |         cat ${raw_data}/${corpus}/${corpus}.fr | clean-quotes | clean-dash > ${raw_data}/${corpus}.fr
27 |     else
28 |         cat ${raw_data}/${corpus}/${corpus}.fr | clean-dash > ${raw_data}/${corpus}.fr
29 |     fi
30 |     cat ${raw_data}/${corpus}/${corpus}_gtranslate.fr | clean-quotes > ${raw_data}/${corpus}.google.fr
31 | 
32 |     rm -f ${raw_data}/${corpus}.orig.en
33 | 
34 |     alignments=${raw_data}/${corpus}/alignments.meta
35 |     database=${raw_data}/TA-LibriSpeechCorpus.db
36 |     var=1
37 |     lines=`tail -n+2 ${alignments} | wc -l`
38 |     len=`python -c "import math; print(1 + int(math.log10(${lines})))"`
39 |     python3 -c "import sqlite3; conn = sqlite3.connect('${database}'); c = conn.cursor(); c.execute('SELECT audio_filename, source_segment FROM alignments'); d = dict(c.fetchall()); print('\n'.join(d[x.split()[-2]].strip() for x in open('${alignments}').readlines()[1:]))" | clean-quotes > ${raw_data}/${corpus}.orig.en
40 |     
41 |     for filename in `tail -n+2 ${alignments} | cut -f5,5`
42 |     do
43 |         name=`printf "%0${len}d" ${var}`
44 |         #cp ${raw_data}/${corpus}/audiofiles/${filename}.wav ${raw_data}/${corpus}/${name}.wav
45 |         ((var++));
46 |     done
47 | 
48 |     #rm -r ${raw_data}/${corpus}/audiofiles
49 |     find raw_data/LibriSpeech/${corpus}/ -maxdepth 1 -name "*.wav" > /tmp/files.txt
50 |     tar -cf ${raw_data}/${corpus}.tar -T /tmp/files.txt
51 |     #rm -r ${raw_data}/${corpus}
52 | done
53 | 
54 | sed -i '1743,1744d' ${raw_data}/test.fr
55 | sed -i '1743,1744d' ${raw_data}/test.en
56 | 


--------------------------------------------------------------------------------
/config/LibriSpeech/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | data_dir=data/LibriSpeech
 4 | raw_data=raw_data/LibriSpeech
 5 | mkdir -p ${data_dir}
 6 | 
 7 | scripts/prepare-data.py ${raw_data}/train fr en google.fr ${data_dir} --lowercase --no-tokenize en \
 8 | --dev-corpus ${raw_data}/dev --test-corpus ${raw_data}/test --normalize-punk fr google.fr --lang fr en fr \
 9 | --mode prepare
10 | 
11 | scripts/speech/extract.py ${raw_data}/train.tar ${data_dir}/train.npz
12 | scripts/speech/extract.py ${raw_data}/dev.tar ${data_dir}/dev.npz
13 | scripts/speech/extract.py ${raw_data}/test.tar ${data_dir}/test.npz
14 | scripts/speech/extract.py ${raw_data}/other.tar ${data_dir}/other.npz
15 | 
16 | cat ${data_dir}/{train,train.google}.fr > ${data_dir}/train+google.fr
17 | cat ${data_dir}/{train,train}.en > ${data_dir}/train+google.en
18 | scripts/speech/cat.py ${data_dir}/{train,train}.npz ${data_dir}/train+google.npz
19 | scripts/speech/shuf.py ${data_dir}/train+google.npz --input-txt ${data_dir}/train+google.{fr,en} 
20 | scripts/speech/shuf.py ${data_dir}/train.npz --input-txt ${data_dir}/train.{fr,en,google.fr} 
21 | 
22 | # prepare BPE
23 | scripts/bpe/learn_bpe.py -i ${data_dir}/train.en -s 30000 -o ${data_dir}/bpe.en
24 | 
25 | # apply BPE
26 | scripts/prepare-data.py ${data_dir}/train en ${data_dir} --no-tokenize --subwords --bpe-path ${data_dir}/bpe \
27 | --output train.sub --dev-prefix dev.sub --test-prefix test.sub --vocab-prefix vocab.sub \
28 | --dev-corpus ${data_dir}/dev --test-corpus ${data_dir}/test 
29 | 
30 | scripts/prepare-data.py ${data_dir}/train+google en ${data_dir} --no-tokenize --subwords \
31 | --bpe-path ${data_dir}/bpe --output train+google.sub --mode prepare
32 | 
33 | # prepare word-level vocabs
34 | scripts/prepare-data.py ${data_dir}/train+google fr en ${data_dir} --mode vocab --vocab-size 30000
35 | 
36 | # prepare character-level vocabs
37 | scripts/prepare-data.py ${data_dir}/train+google fr en ${data_dir} --mode vocab --character-level \
38 | --vocab-size 200 --vocab-prefix vocab.char
39 | 
40 | for corpus in train+google train dev test
41 | do
42 |     cp ${data_dir}/${corpus}.fr ${data_dir}/${corpus}.char.fr
43 |     cp ${data_dir}/${corpus}.en ${data_dir}/${corpus}.char.en
44 | done
45 | 
46 | 


--------------------------------------------------------------------------------
/config/WMT14/RNNsearch-Adam.yaml:
--------------------------------------------------------------------------------
 1 | label: "RNNsearch + Adam"
 2 | description: "Same config as RNNsearch (Bahdanau 2014), with Adam instead of AdaDelta"
 3 | 
 4 | cell_size: 1000
 5 | attn_size: 1000
 6 | embedding_size: 620
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/WMT14
10 | max_len: 50
11 | model_dir: models/WMT14/RNNsearch_Adam
12 | max_train_size: 1000000
13 | 
14 | steps_per_checkpoint: 10000
15 | steps_per_eval: 10000
16 | keep_best: 1
17 | max_to_keep: 1
18 | score_function: corpus_bleu
19 | 
20 | optimizer: adam
21 | learning_rate: 0.0002
22 | batch_size: 80
23 | batch_mode: standard
24 | shuffle: False
25 | read_ahead: 20
26 | max_gradient_norm: 1.0
27 | max_epochs: 10
28 | learning_rate_decay_factor: 0.5
29 | decay_every_n_epoch: 0.5
30 | 
31 | attention_type: global
32 | final_state: last
33 | 
34 | weight_scale: 0.01
35 | 
36 | encoders:
37 |   - name: en
38 |     train_initial_states: False
39 | 
40 | decoders:
41 |   - name: fr
42 | 
43 | generate_first: False
44 | orthogonal_init: True
45 | 


--------------------------------------------------------------------------------
/config/WMT14/RNNsearch-BPE.yaml:
--------------------------------------------------------------------------------
 1 | label: "RNNsearch + BPE"
 2 | description: "Same config as RNNsearch (Bahdanau 2014), with Adam, a Cond-GRU decoder and BPE units"
 3 | 
 4 | cell_size: 1000
 5 | attn_size: 1000
 6 | embedding_size: 620
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/WMT14
10 | max_len: 60
11 | model_dir: models/WMT14/RNNsearch_BPE
12 | max_train_size: 1000000
13 | 
14 | steps_per_checkpoint: 10000
15 | steps_per_eval: 10000
16 | keep_best: 1
17 | max_to_keep: 1
18 | score_function: corpus_bleu
19 | 
20 | optimizer: adam
21 | learning_rate: 0.0002
22 | batch_size: 80
23 | batch_mode: standard
24 | read_ahead: 20
25 | max_gradient_norm: 1.0
26 | max_epochs: 5
27 | learning_rate_decay_factor: 0.5
28 | decay_every_n_epoch: 0.5
29 | 
30 | attention_type: global
31 | final_state: last_both
32 | 
33 | weight_scale: 0.01
34 | 
35 | encoders:
36 |   - name: joint
37 |     ext: jsub.en
38 | 
39 | decoders:
40 |   - name: joint
41 |     ext: jsub.fr
42 |     conditional_rnn: True
43 |     pred_deep_layer: True
44 | 
45 | orthogonal_init: True
46 | 


--------------------------------------------------------------------------------
/config/WMT14/RNNsearch.yaml:
--------------------------------------------------------------------------------
 1 | label: "RNNsearch"
 2 | description: "Baseline WMT14 model, exact same config as Bahdanau et al. 2014"
 3 | 
 4 | cell_size: 1000
 5 | attn_size: 1000
 6 | embedding_size: 620
 7 | cell_type: GRU
 8 | 
 9 | data_dir: data/WMT14
10 | max_len: 50
11 | model_dir: models/WMT14/RNNsearch
12 | max_train_size: 1000000
13 | 
14 | steps_per_checkpoint: 10000
15 | steps_per_eval: 10000
16 | keep_best: 1
17 | max_to_keep: 1
18 | score_function: corpus_bleu
19 | 
20 | optimizer: adadelta
21 | learning_rate: 1.0
22 | batch_size: 80
23 | batch_mode: standard
24 | shuffle: False
25 | read_ahead: 20
26 | max_gradient_norm: 1.0
27 | max_epochs: 10
28 | 
29 | attention_type: global
30 | final_state: last
31 | 
32 | weight_scale: 0.01
33 | 
34 | encoders:
35 |   - name: en
36 |     train_initial_states: False
37 | 
38 | decoders:
39 |   - name: fr
40 | 
41 | generate_first: False
42 | orthogonal_init: True
43 | 


--------------------------------------------------------------------------------
/config/WMT14/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | raw_data=raw_data/WMT14
 4 | 
 5 | mkdir -p ${raw_data}
 6 | cur_dir=`pwd`
 7 | cd ${raw_data}
 8 | 
 9 | wget "http://www-lium.univ-lemans.fr/~schwenk/nnmt-shared-task/data/bitexts.tgz"
10 | tar xzf bitexts.tgz
11 | gunzip bitexts.selected/*
12 | cat bitexts.selected/{ep7_pc45,nc9,dev08_11,crawl,ccb2_pc30,un2000_pc34}.en > WMT14.fr-en.en
13 | cat bitexts.selected/{ep7_pc45,nc9,dev08_11,crawl,ccb2_pc30,un2000_pc34}.fr > WMT14.fr-en.fr
14 | rm -rf bitexts.selected
15 | 
16 | wget "http://www-lium.univ-lemans.fr/~schwenk/nnmt-shared-task/data/dev+test.tgz"
17 | tar xzf dev+test.tgz
18 | rename s@dev/ntst1213@ntst1213.fr-en@ dev/*
19 | rename s@dev/ntst14@ntst14.fr-en@ dev/*
20 | rmdir dev
21 | 
22 | rm bitexts.tgz dev+test.tgz
23 | 
24 | cd ${cur_dir}
25 | 


--------------------------------------------------------------------------------
/config/WMT14/prepare-lexicon.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | data_dir=data/WMT14
 6 | 
 7 | rm -rf fast_align-master
 8 | 
 9 | wget https://github.com/clab/fast_align/archive/master.zip
10 | unzip master.zip
11 | rm master.zip
12 | cd fast_align-master
13 | mkdir build
14 | cd build
15 | cmake ..
16 | make
17 | cd ../..
18 | 
19 | corpus=train
20 | fast_align=fast_align-master/build
21 | 
22 | scripts/join.py ${data_dir}/${corpus}.{en,fr} > ${data_dir}/${corpus}.en-fr
23 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.en-fr -d -o -v > ${data_dir}/${corpus}.forward.align
24 | ${fast_align}/fast_align -i ${data_dir}/${corpus}.en-fr -d -o -v -r > ${data_dir}/${corpus}.reverse.align
25 | ${fast_align}/atools -i ${data_dir}/${corpus}.forward.align -j ${data_dir}/${corpus}.reverse.align -c grow-diag-final-and > ${data_dir}/${corpus}.align
26 | 
27 | scripts/extract-lexicon.py ${data_dir}/${corpus}.{en,fr,align} > ${data_dir}/${corpus}.lexicon
28 | python3 -c "print('\n'.join(line.rstrip() for line in open('${data_dir}/${corpus}.lexicon') if not line[0].isupper() and not line.split()[0] == line.split()[1]))" > ${data_dir}/${corpus}.lexicon.purged
29 | 
30 | rm -rf fast_align-master
31 | rm ${data_dir}/${corpus}.en-fr
32 | rm ${data_dir}/*.align
33 | 


--------------------------------------------------------------------------------
/config/WMT14/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Filtered WMT14 data, available on http://www-lium.univ-lemans.fr/~schwenk/nnmt-shared-task/
 4 | 
 5 | raw_data=raw_data/WMT14
 6 | data_dir=data/WMT14
 7 | 
 8 | rm -rf ${data_dir}
 9 | mkdir -p ${data_dir}
10 | 
11 | scripts/prepare-data.py ${raw_data}/WMT14.fr-en fr en ${data_dir} --no-tokenize \
12 | --dev-corpus ${raw_data}/ntst1213.fr-en \
13 | --test-corpus ${raw_data}/ntst14.fr-en \
14 | --vocab-size 30000 --shuffle --seed 1234
15 | 
16 | cat ${raw_data}/WMT14.fr-en.{fr,en} > ${data_dir}/train.concat
17 | scripts/bpe/learn_bpe.py -i ${data_dir}/train.concat -o ${data_dir}/bpe.joint -s 30000
18 | cp ${data_dir}/bpe.joint ${data_dir}/bpe.joint.fr
19 | cp ${data_dir}/bpe.joint ${data_dir}/bpe.joint.en
20 | rm ${data_dir}/train.concat
21 | 
22 | scripts/prepare-data.py ${raw_data}/WMT14.fr-en fr en ${data_dir} --no-tokenize \
23 | --subwords --bpe-path ${data_dir}/bpe.joint \
24 | --dev-corpus ${raw_data}/ntst1213.fr-en --dev-prefix dev.jsub \
25 | --test-corpus ${raw_data}/ntst14.fr-en --test-prefix test.jsub \
26 | --shuffle --seed 1234 --output train.jsub --mode prepare
27 | 
28 | cat ${data_dir}/train.jsub.{fr,en} > ${data_dir}/train.concat.jsub
29 | scripts/prepare-data.py ${data_dir}/train concat.jsub ${data_dir} --vocab-size 0 --mode vocab
30 | cp ${data_dir}/vocab.concat.jsub ${data_dir}/vocab.jsub.fr
31 | cp ${data_dir}/vocab.concat.jsub ${data_dir}/vocab.jsub.en
32 | rm ${data_dir}/*.concat.*
33 | 
34 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | root_dir=`pwd`
 4 | 
 5 | /usr/bin/env pip3 install tensorflow-gpu python-dateutil pyyaml matplotlib --user --upgrade
 6 | 
 7 | cat >>~/.bashrc << EOL
 8 | export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/cuda/lib64
 9 | alias get-best-score=${root_dir}/scripts/get-best-score.py
10 | alias plot-loss=${root_dir}/scripts/plot-loss.py
11 | alias multi-print=${root_dir}/scripts/multi-print.py
12 | alias copy-model=${root_dir}/scripts/copy-model.py
13 | EOL
14 | 


--------------------------------------------------------------------------------
/run-tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import subprocess
  3 | import shlex
  4 | import re
  5 | import os
  6 | import argparse
  7 | 
  8 | OKGREEN = '\033[92m'
  9 | FAIL = '\033[91m'
 10 | ENDC = '\033[0m'
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--no-gpu', action='store_true')
 14 | parser.add_argument('--gpu-id', type=int)
 15 | parser.add_argument('dirs', nargs='*')
 16 | args = parser.parse_args()
 17 | 
 18 | extra_params = []
 19 | if args.no_gpu:
 20 |     extra_params.append('--no-gpu')
 21 | if args.gpu_id is not None:
 22 |     extra_params += ['--gpu-id', str(args.gpu_id)]
 23 | 
 24 | def failure(message):
 25 |     print('{}failure: {}{}'.format(FAIL, message, ENDC))
 26 | def success(message):
 27 |     print('{}success: {}{}'.format(OKGREEN, message, ENDC))
 28 | 
 29 | log_file = os.path.join('tests', 'log.txt')
 30 | 
 31 | try:
 32 |     os.remove(log_file)
 33 | except FileNotFoundError:
 34 |     pass
 35 | 
 36 | 
 37 | def get_best_score(log_file):
 38 |     scores = []
 39 |     with open(log_file) as f:
 40 |         for line in f:
 41 |             score_ = re.search(r' (score|bleu|ter|loss|cer|wer|bleu1)=(.*?) ', line + ' ')
 42 | 
 43 |             if score_:
 44 |                 scores.append(float(score_.group(2)))
 45 | 
 46 |     if len(scores) == 0:
 47 |         return None
 48 |     elif len(scores) == 1:
 49 |         return scores[0]
 50 |     elif scores[0] <= scores[-1]:
 51 |         return max(scores)
 52 |     else:
 53 |         return min(scores)
 54 | 
 55 | 
 56 | def run(dir_, score=None):
 57 |     config_file = os.path.join(dir_, 'config.yaml')
 58 |     log_file_ = os.path.join(dir_, 'log.txt')
 59 |     name = os.path.basename(dir_)
 60 | 
 61 |     if score is None:
 62 |         try:
 63 |             score = get_best_score(log_file_)
 64 |         except:
 65 |             pass
 66 | 
 67 |     print('Running {}'.format(name))
 68 | 
 69 |     try:
 70 |         output = subprocess.check_output(['./seq2seq.sh', config_file, '--eval'] + extra_params,
 71 |                                          stderr=subprocess.STDOUT).decode()
 72 |     except subprocess.CalledProcessError as e:
 73 |         output = e.output.decode()
 74 | 
 75 |     scores = output.strip().split('\n')[-1] + ' '
 76 |     score_ = re.search(r' (score|bleu|ter|loss|cer|wer|bleu1)=(.*?) ', scores)
 77 | 
 78 |     with open(log_file, 'a') as f:
 79 |         f.write(output)
 80 | 
 81 |     if not score_:
 82 |         failure('unable to run test (see log file)')
 83 |     else:
 84 |         score_ = float(score_.group(2)) 
 85 |         if score is None:
 86 |             success('obtained {}'.format(score_))
 87 |         elif score_ == score:
 88 |             success('scores matching ({})'.format(score_))
 89 |         else:
 90 |             failure('obtained {}, expected {}'.format(score_, score))
 91 | 
 92 | 
 93 | if not args.dirs:
 94 |     dirs = [os.path.join('tests', name) for name in os.listdir('tests')]
 95 | else:
 96 |     dirs = args.dirs
 97 | 
 98 | for path in dirs:
 99 |     if os.path.isdir(path):
100 |         run(path)
101 | 
102 | 


--------------------------------------------------------------------------------
/scripts/bpe/bpe_toy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Rico Sennrich
 4 | 
 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens.
 8 | This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets,
 9 | indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py).
10 | 
11 | Reference:
12 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
13 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
14 | """
15 | 
16 | 
17 | import re
18 | import sys
19 | import collections
20 | 
21 | def get_stats(vocab):
22 |   pairs = collections.defaultdict(int)
23 |   for word, freq in vocab.items():
24 |     symbols = word.split()
25 |     for i in range(len(symbols)-1):
26 |       pairs[symbols[i],symbols[i+1]] += freq
27 |   return pairs
28 | 
29 | def merge_vocab(pair, v_in):
30 |   v_out = {}
31 |   bigram_pattern = re.escape(' '.join(pair))
32 |   p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)')
33 |   for word in v_in:
34 |     w_out = p.sub(''.join(pair), word)
35 |     v_out[w_out] = v_in[word]
36 |   return v_out
37 | 
38 | vocab = {'l o w</w>' : 5, 'l o w e r</w>' : 2,
39 |          'n e w e s t</w>' : 6, 'w i d e s t</w>' : 3}
40 | num_merges = 15
41 | for i in range(num_merges):
42 |   pairs = get_stats(vocab)
43 |   try:
44 |     best = max(pairs, key=pairs.get)
45 |   except ValueError:
46 |     break
47 |   if pairs[best] < 2:
48 |      sys.stderr.write('no pair has frequency > 1. Stopping\n')
49 |      break
50 |   vocab = merge_vocab(best, vocab)
51 |   print(best)
52 | 


--------------------------------------------------------------------------------
/scripts/bpe/chrF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Compute chrF3 for machine translation evaluation
  6 | 
  7 | Reference:
  8 | Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal.
  9 | """
 10 | 
 11 | from __future__ import print_function, unicode_literals, division
 12 | import sys
 13 | import codecs
 14 | import io
 15 | import argparse
 16 | from collections import defaultdict
 17 | 
 18 | # hack for python2/3 compatibility
 19 | from io import open
 20 | argparse.open = open
 21 | 
 22 | # python 2/3 compatibility
 23 | if sys.version_info < (3, 0):
 24 |   sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 25 |   sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 26 |   sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 27 | 
 28 | 
 29 | def create_parser():
 30 |     parser = argparse.ArgumentParser(
 31 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 32 |         description="learn BPE-based word segmentation")
 33 | 
 34 |     parser.add_argument(
 35 |         '--ref', '-r', type=argparse.FileType('r'), required=True,
 36 |         metavar='PATH',
 37 |         help="Reference file")
 38 |     parser.add_argument(
 39 |         '--hyp', type=argparse.FileType('r'), metavar='PATH',
 40 |         default=sys.stdin,
 41 |         help="Hypothesis file (default: stdin).")
 42 |     parser.add_argument(
 43 |         '--beta', '-b', type=float, default=3,
 44 |         metavar='FLOAT',
 45 |         help="beta parameter (default: '%(default)s')")
 46 |     parser.add_argument(
 47 |         '--ngram', '-n', type=int, default=6,
 48 |         metavar='INT',
 49 |         help="ngram order (default: '%(default)s')")
 50 |     parser.add_argument(
 51 |         '--space', '-s', action='store_true',
 52 |         help="take spaces into account (default: '%(default)s')")
 53 |     parser.add_argument(
 54 |         '--precision', action='store_true',
 55 |         help="report precision (default: '%(default)s')")
 56 |     parser.add_argument(
 57 |         '--recall', action='store_true',
 58 |         help="report recall (default: '%(default)s')")
 59 | 
 60 |     return parser
 61 | 
 62 | def extract_ngrams(words, max_length=4, spaces=False):
 63 | 
 64 |     if not spaces:
 65 |         words = ''.join(words.split())
 66 |     else:
 67 |         words = words.strip()
 68 | 
 69 |     results = defaultdict(lambda: defaultdict(int))
 70 |     for length in range(max_length):
 71 |         for start_pos in range(len(words)):
 72 |             end_pos = start_pos + length + 1
 73 |             if end_pos <= len(words):
 74 |                 results[length][tuple(words[start_pos: end_pos])] += 1
 75 |     return results
 76 | 
 77 | 
 78 | def get_correct(ngrams_ref, ngrams_test, correct, total):
 79 | 
 80 |     for rank in ngrams_test:
 81 |         for chain in ngrams_test[rank]:
 82 |             total[rank] += ngrams_test[rank][chain]
 83 |             if chain in ngrams_ref[rank]:
 84 |                 correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain])
 85 | 
 86 |     return correct, total
 87 | 
 88 | 
 89 | def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0):
 90 | 
 91 |     precision = 0
 92 |     recall = 0
 93 | 
 94 |     for i in range(max_length):
 95 |       if total_hyp[i] + smooth and total_ref[i] + smooth:
 96 |         precision += (correct[i] + smooth) / (total_hyp[i] + smooth)
 97 |         recall += (correct[i] + smooth) / (total_ref[i] + smooth)
 98 | 
 99 |     precision /= max_length
100 |     recall /= max_length
101 | 
102 |     return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall
103 | 
104 | def main(args):
105 | 
106 |     correct = [0]*args.ngram
107 |     total = [0]*args.ngram
108 |     total_ref = [0]*args.ngram
109 |     for line in args.ref:
110 |       line2 = args.hyp.readline()
111 | 
112 |       ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space)
113 |       ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space)
114 | 
115 |       get_correct(ngrams_ref, ngrams_test, correct, total)
116 | 
117 |       for rank in ngrams_ref:
118 |           for chain in ngrams_ref[rank]:
119 |               total_ref[rank] += ngrams_ref[rank][chain]
120 | 
121 |     chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta)
122 | 
123 |     print('chrF3: {0:.4f}'.format(chrf))
124 |     if args.precision:
125 |         print('chrPrec: {0:.4f}'.format(precision))
126 |     if args.recall:
127 |         print('chrRec: {0:.4f}'.format(recall))
128 | 
129 | if __name__ == '__main__':
130 | 
131 |     parser = create_parser()
132 |     args = parser.parse_args()
133 | 
134 |     main(args)
135 | 


--------------------------------------------------------------------------------
/scripts/bpe/concat-bpe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('vocab')
 6 | parser.add_argument('bpe')
 7 | 
 8 | 
 9 | def build_vocab(bpe_pairs):
10 |     vocab = set()
11 |     for a, b in bpe_pairs:
12 |         words = [a, b, a + b]
13 |         for word in words:
14 |             if word.endswith('</w>'):
15 |                 vocab.add(word[:-4])
16 |             else:
17 |                 vocab.add(word + '@@')
18 |                 vocab.add(word)
19 |     return vocab
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     args = parser.parse_args()
24 | 
25 |     with open(args.bpe) as bpe_file, open(args.vocab) as vocab_file:
26 |         bpe_pairs = [line.split() for line in bpe_file]
27 |         vocab = [line.strip() for line in vocab_file]
28 | 
29 |         bpe_vocab = build_vocab(bpe_pairs)
30 | 
31 |         for w in vocab:
32 |             print(w)
33 | 
34 |         vocab = set(vocab)
35 |         for w in bpe_vocab:
36 |             if w not in vocab:
37 |                 print(w)
38 | 


--------------------------------------------------------------------------------
/scripts/bpe/get_vocab.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | from __future__ import print_function
 3 | import sys
 4 | from collections import Counter
 5 | 
 6 | c = Counter()
 7 | 
 8 | for line in sys.stdin:
 9 |     for word in line.split():
10 |         c[word] += 1
11 | 
12 | for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True):
13 |     print(key+" "+ str(f))
14 | 


--------------------------------------------------------------------------------
/scripts/bpe/learn_joint_bpe_and_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
  6 | This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus,
  7 | applies the learned operation to each and (optionally) returns the resulting vocabulary of each text.
  8 | The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text.
  9 | 
 10 | Reference:
 11 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
 12 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 13 | """
 14 | 
 15 | from __future__ import unicode_literals
 16 | 
 17 | import sys
 18 | import os
 19 | import codecs
 20 | import argparse
 21 | import tempfile
 22 | from collections import Counter
 23 | 
 24 | import learn_bpe
 25 | import apply_bpe
 26 | 
 27 | # hack for python2/3 compatibility
 28 | from io import open
 29 | argparse.open = open
 30 | 
 31 | def create_parser():
 32 |     parser = argparse.ArgumentParser(
 33 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 34 |         description="learn BPE-based word segmentation")
 35 | 
 36 |     parser.add_argument(
 37 |         '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+',
 38 |         metavar='PATH',
 39 |         help="Input texts (multiple allowed).")
 40 |     parser.add_argument(
 41 |         '--output', '-o', type=argparse.FileType('w'), required=True,
 42 |         metavar='PATH',
 43 |         help="Output file for BPE codes.")
 44 |     parser.add_argument(
 45 |         '--symbols', '-s', type=int, default=10000,
 46 |         help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
 47 |     parser.add_argument(
 48 |         '--separator', type=str, default='@@', metavar='STR',
 49 |         help="Separator between non-final subword units (default: '%(default)s'))")
 50 |     parser.add_argument(
 51 |         '--write-vocabulary', type=argparse.FileType('w'), nargs = '+', default=None,
 52 |         metavar='PATH', dest='vocab',
 53 |         help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py')
 54 |     parser.add_argument(
 55 |         '--min-frequency', type=int, default=2, metavar='FREQ',
 56 |         help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
 57 |     parser.add_argument(
 58 |         '--verbose', '-v', action="store_true",
 59 |         help="verbose mode.")
 60 | 
 61 |     return parser
 62 | 
 63 | 
 64 | 
 65 | if __name__ == '__main__':
 66 | 
 67 |     # python 2/3 compatibility
 68 |     if sys.version_info < (3, 0):
 69 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 70 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 71 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 72 |     else:
 73 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
 74 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
 75 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
 76 | 
 77 |     parser = create_parser()
 78 |     args = parser.parse_args()
 79 | 
 80 |     if args.vocab and len(args.input) != len(args.vocab):
 81 |         sys.stderr.write('Error: number of input files and vocabulary files must match\n')
 82 |         sys.exit(1)
 83 | 
 84 |     # read/write files as UTF-8
 85 |     args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
 86 |     args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]
 87 | 
 88 |     # get combined vocabulary of all input texts
 89 |     full_vocab = Counter()
 90 |     for f in args.input:
 91 |         full_vocab += learn_bpe.get_vocabulary(f)
 92 |         f.seek(0)
 93 | 
 94 |     vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]
 95 | 
 96 |     # learn BPE on combined vocabulary
 97 |     with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
 98 |         learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)
 99 | 
100 |     with codecs.open(args.output.name, encoding='UTF-8') as codes:
101 |         bpe = apply_bpe.BPE(codes, separator=args.separator)
102 | 
103 |     # apply BPE to each training corpus and get vocabulary
104 |     for train_file, vocab_file in zip(args.input, args.vocab):
105 | 
106 |         tmp = tempfile.NamedTemporaryFile(delete=False)
107 |         tmp.close()
108 | 
109 |         tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
110 | 
111 |         train_file.seek(0)
112 |         for line in train_file:
113 |             tmpout.write(bpe.segment(line).strip())
114 |             tmpout.write('\n')
115 | 
116 |         tmpout.close()
117 |         tmpin = codecs.open(tmp.name, encoding='UTF-8')
118 | 
119 |         vocab = learn_bpe.get_vocabulary(tmpin)
120 |         tmpin.close()
121 |         os.remove(tmp.name)
122 | 
123 |         for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
124 |             vocab_file.write("{0} {1}\n".format(key, freq))
125 |         vocab_file.close()
126 | 


--------------------------------------------------------------------------------
/scripts/bpe/segment-char-ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Rico Sennrich
 4 | 
 5 | from __future__ import unicode_literals, division
 6 | 
 7 | import sys
 8 | import codecs
 9 | import argparse
10 | 
11 | # hack for python2/3 compatibility
12 | from io import open
13 | argparse.open = open
14 | 
15 | def create_parser():
16 |     parser = argparse.ArgumentParser(
17 |         formatter_class=argparse.RawDescriptionHelpFormatter,
18 |         description="segment rare words into character n-grams")
19 | 
20 |     parser.add_argument(
21 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
22 |         metavar='PATH',
23 |         help="Input file (default: standard input).")
24 |     parser.add_argument(
25 |         '--vocab', type=argparse.FileType('r'), metavar='PATH',
26 |         required=True,
27 |         help="Vocabulary file.")
28 |     parser.add_argument(
29 |         '--shortlist', type=int, metavar='INT', default=0,
30 |         help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).")
31 |     parser.add_argument(
32 |         '-n', type=int, metavar='INT', default=2,
33 |         help="segment rare words into character n-grams of size INT (default: '%(default)s')).")
34 |     parser.add_argument(
35 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
36 |         metavar='PATH',
37 |         help="Output file (default: standard output)")
38 |     parser.add_argument(
39 |         '--separator', '-s', type=str, default='@@', metavar='STR',
40 |         help="Separator between non-final subword units (default: '%(default)s'))")
41 | 
42 |     return parser
43 | 
44 | 
45 | if __name__ == '__main__':
46 | 
47 |     # python 2/3 compatibility
48 |     if sys.version_info < (3, 0):
49 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
50 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
51 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
52 |     else:
53 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
54 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
55 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)
56 | 
57 |     parser = create_parser()
58 |     args = parser.parse_args()
59 | 
60 |     # read/write files as UTF-8
61 |     args.vocab = codecs.open(args.vocab.name, encoding='utf-8')
62 |     if args.input.name != '<stdin>':
63 |         args.input = codecs.open(args.input.name, encoding='utf-8')
64 |     if args.output.name != '<stdout>':
65 |         args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
66 | 
67 |     vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2]
68 |     vocab = dict((y,x) for (x,y) in enumerate(vocab))
69 | 
70 |     for line in args.input:
71 |       for word in line.split():
72 |         if word not in vocab or vocab[word] > args.shortlist:
73 |           i = 0
74 |           while i*args.n < len(word):
75 |             args.output.write(word[i*args.n:i*args.n+args.n])
76 |             i += 1
77 |             if i*args.n < len(word):
78 |               args.output.write(args.separator)
79 |             args.output.write(' ')
80 |         else:
81 |           args.output.write(word + ' ')
82 |       args.output.write('\n')
83 | 


--------------------------------------------------------------------------------
/scripts/config-diff.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | function sort_config {
 6 |     tmp=`mktemp`
 7 |     grep -Pv "^$|model_dir|data_dir|label|description" $1 | sed "s/^\\s\+-\?\\s*/    /" | grep -Pv "^\\s*#" > ${tmp}
 8 |     output=`mktemp`
 9 |     grep -Pv "encoders|decoders|reverse_mapping|^[\s]" ${tmp} | sort > ${output}
10 |     echo "decoders:" >> ${output}
11 |     sed -n -e "/encoders/,/^[^ ]/p" ${tmp} | grep "^\s\+" | sort >> ${output}
12 |     echo "encoders:" >> ${output}
13 |     sed -n -e "/decoders/,/^[^ ]/p" ${tmp} | grep "^\s\+" | sort >> ${output}
14 |     rm -f ${tmp}
15 |     echo ${output}
16 | }
17 | 
18 | filename1=`sort_config $1`
19 | filename2=`sort_config $2`
20 | 
21 | sdiff -dbBWZs ${filename1} ${filename2}
22 | rm -f ${filename1} ${filename2}
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/copy-model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import subprocess
 4 | import os
 5 | import shutil
 6 | import re
 7 | import sys
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('model_dir')
11 | parser.add_argument('dest_dir')
12 | parser.add_argument('--move', action='store_true')
13 | parser.add_argument('--copy-data', action='store_true')
14 | parser.add_argument('--compact', action='store_true')
15 | parser.add_argument('--force', action='store_true')
16 | 
17 | args = parser.parse_args()
18 | 
19 | 
20 | if os.path.exists(args.dest_dir):
21 |     if args.force and os.path.isdir(args.dest_dir):
22 |         shutil.rmtree(args.dest_dir)
23 |     else:
24 |         raise Exception
25 | if not os.path.isdir(args.model_dir):
26 |     raise Exception
27 | 
28 | config_dir = os.path.realpath(args.dest_dir)
29 | root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
30 | if config_dir.startswith(root_dir):
31 |     config_dir = config_dir[len(root_dir):]
32 | else:
33 |     config_dir = args.dest_dir
34 | 
35 | if args.compact:
36 |     os.makedirs(os.path.join(args.dest_dir, 'checkpoints'))
37 | 
38 |     files = ['config.yaml', 'default.yaml', 'log.txt', 'code.tar.gz']
39 |     dirs = ['data']
40 | 
41 |     for filename in files:
42 |         shutil.copy(os.path.join(args.model_dir, filename), args.dest_dir)
43 |     for dirname in dirs:
44 |         try:
45 |             shutil.copytree(os.path.join(args.model_dir, dirname), os.path.join(args.dest_dir, dirname))
46 |         except:
47 |             pass
48 | 
49 |     checkpoint_dir = os.path.join(args.model_dir, 'checkpoints')
50 |     for filename in os.listdir(checkpoint_dir):
51 |         if filename.startswith('best.') or filename.startswith('average.') or filename in ('vars.pkl', 'scores.txt'):
52 |             shutil.copy(os.path.join(checkpoint_dir, filename),
53 |                         os.path.join(args.dest_dir, 'checkpoints'))
54 | 
55 |     if args.move:  # delete
56 |         shutil.rmtree(args.model_dir)
57 | elif args.move:
58 |     shutil.move(args.model_dir, args.dest_dir)
59 | else:
60 |     shutil.copytree(args.model_dir, args.dest_dir)
61 | 
62 | 
63 | config_filename = os.path.join(args.dest_dir, 'config.yaml')
64 | with open(config_filename) as f:
65 |     content = f.read()
66 | 
67 | content = re.sub(r'model_dir:.*?\n', 'model_dir: {}\n'.format(args.dest_dir), content, flags=re.MULTILINE)
68 | with open(config_filename, 'w') as f:
69 |     f.write(content)
70 | 
71 | if args.copy_data:
72 |     data_dir = re.search(r'data_dir:\s*(.*)\s*\n', content, flags=re.MULTILINE).group(1)
73 | 
74 |     content = re.sub(r'data_dir:.*?\n', 'data_dir: {}/data\n'.format(args.dest_dir), content, flags=re.MULTILINE)
75 |     with open(config_filename, 'w') as f:
76 |         f.write(content)
77 | 
78 |     for filename in os.listdir(data_dir):
79 |         if filename.startswith('dev') or filename.startswith('test'):
80 |             shutil.copy(os.path.join(data_dir, filename), os.path.join(args.dest_dir, 'data', filename))
81 | 


--------------------------------------------------------------------------------
/scripts/coverage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from collections import Counter
 5 | from itertools import starmap
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('filename')
 9 | parser.add_argument('vocab')
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     args = parser.parse_args()
14 |     with open(args.filename) as f, open(args.vocab) as vocab_file:
15 |         vocab = set(line.strip() for line in vocab_file)
16 | 
17 |         true_vocab = Counter(w for line in f for w in line.split())
18 | 
19 |         unk_words = Counter({w: c for w, c in true_vocab.items() if w not in vocab})
20 | 
21 |         print('Unknown words:')
22 |         print('\n'.join(starmap('  {:20} {}'.format, unk_words.most_common()[::-1])))
23 | 
24 |         print('{:22} {} ({:.2f}%)'.format('Unknown words:', len(unk_words), 100 * len(unk_words) / len(true_vocab)))
25 | 
26 |         total_unk_words = sum(unk_words.values())
27 |         total_words = sum(true_vocab.values())
28 |         print('{:22} {} ({:.2f}%)'.format('Total count:', total_unk_words, 100 * total_unk_words / total_words))
29 | 


--------------------------------------------------------------------------------
/scripts/decode-moses.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [[ $# -lt 4 ]]
 6 | then
 7 |     echo "wrong number of arguments supplied: $#"
 8 |     exit 0
 9 | fi
10 | 
11 | if [ -z ${MOSES} ]
12 | then
13 |     echo "variable MOSES undefined"
14 |     exit 0
15 | fi
16 | 
17 | config_file=`readlink -f $1`
18 | temp_dir=`readlink -f $2`
19 | filename=`readlink -f $3`
20 | output_filename=$4
21 | cores=`lscpu | grep -Po "^(CPU\(s\)|Processeur\(s\)).?:\s+\K\d+$"`
22 | 
23 | if [ -d "${temp_dir}" ]
24 | then
25 |     echo "directory ${temp_dir} already exists"
26 |     exit 0
27 | fi
28 | 
29 | mkdir -p ${temp_dir}
30 | printf "started: "; date
31 | ${MOSES}/scripts/training/filter-model-given-input.pl ${temp_dir}/model ${config_file} ${filename} >/dev/null 2>/dev/null
32 | cat ${filename} | sed "s/|//g" | ${MOSES}/bin/moses -f ${temp_dir}/model/moses.ini -threads ${cores} > ${output_filename} 2>/dev/null
33 | rm -rf ${temp_dir}
34 | printf "finished: "; date
35 | 
36 | 


--------------------------------------------------------------------------------
/scripts/extract-lexicon.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import argparse
 4 | from collections import defaultdict, OrderedDict
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('source_file')
 8 | parser.add_argument('target_file')
 9 | parser.add_argument('align_file')
10 | 
11 | args = parser.parse_args()
12 | 
13 | src_vocab = OrderedDict()
14 | trg_vocab = OrderedDict()
15 | 
16 | counts = defaultdict(dict)
17 | 
18 | with open(args.source_file) as src_file, open(args.target_file) as trg_file, open(args.align_file) as align_file:
19 |     for src, trg, align in zip(src_file, trg_file, align_file):
20 |         src = src.split()
21 |         trg = trg.split()
22 |         align = align.split()
23 |         for i, j in map(lambda p: map(int, p.split('-')), align):
24 |             src_ = src[i]
25 |             trg_ = trg[j]
26 | 
27 |             src_id = src_vocab.setdefault(src_, len(src_vocab))
28 |             trg_id = trg_vocab.setdefault(trg_, len(trg_vocab))
29 | 
30 |             #src_counts[src_id] = src_counts.get(src_id, 0) + 1
31 |             #trg_counts[trg_id] = trg_counts.get(trg_id, 0) + 1
32 |             #pair_counts((src_id, trg_id)) = pair_counts.get((src_id, trg_id), 0) + 1
33 |             
34 |             counts[src_id][trg_id] = counts[src_id].get(trg_id, 0) + 1
35 | 
36 | src_vocab = list(src_vocab.keys())
37 | trg_vocab = list(trg_vocab.keys())
38 | 
39 | for source, counts_ in counts.items():
40 |     target = max(counts_.keys(), key=lambda word: counts_[word])
41 |     source = src_vocab[source]
42 |     target = trg_vocab[target]
43 |     print(source, target)
44 | 


--------------------------------------------------------------------------------
/scripts/join.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('source_file')
 6 | parser.add_argument('target_file')
 7 | parser.add_argument('-s', '--separator', default='|||')
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | with open(args.source_file) as src_file, open(args.target_file) as trg_file:
12 |     for src, trg in zip(src_file, trg_file):
13 |         line = ' '.join([src.rstrip(), args.separator, trg.rstrip()])
14 |         print(line)
15 | 
16 | 


--------------------------------------------------------------------------------
/scripts/moses/clean-corpus-n.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id: clean-corpus-n.perl 3633 2010-10-21 09:49:27Z phkoehn $
  7 | use warnings;
  8 | use strict;
  9 | use Getopt::Long;
 10 | my $help;
 11 | my $lc = 0; # lowercase the corpus?
 12 | my $ignore_ratio = 0;
 13 | my $ignore_xml = 0;
 14 | my $enc = "utf8"; # encoding of the input and output files
 15 |     # set to anything else you wish, but I have not tested it yet
 16 | my $max_word_length = 1000; # any segment with a word (or factor) exceeding this length in chars
 17 |     # is discarded; motivated by symal.cpp, which has its own such parameter (hardcoded to 1000)
 18 |     # and crashes if it encounters a word that exceeds it
 19 | my $ratio = 9;
 20 | 
 21 | GetOptions(
 22 |   "help" => \$help,
 23 |   "lowercase|lc" => \$lc,
 24 |   "encoding=s" => \$enc,
 25 |   "ratio=f" => \$ratio,
 26 |   "ignore-ratio" => \$ignore_ratio,
 27 |   "ignore-xml" => \$ignore_xml,
 28 |   "max-word-length|mwl=s" => \$max_word_length
 29 | ) or exit(1);
 30 | 
 31 | if (scalar(@ARGV) < 6 || $help) {
 32 |     print "syntax: clean-corpus-n.perl [-ratio n] corpus l1 l2 clean-corpus min max [lines retained file]\n";
 33 |     exit;
 34 | }
 35 | 
 36 | my $corpus = $ARGV[0];
 37 | my $l1 = $ARGV[1];
 38 | my $l2 = $ARGV[2];
 39 | my $out = $ARGV[3];
 40 | my $min = $ARGV[4];
 41 | my $max = $ARGV[5];
 42 | 
 43 | my $linesRetainedFile = "";
 44 | if (scalar(@ARGV) > 6) {
 45 | 	$linesRetainedFile = $ARGV[6];
 46 | 	open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
 47 | }
 48 | 
 49 | print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max, ratio $ratio\n";
 50 | 
 51 | my $opn = undef;
 52 | my $l1input = "$corpus.$l1";
 53 | if (-e $l1input) {
 54 |   $opn = $l1input;
 55 | } elsif (-e $l1input.".gz") {
 56 |   $opn = "gunzip -c $l1input.gz |";
 57 | } else {
 58 |     die "Error: $l1input does not exist";
 59 | }
 60 | open(F,$opn) or die "Can't open '$opn'";
 61 | $opn = undef;
 62 | my $l2input = "$corpus.$l2";
 63 | if (-e $l2input) {
 64 |   $opn = $l2input;
 65 | } elsif (-e $l2input.".gz") {
 66 |   $opn = "gunzip -c $l2input.gz |";
 67 | } else  {
 68 |  die "Error: $l2input does not exist";
 69 | }
 70 | 
 71 | open(E,$opn) or die "Can't open '$opn'";
 72 | 
 73 | open(FO,">$out.$l1") or die "Can't write $out.$l1";
 74 | open(EO,">$out.$l2") or die "Can't write $out.$l2";
 75 | 
 76 | # necessary for proper lowercasing
 77 | my $binmode;
 78 | if ($enc eq "utf8") {
 79 |   $binmode = ":utf8";
 80 | } else {
 81 |   $binmode = ":encoding($enc)";
 82 | }
 83 | binmode(F, $binmode);
 84 | binmode(E, $binmode);
 85 | binmode(FO, $binmode);
 86 | binmode(EO, $binmode);
 87 | 
 88 | my $innr = 0;
 89 | my $outnr = 0;
 90 | my $factored_flag;
 91 | while(my $f = <F>) {
 92 |   $innr++;
 93 |   print STDERR "." if $innr % 10000 == 0;
 94 |   print STDERR "($innr)" if $innr % 100000 == 0;
 95 |   my $e = <E>;
 96 |   die "$corpus.$l2 is too short!" if !defined $e;
 97 |   chomp($e);
 98 |   chomp($f);
 99 |   if ($innr == 1) {
100 |     $factored_flag = ($e =~ /\|/ || $f =~ /\|/);
101 |   }
102 | 
103 |   #if lowercasing, lowercase
104 |   if ($lc) {
105 |     $e = lc($e);
106 |     $f = lc($f);
107 |   }
108 | 
109 |   $e =~ s/\|//g unless $factored_flag;
110 |   $e =~ s/\s+/ /g;
111 |   $e =~ s/^ //;
112 |   $e =~ s/ $//;
113 |   $f =~ s/\|//g unless $factored_flag;
114 |   $f =~ s/\s+/ /g;
115 |   $f =~ s/^ //;
116 |   $f =~ s/ $//;
117 |   next if $f eq '';
118 |   next if $e eq '';
119 | 
120 |   my $ec = &word_count($e);
121 |   my $fc = &word_count($f);
122 |   next if $ec > $max;
123 |   next if $fc > $max;
124 |   next if $ec < $min;
125 |   next if $fc < $min;
126 |   next if !$ignore_ratio && $ec/$fc > $ratio;
127 |   next if !$ignore_ratio && $fc/$ec > $ratio;
128 |   # Skip this segment if any factor is longer than $max_word_length
129 |   my $max_word_length_plus_one = $max_word_length + 1;
130 |   next if $e =~ /[^\s\|]{$max_word_length_plus_one}/;
131 |   next if $f =~ /[^\s\|]{$max_word_length_plus_one}/;
132 | 
133 |   # An extra check: none of the factors can be blank!
134 |   die "There is a blank factor in $corpus.$l1 on line $innr: $f"
135 |     if $f =~ /[ \|]\|/;
136 |   die "There is a blank factor in $corpus.$l2 on line $innr: $e"
137 |     if $e =~ /[ \|]\|/;
138 | 
139 |   $outnr++;
140 |   print FO $f."\n";
141 |   print EO $e."\n";
142 | 
143 |   if ($linesRetainedFile ne "") {
144 | 	print LINES_RETAINED $innr."\n";
145 |   }
146 | }
147 | 
148 | if ($linesRetainedFile ne "") {
149 |   close LINES_RETAINED;
150 | }
151 | 
152 | print STDERR "\n";
153 | my $e = <E>;
154 | die "$corpus.$l2 is too long!" if defined $e;
155 | 
156 | print STDERR "Input sentences: $innr  Output sentences:  $outnr\n";
157 | 
158 | sub word_count {
159 |   my ($line) = @_;
160 |   if ($ignore_xml) {
161 |     $line =~ s/<\S[^>]*\S>/ /g;
162 |     $line =~ s/\s+/ /g;
163 |     $line =~ s/^ //g;
164 |     $line =~ s/ $//g;
165 |   }
166 |   my @w = split(/ /,$line);
167 |   return scalar @w;
168 | }
169 | 


--------------------------------------------------------------------------------
/scripts/moses/deescape-special-chars.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | while(<STDIN>) {
10 |   s/\&bar;/\|/g;   # factor separator (legacy)
11 |   s/\&#124;/\|/g;  # factor separator
12 |   s/\&lt;/\</g;    # xml
13 |   s/\&gt;/\>/g;    # xml
14 |   s/\&bra;/\[/g;   # syntax non-terminal (legacy)
15 |   s/\&ket;/\]/g;   # syntax non-terminal (legacy)
16 |   s/\&quot;/\"/g;  # xml
17 |   s/\&apos;/\'/g;  # xml
18 |   s/\&#91;/\[/g;   # syntax non-terminal
19 |   s/\&#93;/\]/g;   # syntax non-terminal
20 |   s/\&amp;/\&/g;   # escape escape
21 |   print $_;
22 | }
23 | 


--------------------------------------------------------------------------------
/scripts/moses/detruecase.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | use strict;
 4 | use Getopt::Long "GetOptions";
 5 | 
 6 | binmode(STDIN, ":utf8");
 7 | binmode(STDOUT, ":utf8");
 8 | 
 9 | my ($SRC,$INFILE,$UNBUFFERED);
10 | die("detruecase.perl < in > out")
11 |     unless &GetOptions('headline=s' => \$SRC,
12 | 		       'in=s' => \$INFILE,
13 |                        'b|unbuffered' => \$UNBUFFERED);
14 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
15 | 
16 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
17 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
18 | 
19 | # lowercase even in headline
20 | my %ALWAYS_LOWER;
21 | foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
22 | 
23 | # find out about the headlines
24 | my @HEADLINE;
25 | if (defined($SRC)) {
26 |     open(SRC,$SRC);
27 |     my $headline_flag = 0;
28 |     while(<SRC>) {
29 | 	$headline_flag = 1 if /<hl>/;
30 | 	$headline_flag = 0 if /<.hl>/;
31 | 	next unless /^<seg/;
32 | 	push @HEADLINE, $headline_flag;
33 |     }
34 |     close(SRC);
35 | }
36 | 
37 | my $sentence = 0;
38 | if ($INFILE) {
39 |   open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'");
40 |   binmode(IN, ":utf8");
41 |   while(<IN>) {
42 |     &process($_,$sentence++);
43 |   }
44 |   close(IN);
45 | }
46 | else {
47 |   while(<STDIN>) {
48 |     &process($_,$sentence++);
49 |   }
50 | }
51 | 
52 | sub process {
53 |     my $line = $_[0];
54 |     chomp($line);
55 |     $line =~ s/^\s+//;
56 |     $line =~ s/\s+$//;
57 |     my @WORD  = split(/\s+/,$line);
58 | 
59 |     # uppercase at sentence start
60 |     my $sentence_start = 1;
61 |     for(my $i=0;$i<scalar(@WORD);$i++) {
62 |       &uppercase(\$WORD[$i]) if $sentence_start;
63 |       if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; }
64 |       elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; }
65 |     }
66 | 
67 |     # uppercase headlines {
68 |     if (defined($SRC) && $HEADLINE[$sentence]) {
69 | 	foreach (@WORD) {
70 | 	    &uppercase(\$_) unless $ALWAYS_LOWER{$_};
71 | 	}	
72 |     }
73 | 
74 |     # output
75 |     my $first = 1;
76 |     foreach (@WORD) {
77 | 	print " " unless $first;
78 | 	$first = 0;
79 | 	print $_;
80 |     }
81 |     print "\n";
82 |     $sentence++;
83 | }
84 | 
85 | sub uppercase {
86 |     my ($W) = @_;
87 |     $$W = uc(substr($$W,0,1)).substr($$W,1);
88 | }
89 | 


--------------------------------------------------------------------------------
/scripts/moses/escape-special-chars.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | while(<STDIN>) {
10 |   chop;
11 | 
12 |   # avoid general madness
13 |   s/[\000-\037]//g;
14 |   s/\s+/ /g;
15 | 	s/^ //g;
16 | 	s/ $//g;
17 | 
18 |   # special characters in moses
19 |   s/\&/\&amp;/g;   # escape escape
20 |   s/\|/\&#124;/g;  # factor separator
21 |   s/\</\&lt;/g;    # xml
22 |   s/\>/\&gt;/g;    # xml
23 |   s/\'/\&apos;/g;  # xml
24 |   s/\"/\&quot;/g;  # xml
25 |   s/\[/\&#91;/g;   # syntax non-terminal
26 |   s/\]/\&#93;/g;   # syntax non-terminal
27 | 
28 |   # restore xml instructions
29 |   s/\&lt;(\S+) translation=&quot;(.+?)&quot;&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
30 |   print $_."\n";
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/scripts/moses/lowercase.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | use strict;
 4 | 
 5 | binmode(STDIN, ":utf8");
 6 | binmode(STDOUT, ":utf8");
 7 | 
 8 | while(<STDIN>) {
 9 |   print lc($_);
10 | }
11 | 


--------------------------------------------------------------------------------
/scripts/moses/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | # $Id$
  4 | use strict;
  5 | 
  6 | my $lowercase = 0;
  7 | if ($ARGV[0] eq "-lc") {
  8 |   $lowercase = 1;
  9 |   shift;
 10 | }
 11 | 
 12 | my $stem = $ARGV[0];
 13 | if (!defined $stem) {
 14 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 15 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 16 |   exit(1);
 17 | }
 18 | 
 19 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 20 | 
 21 | my @REF;
 22 | my $ref=0;
 23 | while(-e "$stem$ref") {
 24 |     &add_to_ref("$stem$ref",\@REF);
 25 |     $ref++;
 26 | }
 27 | &add_to_ref($stem,\@REF) if -e $stem;
 28 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 29 | 
 30 | sub add_to_ref {
 31 |     my ($file,$REF) = @_;
 32 |     my $s=0;
 33 |     open(REF,$file) or die "Can't read $file";
 34 |     while(<REF>) {
 35 | 	chop;
 36 | 	push @{$$REF[$s++]}, $_;
 37 |     }
 38 |     close(REF);
 39 | }
 40 | 
 41 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 42 | my $s=0;
 43 | while(<STDIN>) {
 44 |     chop;
 45 |     $_ = lc if $lowercase;
 46 |     my @WORD = split;
 47 |     my %REF_NGRAM = ();
 48 |     my $length_translation_this_sentence = scalar(@WORD);
 49 |     my ($closest_diff,$closest_length) = (9999,9999);
 50 |     foreach my $reference (@{$REF[$s]}) {
 51 | #      print "$s $_ <=> $reference\n";
 52 |   $reference = lc($reference) if $lowercase;
 53 | 	my @WORD = split(' ',$reference);
 54 | 	my $length = scalar(@WORD);
 55 |         my $diff = abs($length_translation_this_sentence-$length);
 56 | 	if ($diff < $closest_diff) {
 57 | 	    $closest_diff = $diff;
 58 | 	    $closest_length = $length;
 59 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 60 | 	} elsif ($diff == $closest_diff) {
 61 |             $closest_length = $length if $length < $closest_length;
 62 |             # from two references with the same closeness to me
 63 |             # take the *shorter* into account, not the "first" one.
 64 |         }
 65 | 	for(my $n=1;$n<=4;$n++) {
 66 | 	    my %REF_NGRAM_N = ();
 67 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 68 | 		my $ngram = "$n";
 69 | 		for(my $w=0;$w<$n;$w++) {
 70 | 		    $ngram .= " ".$WORD[$start+$w];
 71 | 		}
 72 | 		$REF_NGRAM_N{$ngram}++;
 73 | 	    }
 74 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 75 | 		if (!defined($REF_NGRAM{$ngram}) || 
 76 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 77 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 78 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 79 | 		}
 80 | 	    }
 81 | 	}
 82 |     }
 83 |     $length_translation += $length_translation_this_sentence;
 84 |     $length_reference += $closest_length;
 85 |     for(my $n=1;$n<=4;$n++) {
 86 | 	my %T_NGRAM = ();
 87 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 88 | 	    my $ngram = "$n";
 89 | 	    for(my $w=0;$w<$n;$w++) {
 90 | 		$ngram .= " ".$WORD[$start+$w];
 91 | 	    }
 92 | 	    $T_NGRAM{$ngram}++;
 93 | 	}
 94 | 	foreach my $ngram (keys %T_NGRAM) {
 95 | 	    $ngram =~ /^(\d+) /;
 96 | 	    my $n = $1;
 97 |             # my $corr = 0;
 98 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
 99 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
100 | 	    if (defined($REF_NGRAM{$ngram})) {
101 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
102 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
103 |                     # $corr =  $T_NGRAM{$ngram};
104 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
105 | 		}
106 | 		else {
107 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
108 |                     # $corr =  $REF_NGRAM{$ngram};
109 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
110 | 		}
111 | 	    }
112 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
113 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
114 | 	}
115 |     }
116 |     $s++;
117 | }
118 | my $brevity_penalty = 1;
119 | my $bleu = 0;
120 | 
121 | my @bleu=();
122 | 
123 | for(my $n=1;$n<=4;$n++) {
124 |   if (defined ($TOTAL[$n])){
125 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
126 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
127 |   }else{
128 |     $bleu[$n]=0;
129 |   }
130 | }
131 | 
132 | if ($length_reference==0){
133 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
134 |   exit(1);
135 | }
136 | 
137 | if ($length_translation<$length_reference) {
138 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
139 | }
140 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
141 | 				my_log( $bleu[2] ) +
142 | 				my_log( $bleu[3] ) +
143 | 				my_log( $bleu[4] ) ) / 4) ;
144 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
145 |     100*$bleu,
146 |     100*$bleu[1],
147 |     100*$bleu[2],
148 |     100*$bleu[3],
149 |     100*$bleu[4],
150 |     $brevity_penalty,
151 |     $length_translation / $length_reference,
152 |     $length_translation,
153 |     $length_reference;
154 | 
155 | sub my_log {
156 |   return -9999999999 unless $_[0];
157 |   return log($_[0]);
158 | }
159 | 


--------------------------------------------------------------------------------
/scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.el:
--------------------------------------------------------------------------------
1 | # for now, just include the Greek equivalent of "Mr."
2 | κ
3 | 


--------------------------------------------------------------------------------
/scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 


--------------------------------------------------------------------------------
/scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/scripts/moses/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/scripts/moses/normalize-punctuation.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | use strict;
 4 | 
 5 | my ($language) = @ARGV;
 6 | 
 7 | while(<STDIN>) {
 8 |     s/\r//g;
 9 |     # remove extra spaces
10 |     s/\(/ \(/g;
11 |     s/\)/\) /g; s/ +/ /g;
12 |     s/\) ([\.\!\:\?\;\,])/\)$1/g;
13 |     s/\( /\(/g;
14 |     s/ \)/\)/g;
15 |     s/(\d) \%/$1\%/g;
16 |     s/ :/:/g;
17 |     s/ ;/;/g;
18 |     # normalize unicode punctuation
19 |     s/„/\"/g;
20 |     s/“/\"/g;
21 |     s/”/\"/g;
22 |     s/–/-/g;
23 |     s/—/ - /g; s/ +/ /g;
24 |     s/´/\'/g;
25 |     s/(\pL)‘(\pL)/$1\'$2/gi;
26 |     s/(\pL)’(\pL)/$1\'$2/gi;
27 |     #s/([A-Za-zé])‘([A-Za-zé])/$1\'$2/gi;
28 |     #s/([A-Za-zé])([A-Za-zé])/$1\'$2/gi;
29 |     s/‘/\"/g;
30 |     s/‚/\"/g;
31 |     s/’/\"/g;
32 |     s/''/\"/g;
33 |     s/´´/\"/g;
34 |     s/…/.../g;
35 |     # French quotes
36 |     s/ « / \"/g;
37 |     s/« /\"/g;
38 |     s/«/\"/g;
39 |     s/ » /\" /g;
40 |     s/ »/\"/g;
41 |     s/»/\"/g;
42 |     # handle pseudo-spaces
43 |     s/ \%/\%/g;
44 |     s/nº /nº /g;
45 |     s/ :/:/g;
46 |     s/ ºC/ ºC/g;
47 |     s/ cm/ cm/g;
48 |     s/ \?/\?/g;
49 |     s/ \!/\!/g;
50 |     s/ ;/;/g;
51 |     s/, /, /g; s/ +/ /g;
52 | 
53 |     # English "quotation," followed by comma, style
54 |     if ($language eq "en") {
55 | 	s/\"([,\.]+)/$1\"/g;
56 |     }
57 |     # Czech is confused
58 |     elsif ($language eq "cs" || $language eq "cz") {
59 |     }
60 |     # German/Spanish/French "quotation", followed by comma, style
61 |     else {
62 | 	s/,\"/\",/g;	
63 | 	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
64 |     }
65 | 
66 |     print STDERR $_ if /﻿/;
67 | 
68 |     if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
69 | 	s/(\d) (\d)/$1,$2/g;
70 |     }
71 |     else {
72 | 	s/(\d) (\d)/$1.$2/g;
73 |     }
74 |     print $_;
75 | }
76 | 


--------------------------------------------------------------------------------
/scripts/moses/strip-xml.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # strip text file of any XML markup
 4 | 
 5 | binmode(STDIN, ":utf8");
 6 | binmode(STDOUT, ":utf8");
 7 | 
 8 | use strict;
 9 | 
10 | while(<STDIN>) {
11 |   s/<\S[^>]*>/ /g;
12 |   chomp;
13 |   s/ +/ /g;
14 |   s/^ //;
15 |   print $_;
16 |   print "\n";
17 | }
18 | 


--------------------------------------------------------------------------------
/scripts/moses/tokenizer.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | # Sample Tokenizer
  4 | # written by Josh Schroeder, based on code by Philipp Koehn
  5 | 
  6 | binmode(STDIN, ":utf8");
  7 | binmode(STDOUT, ":utf8");
  8 | 
  9 | use FindBin qw($Bin);
 10 | use strict;
 11 | #use Time::HiRes;
 12 | 
 13 | my $mydir = "$Bin/nonbreaking_prefixes";
 14 | 
 15 | my %NONBREAKING_PREFIX = ();
 16 | my $language = "en";
 17 | my $QUIET = 0;
 18 | my $HELP = 0;
 19 | 
 20 | #my $start = [ Time::HiRes::gettimeofday( ) ];
 21 | 
 22 | while (@ARGV) {
 23 | 	$_ = shift;
 24 | 	/^-l$/ && ($language = shift, next);
 25 | 	/^-q$/ && ($QUIET = 1, next);
 26 | 	/^-h$/ && ($HELP = 1, next);
 27 | }
 28 | 
 29 | if ($HELP) {
 30 | 	print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
 31 | 	exit;
 32 | }
 33 | if (!$QUIET) {
 34 | 	print STDERR "Tokenizer v3\n";
 35 | 	print STDERR "Language: $language\n";
 36 | }
 37 | 
 38 | load_prefixes($language,\%NONBREAKING_PREFIX);
 39 | 
 40 | if (scalar(%NONBREAKING_PREFIX) eq 0){
 41 | 	print STDERR "Warning: No known abbreviations for language '$language'\n";
 42 | }
 43 | 
 44 | while(<STDIN>) {
 45 | 	if (/^<.+>$/ || /^\s*$/) {
 46 | 		#don't try to tokenize XML/HTML tag lines
 47 | 		print $_;
 48 | 	}
 49 | 	else {
 50 | 		print &tokenize($_);
 51 | 	}
 52 | }
 53 | 
 54 | #my $duration = Time::HiRes::tv_interval( $start );
 55 | #print STDERR ("EXECUTION TIME: ".$duration."\n");
 56 | 
 57 | 
 58 | sub tokenize {
 59 | 	my($text) = @_;
 60 | 	chomp($text);
 61 | 	$text = " $text ";
 62 | 	
 63 | 	# seperate out all "other" special characters
 64 | 	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
 65 | 	
 66 | 	#multi-dots stay together
 67 | 	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
 68 | 	while($text =~ /DOTMULTI\./) {
 69 | 		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
 70 | 		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
 71 | 	}
 72 | 
 73 | 	# seperate out "," except if within numbers (5,300)
 74 | 	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 75 | 	# separate , pre and post number
 76 | 	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 77 | 	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
 78 | 	      
 79 | 	# turn `into '
 80 | 	$text =~ s/\`/\'/g;
 81 | 	
 82 | 	#turn '' into "
 83 | 	$text =~ s/\'\'/ \" /g;
 84 | 
 85 | 	if ($language eq "en") {
 86 | 		#split contractions right
 87 | 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 88 | 		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
 89 | 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 90 | 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
 91 | 		#special case for "1990's"
 92 | 		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
 93 | 	} elsif (($language eq "fr") or ($language eq "it")) {
 94 | 		#split contractions left	
 95 | 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 96 | 		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
 97 | 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 98 | 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
 99 | 	} else {
100 | 		$text =~ s/\'/ \' /g;
101 | 	}
102 | 	
103 | 	#word token method
104 | 	my @words = split(/\s/,$text);
105 | 	$text = "";
106 | 	for (my $i=0;$i<(scalar(@words));$i++) {
107 | 		my $word = $words[$i];
108 | 		if ( $word =~ /^(\S+)\.$/) {
109 | 			my $pre = $1;
110 | 			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
111 | 				#no change
112 | 			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
113 | 				#no change
114 | 			} else {
115 | 				$word = $pre." .";
116 | 			}
117 | 		}
118 | 		$text .= $word." ";
119 | 	}		
120 | 
121 | 	# clean up extraneous spaces
122 | 	$text =~ s/ +/ /g;
123 | 	$text =~ s/^ //g;
124 | 	$text =~ s/ $//g;
125 | 
126 | 	#restore multi-dots
127 | 	while($text =~ /DOTDOTMULTI/) {
128 | 		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
129 | 	}
130 | 	$text =~ s/DOTMULTI/./g;
131 | 	
132 | 	#ensure final line break
133 | 	$text .= "\n" unless $text =~ /\n$/;
134 | 
135 | 	return $text;
136 | }
137 | 
138 | sub load_prefixes {
139 | 	my ($language, $PREFIX_REF) = @_;
140 | 	
141 | 	my $prefixfile = "$mydir/nonbreaking_prefix.$language";
142 | 	
143 | 	#default back to English if we don't have a language-specific prefix file
144 | 	if (!(-e $prefixfile)) {
145 | 		$prefixfile = "$mydir/nonbreaking_prefix.en";
146 | 		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
147 | 		die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
148 | 	}
149 | 	
150 | 	if (-e "$prefixfile") {
151 | 		open(PREFIX, "<:utf8", "$prefixfile");
152 | 		while (<PREFIX>) {
153 | 			my $item = $_;
154 | 			chomp($item);
155 | 			if (($item) && (substr($item,0,1) ne "#")) {
156 | 				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
157 | 					$PREFIX_REF->{$1} = 2;
158 | 				} else {
159 | 					$PREFIX_REF->{$item} = 1;
160 | 				}
161 | 			}
162 | 		}
163 | 		close(PREFIX);
164 | 	}
165 | 	
166 | }
167 | 
168 | 


--------------------------------------------------------------------------------
/scripts/moses/train-truecaser.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
 4 | 
 5 | #
 6 | # Options:
 7 | #
 8 | # --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
 9 | #
10 | 
11 | use strict;
12 | use Getopt::Long "GetOptions";
13 | 
14 | # apply switches
15 | my ($MODEL,$CORPUS);
16 | die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]")
17 |     unless &GetOptions('corpus=s' => \$CORPUS,
18 |                        'model=s' => \$MODEL,
19 |                        'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0))
20 |     && defined($CORPUS) && defined($MODEL);
21 | my %CASING;
22 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
23 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
24 | open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
25 | binmode(CORPUS, ":utf8");
26 | while(<CORPUS>) {
27 |   chop;
28 |   my @WORD = split;
29 |   my $start = 0;
30 |   while($start<=$#WORD && defined($DELAYED_SENTENCE_START{$WORD[$start]})) { $start++; }
31 |   my $firstWordOfSentence = 1;
32 |   for(my $i=$start;$i<=$#WORD;$i++) {
33 |     my $currentWord = $WORD[$i];
34 |     if (! $firstWordOfSentence && defined($SENTENCE_END{$WORD[$i-1]})) {
35 |       $firstWordOfSentence = 1;
36 |     }
37 | 
38 |     my $currentWordWeight = 0;
39 |     if (! $firstWordOfSentence) {
40 |       $currentWordWeight = 1;
41 |     } elsif ($possiblyUseFirstToken) {
42 |       # gated special handling of first word of sentence
43 |       my $firstChar = substr($currentWord, 0, 1);
44 |       if (lc($firstChar) eq $firstChar) {
45 |         # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial)
46 | 	$currentWordWeight = 1;
47 |       } elsif (scalar(@WORD) == 1) {
48 | 	# if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case)
49 | 	$currentWordWeight = 0.1;
50 |       }
51 |     }
52 |     if ($currentWordWeight > 0) {
53 |       $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight;
54 |     }
55 | 
56 |     $firstWordOfSentence = 0;
57 |   }
58 | }
59 | close(CORPUS);
60 | 
61 | open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'");
62 | binmode(MODEL, ":utf8");
63 | foreach my $type (keys %CASING) {
64 |   my ($score,$total,$best) = (-1,0,"");
65 |   foreach my $word (keys %{$CASING{$type}}) {
66 |     my $count = $CASING{$type}{$word};
67 |     $total += $count;
68 |     if ($count > $score) {
69 |       $best = $word;
70 |       $score = $count;
71 |     }
72 |   }
73 |   print MODEL "$best ($score/$total)";
74 |   foreach my $word (keys %{$CASING{$type}}) {
75 |     print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best;
76 |   }
77 |   print MODEL "\n";
78 | }
79 | close(MODEL);
80 | 


--------------------------------------------------------------------------------
/scripts/moses/truecase.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | # $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
 4 | use strict;
 5 | use Getopt::Long "GetOptions";
 6 | 
 7 | binmode(STDIN, ":utf8");
 8 | binmode(STDOUT, ":utf8");
 9 | 
10 | # apply switches
11 | my ($MODEL, $UNBUFFERED);
12 | die("truecase.perl --model MODEL [-b] < in > out")
13 |     unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
14 |     && defined($MODEL);
15 | if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
16 | 
17 | my (%BEST,%KNOWN);
18 | open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
19 | binmode(MODEL, ":utf8");
20 | while(<MODEL>) {
21 |   my ($word,@OPTIONS) = split;
22 |   $BEST{ lc($word) } = $word;
23 |   $KNOWN{ $word } = 1;
24 |   for(my $i=1;$i<$#OPTIONS;$i+=2) {
25 |     $KNOWN{ $OPTIONS[$i] } = 1;
26 |   }
27 | }
28 | close(MODEL);
29 | 
30 | my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
31 | my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
32 | 
33 | while(<STDIN>) {
34 |   chop;
35 |   my ($WORD,$MARKUP) = split_xml($_);
36 |   my $sentence_start = 1;
37 |   for(my $i=0;$i<=$#$WORD;$i++) {
38 |     print " " if $i;
39 |     print $$MARKUP[$i];
40 | 
41 |     $$WORD[$i] =~ /^([^\|]+)(.*)/;
42 |     my $word = $1;
43 |     my $otherfactors = $2;
44 | 
45 |     if ($sentence_start && defined($BEST{lc($word)})) {
46 |       print $BEST{lc($word)}; # truecase sentence start
47 |     }
48 |     elsif (defined($KNOWN{$word})) {
49 |       print $word; # don't change known words
50 |     }
51 |     elsif (defined($BEST{lc($word)})) {
52 |       print $BEST{lc($word)}; # truecase otherwise unknown words
53 |     }
54 |     else {
55 |       print $word; # unknown, nothing to do
56 |     }
57 |     print $otherfactors;
58 | 
59 |     if    ( defined($SENTENCE_END{ $word }))           { $sentence_start = 1; }
60 |     elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
61 |   }
62 |   print " ".$$MARKUP[$#$MARKUP];
63 |   print "\n";
64 | }
65 | 
66 | # store away xml markup
67 | sub split_xml {
68 |   my ($line) = @_;
69 |   my (@WORD,@MARKUP);
70 |   my $i = 0;
71 |   $MARKUP[0] = "";
72 |   while($line =~ /\S/) {
73 |     if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
74 |       $MARKUP[$i] .= $1." ";
75 |       $line = $2;
76 |     }
77 |     elsif ($line =~ /^\s*(\S+)(.*)$/) {
78 |       $WORD[$i++] = $1;
79 |       $MARKUP[$i] = "";
80 |       $line = $2;
81 |     }
82 |     else {
83 |       die("ERROR: huh? $line\n");
84 |     }
85 |   }
86 |   chop($MARKUP[$#MARKUP]);
87 |   return (\@WORD,\@MARKUP);
88 | }
89 | 


--------------------------------------------------------------------------------
/scripts/moses/wrap-xml.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | 
 9 | my ($language,$src,$system) = @ARGV;
10 | die("wrapping frame not found ($src)") unless -e $src;
11 | $system = "Edinburgh" unless $system;
12 | 
13 | open(SRC,$src) or die "Cannot open: $!";
14 | my @OUT = <STDIN>;
15 | chomp(@OUT);
16 | #my @OUT = `cat $decoder_output`;
17 | my $missing_end_seg = 0;
18 | while(<SRC>) {
19 |     chomp;
20 |     if (/^<srcset/) {
21 | 	s/<srcset/<tstset trglang="$language"/i;
22 |     }
23 |     elsif (/^<\/srcset/) {
24 | 	s/<\/srcset/<\/tstset/i;
25 |     }
26 |     elsif (/^<doc/i) {
27 |   s/ *sysid="[^\"]+"//;
28 | 	s/<doc/<doc sysid="$system"/i;
29 |     }
30 |     elsif (/<seg/) {
31 | 	my $line = shift(@OUT);
32 |         $line = "" if $line =~ /NO BEST TRANSLATION/;
33 |         if (/<\/seg>/) {
34 | 	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
35 |           $missing_end_seg = 0;
36 |         }
37 |         else {
38 | 	  s/(<seg[^>]+> *)[^<]*/$1$line<\/seg>/i;
39 |           $missing_end_seg = 1;
40 |         }
41 |     }
42 |     elsif ($missing_end_seg) {
43 |       if (/<\/doc>/) {
44 |         $missing_end_seg = 0;
45 |       }
46 |       else {
47 |         next;
48 |       }
49 |     }
50 |     print $_."\n";
51 | }
52 | 


--------------------------------------------------------------------------------
/scripts/multi-print.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import subprocess
 6 | import re
 7 | import sys
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('files', nargs='+')
11 | parser.add_argument('--head', action='store_true')
12 | parser.add_argument('--shuf', action='store_true')
13 | parser.add_argument('-n', type=int)
14 | parser.add_argument('-d', '--delimiter', default='^', choices=['&', '^', '@', '~', '|', '/', '#', '$'])
15 | parser.add_argument('--space', action='store_true')
16 | 
17 | args = parser.parse_args()
18 | 
19 | commands = []
20 | paste = ['paste', '-d', args.delimiter] + list(args.files)
21 | commands.append(paste)
22 | 
23 | if args.shuf:
24 |     shuf = ['shuf']
25 |     if args.n:
26 |         shuf += ['-n', str(args.n)]
27 |     commands.append(shuf)
28 | if args.head:
29 |     head = ['head', '-n', str(args.n or 10)]
30 |     commands.append(head)
31 | 
32 | if args.space:
33 |     space = ['sed', 'G']
34 |     commands.append(space)
35 | 
36 | delimiter = re.escape(args.delimiter) if args.delimiter in ('/', '^', '$') else args.delimiter
37 | sed = ['sed', 's/{}/\\n/g'.format(delimiter)]
38 | commands.append(sed)
39 | 
40 | ps = None
41 | 
42 | for i, cmd in enumerate(commands):
43 |     stdout = sys.stdout if i == len(commands) - 1 else subprocess.PIPE
44 |     stdin = None if i == 0 else ps.stdout
45 |     ps = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=open('/dev/null', 'w'))
46 | 
47 | ps.wait()
48 | 
49 | 


--------------------------------------------------------------------------------
/scripts/paired-eval.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import numpy as np
 6 | from translate.evaluation import corpus_bleu, corpus_ter
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('source1')
10 | parser.add_argument('source2')
11 | parser.add_argument('target')
12 | 
13 | parser.add_argument('--bleu', action='store_true')
14 | parser.add_argument('--max-size', type=int)
15 | parser.add_argument('--case-insensitive', '-i', action='store_true')
16 | 
17 | parser.add_argument('--samples', type=int, default=1000)
18 | parser.add_argument('--sample-size', type=int, default=0)
19 | parser.add_argument('-p', type=float, default=0.05)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     args = parser.parse_args()
24 | 
25 |     with open(args.source1) as src_file_1, open(args.source2) as src_file_2, open(args.target) as trg_file:
26 |         if args.case_insensitive:
27 |             fun = lambda x: x.strip().lower()
28 |         else:
29 |             fun = lambda x: x.strip()
30 | 
31 |         hypotheses_1 = list(map(fun, src_file_1))
32 |         hypotheses_2 = list(map(fun, src_file_2))
33 |         references = list(map(fun, trg_file))
34 | 
35 |         if args.max_size is not None:
36 |             hypotheses_1 = hypotheses_1[:args.max_size]
37 |             hypotheses_2 = hypotheses_2[:args.max_size]
38 |             references = references[:args.max_size]
39 | 
40 |         if len(hypotheses_1) != len(references) or len(hypotheses_2) != len(references):
41 |             sys.stderr.write('warning: source and target don\'t have the same length\n')
42 |             size = min(len(hypotheses_1), len(hypotheses_2), len(references))
43 |             hypotheses_1 = hypotheses_1[:size]
44 |             hypotheses_2 = hypotheses_2[:size]
45 |             references = references[:size]
46 | 
47 |         indices = np.arange(len(references))
48 |         if args.sample_size == 0:
49 |             args.sample_size = len(references)
50 | 
51 |         diffs = []
52 | 
53 |         hypotheses_1 = np.array(hypotheses_1)
54 |         hypotheses_2 = np.array(hypotheses_2)
55 |         references = np.array(references)
56 | 
57 |         score_fun = corpus_bleu if args.bleu else corpus_ter
58 | 
59 |         #diff = abs(score_fun(hypotheses_1, references)[0] - score_fun(hypotheses_2, references)[0])
60 | 
61 |         for _ in range(args.samples):
62 |             indices = np.random.randint(len(references), size=args.sample_size)
63 |             hypotheses_1_ = hypotheses_1[indices]
64 |             hypotheses_2_ = hypotheses_2[indices]
65 |             references_ = references[indices]
66 | 
67 |             score_1, _ = score_fun(hypotheses_1_, references_)
68 |             score_2, _ = score_fun(hypotheses_2_, references_)
69 | 
70 |             diffs.append(int(score_1 > score_2))
71 |             #diffs.append(abs(score_1 - score_2))
72 | 
73 |         # avg_diff = sum(diffs) / len(diffs)
74 |         # c = sum(
75 |         #     int(diff_ - avg_diff >= diff) for diff_ in diffs
76 |         # )
77 |         #
78 |         # p = (c + 1) / (len(diffs) + 1)
79 |         # print(p)
80 | 
81 |         p = sum(diffs) / len(diffs)
82 |         if not args.bleu:
83 |             p = 1 - p
84 | 
85 |         print('x is better than y {:.1f}% of the time'.format(p * 100))
86 | 


--------------------------------------------------------------------------------
/scripts/plot-score-per-length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import os
 6 | import re
 7 | import numpy as np
 8 | from collections import OrderedDict
 9 | from matplotlib import pyplot as plt
10 | 
11 | script_dir = os.path.dirname(os.path.abspath(__file__))
12 | root_dir = os.path.dirname(script_dir)
13 | sys.path.append(root_dir)
14 | tercom_path = os.path.join(script_dir, 'tercom.jar')
15 | 
16 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer, corpus_cer, corpus_bleu1
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('mt', nargs='+')
20 | parser.add_argument('ref')
21 | 
22 | parser.add_argument('--src')
23 | parser.add_argument('--min', type=int, default=0)
24 | parser.add_argument('--max', type=int, default=70)
25 | parser.add_argument('--step', type=int, default=5)
26 | parser.add_argument('--labels', nargs='*')
27 | parser.add_argument('--output')
28 | 
29 | parser.add_argument('--bar', action='store_true')
30 | 
31 | args = parser.parse_args()
32 | 
33 | if args.src is None:
34 |     args.src = args.ref
35 | 
36 | assert args.labels is None or len(args.labels) == len(args.mt)
37 | 
38 | for k, mt in enumerate(args.mt):
39 |     with open(args.src) as src_file, open(mt) as mt_file, open(args.ref) as ref_file:
40 |         lines = list(zip(src_file, mt_file, ref_file))
41 | 
42 |         bins = OrderedDict()
43 | 
44 |         for i in range(args.min, args.max, args.step):
45 |             lines_ = [(mt.strip(), ref.strip()) for src, mt, ref in lines if i < len(src.split()) <= i + args.step]
46 |             if len(lines_) > 0:
47 |                 score, summary = corpus_bleu(*zip(*lines_))
48 |                 bins[i + args.step] = score
49 |                 # print(i + args.step, '{:.1f}'.format(score), len(lines_), summary)
50 | 
51 |         values = np.array(list(bins.values()))
52 |         keys = np.array(list(bins.keys()))
53 | 
54 |         label = args.labels[k] if args.labels else None
55 | 
56 |         if args.bar:
57 |             width = 1 if len(args.mt) > 1 else args.step - 1
58 |             keys += k
59 |             plt.bar(keys + k, values, width=width, label=label)
60 |         else:
61 |             plt.plot(keys, values, label=label)
62 | 
63 | xlabel = 'Reference words' if args.src == args.ref else 'Source words'
64 | plt.xlabel(xlabel)
65 | plt.ylabel('BLEU')
66 | plt.legend()
67 | 
68 | if args.output:
69 |     plt.savefig(args.output)
70 | else:
71 |     plt.show()
72 | 


--------------------------------------------------------------------------------
/scripts/post_editing/apply-edits.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from translate.utils import reverse_edits
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('source')
 8 | parser.add_argument('edits')
 9 | parser.add_argument('--not-strict', action='store_false', dest='strict')
10 | parser.add_argument('--no-fix', action='store_false', dest='fix')
11 | 
12 | if __name__ == '__main__':
13 |     args = parser.parse_args()
14 |     with open(args.source) as src_file, open(args.edits) as edit_file:
15 |         for source, edits in zip(src_file, edit_file):
16 |             target = reverse_edits(source.strip('\n'), edits.strip('\n'), strict=args.strict, fix=args.fix)
17 |             print(target)
18 | 


--------------------------------------------------------------------------------
/scripts/post_editing/extract-ter-vectors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import numpy as np
 5 | from translate.evaluation import tercom_statistics
 6 | from itertools import islice
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('source')
10 | parser.add_argument('target')
11 | parser.add_argument('--output')
12 | parser.add_argument('--precision', type=int, default=4)
13 | 
14 | parser.add_argument('--case-insensitive', '-i', action='store_true')
15 | 
16 | if __name__ == '__main__':
17 |     args = parser.parse_args()
18 | 
19 |     vectors = []
20 |     fields = ['DEL', 'INS', 'SUB', 'WORD_SHIFT', 'REF_WORDS', 'TER']
21 | 
22 |     with open(args.source) as src_file, open(args.target) as trg_file:
23 | 
24 |         i = 0
25 |         n = 1000
26 | 
27 |         avg_length = 0
28 | 
29 |         while True:
30 |             i += 1
31 |             hypotheses = list(islice(src_file, n))
32 |             references = list(islice(trg_file, n))
33 | 
34 |             if not hypotheses or not references:
35 |                 break
36 | 
37 |             hypotheses = [line.strip() for line in hypotheses]
38 |             references = [line.strip() for line in references]
39 | 
40 |             _, stats = tercom_statistics(hypotheses, references, not args.case_insensitive)
41 | 
42 |             if avg_length == 0:
43 |                 avg_length = sum(stats_['REF_WORDS'] for stats_ in stats) / len(stats)
44 | 
45 |             for stats_ in stats:
46 |                 for field in ('DEL', 'INS', 'SUB', 'WORD_SHIFT'):
47 |                     stats_[field] /= stats_['REF_WORDS']
48 | 
49 |                 stats_['REF_WORDS'] = (stats_['REF_WORDS'] - avg_length) / avg_length
50 |                 stats_['TER'] /= 100
51 | 
52 |             if not args.output:
53 |                 print('\n'.join(','.join(str(round(stats_[k], args.precision)) for k in fields)
54 |                                 for stats_ in stats))
55 |             else:
56 |                 vectors += [np.array([stats_[k] for k in fields]) for stats_ in stats]
57 |                 print('{}'.format(i * n), end='\r')
58 | 
59 |         if args.output:
60 |             import h5py
61 |             h5f = h5py.File(args.output, 'w')
62 |             h5f.create_dataset('dataset_1', data=vectors)
63 |             h5f.close()
64 | 


--------------------------------------------------------------------------------
/scripts/post_editing/noisify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import sklearn.mixture
 6 | import numpy as np
 7 | import random
 8 | from scipy.stats import truncnorm
 9 | from collections import Counter
10 | from translate.evaluation import tercom_statistics
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('source')
14 | parser.add_argument('target')
15 | 
16 | parser.add_argument('--mono')
17 | parser.add_argument('--min-count', type=int, default=2)
18 | parser.add_argument('--case-insensitive', '-i', action='store_true')
19 | 
20 | if __name__ == '__main__':
21 |     args = parser.parse_args()
22 | 
23 |     fields = ['DEL', 'INS', 'SUB', 'WORD_SHIFT', 'REF_WORDS']
24 |     op_fields = ['DEL', 'INS', 'SUB', 'WORD_SHIFT']
25 | 
26 |     with open(args.source) as src_file, open(args.target) as trg_file:
27 |         hypotheses = [line.strip() for line in src_file]
28 |         references = [line.strip() for line in trg_file]
29 | 
30 |         _, stats = tercom_statistics(hypotheses, references, not args.case_insensitive)
31 | 
32 |         for stats_ in stats:
33 |             for field in op_fields:
34 |                 stats_[field] /= stats_['REF_WORDS']
35 | 
36 |         ops = np.array([[stats_[k] for k in op_fields] for stats_ in stats])
37 | 
38 |         model = sklearn.mixture.GMM(n_components=1)
39 |         model.fit(ops)
40 | 
41 |         sigma = model.covars_
42 |         mu = model.means_
43 |         distribution = truncnorm(-mu / sigma, np.inf, loc=mu, scale=sigma)
44 | 
45 |     unigram_filename = args.mono or args.source
46 |     with open(unigram_filename) as unigram_file:
47 |         unigrams = Counter(w for line in unigram_file for w in line.split())
48 |         unigrams = Counter({w: c for w, c in unigrams.items() if c >= args.min_count})
49 | 
50 |         total = sum(unigrams.values())
51 |         for k in unigrams.keys():
52 |             unigrams[k] /= total
53 | 
54 |     vocab = list(unigrams.keys())
55 |     p = np.array(list(unigrams.values()))
56 | 
57 |     def unigram_sampler():
58 |         while True:
59 |             x = np.random.choice(vocab, size=1000, p=p)
60 |             for w in x:
61 |                 yield w
62 | 
63 |     sampler = unigram_sampler()
64 | 
65 |     for line in sys.stdin:
66 |         words = line.split()
67 | 
68 |         sample = distribution.rvs(len(op_fields)) * len(words)
69 | 
70 |         x = sample.astype(np.int32)
71 |         i = np.random.random(sample.shape) < sample - sample.astype(np.int32)
72 |         x += i.astype(np.int32)
73 | 
74 |         dels, ins, subs, shifts = x
75 | 
76 |         for _ in range(dels):
77 |             k = random.randrange(len(words))
78 |             del words[k]
79 | 
80 |         for _ in range(shifts):
81 |             j, k = random.sample(range(len(words)), 2)
82 |             w = words.pop(j)
83 |             words.insert(k, w)
84 | 
85 |         for _ in range(subs):
86 |             w = next(sampler)
87 |             k = random.randrange(len(words))
88 |             words[k] = w
89 | 
90 |         for _ in range(ins):
91 |             w = next(sampler)
92 |             k = random.randrange(len(words) + 1)
93 |             words.insert(k, w)
94 | 
95 |         print(' '.join(words))
96 | 


--------------------------------------------------------------------------------
/scripts/post_editing/plot-ops.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from matplotlib import pyplot as plt
 5 | import os
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('eval_dir')
 9 | parser.add_argument('reference')
10 | parser.add_argument('--max-step', type=int)
11 | args = parser.parse_args()
12 | 
13 | filenames = sorted(os.listdir(args.eval_dir), key=lambda filename: int(filename.split('.')[-2]))
14 | steps = [int(filename.split('.')[-2]) for filename in filenames]
15 | 
16 | filenames = [os.path.join(args.eval_dir, filename) for filename in filenames]
17 | 
18 | with open(args.reference) as ref_file:
19 |     lines = [line.split() for line in ref_file]
20 |     ref_keeps = [line.count('<KEEP>') for line in lines]
21 |     ref_dels = [line.count('<DEL>') for line in lines]
22 |     ref_ins = [len(line) - line.count('<KEEP>') - line.count('<DEL>') for line in lines]
23 | 
24 | 
25 | keeps = []
26 | dels = []
27 | ins = []
28 | 
29 | fun = lambda x, y, z: abs(x - y) / z
30 | #fun = lambda x, y, z: x/z
31 | 
32 | for filename in filenames:
33 |     with open(filename) as f:
34 |         keep_ = 0
35 |         del_ = 0
36 |         ins_ = 0
37 |         lines = 0
38 | 
39 |         for i, line in enumerate(f):
40 |             words = line.split()
41 |             lines += 1
42 |             keep_ += fun(words.count('<KEEP>'), ref_keeps[i], len(words))
43 |             del_ += fun(words.count('<DEL>'), ref_dels[i], len(words))
44 |             ins_ += fun(len(words) - words.count('<KEEP>') - words.count('<DEL>'), ref_ins[i], len(words))
45 | 
46 |         keeps.append(keep_ / lines)
47 |         dels.append(del_ / lines)
48 |         ins.append(ins_ / lines)
49 | 
50 | 
51 | if args.max_step:
52 |     steps, keeps, dels, ins = zip(*[
53 |         (step, keep_, del_, ins_) for step, keep_, del_, ins_
54 |         in zip(steps, keeps, dels, ins) if step <= args.max_step
55 |     ])
56 | 
57 | plt.plot(steps, keeps, label='KEEP')
58 | plt.plot(steps, dels, label='DEL')
59 | plt.plot(steps, ins, label='INS(x)')
60 | 
61 | legend = plt.legend(loc='best', shadow=True)
62 | 
63 | plt.show()


--------------------------------------------------------------------------------
/scripts/post_editing/plot-ter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from translate.evaluation import tercom_statistics
  4 | from matplotlib import pyplot as plt
  5 | import numpy as np
  6 | 
  7 | parser = argparse.ArgumentParser()
  8 | parser.add_argument('hyp_files', nargs='+')
  9 | parser.add_argument('ref_file')
 10 | parser.add_argument('--reverse', action='store_true')
 11 | parser.add_argument('--labels', nargs='+')
 12 | parser.add_argument('--legend-loc', default='upper right')
 13 | parser.add_argument('--bar-width', type=float, default=0.2)
 14 | parser.add_argument('--ymin', type=float, default=0.0)
 15 | parser.add_argument('--ymax', type=float, default=0.3)
 16 | parser.add_argument('--ops', nargs='+', default=['ins', 'del', 'sub', 'shift'])
 17 | parser.add_argument('--fig-size',  nargs=2, type=float)
 18 | parser.add_argument('--save')
 19 | 
 20 | parser.add_argument('--average', nargs='+', type=int)
 21 | 
 22 | if __name__ == '__main__':
 23 |     args = parser.parse_args()
 24 | 
 25 |     with open(args.ref_file) as f:
 26 |         references = [line.strip() for line in f]
 27 | 
 28 |     hypotheses = []
 29 |     for hyp_file in args.hyp_files:
 30 |         with open(hyp_file) as f:
 31 |             hypotheses.append([line.strip() for line in f])
 32 | 
 33 |     if args.reverse:
 34 |         scores = [tercom_statistics(references, hyp)[0] for hyp in hypotheses]
 35 |     else:
 36 |         scores = [tercom_statistics(hyp, references)[0] for hyp in hypotheses]
 37 | 
 38 |     N = len(args.average) if args.average else len(args.hyp_files)
 39 |     ind = np.arange(N)
 40 |     op_name_mapping = {'ins': 'Insertions', 'del': 'Deletions', 'sub': 'Substitutions', 'shift': 'Shifts'}
 41 | 
 42 |     ref_words = np.array([score["REF_WORDS"] for score in scores])
 43 |     bars = []
 44 |     legend = []
 45 | 
 46 |     bottom = np.zeros(N)
 47 |     
 48 |     colors = ['#e66101', '#fdb863', '#b2abd2', '#5e3c99']
 49 |     
 50 |     if args.fig_size:
 51 |         plt.figure(figsize=tuple(args.fig_size))
 52 |     
 53 |     for op, color in zip(args.ops, colors):
 54 |         scores_ = np.array([score[op.upper()] for score in scores]) / ref_words
 55 |         if args.average:
 56 |             new_scores_ = []
 57 |             j = 0
 58 |             for n in args.average:
 59 |                 new_scores_.append(np.average(scores_[j:j+n]))
 60 |                 j += n
 61 |             scores_ = np.array(new_scores_)
 62 | 
 63 |         bar = plt.bar(ind, scores_, args.bar_width, bottom=bottom, color=color, align='center')
 64 |         
 65 |         bars.append(bar)
 66 |         legend.append(op_name_mapping[op])
 67 |         bottom += scores_
 68 |         
 69 |     #plt.legend((p_ins[0], p_del[0], p_sub[0], p_shift[0])[::-1], ('Insertions', 'Deletions', 'Substitutions', 'Shifts')[::-1],
 70 |     #           loc='upper right')
 71 |     
 72 |     try:
 73 |         loc = float(args.legend_loc)
 74 |         plt.legend(bars[::-1], legend[::-1], bbox_to_anchor=[loc, 1], loc="upper center")
 75 |     except:
 76 |         plt.legend(bars[::-1], legend[::-1], loc=args.legend_loc)
 77 | 
 78 |     plt.ylabel('TER')
 79 | 
 80 |     if args.labels:
 81 |         plt.xticks(ind, args.labels)
 82 |     else:
 83 |         plt.xticks([])
 84 |         
 85 |     axes = plt.gca()
 86 |     axes.set_ylim([args.ymin, args.ymax])
 87 |     
 88 |     if args.save:
 89 |         plt.savefig(args.save)
 90 |     else:
 91 |         plt.show()
 92 | 
 93 | """
 94 | N = 5
 95 | menMeans = (20, 35, 30, 35, 27)
 96 | womenMeans = (25, 32, 34, 20, 25)
 97 | menStd = (2, 3, 4, 1, 2)
 98 | womenStd = (3, 5, 2, 3, 3)
 99 | ind = np.arange(N)    # the x locations for the groups
100 | width = 0.35       # the width of the bars: can also be len(x) sequence
101 | 
102 | p1 = plt.bar(ind, menMeans, width, yerr=menStd)
103 | p2 = plt.bar(ind, womenMeans, width,
104 | 	     bottom=menMeans, yerr=womenStd)
105 | 
106 | plt.ylabel('Scores')
107 | plt.title('Scores by group and gender')
108 | plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
109 | plt.yticks(np.arange(0, 81, 10))
110 | plt.legend((p1[0], p2[0]), ('Men', 'Women'))
111 | 
112 | plt.show()
113 | 
114 |     import ipdb; ipdb.set_trace()
115 | """
116 | 


--------------------------------------------------------------------------------
/scripts/post_editing/reverse-edits.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from translate import utils
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('source')
 8 | parser.add_argument('edits')
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     args = parser.parse_args()
13 |     with open(args.source) as src_file, open(args.edits) as edit_file:
14 |         for src_line, edits in zip(src_file, edit_file):
15 |             trg_line = utils.reverse_edits(src_line.split(), [edits.split()])
16 |             print(' '.join(trg_line))
17 | 


--------------------------------------------------------------------------------
/scripts/post_editing/select-by-index.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import numpy as np
 5 | import random
 6 | import sys
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | 
10 | parser.add_argument('indices')
11 | 
12 | if __name__ == '__main__':
13 |     args = parser.parse_args()
14 | 
15 |     with open(args.indices) as f:
16 |         indices = sorted(list(set([int(line) for line in f])), reverse=True)
17 | 
18 |     for i, line in enumerate(sys.stdin):
19 |         if len(indices) == 0:
20 |             break
21 | 
22 |         if i == indices[-1]:
23 |             indices.pop()
24 |             print(line.rstrip('\r\n'))
25 | 


--------------------------------------------------------------------------------
/scripts/post_editing/select-by-length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import random
 5 | import sys
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | 
 9 | parser.add_argument('ref_sentences')
10 | parser.add_argument('sentences')
11 | parser.add_argument('-n', type=int, default=500000)
12 | parser.add_argument('-k', type=int, default=1)
13 | parser.add_argument('-m', type=int, default=1000)
14 | 
15 | if __name__ == '__main__':
16 |     args = parser.parse_args()
17 | 
18 |     with open(args.ref_sentences) as f:
19 |         ref_lengths = [len(line.split()) for line in f]
20 |     with open(args.sentences) as f:
21 |         lengths = [len(line.split()) for line in f]
22 |         lengths = list(enumerate(lengths))
23 | 
24 |     n = 0
25 |     l = len(lengths)
26 | 
27 |     while n < args.n and l > 0:
28 |         length = ref_lengths[n % len(ref_lengths)]
29 | 
30 |         def key(i):
31 |             return abs(length - lengths[i][1])
32 | 
33 |         indices = random.sample(range(l), k=args.m)
34 | 
35 |         if args.k > 1:
36 |             indices = sorted(indices, key=key)[:args.k]
37 |         else:
38 |             indices = [min(indices, key=key)]
39 | 
40 |         for i in indices:
41 |             sys.stdout.write(str(lengths[i][0]) + '\n')
42 | 
43 |         #sys.stdout.flush()
44 | 
45 |         for i in indices:
46 |             lengths[i], lengths[l - 1] = lengths[l - 1], lengths[i]
47 |             l -= 1
48 |             n += 1
49 | 


--------------------------------------------------------------------------------
/scripts/post_editing/select-by-ter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import numpy as np
 5 | import random
 6 | import sys
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | 
10 | parser.add_argument('ref_vectors')
11 | parser.add_argument('vectors')
12 | parser.add_argument('-n', type=int, default=500000)
13 | parser.add_argument('-k', type=int, default=1)
14 | parser.add_argument('-m', type=int, default=1000)
15 | 
16 | if __name__ == '__main__':
17 |     args = parser.parse_args()
18 | 
19 |     with open(args.ref_vectors) as f:
20 |         ref_vectors = [np.array([float(x) for x in line.split(',')]) for line in f]
21 |     with open(args.vectors) as f:
22 |         vectors = [np.array([float(x) for x in line.split(',')]) for line in f]
23 |         vectors = list(enumerate(vectors))
24 | 
25 |     n = 0
26 |     l = len(vectors)
27 | 
28 |     while n < args.n and l > 0:
29 |         vector = ref_vectors[n % len(ref_vectors)]
30 |         n += 1
31 | 
32 |         def key(i):
33 |             return np.sum((vector - vectors[i][1]) ** 2)
34 | 
35 |         indices = random.sample(range(l), k=args.m)
36 | 
37 |         if args.k > 1:
38 |             indices = sorted(indices, key=key)[:args.k]
39 |         else:
40 |             indices = [min(indices, key=key)]
41 | 
42 |         for i in indices:
43 |             sys.stdout.write(str(vectors[i][0]) + '\n')
44 | 
45 |         #sys.stdout.flush()
46 | 
47 |         for i in indices:
48 |             vectors[i], vectors[l - 1] = vectors[l - 1], vectors[i]
49 |             l -= 1
50 | 


--------------------------------------------------------------------------------
/scripts/post_editing/stats-TER.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import numpy as np
 6 | import re
 7 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer, tercom_statistics
 8 | from collections import OrderedDict
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('source')
12 | parser.add_argument('target')
13 | parser.add_argument('--bleu', action='store_true')
14 | #parser.add_argument('--ter', action='store_true')
15 | #parser.add_argument('--wer', action='store_true')
16 | #parser.add_argument('--all', '-a', action='store_true')
17 | parser.add_argument('--max-size', type=int)
18 | parser.add_argument('--case-insensitive', '-i', action='store_true')
19 | 
20 | parser.add_argument('--draws', type=int, default=1000)
21 | parser.add_argument('--sample-size', type=int, default=0)
22 | parser.add_argument('-p', type=float, default=0.05)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     args = parser.parse_args()
27 | 
28 |     with open(args.source) as src_file, open(args.target) as trg_file:
29 |         if args.case_insensitive:
30 |             hypotheses = [line.strip().lower() for line in src_file]
31 |             references = [line.strip().lower() for line in trg_file]
32 |         else:
33 |             hypotheses = [line.strip() for line in src_file]
34 |             references = [line.strip() for line in trg_file]
35 | 
36 |         if args.max_size is not None:
37 |             hypotheses = hypotheses[:args.max_size]
38 |             references = references[:args.max_size]
39 | 
40 |         if len(hypotheses) != len(references):
41 |             sys.stderr.write('warning: source and target don\'t have the same length\n')
42 |             size = min(len(hypotheses), len(references))
43 |             hypotheses = hypotheses[:size]
44 |             references = references[:size]
45 | 
46 |         avg_stats, stats = tercom_statistics(hypotheses, references)
47 | 
48 |         ters = [stats_['TER'] for stats_ in stats]
49 | 
50 |         mean = sum(ters) / len(ters)
51 |         variance = sum((ter - mean) ** 2 for ter in ters) / (len(ters) - 1)
52 | 
53 |         ts = {0.01: 2.5841, 0.05: 1.9639, 0.10: 1.6474}
54 |         t = ts.get(args.p)
55 |         if t is None:
56 |             raise Exception
57 | 
58 |         d = t * np.sqrt(variance / len(ters))
59 | 
60 |         print('{:.3f} +/- {:.3f}'.format(mean, d))


--------------------------------------------------------------------------------
/scripts/post_editing/ter-stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from translate.evaluation import tercom_statistics
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('source')
 8 | parser.add_argument('target')
 9 | 
10 | parser.add_argument('--case-insensitive', '-i', action='store_true')
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     args = parser.parse_args()
15 | 
16 |     with open(args.source) as src_file, open(args.target) as trg_file:
17 |         hypotheses = [line.strip() for line in src_file]
18 |         references = [line.strip() for line in trg_file]
19 | 
20 |         total, _ = tercom_statistics(hypotheses, references, not args.case_insensitive)
21 | 
22 |         total['TER'] = total['ERRORS'] / total['REF_WORDS']
23 |         print(' '.join('{}={:.2f}'.format(k, v) for k, v in sorted(total.items())))
24 | 


--------------------------------------------------------------------------------
/scripts/post_editing/to-sgm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import numpy as np
 6 | #from translate.evaluation import corpus_bleu, corpus_ter
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | # parser.add_argument('source1')
10 | # parser.add_argument('source2')
11 | # parser.add_argument('target')
12 | #
13 | # parser.add_argument('--bleu', action='store_true')
14 | # parser.add_argument('--max-size', type=int)
15 | # parser.add_argument('--case-insensitive', '-i', action='store_true')
16 | #
17 | # parser.add_argument('--draws', type=int, default=1000)
18 | # parser.add_argument('--sample-size', type=int, default=0)
19 | # parser.add_argument('-p', type=float, default=0.05)
20 | parser.add_argument('--set-type')
21 | parser.add_argument('--set-id')
22 | 
23 | args = parser.parse_args()
24 | 
25 | if args.set_type is not None:
26 |     if args.set_id is None:
27 |         args.set_id = 'dummy'
28 | 
29 |     print('<{} setid="{}" srclang="any" trglang="any">'.format(args.set_type, args.set_id))
30 | 
31 | print('<doc docid="dummy" sysid="{}">'.format(args.set_type))
32 | for i, line in enumerate(sys.stdin, 1):
33 |     print('<seg id="{}">{}</seg>'.format(i, line.strip()))
34 | print('</doc>')
35 | 
36 | if args.set_type is not None:
37 |     print('</{}>'.format(args.set_type))
38 | 


--------------------------------------------------------------------------------
/scripts/post_editing/well-formed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import string
 4 | from signal import signal, SIGPIPE, SIG_DFL
 5 | 
 6 | signal(SIGPIPE, SIG_DFL)
 7 | 
 8 | punk = '.!?:")'
 9 | 
10 | def is_well_formed(line):
11 |     if len(line) < 21:
12 |         return False
13 | 
14 |     x = line[0]
15 |     if not x.isdigit() and not (x.isalpha() and x.isupper()):
16 |         return False
17 |     if not line[-2] in punk:  # last character is '\n'
18 |         return False
19 | 
20 |     i = 0
21 |     k = 0
22 | 
23 |     for c in line:
24 |         if c == ' ':
25 |             continue
26 | 
27 |         k += 1
28 |         if c.isalpha():
29 |             i += 1
30 | 
31 |     j = 0
32 |     prev = None
33 |     for word in line.split():
34 |         if prev is not None and word == prev:
35 |             j += 1
36 |             if j > 3:
37 |                 return False
38 |         else:
39 |             prev = word
40 |             j = 1
41 | 
42 |     return i >= 20 and i >= k * 0.75
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     for line in sys.stdin:
47 |         if is_well_formed(line):
48 |             sys.stdout.write(line)
49 | 


--------------------------------------------------------------------------------
/scripts/reverse.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sys
3 | 
4 | for line in sys.stdin:
5 |     print(' '.join(reversed(line.split())))
6 | 


--------------------------------------------------------------------------------
/scripts/score.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import os
 6 | import re
 7 | from collections import OrderedDict
 8 | 
 9 | script_dir = os.path.dirname(os.path.abspath(__file__))
10 | root_dir = os.path.dirname(script_dir)
11 | sys.path.append(root_dir)
12 | tercom_path = os.path.join(script_dir, 'tercom.jar')
13 | 
14 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer, corpus_cer, corpus_bleu1
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('source')
18 | parser.add_argument('target')
19 | parser.add_argument('--bleu', action='store_true')
20 | parser.add_argument('--ter', action='store_true')
21 | parser.add_argument('--wer', action='store_true')
22 | parser.add_argument('--cer', action='store_true')
23 | parser.add_argument('--bleu1', action='store_true')
24 | parser.add_argument('--all', '-a', action='store_true')
25 | parser.add_argument('--max-size', type=int)
26 | parser.add_argument('--no-punk', action='store_true')
27 | 
28 | parser.add_argument('--max-len', type=int, default=0)
29 | parser.add_argument('--min-len', type=int, default=0)
30 | 
31 | parser.add_argument('--case-insensitive', '-i', action='store_true')
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     args = parser.parse_args()
36 | 
37 |     if not args.max_len:
38 |         args.max_len = float('inf')
39 | 
40 |     if not any([args.all, args.wer, args.ter, args.bleu, args.cer, args.bleu1]):
41 |         args.all = True
42 | 
43 |     if args.all:
44 |         args.wer = args.ter = args.bleu = args.bleu1 = True
45 | 
46 |     with open(args.source) as src_file, open(args.target) as trg_file:
47 | 
48 |         lines = [(src, trg) for src, trg in zip(src_file, trg_file)
49 |                  if args.min_len <= len(trg.split()) <= args.max_len]
50 |         src_lines, trg_lines = zip(*lines)
51 | 
52 |         def transform(sentence):
53 |             sentence = sentence.strip()
54 |             sentence = re.sub(r'\s+', ' ', sentence)
55 |             if args.case_insensitive:
56 |                 sentence = sentence.lower()
57 |             if args.no_punk:
58 |                 sentence = re.sub(r'[,!;:?"\'\.]', '', sentence)
59 |             sentence = re.sub(r'@@ ', '', sentence)
60 |             sentence = re.sub(r'@@', '', sentence)
61 |             return sentence
62 | 
63 |         hypotheses = list(map(transform, src_lines))
64 |         references = list(map(transform, trg_lines))
65 | 
66 |         if args.max_size is not None:
67 |             hypotheses = hypotheses[:args.max_size]
68 |             references = references[:args.max_size]
69 | 
70 |         if len(hypotheses) != len(references):
71 |             sys.stderr.write('warning: source and target don\'t have the same length\n')
72 |             size = min(len(hypotheses), len(references))
73 |             hypotheses = hypotheses[:size]
74 |             references = references[:size]
75 | 
76 |         scores = OrderedDict()
77 |         if args.bleu:
78 |             scores['bleu'], summary = corpus_bleu(hypotheses, references)
79 |             try:
80 |                 scores['penalty'], scores['ratio'] = map(float, re.findall('\w+=(\d+.\d+)', summary))
81 |             except ValueError:
82 |                 pass
83 |         if args.wer:
84 |             scores['wer'], _ = corpus_wer(hypotheses, references)
85 |         if args.ter:
86 |             try:  # java missing
87 |                 scores['ter'], _ = corpus_ter(hypotheses, references, tercom_path=tercom_path)
88 |             except:
89 |                 scores['ter'] = 0
90 |         if args.cer:
91 |             scores['cer'], _ = corpus_cer(hypotheses, references)
92 |         if args.bleu1:
93 |             scores['bleu1'], _ = corpus_bleu1(hypotheses, references)
94 | 
95 |         print(' '.join('{}={:.2f}'.format(k, v) for k, v in scores.items()))
96 | 


--------------------------------------------------------------------------------
/scripts/shuf-corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import division
 4 | import sys
 5 | import random
 6 | import argparse
 7 | import shutil
 8 | 
 9 | help_msg = """\
10 | Shuffles a corpus.
11 | 
12 | Usage example:
13 |     shuf-corpus.py data/my_corpus data/my_corpus.shuf fr en
14 | """
15 | 
16 | if __name__ == '__main__':
17 |     parser = argparse.ArgumentParser(description=help_msg, formatter_class=argparse.RawDescriptionHelpFormatter)
18 | 
19 |     parser.add_argument('corpus', help='name of the input corpus (path without extension, e.g. data/my_corpus)')
20 |     parser.add_argument('--output', help='name of the output corpus (if not specified, input corpus is overwritten)')
21 |     parser.add_argument('--seed', type=int)
22 |     parser.add_argument('extensions', nargs='+', help='extensions (e.g. fr, en)')
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     corpus = args.corpus
27 | 
28 |     if args.output is not None:
29 |         output = args.output
30 |     else:
31 |         output = corpus
32 | 
33 |     input_files = ['{0}.{1}'.format(args.corpus, ext) for ext in args.extensions]
34 |     output_files = ['{0}.{1}'.format(output, ext) for ext in args.extensions]
35 | 
36 |     # reads the whole contents into memory (might cause problems if the files are too large)
37 |     # TODO: process files one by one
38 |     contents = []
39 |     for filename in input_files:
40 |         with open(filename) as f:
41 |             contents.append(f.readlines())
42 | 
43 |     indices = list(range(len(contents[0])))
44 |     random.seed(args.seed)
45 |     random.shuffle(indices)
46 | 
47 |     contents = [[content[i] for i in indices] for content in contents]
48 | 
49 |     for filename, content in zip(output_files, contents):
50 |         with open(filename, 'w') as f:
51 |             f.writelines(content)
52 | 


--------------------------------------------------------------------------------
/scripts/speech/cat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('inputs', nargs='+')
 7 | parser.add_argument('output')
 8 | parser.add_argument('-v', '--verbose', action='store_true')
 9 | 
10 | args = parser.parse_args()
11 | 
12 | dim = None
13 | n = 0
14 | for filename in args.inputs:
15 |     with open(filename, 'rb') as f:
16 |         n_, dim_ = np.load(f)
17 |         n += n_
18 |         assert dim is None or dim_ == dim, 'incompatible dimensions {} != {}'.format(dim_, dim)
19 |         dim = dim_
20 | 
21 | if args.verbose:
22 |     print('count: {}, dim: {}'.format(n, dim))
23 | 
24 | with open(args.output, 'wb') as output_file:
25 |     np.save(output_file, (n, dim))
26 |     for filename in args.inputs:
27 |         with open(filename, 'rb') as f:
28 |             n_, _ = np.load(f)
29 |             for _ in range(n_):
30 |                 feats = np.load(f)
31 |                 np.save(output_file, feats)
32 | 
33 | 


--------------------------------------------------------------------------------
/scripts/speech/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import numpy as np
 4 | import struct
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('input')
 8 | parser.add_argument('output')
 9 | 
10 | args = parser.parse_args()
11 | 
12 | with open(args.input, 'rb') as infile, open(args.output, 'wb') as outfile:
13 |     lines, dim = struct.unpack('ii', infile.read(8))
14 |     np.save(outfile, (lines, dim))
15 |     
16 |     for _ in range(lines):
17 |         x = infile.read(4)
18 |         frames, = struct.unpack('i', x)
19 |         n = frames * dim
20 |         x = infile.read(4 * n)
21 |         feats = struct.unpack('f' * n, x)
22 |         feats = np.array(feats).reshape(frames, dim)
23 |         np.save(outfile, feats.astype(np.float32))
24 | 
25 | 


--------------------------------------------------------------------------------
/scripts/speech/extract-new.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import numpy as np
  4 | import scipy.io.wavfile as wav
  5 | import tarfile
  6 | import sys
  7 | from python_speech_features import mfcc, delta, fbank
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument('inputs', nargs='+')
 11 | parser.add_argument('output')
 12 | 
 13 | parser.add_argument('--mfcc', action='store_true')
 14 | parser.add_argument('--filters', type=int, default=40)
 15 | parser.add_argument('--energy', action='store_true')
 16 | parser.add_argument('--step-size', type=float, default=0.010)
 17 | parser.add_argument('--win-size', type=float, default=0.025)
 18 | parser.add_argument('--delta', action='store_true')
 19 | parser.add_argument('--delta-delta', action='store_true')
 20 | parser.add_argument('--window', default='hamming')
 21 | parser.add_argument('--nfft', type=int, default=512)
 22 | parser.add_argument('--low-freq', type=float, default=0)
 23 | parser.add_argument('--high-freq', type=float)
 24 | parser.add_argument('-v', '--verbose', action='store_true')
 25 | 
 26 | args = parser.parse_args()
 27 | 
 28 | if args.delta_delta:
 29 |     args.delta = True
 30 | 
 31 | if args.window.lower().startswith('ham'):
 32 |     winfunc = np.hamming
 33 | elif args.window.lower().startswith('han'):
 34 |     winfunc = np.hanning
 35 | else:
 36 |     winfunc = lambda x: np.ones((x,))
 37 | 
 38 | params = dict(
 39 |         winlen=args.win_size,
 40 |         winstep=args.step_size,
 41 |         nfilt=args.filters,
 42 |         preemph=0,
 43 |         winfunc=winfunc,
 44 |         lowfreq=args.low_freq,
 45 |         highfreq=args.high_freq,
 46 |         nfft=args.nfft)
 47 | 
 48 | outfile = open(args.output, 'wb')
 49 | 
 50 | total = 0
 51 | for filename in args.inputs:
 52 |     tar = tarfile.open(filename)
 53 |     files = [f for f in tar.getmembers() if f.isfile()]
 54 |     total += len(files)
 55 | 
 56 | dim = min(12, args.filters - 1) if args.mfcc else args.filters
 57 | if args.delta_delta:
 58 |     dim *= 3
 59 | elif args.delta:
 60 |     dim *= 2
 61 | if args.energy:
 62 |     dim += 1
 63 | if args.verbose:
 64 |     print('count: {}, dim: {}'.format(total, dim))
 65 | 
 66 | np.save(outfile, (total, dim))
 67 | 
 68 | i = 1
 69 | for filename in args.inputs:
 70 |     tar = tarfile.open(filename)
 71 |     files = [f for f in tar.getmembers() if f.isfile()]
 72 |     files = sorted(files, key=lambda f: f.name)
 73 | 
 74 |     for fileinfo in files:
 75 |         with tar.extractfile(fileinfo) as f:
 76 |             rate, data = wav.read(f)
 77 |             
 78 |             if args.mfcc:
 79 |                 feats = mfcc(data, rate, ceplifter=0, **params)
 80 |                 energy = feats[:,:1]
 81 |                 feats = feats[:,1:]
 82 |             else:
 83 |                 feats, energy = fbank(data, rate, **params)
 84 |                 feats = np.log(feats)
 85 |                 energy = np.expand_dims(np.log(energy), axis=1)
 86 | 
 87 |             if args.delta:
 88 |                 d1 = delta(feats, 2)
 89 |                 feats = np.concatenate([feats, d1], axis=1)
 90 |                 if args.delta_delta:
 91 |                     d2 = delta(d1, 2)
 92 |                     feats = np.concatenate([feats, d2], axis=1)
 93 | 
 94 |             if args.energy:
 95 |                 feats = np.concatenate([energy, feats], axis=1)
 96 |                
 97 |             np.save(outfile, feats)
 98 |         if args.verbose and i % 10 == 0:
 99 |             sys.stdout.write('\rfiles processed: {}'.format(i))
100 |         i += 1
101 | 
102 | if args.verbose:
103 |     print('\rfiles processed: {}'.format(i))
104 | 
105 | outfile.close()
106 | 
107 | 


--------------------------------------------------------------------------------
/scripts/speech/extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import division
 4 | import argparse
 5 | import numpy as np
 6 | import yaafelib
 7 | import tarfile
 8 | import tempfile
 9 | import os
10 | from collections import Counter
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('inputs', nargs='+', help='tar archive which contains all the wav files')
14 | parser.add_argument('output', help='output file')
15 | parser.add_argument('--derivatives', action='store_true')
16 | 
17 | args = parser.parse_args()
18 | 
19 | parameters = dict(
20 |     step_size=160,  # corresponds to 10 ms (at 16 kHz)
21 |     block_size=640,  # corresponds to 40 ms
22 |     mfcc_coeffs=40,
23 |     mfcc_filters=41  # more filters? (needs to be at least mfcc_coeffs+1, because first coeff is ignored)
24 | )
25 | 
26 | # TODO: ensure that all input files use this rate
27 | fp = yaafelib.FeaturePlan(sample_rate=16000)
28 | 
29 | mfcc_features = 'MFCC MelNbFilters={mfcc_filters} CepsNbCoeffs={mfcc_coeffs} ' \
30 |                 'blockSize={block_size} stepSize={step_size}'.format(**parameters)
31 | energy_features = 'Energy blockSize={block_size} stepSize={step_size}'.format(**parameters)
32 | 
33 | fp.addFeature('mfcc: {}'.format(mfcc_features))
34 | if args.derivatives:
35 |     fp.addFeature('mfcc_d1: {} > Derivate DOrder=1'.format(mfcc_features))
36 |     fp.addFeature('mfcc_d2: {} > Derivate DOrder=2'.format(mfcc_features))
37 | 
38 | fp.addFeature('energy: {}'.format(energy_features))
39 | if args.derivatives:
40 |     fp.addFeature('energy_d1: {} > Derivate DOrder=1'.format(energy_features))
41 |     fp.addFeature('energy_d2: {} > Derivate DOrder=2'.format(energy_features))
42 | 
43 | if args.derivatives:
44 |     keys = ['mfcc', 'mfcc_d1', 'mfcc_d2', 'energy', 'energy_d1', 'energy_d2']
45 | else:
46 |     keys = ['mfcc', 'energy']
47 | 
48 | df = fp.getDataFlow()
49 | engine = yaafelib.Engine()
50 | engine.load(df)
51 | afp = yaafelib.AudioFileProcessor()
52 | 
53 | frame_counter = Counter()
54 | 
55 | outfile = open(args.output, 'wb')
56 | 
57 | total = 0
58 | for filename in args.inputs:
59 |     tar = tarfile.open(filename)
60 |     total += len([f for f in tar if f.isfile()])
61 | 
62 | _, tmp_file = tempfile.mkstemp()
63 | 
64 | for j, filename in enumerate(args.inputs):
65 |     tar = tarfile.open(filename)
66 |     files = sorted([f for f in tar if f.isfile()], key=lambda f: f.name)
67 | 
68 |     for i, fileinfo in enumerate(files):
69 |         file_ = tar.extractfile(fileinfo)
70 |         with open(tmp_file, 'wb') as f:
71 |             f.write(file_.read())
72 | 
73 |         afp.processFile(engine, tmp_file)
74 |         feats = engine.readAllOutputs()
75 |         feats = np.concatenate([feats[k] for k in keys], axis=1)
76 |         frames, dim = feats.shape
77 | 
78 |         feats = feats.astype(np.float32)
79 | 
80 |         if frames == 0:
81 |             print(frames, dim, fileinfo.name)
82 |             raise Exception
83 | 
84 |         if i == 0 and j == 0:
85 |             np.save(outfile, (total, dim))
86 | 
87 |         np.save(outfile, feats)
88 | 
89 | outfile.close()
90 | os.remove(tmp_file)
91 | 


--------------------------------------------------------------------------------
/scripts/speech/head.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('input')
 7 | parser.add_argument('output')
 8 | parser.add_argument('-n', type=int, default=10)
 9 | 
10 | args = parser.parse_args()
11 | 
12 | with open(args.input, 'rb') as input_file, open(args.output, 'wb') as output_file:
13 |     n, dim = np.load(input_file)
14 |     n = min(args.n, n)
15 |     np.save(output_file, (n, dim))
16 |     for _ in range(n):
17 |         feats = np.load(input_file)
18 |         np.save(output_file, feats)
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/speech/python_speech_features/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | 


--------------------------------------------------------------------------------
/scripts/speech/shuf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import random
 4 | import numpy as np
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('input')
 8 | parser.add_argument('--output')
 9 | parser.add_argument('-n', type=int, default=0)
10 | 
11 | parser.add_argument('--input-txt', nargs='*')
12 | parser.add_argument('--output-txt', nargs='*')
13 | 
14 | args = parser.parse_args()
15 | 
16 | if not args.output:
17 |     args.output = args.input
18 | if args.input_txt and not args.output_txt:
19 |     args.output_txt = args.input_txt
20 | 
21 | with open(args.input, 'rb') as input_file:
22 |     n, dim = np.load(input_file)
23 | 
24 |     indices = list(range(n))
25 |     random.shuffle(indices)
26 | 
27 |     if args.n > 0:
28 |         indices = indices[:args.n]
29 | 
30 |     frames = []
31 | 
32 |     for _ in range(n):
33 |         feats = np.load(input_file)
34 |         frames.append(feats)
35 | 
36 | with open(args.output, 'wb') as output_file:
37 |     np.save(output_file, (len(indices), dim))
38 |     for index in indices:
39 |         feats = frames[index]
40 |         np.save(output_file, feats)
41 | 
42 | if args.input_txt and args.output_txt:
43 |     lines = []
44 |     for input_filename in args.input_txt:
45 |         with open(input_filename) as input_file:
46 |             lines.append(input_file.readlines())
47 | 
48 |     for lines_, output_filename in zip(lines, args.output_txt):
49 |         with open(output_filename, 'w') as output_file:
50 |             for index in indices:
51 |                 line = lines_[index]
52 |                 output_file.write(line)
53 | 
54 | 


--------------------------------------------------------------------------------
/scripts/split-corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('filename')
 8 | parser.add_argument('dest')
 9 | parser.add_argument('--splits', type=int, required=True)
10 | parser.add_argument('--tokens', action='store_true')
11 | 
12 | args = parser.parse_args()
13 | 
14 | os.makedirs(args.dest, exist_ok=True)
15 | 
16 | with open(args.filename) as input_file:
17 |     if args.tokens:
18 |         total_size = sum(len(line.split()) for line in input_file)
19 |     else:
20 |         total_size = sum(1 for line in input_file)
21 | 
22 |     input_file.seek(0)
23 | 
24 |     shard_size = total_size // args.splits
25 | 
26 |     for i in range(args.splits):
27 |         filename = os.path.join(args.dest, str(i + 1).zfill(len(str(args.splits))))
28 | 
29 |         with open(filename, 'w') as output_file:
30 | 
31 |             this_size = 0
32 |             for line in input_file:
33 |                 line_size = len(line.split()) if args.tokens else 1
34 |                 this_size += line_size
35 | 
36 |                 output_file.write(line)
37 | 
38 |                 if this_size >= shard_size and i < args.splits - 1:
39 |                     break
40 | 


--------------------------------------------------------------------------------
/scripts/stats-bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | import numpy as np
 6 | import re
 7 | from translate.evaluation import corpus_bleu, corpus_ter, corpus_wer
 8 | from collections import OrderedDict
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('source')
12 | parser.add_argument('target')
13 | parser.add_argument('--bleu', action='store_true')
14 | #parser.add_argument('--ter', action='store_true')
15 | #parser.add_argument('--wer', action='store_true')
16 | #parser.add_argument('--all', '-a', action='store_true')
17 | parser.add_argument('--max-size', type=int)
18 | parser.add_argument('--case-insensitive', '-i', action='store_true')
19 | 
20 | parser.add_argument('--draws', type=int, default=1000)
21 | parser.add_argument('--sample-size', type=int, default=0)
22 | parser.add_argument('-p', type=float, default=0.05)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     args = parser.parse_args()
27 | 
28 |     with open(args.source) as src_file, open(args.target) as trg_file:
29 |         if args.case_insensitive:
30 |             hypotheses = [line.strip().lower() for line in src_file]
31 |             references = [line.strip().lower() for line in trg_file]
32 |         else:
33 |             hypotheses = [line.strip() for line in src_file]
34 |             references = [line.strip() for line in trg_file]
35 | 
36 |         if args.max_size is not None:
37 |             hypotheses = hypotheses[:args.max_size]
38 |             references = references[:args.max_size]
39 | 
40 |         if len(hypotheses) != len(references):
41 |             sys.stderr.write('warning: source and target don\'t have the same length\n')
42 |             size = min(len(hypotheses), len(references))
43 |             hypotheses = hypotheses[:size]
44 |             references = references[:size]
45 | 
46 |         indices = np.arange(len(hypotheses))
47 |         if args.sample_size == 0:
48 |             args.sample_size = len(hypotheses)
49 | 
50 |         bleu_scores = []
51 |         hypotheses = np.array(hypotheses)
52 |         references = np.array(references)
53 | 
54 |         for _ in range(args.draws):
55 |             indices = np.random.randint(len(hypotheses), size=args.sample_size)
56 |             hypotheses_ = hypotheses[indices]
57 |             references_ = references[indices]
58 | 
59 |             bleu, _ = corpus_bleu(hypotheses_, references_)
60 |             bleu_scores.append(bleu)
61 | 
62 |         bleu_scores = sorted(bleu_scores)
63 |         k = int(len(bleu_scores) * args.p) // 2   # FIXME
64 | 
65 |         bleu_scores = bleu_scores[k:len(bleu_scores) - k]
66 | 
67 |         print('[{:.3f}, {:.3f}]'.format(bleu_scores[0], bleu_scores[-1]))
68 | 


--------------------------------------------------------------------------------
/scripts/stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from collections import Counter, namedtuple, OrderedDict
  4 | 
  5 | parser = argparse.ArgumentParser()
  6 | parser.add_argument('filename')
  7 | parser.add_argument('--lower', action='store_true')
  8 | parser.add_argument('--count-whitespaces', action='store_true')
  9 | parser.add_argument('-c', '-b', '--chars', action='store_true', help='display char info')
 10 | parser.add_argument('-l', '--lines', action='store_true', help='display line count')
 11 | parser.add_argument('-w', '--words', action='store_true', help='display word info')
 12 | parser.add_argument('-a', '--all', action='store_true',
 13 |                     help='display all info and more (large memory usage)')
 14 | 
 15 | args = parser.parse_args()
 16 | 
 17 | if not args.chars and not args.lines and not args.words or args.all:
 18 |     args.chars = args.words = args.lines = True
 19 | 
 20 | word_counts = Counter()
 21 | char_counts = Counter()
 22 | 
 23 | word_dict = Counter()
 24 | char_dict = Counter()
 25 | 
 26 | line_dict = Counter()
 27 | lines = 0
 28 | 
 29 | with open(args.filename) as f:
 30 |     for line in f:
 31 |         if args.lower:
 32 |             line = line.lower()
 33 | 
 34 |         if args.words:
 35 |             words = line.split()
 36 |             word_counts[len(words)] += 1
 37 |             for word in words:
 38 |                 word_dict[word] += 1
 39 | 
 40 |         if args.chars:
 41 |             chars = line
 42 |             if not args.count_whitespaces:
 43 |                 chars = line.strip().replace(' ', '')
 44 | 
 45 |             char_counts[len(chars)] += 1
 46 |             for char in chars:
 47 |                 char_dict[char] += 1
 48 | 
 49 |         lines += 1
 50 |         if args.all:
 51 |             line_dict[line] += 1
 52 | 
 53 | 
 54 | def info_dict(title, counter):
 55 |     total = sum(counter.values())
 56 |     unique = len(counter)
 57 |     avg = total / unique
 58 |     min_ = min(counter.values())
 59 |     max_ = max(counter.values())
 60 | 
 61 |     cumulative_count = 0
 62 |     coverage = OrderedDict([(90, 0), (95, 0), (99, 0)])
 63 | 
 64 |     for i, pair in enumerate(counter.most_common(), 1):
 65 |         _, count = pair
 66 |         cumulative_count += count
 67 | 
 68 |         for percent, count in coverage.items():
 69 |             if count == 0 and cumulative_count * 100 >= percent * total:
 70 |                 coverage[percent] = i
 71 | 
 72 |     summary = [
 73 |         '{}\n{}'.format(title, '-' * len(title)),
 74 |         'Total:   {}'.format(total),
 75 |         'Unique:  {}'.format(unique),
 76 |         'Minimum: {}'.format(min_),
 77 |         'Maximum: {}'.format(max_),
 78 |         'Average: {:.1f}'.format(avg)
 79 |     ]
 80 | 
 81 |     for percent, count in coverage.items():
 82 |         summary.append('{}% cov: {}'.format(percent, count))
 83 | 
 84 |     return '\n  '.join(summary) + '\n'
 85 | 
 86 | 
 87 | def info_lengths(title, counter):
 88 |     total = sum(counter.values())
 89 |     avg = sum(k * v for k, v in counter.items()) / total
 90 | 
 91 |     coverage = OrderedDict([(1, 0), (5, 0), (10, 0),
 92 |                             (50, 0), (90, 0), (95, 0), (99, 0)])
 93 | 
 94 |     cumulative_count = 0
 95 |     prev_k = 0
 96 | 
 97 |     for k, v in sorted(counter.items()):
 98 |         cumulative_count += v
 99 | 
100 |         for percent, count in coverage.items():
101 |             if count == 0 and cumulative_count * 100 >= percent * total:
102 |                 coverage[percent] = prev_k if percent < 50 else k
103 | 
104 |         prev_k = k
105 | 
106 |     summary = [
107 |         '{}\n{}'.format(title, '-' * len(title)),
108 |         'Minimum: {}'.format(min(counter)),
109 |         'Maximum: {}'.format(max(counter)),
110 |         'Average: {:.1f}'.format(avg),
111 |     ]
112 | 
113 |     for percent, count in coverage.items():
114 |         summary.append('{}{:2d}%:   {}'.format('<=' if percent < 50 else '>=', percent, count))
115 | 
116 |     return '\n  '.join(summary) + '\n'
117 | 
118 | 
119 | if args.lines:
120 |     print("Lines\n-----\n  Total:   {}".format(lines))
121 | 
122 | if args.all:
123 |     summary = [
124 |         'Unique:  {}'.format(len(line_dict)),
125 |         'Average: {:.2f}'.format(lines / len(line_dict))
126 |     ]
127 |     print('  ' + '\n  '.join(summary))
128 | 
129 | print()
130 | 
131 | if args.words:
132 |     print(info_lengths('Words per line', word_counts))
133 |     print(info_dict('Words', word_dict))
134 | 
135 | if args.chars:
136 |     print(info_lengths('Chars per line', char_counts))
137 |     print(info_dict('Chars', char_dict))
138 | 


--------------------------------------------------------------------------------
/scripts/tercom.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/scripts/tercom.jar


--------------------------------------------------------------------------------
/scripts/train-moses.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [[ $# -lt 8 ]]
 6 | then
 7 |     echo "wrong number of arguments supplied: $#"
 8 |     exit 0
 9 | fi
10 | 
11 | if [ -z ${MOSES} ] || [ -z ${GIZA} ]
12 | then
13 |     echo "variables MOSES and/or GIZA undefined"
14 |     exit 0
15 | fi
16 | 
17 | model_dir=`readlink -f $1`
18 | data_dir=`readlink -f $2`
19 | corpus=${data_dir}/$3
20 | dev_corpus=${data_dir}/$4
21 | src_ext=$5
22 | trg_ext=$6
23 | lm_path=${data_dir}/$7
24 | lm_corpus=`basename ${lm_path}`
25 | lm_order=$8
26 | cores=`lscpu | grep -Po "^(CPU\(s\)|Processeur\(s\)).?:\s+\K\d+$"`
27 | 
28 | echo "training on ${cores} CPUs"
29 | 
30 | rm -rf ${model_dir}
31 | mkdir -p ${model_dir}
32 | 
33 | echo "training language model, corpus=${lm_corpus}, order=${lm_order}" | ts
34 | ${MOSES}/bin/lmplz -o ${lm_order} --discount_fallback < ${lm_path}.${trg_ext} > ${model_dir}/${lm_corpus}.${trg_ext}.arpa 2>${model_dir}/train.log
35 | 
36 | echo "training moses, corpus=${corpus}" | ts
37 | ${MOSES}/scripts/training/train-model.perl -root-dir ${model_dir} \
38 | -corpus ${corpus} -f ${src_ext} -e ${trg_ext} -alignment grow-diag-final-and \
39 | -reordering msd-bidirectional-fe -lm 0:${lm_order}:${model_dir}/${lm_corpus}.${trg_ext}.arpa \
40 | -mgiza -external-bin-dir ${GIZA} \
41 | -mgiza-cpus ${cores} -cores ${cores} --parallel 2>&1 | ts >> ${model_dir}/train.log
42 | 
43 | echo "tuning moses, corpus=${dev_corpus}" | ts
44 | ${MOSES}/scripts/training/mert-moses.pl ${dev_corpus}.${src_ext} ${dev_corpus}.${trg_ext} \
45 | ${MOSES}/bin/moses ${model_dir}/model/moses.ini --mertdir ${MOSES}/bin/ \
46 | --decoder-flags="-threads ${cores}" --working-dir ${model_dir}/mert-work 2>&1 | ts > ${model_dir}/tuning.log
47 | 
48 | echo "finished" | ts
49 | mv ${model_dir}/mert-work/moses.ini ${model_dir}/moses.tuned.ini
50 | rm -rf ${model_dir}/mert-work
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/vocab-stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import argparse
 5 | from collections import Counter
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('hyp')
 9 | parser.add_argument('--reference')
10 | parser.add_argument('--source')
11 | parser.add_argument('--max', type=int)
12 | 
13 | args = parser.parse_args()
14 | 
15 | if args.reference is not None:
16 |     with open(args.reference) as ref_file:
17 |         ref_lines = [line.split() for line in ref_file]
18 |         ref_words = list(map(Counter, ref_lines))
19 | else:
20 |     ref_words = None
21 |     ref_lines = None
22 | 
23 | if args.source is not None:
24 |     with open(args.source) as src_file:
25 |         src_lines = [line.split() for line in src_file]
26 | else:
27 |     src_lines = None
28 | 
29 | total = Counter()
30 | ok = Counter()
31 | del_counts = Counter()
32 | ok_del_counts = Counter()
33 | 
34 | def extract_deletes(ops, src_words):
35 |     i = 0
36 |     deletes = []
37 | 
38 |     for op in ops:
39 |         if op == '<KEEP>':
40 |             i += 1
41 |         elif op == '<DEL>':
42 |             deletes.append(src_words[i])
43 | 
44 |     return deletes
45 | 
46 | with open(args.hyp) as hyp_file:
47 |     for i, line in enumerate(hyp_file):
48 |         if ref_words and i >= len(ref_words):
49 |             break
50 | 
51 |         if src_lines and i < len(src_lines):
52 |             hyp_del = Counter(extract_deletes(line.split(), src_lines[i]))
53 |             del_counts += hyp_del
54 | 
55 |             if ref_lines:
56 |                 ref_del = Counter(extract_deletes(ref_lines[i], src_lines[i]))
57 |                 ok_del_counts += Counter(
58 |                     dict((w, min(c, ref_del[w]))
59 |                     for w, c in hyp_del.items())
60 |                 )
61 | 
62 |         words = Counter(line.split())
63 |         total += words
64 | 
65 |         if ref_words:
66 |             ref = ref_words[i]
67 |             ok += Counter(dict((w, min(c, ref[w])) for w, c in words.items()))
68 | 
69 | total_count = sum(total.values())
70 | 
71 | precision_header = ' {:8}'.format('precision') if args.reference else ''
72 | header = '{:15} {:8} {:8}'.format('word', 'count', 'percentage') + precision_header
73 | print(header)
74 | 
75 | for w, c in total.most_common(args.max):
76 |     precision = ' {:8.2f}%'.format(100 * ok[w] / c) if args.reference else ''
77 | 
78 |     print('{:15} {:8} {:8.2f}%'.format(w, c, 100 * c / total_count) + precision)
79 | 
80 | if del_counts:
81 |     print('\nMost deleted words')
82 |     for w, c in del_counts.most_common(args.max):
83 |         precision = ' {:8.2f}%'.format(100 * ok_del_counts[w] / c) if args.source else ''
84 | 
85 |         print('{:15} {:8} {:8.2f}%'.format(w, c, 100 * c / sum(del_counts.values())) + precision)


--------------------------------------------------------------------------------
/seq2seq.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #CUDA_VISIBLE_DEVICES=""
3 | 
4 | /usr/bin/env python3 -m translate "$@"
5 | 


--------------------------------------------------------------------------------
/translate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alex-berard/seq2seq/1b5c6bf19a39ef27c059811b85a061f20d68ac32/translate/__init__.py


--------------------------------------------------------------------------------
/translate/multitask_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from translate import utils
 3 | from translate.translation_model import TranslationModel
 4 | 
 5 | 
 6 | class MultiTaskModel:
 7 |     def __init__(self, tasks, **kwargs):
 8 |         self.models = []
 9 |         self.ratios = []
10 | 
11 |         for i, task in enumerate(tasks, 1):
12 |             if task.name is None:
13 |                 task.name = 'task_{}'.format(i)
14 | 
15 |             # merging both dictionaries (task parameters have a higher precedence)
16 |             kwargs_ = dict(**kwargs)
17 |             kwargs_.update(task)
18 |             model = TranslationModel(**kwargs_)
19 | 
20 |             self.models.append(model)
21 |             self.ratios.append(task.ratio if task.ratio is not None else 1)
22 | 
23 |         self.main_model = self.models[0]
24 |         self.ratios = [ratio / sum(self.ratios) for ratio in self.ratios]  # unit normalization
25 | 
26 |     def train(self, **kwargs):
27 |         for model in self.models:
28 |             utils.log('initializing {}'.format(model.name))
29 |             model.init_training(**kwargs)
30 | 
31 |         utils.log('starting training')
32 |         while True:
33 |             i = np.random.choice(len(self.models), 1, p=self.ratios)[0]
34 |             model = self.models[i]
35 |             try:
36 |                 model.train_step(**kwargs)
37 |             except (utils.FinishedTrainingException, KeyboardInterrupt):
38 |                 utils.log('exiting...')
39 |                 self.main_model.save()
40 |                 return
41 |             except utils.EvalException:
42 |                 if i == 0:
43 |                     model.save()
44 |                     step, score = model.training.scores[-1]
45 |                     model.manage_best_checkpoints(step, score)
46 |             except utils.CheckpointException:
47 |                 if i == 0:   # only save main model (includes all variables)
48 |                     model.save()
49 |                     step, score = model.training.scores[-1]
50 |                     model.manage_best_checkpoints(step, score)
51 | 
52 |     def decode(self, *args, **kwargs):
53 |         self.main_model.decode(*args, **kwargs)
54 | 
55 |     def evaluate(self, *args, **kwargs):
56 |         return self.main_model.evaluate(*args, **kwargs)
57 | 
58 |     def align(self, *args, **kwargs):
59 |         self.main_model.align(*args, **kwargs)
60 | 
61 |     def initialize(self, *args, **kwargs):
62 |         self.main_model.initialize(*args, **kwargs)
63 | 
64 |     def save(self, *args, **kwargs):
65 |         self.main_model.save(*args, **kwargs)
66 | 


--------------------------------------------------------------------------------