├── eval
    ├── hyp
    │   └── readme.md
    └── eval.sh
├── models
    └── readme.md
├── data
    ├── corpus
    │   └── readme.md
    ├── postprocess.sh
    ├── nonbreaking_prefixes
    │   ├── README.txt
    │   ├── nonbreaking_prefix.ro
    │   ├── nonbreaking_prefix.sv
    │   ├── nonbreaking_prefix.ca
    │   ├── nonbreaking_prefix.sl
    │   ├── nonbreaking_prefix.es
    │   ├── nonbreaking_prefix.lv
    │   ├── nonbreaking_prefix.fr
    │   ├── nonbreaking_prefix.en
    │   ├── nonbreaking_prefix.fi
    │   ├── nonbreaking_prefix.hu
    │   ├── nonbreaking_prefix.nl
    │   ├── nonbreaking_prefix.is
    │   ├── nonbreaking_prefix.it
    │   ├── nonbreaking_prefix.ru
    │   ├── nonbreaking_prefix.pl
    │   ├── nonbreaking_prefix.pt
    │   ├── nonbreaking_prefix.ta
    │   ├── nonbreaking_prefix.de
    │   ├── nonbreaking_prefix.cs
    │   └── nonbreaking_prefix.sk
    ├── length.py
    ├── strip_sgml.py
    ├── merge.sh
    ├── preprocess.sh
    ├── build_dictionary.py
    ├── shuffle.py
    └── multi-bleu.perl
├── nematus
    ├── metrics
    │   ├── __init__.py
    │   ├── reference.py
    │   ├── test_sentence_bleu.py
    │   ├── test_scorer_provider.py
    │   ├── scorer.py
    │   ├── scorer_provider.py
    │   ├── scorer_interpolator.py
    │   ├── test_chrf.py
    │   ├── beer.py
    │   ├── sentence_bleu.py
    │   ├── meteor.py
    │   └── chrf.py
    ├── shuffle.py
    ├── __init__.py
    ├── training_progress.py
    ├── initializers.py
    ├── util.py
    ├── compat.py
    ├── theano_util.py
    ├── hypgraph.py
    ├── data_iterator.py
    ├── score.py
    ├── rescore.py
    ├── optimizers.py
    ├── domain_interpolation_data_iterator.py
    └── alignment_util.py
├── test
    ├── .gitignore
    ├── en-de
    │   ├── in
    │   ├── references
    │   ├── ref_score
    │   └── ref
    ├── en-ro
    │   ├── in
    │   ├── references
    │   ├── ref_score
    │   └── ref
    ├── README.md
    ├── test_train.sh
    ├── test_train_bigmem.sh
    ├── test_train_verybigmem.sh
    ├── test_train_domaininterpolation.sh
    ├── test_score.py
    ├── test_translate.py
    └── data
    │   ├── indomain-dev.en
    │   └── indomain-dev.de
├── .gitignore
├── train_reverse_model.sh
├── runnmt.sh
├── runnmt_l8-fce.sh
├── runnmt_l8-fce-giga.sh
├── Dockerfile.cpu
├── setup.py
├── Dockerfile.gpu
├── LICENSE
├── doc
    └── factored_neural_machine_translation.md
└── utils
    ├── copy_unknown_words.py
    ├── visualize_probs.py
    ├── plot_heatmap.py
    └── attention_web.php


/eval/hyp/readme.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/readme.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/corpus/readme.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/nematus/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/nematus/shuffle.py:
--------------------------------------------------------------------------------
1 | ../data/shuffle.py


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | */out*
2 | models
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build
3 | dist
4 | nmt.egg-info
5 | 


--------------------------------------------------------------------------------
/nematus/__init__.py:
--------------------------------------------------------------------------------
1 | from nematus import *
2 | import rescore
3 | import translate


--------------------------------------------------------------------------------
/data/postprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # merges subword units that were split by BPE
4 | 
5 | sed -r 's/\@\@ //g'


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/data/length.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy
 4 | import sys
 5 | 
 6 | for name in sys.argv[1:]:
 7 |     lens = []
 8 |     with open(name, 'r') as f:
 9 |         for ll in f:
10 |             lens.append(len(ll.strip().split(' ')))
11 |     print name, ' max ', numpy.max(lens), ' min ', numpy.min(lens), ' average ', numpy.mean(lens)
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/data/strip_sgml.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | 
 4 | 
 5 | def main():
 6 |     fin = sys.stdin
 7 |     fout = sys.stdout
 8 |     for l in fin:
 9 |         line = l.strip()
10 |         text = re.sub('<[^<]+>', "", line).strip()
11 |         if len(text) == 0:
12 |             continue
13 |         print >>fout, text
14 |                 
15 | 
16 | if __name__ == "__main__":
17 |     main()
18 | 
19 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/eval/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # translate
 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=gpu1 python nematus/translate.py --models models/mle-l8-fce-giga.npz --input data/corpus/dev.src > hyp/hyp.tmp
 5 | 
 6 | # recaser
 7 | 
 8 | # capitalize the first char (if necessary)
 9 | 
10 | # copy src sentences when longer than 50 words (do nothing)
11 | 
12 | # run GLEU.
13 | python2 ../jfleg/eval/gleu.py -r ../jfleg/dev/dev.ref[0-3] -s ../jfleg/dev/dev.src --hyp ./hyp/hyp.tmp
14 | 


--------------------------------------------------------------------------------
/data/merge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | SRC=$1
 5 | TRG=$2
 6 | 
 7 | FSRC=all_${1}-${2}.${1}
 8 | FTRG=all_${1}-${2}.${2}
 9 | 
10 | echo "" > $FSRC
11 | for F in *${1}-${2}.${1}
12 | do
13 |     if [ "$F" = "$FSRC" ]; then
14 |         echo "pass"
15 |     else
16 |         cat $F >> $FSRC
17 |     fi
18 | done
19 | 
20 | 
21 | echo "" > $FTRG
22 | for F in *${1}-${2}.${2}
23 | do
24 |     if [ "$F" = "$FTRG" ]; then
25 |         echo "pass"
26 |     else
27 |         cat $F >> $FTRG
28 |     fi
29 | done
30 | 


--------------------------------------------------------------------------------
/test/en-de/in:
--------------------------------------------------------------------------------
1 | a Republican strategy to counter the re-election of Obama 
2 | Republican leaders justified their policy by the need to combat electoral fraud . 
3 | however , the Brenn@@ an Centre considers this a myth , stating that electoral fraud is rar@@ er in the United States than the number of people killed by lightning . 
4 | indeed , Republican lawyers identified only 300 cases of electoral fraud in the United States in a decade . 
5 | one thing is certain : these new provisions will have a negative impact on voter turn@@ -out .
6 | 


--------------------------------------------------------------------------------
/train_reverse_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # train from trg to src! 
 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \
 5 |   --datasets ./data/corpus/train-esl.trg ./data/corpus/train-esl.src \
 6 |   --dictionaries ./data/corpus/train-esl.trg.json ./data/corpus/train-esl.src.json \
 7 |   --valid_datasets ./data/corpus/dev.trg ./data/corpus/dev.src \
 8 |   --model models/reverse.npz \
 9 |   --use_dropout \
10 |   --maxlen 50 \
11 |   --optimizer adam \
12 |   --lrate 0.0001 \
13 |   --batch_size 40 \
14 |   --n_words_src 35000 \
15 |   --n_words 35000
16 | 


--------------------------------------------------------------------------------
/runnmt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \
 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu python nematus/nmt.py \
 5 |   --datasets data/corpus/train.lang8.src.shuf data/corpus/train.lang8.trg.shuf \
 6 |   --dictionaries data/corpus/train.lang8.src.shuf.json data/corpus/train.lang8.trg.shuf.json \
 7 |   --model models/mle.npz \
 8 |   --use_dropout \
 9 |   --maxlen 50 \
10 |   --optimizer adam \
11 |   --lrate 0.0001 \
12 |   --batch_size 40 \
13 |   --n_words_src 35000 \
14 |   --n_words 35000
15 | 


--------------------------------------------------------------------------------
/nematus/training_progress.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Training progress
 3 | '''
 4 | 
 5 | import sys
 6 | import json
 7 | 
 8 | import util
 9 | 
10 | class TrainingProgress(object):
11 |     '''
12 |     Object used to store, serialize and deserialize pure python variables that change during training and should be preserved in order to properly restart the training process
13 |     '''
14 | 
15 |     def load_from_json(self, file_name):
16 |         self.__dict__.update(util.unicode_to_utf8(json.load(open(file_name, 'rb'))))
17 | 
18 |     def save_to_json(self, file_name):
19 |         json.dump(self.__dict__, open(file_name, 'wb'), indent=2)
20 | 


--------------------------------------------------------------------------------
/runnmt_l8-fce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \
 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu python nematus/nmt.py \
 5 |   --datasets data/corpus/train.l8-fce.src data/corpus/train.l8-fce.trg \
 6 |   --dictionaries data/corpus/train.l8-fce.src.json data/corpus/train.l8-fce.trg.json \
 7 |   --model models/mle-l8-fce.npz \
 8 |   --reload \
 9 |   --use_dropout \
10 |   --maxlen 50 \
11 |   --optimizer adam \
12 |   --lrate 0.0001 \
13 |   --batch_size 40 \
14 |   --n_words_src 35000 \
15 |   --n_words 35000
16 | 


--------------------------------------------------------------------------------
/test/en-de/references:
--------------------------------------------------------------------------------
1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 
2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 
3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 
4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 
5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 
6 | 


--------------------------------------------------------------------------------
/nematus/initializers.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Parameter initializers
 3 | '''
 4 | 
 5 | import numpy
 6 | 
 7 | import theano
 8 | import theano.tensor as tensor
 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
10 | 
11 | def ortho_weight(ndim):
12 |     W = numpy.random.randn(ndim, ndim)
13 |     u, s, v = numpy.linalg.svd(W)
14 |     return u.astype('float32')
15 | 
16 | def norm_weight(nin, nout=None, scale=0.01, ortho=True):
17 |     if nout is None:
18 |         nout = nin
19 |     if nout == nin and ortho:
20 |         W = ortho_weight(nin)
21 |     else:
22 |         W = scale * numpy.random.randn(nin, nout)
23 |     return W.astype('float32')
24 | 
25 | 


--------------------------------------------------------------------------------
/runnmt_l8-fce-giga.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \
 4 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,optimizer_including=cudnn,device=gpu0 python nematus/nmt.py \
 5 |   --datasets data/corpus/train.l8-fce-giga.src data/corpus/train.l8-fce-giga.trg \
 6 |   --dictionaries data/corpus/train.l8-fce-giga.src.json data/corpus/train.l8-fce-giga.trg.json \
 7 |   --model models/mle-l8-fce-giga.npz \
 8 |   --reload \
 9 |   --use_dropout \
10 |   --maxlen 50 \
11 |   --optimizer adam \
12 |   --lrate 0.0001 \
13 |   --batch_size 40 \
14 |   --n_words_src 35000 \
15 |   --n_words 35000
16 | 


--------------------------------------------------------------------------------
/test/en-de/ref_score:
--------------------------------------------------------------------------------
1 | eine republi@@ kanische Strategie , um der Wiederwahl von Obama entgegenzutreten 0.688558
2 | die Führungskräfte der Republikaner rechtfertigen ihre Politik mit der Notwendigkeit , den Wahl@@ betrug zu bekämpfen . 1.18311
3 | allerdings hält das Brenn@@ an Center letzteres für einen Mythos , indem es bekräftigt , dass der Wahl@@ betrug in den USA seltener ist als die Anzahl der vom Blitz@@ schlag getö@@ teten Menschen . 1.44055
4 | die Rechtsanwälte der Republikaner haben in 10 Jahren in den USA übrigens nur 300 Fälle von Wahl@@ betrug verzeichnet . 2.32595
5 | eins ist sicher : diese neuen Bestimmungen werden sich negativ auf die Wahlbeteiligung auswirken . 0.40967
6 | 


--------------------------------------------------------------------------------
/test/en-ro/in:
--------------------------------------------------------------------------------
1 | the European Commission decided on Tuesday to resume payments for Romania under the &quot; Economic competitiveness &quot; and &quot; Environment &quot; programs , both interrupted in early April 2015 .
2 | the judge did not rule on whether L@@ M@@ FAO &apos;s song itself was an un@@ authorized copy of &quot; H@@ ust@@ lin &apos; . &quot;
3 | the Romanian national team is part of Group D in the World Cup in England , along with France , Ireland , Canada and Italy .
4 | it sends a message : your country does not value you becoming a parent .
5 | the round@@ about will be made at the appropriate time , we must consider the trams traffic in the area , and we also need an approval from the National Roads .
6 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/test/en-ro/references:
--------------------------------------------------------------------------------
1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor &quot; Competitivitate Econom@@ ica &quot; și &quot; Mediu &quot; , ambele intre@@ rupte la inceputul lunii aprilie 2015 .
2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui &quot; H@@ ust@@ lin &apos; &quot; .
3 | nationala &quot; tricol@@ ora &quot; face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia .
4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte .
5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale .
6 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/test/en-ro/ref_score:
--------------------------------------------------------------------------------
1 | Comisia Europeana a luat marti decizia de a relua pl@@ atile pentru Romania în cadrul programelor &quot; Competitivitate Econom@@ ica &quot; și &quot; Mediu &quot; , ambele intre@@ rupte la inceputul lunii aprilie 2015 . 1.10127
2 | judecătoarea nu a hotărât dacă melodia trupei L@@ M@@ FAO este o copie neautorizată a lui &quot; H@@ ust@@ lin &apos; &quot; . 1.43826
3 | nationala &quot; tricol@@ ora &quot; face parte din Grupa D la Mondi@@ alul din Anglia , alaturi de Franta , Irlanda , Canada și Italia . 1.16586
4 | trimite un mesaj : țara ta nu pune vreo valoare pe faptul că vei deveni părinte . 2.04865
5 | Gir@@ ația va fi făcută la momentul potrivit , trebuie să ținem cont de circulația tramv@@ aielor în zonă , trebuie un aviz și de la Drumuri Naționale . 2.03933
6 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | Testing Nematus
 2 | ---------------
 3 | 
 4 | To test translation, execute
 5 | 
 6 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=cpu python test_translate.py
 7 | 
 8 | To test scoring, execute
 9 | 
10 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=cpu python test_score.py
11 | 
12 | more sample models (including scripts for pre- and postprocessing)
13 | are provided at: http://statmt.org/rsennrich/wmt16_systems/
14 | 
15 | to test training, execute
16 | 
17 | THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=cpu ./test_train.sh
18 | 
19 | note that the training script is just a toy setup to make sure the scripts run,
20 | and to allow for speed comparisons. For instructions to train a
21 | real-scale system, check the instructions at https://github.com/rsennrich/wmt16-scripts


--------------------------------------------------------------------------------
/test/test_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | mkdir -p models
10 | 
11 | ../nematus/nmt.py \
12 |   --model models/model.npz \
13 |   --datasets data/corpus.en data/corpus.de \
14 |   --dictionaries data/vocab.en.json data/vocab.de.json \
15 |   --dim_word 256 \
16 |   --dim 512 \
17 |   --n_words_src 30000 \
18 |   --n_words 30000 \
19 |   --maxlen 50 \
20 |   --optimizer adam \
21 |   --lrate 0.0001 \
22 |   --batch_size 40 \
23 |   --no_shuffle \
24 |   --dispFreq 500 \
25 |   --finish_after 500
26 | 


--------------------------------------------------------------------------------
/test/test_train_bigmem.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | mkdir -p models
10 | 
11 | ../nematus/nmt.py \
12 |   --model models/model.npz \
13 |   --datasets data/corpus.en data/corpus.de \
14 |   --dictionaries data/vocab.en.json data/vocab.de.json \
15 |   --dim_word 500 \
16 |   --dim 1024 \
17 |   --n_words_src 30000 \
18 |   --n_words 30000 \
19 |   --maxlen 50 \
20 |   --optimizer adam \
21 |   --lrate 0.0001 \
22 |   --batch_size 80 \
23 |   --no_shuffle \
24 |   --dispFreq 500 \
25 |   --finish_after 500
26 | 


--------------------------------------------------------------------------------
/test/test_train_verybigmem.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | mkdir -p models
10 | 
11 | ../nematus/nmt.py \
12 |   --model models/model.npz \
13 |   --datasets data/corpus.en data/corpus.de \
14 |   --dictionaries data/vocab.en.json data/vocab.de.json \
15 |   --dim_word 500 \
16 |   --dim 2048 \
17 |   --n_words_src 30000 \
18 |   --n_words 30000 \
19 |   --maxlen 50 \
20 |   --optimizer adam \
21 |   --lrate 0.0001 \
22 |   --batch_size 80 \
23 |   --no_shuffle \
24 |   --dispFreq 500 \
25 |   --finish_after 500
26 | 


--------------------------------------------------------------------------------
/data/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | P=$1
 4 | 
 5 | # source language (example: fr)
 6 | S=$2
 7 | # target language (example: en)
 8 | T=$3
 9 | 
10 | # path to nematus/data
11 | P1=$4
12 | 
13 | # path to subword NMT scripts (can be downloaded from https://github.com/rsennrich/subword-nmt)
14 | P2=$5
15 | 
16 | # tokenize
17 | perl $P1/tokenizer.perl -threads 5 -l $S < {P}.${S} > {P}.${S}.tok
18 | perl $P1/tokenizer.perl -threads 5 -l $T < {P}.${T} > {P}.${T}.tok
19 | 
20 | # learn BPE on joint vocabulary:
21 | cat {P}.${S}.tok {P}.${T}.tok | python $P2/learn_bpe.py -s 20000 > ${S}${T}.bpe
22 | 
23 | python $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${S}.tok > {P}.${S}.tok.bpe
24 | python $P2/apply_bpe.py -c ${S}${T}.bpe < {P}.${T}.tok > {P}.${T}.tok.bpe
25 | 
26 | # build dictionary
27 | python $P1/build_dictionary.py {P}.${S}.tok.bpe
28 | python $P1/build_dictionary.py {P}.${T}.tok.bpe
29 | 
30 | 


--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | MAINTAINER Tom Kocmi <kocmi@ufal.mff.cuni.cz>
 3 | 
 4 | RUN apt-get update && apt-get install -y \
 5 | 	cmake \
 6 | 	git \
 7 | 	python \
 8 | 	python3 \
 9 | 	vim \
10 | 	nano \
11 | 	python-dev \
12 | 	python-pip \
13 | 	python-pygraphviz \
14 | 	xml-twig-tools
15 | 
16 | RUN pip install --upgrade pip
17 | 
18 | RUN pip install numpy numexpr cython theano ipdb
19 | 
20 | RUN mkdir -p /path/to
21 | WORKDIR /path/to/
22 | 
23 | # Install mosesdecoder
24 | RUN git clone https://github.com/moses-smt/mosesdecoder
25 | 
26 | # Install subwords
27 | RUN git clone https://github.com/rsennrich/subword-nmt
28 | 
29 | # Install nematus
30 | COPY . /path/to/nematus
31 | WORKDIR /path/to/nematus
32 | RUN python setup.py install
33 | 
34 | WORKDIR /
35 | 
36 | # playground will contain user defined scripts, it should be run as:
37 | # docker run -v `pwd`:/playground -it nematus-docker
38 | RUN mkdir playground
39 | WORKDIR /playground
40 | 
41 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import setuptools
 5 | 
 6 | setuptools.setup(
 7 |     name='nematus',
 8 |     version='0.1dev',
 9 |     description='Neural machine translation tools on top of Theano',
10 |     long_description=open(os.path.join(os.path.dirname(
11 |         os.path.abspath(__file__)), 'README.md')).read(),
12 |     license='BSD 3-clause',
13 |     url='http://github.com/rsennrich/nematus',
14 |     install_requires=['numpy',
15 |                       'Theano',
16 |                       'ipdb'],
17 |     dependency_links=['git+http://github.com/Theano/Theano.git#egg=Theano',],
18 |     classifiers=['Development Status :: 3 - Alpha',
19 |                  'Intended Audience :: Science/Research',
20 |                  'License :: OSI Approved :: BSD License',
21 |                  'Operating System :: OS Independent',
22 |                  'Topic :: Scientific/Engineering'],
23 |     packages = ['nematus', 'nematus.metrics'],
24 | )
25 | 


--------------------------------------------------------------------------------
/data/build_dictionary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy
 4 | import json
 5 | 
 6 | import sys
 7 | import fileinput
 8 | 
 9 | from collections import OrderedDict
10 | 
11 | def main():
12 |     for filename in sys.argv[1:]:
13 |         print 'Processing', filename
14 |         word_freqs = OrderedDict()
15 |         with open(filename, 'r') as f:
16 |             for line in f:
17 |                 words_in = line.strip().split(' ')
18 |                 for w in words_in:
19 |                     if w not in word_freqs:
20 |                         word_freqs[w] = 0
21 |                     word_freqs[w] += 1
22 |         words = word_freqs.keys()
23 |         freqs = word_freqs.values()
24 | 
25 |         sorted_idx = numpy.argsort(freqs)
26 |         sorted_words = [words[ii] for ii in sorted_idx[::-1]]
27 | 
28 |         worddict = OrderedDict()
29 |         worddict['eos'] = 0
30 |         worddict['UNK'] = 1
31 |         for ii, ww in enumerate(sorted_words):
32 |             worddict[ww] = ii+2
33 | 
34 |         with open('%s.json'%filename, 'wb') as f:
35 |             json.dump(worddict, f, indent=2, ensure_ascii=False)
36 | 
37 |         print 'Done'
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/nematus/metrics/reference.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from abc import ABCMeta, abstractmethod
 5 | 
 6 | class Reference:
 7 |     """
 8 |     Abstract base class for re-usable translation reference. Hypotheses can be
 9 |     scored against this reference through the evaluation metric implemented in
10 |     its `score` function.
11 |     """
12 | 
13 |     __metaclass__ = ABCMeta #abstract base class
14 | 
15 |     def __init__(self, reference_tokens):
16 |         """
17 |         @param reference the reference translation that hypotheses shall be
18 |                          scored against.
19 |         """
20 |         self._reference_tokens = reference_tokens
21 |         #additional (metric-specific) parameters to be defined in subclass
22 | 
23 |     @abstractmethod
24 |     def score(self, hypothesis_tokens):
25 |         """
26 |         Scores @param hypothesis against this reference.
27 |         """
28 |         pass #to be implemented in sublcass
29 | 
30 |     def score_matrix(self, hypothesis_matrix):
31 |         """
32 |         Scores every hypothesis in @param hypotheses against this reference.
33 |         @param hypothesis_matrix an iterable of iterables of tokens.
34 |         """
35 |         return [self.score(hypothesis_tokens) for hypothesis_tokens in hypothesis_matrix]
36 | 


--------------------------------------------------------------------------------
/nematus/util.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Utility functions
 3 | '''
 4 | 
 5 | import sys
 6 | import json
 7 | import cPickle as pkl
 8 | 
 9 | #json loads strings as unicode; we currently still work with Python 2 strings, and need conversion
10 | def unicode_to_utf8(d):
11 |     return dict((key.encode("UTF-8"), value) for (key,value) in d.items())
12 | 
13 | def load_dict(filename):
14 |     try:
15 |         with open(filename, 'rb') as f:
16 |             return unicode_to_utf8(json.load(f))
17 |     except:
18 |         with open(filename, 'rb') as f:
19 |             return pkl.load(f)
20 | 
21 | 
22 | def load_config(basename):
23 |     try:
24 |         with open('%s.json' % basename, 'rb') as f:
25 |             return json.load(f)
26 |     except:
27 |         try:
28 |             with open('%s.pkl' % basename, 'rb') as f:
29 |                 return pkl.load(f)
30 |         except:
31 |             sys.stderr.write('Error: config file {0}.json is missing\n'.format(basename))
32 |             sys.exit(1)
33 | 
34 | 
35 | def seqs2words(seq, inverse_target_dictionary):
36 |     words = []
37 |     for w in seq:
38 |         if w == 0:
39 |             break
40 |         if w in inverse_target_dictionary:
41 |             words.append(inverse_target_dictionary[w])
42 |         else:
43 |             words.append('UNK')
44 |     return ' '.join(words)


--------------------------------------------------------------------------------
/data/shuffle.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import random
 4 | 
 5 | import tempfile
 6 | from subprocess import call
 7 | 
 8 | 
 9 | 
10 | def main(files, temporary=False):
11 | 
12 |     tf_os, tpath = tempfile.mkstemp()
13 |     tf = open(tpath, 'w')
14 | 
15 |     fds = [open(ff) for ff in files]
16 | 
17 |     for l in fds[0]:
18 |         lines = [l.strip()] + [ff.readline().strip() for ff in fds[1:]]
19 |         print >>tf, "|||".join(lines)
20 | 
21 |     [ff.close() for ff in fds]
22 |     tf.close()
23 | 
24 |     lines = open(tpath, 'r').readlines()
25 |     random.shuffle(lines)
26 | 
27 |     if temporary:
28 |         fds = []
29 |         for ff in files:
30 |             path, filename = os.path.split(os.path.realpath(ff))
31 |             fds.append(tempfile.TemporaryFile(prefix=filename+'.shuf', dir=path))
32 |     else:
33 |         fds = [open(ff+'.shuf','w') for ff in files]
34 | 
35 |     for l in lines:
36 |         s = l.strip().split('|||')
37 |         for ii, fd in enumerate(fds):
38 |             print >>fd, s[ii]
39 | 
40 |     if temporary:
41 |         [ff.seek(0) for ff in fds]
42 |     else:
43 |         [ff.close() for ff in fds]
44 | 
45 |     os.close(tf_os)
46 |     os.remove(tpath)
47 | 
48 |     return fds
49 | 
50 | if __name__ == '__main__':
51 |     main(sys.argv[1:])
52 | 
53 |     
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/test/test_train_domaininterpolation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # warning: this test is useful to check if training fails, and what speed you can achieve
 4 | # the toy datasets are too small to obtain useful translation results,
 5 | # and hyperparameters are chosen for speed, not for quality.
 6 | # For a setup that preprocesses and trains a larger data set,
 7 | # check https://github.com/rsennrich/wmt16-scripts/tree/master/sample
 8 | 
 9 | mkdir -p models
10 | 
11 | ../nematus/nmt.py \
12 |   --model models/model_domainadapt.npz \
13 |   --datasets data/corpus.en data/corpus.de \
14 |   --dictionaries data/vocab.en.json data/vocab.de.json \
15 |   --dim_word 256 \
16 |   --dim 512 \
17 |   --n_words_src 30000 \
18 |   --n_words 30000 \
19 |   --maxlen 50 \
20 |   --optimizer adam \
21 |   --lrate 0.0001 \
22 |   --batch_size 40 \
23 |   --no_shuffle \
24 |   --dispFreq 100 \
25 |   --finish_after 50000 \
26 |   --domain_interpolation_indomain_datasets data/indomain-corpus.en data/indomain-corpus.de \
27 |   --domain_interpolation_min 0.5 \
28 |   --domain_interpolation_max 1.0 \
29 |   --domain_interpolation_inc 0.2 \
30 |   --saveFreq 100 \
31 |   --valid_datasets data/indomain-dev.en data/indomain-dev.de \
32 |   --valid_batch_size 20 \
33 |   --validFreq 100 \
34 |   --patience 3 \
35 |   --use_domain_interpolation \
36 | #  --reload
37 | 
38 | 


--------------------------------------------------------------------------------
/nematus/metrics/test_sentence_bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import unittest
 5 | 
 6 | from sentence_bleu import SentenceBleuScorer
 7 | 
 8 | class TestSentenceBleuReference(unittest.TestCase):
 9 |     """
10 |     Regression tests for SmoothedBleuReference
11 |     """
12 |     @staticmethod
13 |     def tokenize(sentence):
14 |         return sentence.split(" ")
15 |     def test_identical_segments(self):
16 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
17 |         scorer = SentenceBleuScorer('n=4')
18 |         scorer.set_reference(segment)
19 |         self.assertEqual(scorer.score(segment), 1.0)
20 |     def test_completely_different_segments(self):
21 |         segment_a = self.tokenize("A A A")
22 |         segment_b = self.tokenize("B B B")
23 |         scorer = SentenceBleuScorer('n=4')
24 |         scorer.set_reference(segment_a)
25 |         self.assertEqual(scorer.score(segment_b), 0.0)
26 |     def test_clipping(self):
27 |         segment_a = self.tokenize("The very nice man")
28 |         segment_b = self.tokenize("man man man man")
29 |         scorer = SentenceBleuScorer('n=1')
30 |         scorer.set_reference(segment_a)
31 |         self.assertNotEqual(scorer.score(segment_b), 1.0)
32 | 
33 | if __name__ == '__main__':
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:8.0-cudnn5-devel
 2 | MAINTAINER Tom Kocmi <kocmi@ufal.mff.cuni.cz>
 3 | 
 4 | # Install git, wget, python-dev, pip and other dependencies
 5 | RUN apt-get update && apt-get install -y \
 6 |   git \
 7 |   wget \
 8 |   cmake \
 9 |   vim \
10 |   nano \
11 |   python3 \
12 |   libopenblas-dev \
13 |   python-dev \
14 |   python-pip \
15 |   python-nose \
16 |   python-numpy \
17 |   python-scipy \
18 |   python-pygraphviz \
19 |   xml-twig-tools
20 | 
21 | RUN pip install --upgrade pip
22 | RUN pip install -U setuptools
23 | RUN pip install numexpr cython ipdb
24 | 
25 | # Set CUDA_ROOT
26 | ENV CUDA_ROOT /usr/local/cuda/bin
27 | # Install bleeding-edge Theano
28 | RUN pip install --upgrade --no-deps theano
29 | # Set up .theanorc for CUDA
30 | RUN echo "[global]\ndevice=gpu\nfloatX=float32\noptimizer_including=cudnn\n[lib]\ncnmem=0.1\n[nvcc]\nfastmath=True" > /root/.theanorc
31 | 
32 | 
33 | RUN mkdir -p /path/to
34 | WORKDIR /path/to/
35 | 
36 | # Install mosesdecoder
37 | RUN git clone https://github.com/moses-smt/mosesdecoder
38 | 
39 | # Install subwords
40 | RUN git clone https://github.com/rsennrich/subword-nmt
41 | 
42 | # Install nematus
43 | COPY . /path/to/nematus
44 | WORKDIR /path/to/nematus
45 | RUN python setup.py install
46 | 
47 | WORKDIR /
48 | 
49 | # playground will contain user defined scripts, it should be run as:
50 | # nvidia-docker run -v `pwd`:/playground -it nematus-docker
51 | RUN mkdir playground
52 | WORKDIR /playground
53 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015--2017 Nematus Development Team
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of Nematus nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/nematus/metrics/test_scorer_provider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import unittest
 5 | 
 6 | from scorer_provider import ScorerProvider
 7 | from sentence_bleu import SentenceBleuScorer
 8 | 
 9 | class TestScorerProvider(unittest.TestCase):
10 |     """
11 |     Regression tests for ScorerProvider
12 |     """
13 |     @staticmethod
14 |     def tokenize(sentence):
15 |         return sentence.split(" ")
16 | 
17 |     def test_single_metric(self):
18 |         config_string = "SENTENCEBLEU n=4"
19 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
20 |         reference_scorer = SentenceBleuScorer('n=4')
21 |         provided_scorer = ScorerProvider().get(config_string)
22 |         reference_scorer.set_reference(segment)
23 |         provided_scorer.set_reference(segment)
24 |         self.assertEqual(
25 |             reference_scorer.score(segment),
26 |             provided_scorer.score(segment)
27 |         )
28 | 
29 |     def test_interpolated_metrics(self):
30 |         config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4"
31 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
32 |         reference_scorer = SentenceBleuScorer('n=4')
33 |         provided_scorer = ScorerProvider().get(config_string) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer
34 |         reference_scorer.set_reference(segment)
35 |         provided_scorer.set_reference(segment)
36 |         self.assertEqual(
37 |             reference_scorer.score(segment),
38 |             provided_scorer.score(segment)
39 |         )
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/test/en-de/ref:
--------------------------------------------------------------------------------
 1 | eine republi@@ kanische Strategie gegen die Wiederwahl Obamas
 2 | 0.977844655514 0.90209954977 0.927412986755 0.984532177448 0.183520868421 0.907861471176 0.994144678116 0.917708992958 0.990146577358 
 3 | die republi@@ kanische Führung begründet ihre Politik mit der Notwendigkeit , Wahl@@ betrug zu bekämpfen .
 4 | 0.624975204468 0.467659324408 0.895200014114 0.922666728497 0.332508355379 0.962346553802 0.985188066959 0.511733949184 0.702501058578 0.733234107494 0.834280848503 0.298875242472 0.978177785873 0.962297916412 0.991670489311 0.998888731003 0.999692261219 
 5 | das Brenn@@ an Zentrum hält dies aber für einen Mythos , der besagt , dass Wahl@@ betrug in den USA seltener ist als die Zahl der getö@@ teten Menschen .
 6 | 0.153531059623 0.871728599072 0.346277505159 0.747219443321 0.871806800365 0.120552673936 0.37667247653 0.782940626144 0.822250068188 0.98460739851 0.73440104723 0.481711357832 0.311930894852 0.961221635342 0.896834015846 0.427923560143 0.903929233551 0.673036038876 0.992655754089 0.739101171494 0.754340946674 0.522766292095 0.916598856449 0.96203070879 0.791576385498 0.890906095505 0.162579834461 0.99129909277 0.765361487865 0.619172334671 0.999593555927 
 7 | tatsächlich wurden in den USA in einem Jahrzehnt nur 300 Fälle von Wahl@@ betrug in den USA festgestellt .
 8 | 0.874663293362 0.193072125316 0.830588340759 0.950349152088 0.536000072956 0.732309579849 0.601523339748 0.985651493073 0.771518468857 0.963857293129 0.582112908363 0.782780885696 0.960188984871 0.962329685688 0.735553085804 0.973220407963 0.69519174099 0.764474630356 0.998193442822 0.999425113201 
 9 | eines ist sicher : diese neuen Bestimmungen werden negative Auswirkungen auf die Wahlbeteiligung haben .
10 | 0.634134709835 0.78360158205 0.81129103899 0.985949218273 0.919415593147 0.925939559937 0.844495713711 0.82704269886 0.344317674637 0.952615022659 0.954769909382 0.629434704781 0.463058054447 0.923200011253 0.998686730862 0.999255955219 
11 | 


--------------------------------------------------------------------------------
/nematus/compat.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Default options for backward compatibility
 3 | '''
 4 | 
 5 | #hacks for using old models with missing options (dict is modified in-place)
 6 | def fill_options(options):
 7 |     if not 'dropout_embedding' in options:
 8 |         options['dropout_embedding'] = 0
 9 |     if not 'dropout_hidden' in options:
10 |         options['dropout_hidden'] = 0
11 |     if not 'dropout_source' in options:
12 |         options['dropout_source'] = 0
13 |     if not 'dropout_target' in options:
14 |         options['dropout_target'] = 0
15 |     if not 'factors' in options:
16 |         options['factors'] = 1
17 |     if not 'dim_per_factor' in options:
18 |         options['dim_per_factor'] = [options['dim_word']]
19 |     if not 'model_version' in options:
20 |         options['model_version'] = 0
21 |     if not 'tie_encoder_decoder_embeddings' in options:
22 |         options['tie_encoder_decoder_embeddings'] = False
23 |     if not 'tie_decoder_embeddings' in options:
24 |         options['tie_decoder_embeddings'] = False
25 |     if not 'encoder_truncate_gradient' in options:
26 |         options['encoder_truncate_gradient'] = -1
27 |     if not 'decoder_truncate_gradient' in options:
28 |         options['decoder_truncate_gradient'] = -1
29 |     if not 'reload_training_progress' in options:
30 |         options['reload_training_progress'] = True
31 |     if not 'use_domain_interpolation' in options:
32 |         options['use_domain_interpolation'] = False
33 |     if not 'domain_interpolation_min' in options:
34 |         options['decoder_truncate_gradient'] = 0.1
35 |     if not 'domain_interpolation_max' in options:
36 |         options['decoder_truncate_gradient'] = 1.0
37 |     if not 'domain_interpolation_inc' in options:
38 |         options['decoder_truncate_gradient'] = 0.1
39 |     if not 'domain_interpolation_indomain_datasets' in options:
40 |         options['domain_interpolation_indomain_datasets'] = ['indomain.en', 'indomain.fr']
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 | 


--------------------------------------------------------------------------------
/nematus/metrics/scorer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from abc import ABCMeta, abstractmethod
 5 | 
 6 | class Scorer:
 7 |     """
 8 |     Abstract base class for MT evaluation metric. Can be passed on to a
 9 |     Reference for scoring translation hypotheses.
10 |     """
11 | 
12 |     __metaclass__ = ABCMeta #abstract base class
13 | 
14 |     def __init__(self, argument_string):
15 |         """
16 |         @param argument_string the metric-specific parameters (such as n-gram
17 |         order for BLEU, language for METEOR, etc.)
18 |         """
19 |         # parse arguments
20 |         self._reference = None # to be set via `self.set_reference()`
21 |         self._arguments = {}
22 |         if argument_string:
23 |             argument_strings = argument_string.split(",")
24 |             for a in argument_strings:
25 |                 argument, value = a.split("=")
26 |                 argument = argument.strip()
27 |                 value = value.strip()
28 |                 try:
29 |                     value = int(value) # change type to int if applicable
30 |                 except ValueError:
31 |                     value = value
32 |                 self._arguments[argument] = value
33 | 
34 |     @abstractmethod
35 |     def set_reference(self, reference_tokens):
36 |         """
37 |         Sets the reference against which one or many hypotheses can be scored
38 |         via `self.score()` and `self.score_matrix()`.
39 |         """
40 |         pass # instantiate a Reference object and store it at self._reference
41 | 
42 |     def score(self, hypothesis_tokens):
43 |         """
44 |         Scores @param hypothesis against this reference.
45 |         """
46 |         return self._reference.score(hypothesis_tokens)
47 | 
48 |     def score_matrix(self, hypothesis_matrix):
49 |         """
50 |         Scores every hypothesis in @param hypotheses against this reference.
51 |         @param hypothesis_matrix an iterable of iterables of tokens.
52 |         """
53 |         return self._reference.score_matrix(hypothesis_matrix)
54 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/doc/factored_neural_machine_translation.md:
--------------------------------------------------------------------------------
 1 | FACTORED NEURAL MACHINE TRANSLATION
 2 | -----------------------------------
 3 | 
 4 | Nematus supports arbitrary input features through factored representations, similar to factored models popularized with Moses.
 5 | This can be used to add linguistic features such as lemmas, POS, or dependency labels, or potentially other types of information.
 6 | The pipe symbol "|" serves as a factor separator and should not otherwise appear in the text.
 7 | 
 8 | To use factored models, follow these steps:
 9 | 
10 |   - preprocess the source side of the training, development and test data to include factors. Consider this example sentence, in an unfactored (or 1-factored) representation, and with 4 factors per word:
11 | 
12 |     Leonidas begged in the arena .
13 | 
14 |     Leonidas|Leonidas|NNP|nsubj begged|beg|VBD|root in|in|IN|prep the|the|DT|det gladiatorial|gladiatorial|JJ|amod arena|arena|NN|pobj
15 | 
16 |     https://github.com/rsennrich/wmt16-scripts/tree/master/factored_sample provides sample scripts to produce a factored representation from a CoNLL file, and BPE-segmented text.
17 | 
18 |   - in the arguments to nematus.nmt.train, adjust the following options:
19 |     - factors: the number of factors per word
20 |     - dim_per_factor: the size of the embedding layer for each factor (a list of integers)
21 |     - dim_word: the total size of the input embedding (must match the sum of dim_per_factor)
22 |     - dictionaries: add a vocabulary file for each factor (in the order they appear), plus a vocabulary file for the target side
23 | 
24 |     an example config is shown at https://github.com/rsennrich/wmt16-scripts/blob/master/factored_sample/config.py
25 | 
26 |   - commands for training and running Nematus are otherwise identical to the non-factored version
27 | 
28 | 
29 | PUBLICATIONS
30 | ------------
31 | 
32 | factored neural machine translation is described in:
33 | 
34 | Sennrich, Rico, Haddow, Barry (2016): Linguistic Input Features Improve Neural Machine Translation, Proc. of the First Conference on Machine Translation (WMT16). Berlin, Germany


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/nematus/metrics/scorer_provider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import scorer_interpolator as si
 5 | 
 6 | from sentence_bleu import SentenceBleuScorer
 7 | from meteor import MeteorScorer
 8 | from beer import BeerScorer
 9 | from chrf import CharacterFScorer
10 | 
11 | class ScorerProvider:
12 |     """
13 |     Parses a config string and returns a matching scorer object with the given
14 |     parameters.
15 |     """
16 |     #from bleu import SentenceBleuScorer
17 | 
18 |     def __init__(self):
19 |         pass
20 | 
21 |     def get(self, config_string):
22 |         """
23 |         Returns a scorer matching the metric and parameters defined in @param
24 |         config string.
25 | 
26 |         Example: ScorerProvider.get("BLEU n=4") returns a SmoothedBleuScorer
27 |                  object that considers n-gram precision up to n=4.
28 | 
29 |         If more than one metrics are provided (separated by `;`),
30 |         an interpolated scorer will be returned.
31 | 
32 |         Example: ScorerProvider.get("INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor")
33 |                  returns an InterpolatedScorer object that scores hypotheses
34 |                  using 0.5 * bleu_score + 0.5 * meteor_score.
35 |         """
36 |         # interpolation
37 |         if config_string.startswith("INTERPOLATE"):
38 |             return si.ScorerInterpolator(config_string)
39 |         try:
40 |             scorer, arguments = config_string.split(" ", 1)
41 |         except ValueError:
42 |             scorer = config_string
43 |             arguments = ''
44 |         if scorer == 'SENTENCEBLEU':
45 |             return SentenceBleuScorer(arguments)
46 |         elif scorer == 'METEOR':
47 |             return MeteorScorer(arguments)
48 |         elif scorer == 'BEER':
49 |             return BeerScorer(arguments)
50 |         elif scorer == 'CHRF':
51 |             return CharacterFScorer(arguments)
52 |         # add other scorers here
53 |         else:
54 |             raise NotImplementedError("No such scorer: %s" % scorer)
55 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/test/test_score.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | import unittest
 7 | import requests
 8 | 
 9 | sys.path.append(os.path.abspath('../nematus'))
10 | from score import main as score
11 | 
12 | 
13 | def load_wmt16_model(src, target):
14 |         path = os.path.join('models', '{0}-{1}'.format(src,target))
15 |         try:
16 |             os.makedirs(path)
17 |         except OSError:
18 |             pass
19 |         for filename in ['model.npz', 'model.npz.json', 'vocab.{0}.json'.format(src), 'vocab.{0}.json'.format(target)]:
20 |             if not os.path.exists(os.path.join(path, filename)):
21 |                 r = requests.get('http://data.statmt.org/rsennrich/wmt16_systems/{0}-{1}/'.format(src,target) + filename, stream=True)
22 |                 with open(os.path.join(path, filename), 'wb') as f:
23 |                     for chunk in r.iter_content(1024**2):
24 |                         f.write(chunk)
25 | 
26 | class TestTranslate(unittest.TestCase):
27 |     """
28 |     Regression tests for translation with WMT16 models
29 |     """
30 | 
31 |     def setUp(self):
32 |         """
33 |         Download pre-trained models
34 |         """
35 |         load_wmt16_model('en','de')
36 |         load_wmt16_model('en','ro')
37 | 
38 |     def scoreEqual(self, output1, output2):
39 |         """given two files with translation scores, check that probabilities are equal within rounding error.
40 |         """
41 |         for i, (line, line2) in enumerate(zip(open(output1).readlines(), open(output2).readlines())):
42 |             self.assertAlmostEqual(float(line.split()[-1]), float(line2.split()[-1]), 5)
43 | 
44 |     # English-German WMT16 system, no dropout
45 |     def test_ende(self):
46 |         os.chdir('models/en-de/')
47 |         score(['model.npz'], open('../../en-de/in'), open('../../en-de/references'), open('../../en-de/out_score','w'), normalize=True)
48 |         os.chdir('../..')
49 |         self.scoreEqual('en-de/ref_score', 'en-de/out_score')
50 | 
51 |     # English-Romanian WMT16 system, dropout
52 |     def test_enro(self):
53 |         os.chdir('models/en-ro/')
54 |         score(['model.npz'], open('../../en-ro/in'), open('../../en-ro/references'), open('../../en-ro/out_score','w'), normalize=True)
55 |         os.chdir('../..')
56 |         self.scoreEqual('en-ro/ref_score', 'en-ro/out_score')
57 | 
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/nematus/metrics/scorer_interpolator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from scorer import Scorer
 5 | import scorer_provider as sp
 6 | 
 7 | class ScorerInterpolator(Scorer):
 8 |     """
 9 |     Creates a scorer that interpolates scores from 1..n sub-scorers, e.g.,
10 |     0.5 * SENTENCEBLEU + 0.5 * METEOR.
11 |     """
12 | 
13 |     def __init__(self, config_string):
14 |         """
15 |         @param config_string example:
16 |         `INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor`
17 |         """
18 |         self._scorers = []
19 |         self._weights = []
20 |         # parse arguments
21 |         scorers = config_string.split(";")
22 |         scorers = [scorer.strip() for scorer in scorers]
23 |         try:
24 |             instruction, weights = scorers[0].split("w=")
25 |             assert instruction.strip() == "INTERPOLATE"
26 |             weights = [float(w) for w in weights.split(',')]
27 |             scorers = [sp.ScorerProvider().get(s) for s in scorers[1:]]
28 |         except:
29 |             raise SyntaxError("Ill-formated interpolation of metrics. Example of valid definition: `INTERPOLATE w=0.5,0.5`.")
30 |         # assertions
31 |         assert len(weights) == len(scorers)
32 |         assert sum(weights) == 1.0
33 |         # init scorers
34 |         for i, scorer in enumerate(scorers):
35 |             self._scorers.append(scorer)
36 |             self._weights.append(weights[i])
37 | 
38 |     def set_reference(self, reference_tokens):
39 |         """
40 |         Sets the reference against which one or many hypotheses can be scored
41 |         via `self.score()` and `self.score_matrix()`.
42 |         """
43 |         for scorer in self._scorers:
44 |             scorer.set_reference(reference_tokens)
45 | 
46 |     def score(self, hypothesis_tokens):
47 |         """
48 |         Scores @param hypothesis with all scorers added via `self.add_scorer`
49 |         and interpolates the scores with the respective weights.
50 |         """
51 |         return sum([s.score(hypothesis_tokens) * w for w, s in zip(self._weights, self._scorers)])
52 | 
53 |     def score_matrix(self, hypothesis_matrix):
54 |         """
55 |         Scores every hypothesis in @param hypotheses with all scorers added via
56 |         `self.add_scorer` and interpolates the scores with the respective
57 |         weights.
58 |         """
59 |         return sum([s.score_matrix(hypothesis_matrix) * w for w, s in zip(self._weights, self._scorers)])
60 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/test/en-ro/ref:
--------------------------------------------------------------------------------
 1 | Comisia Europeană a decis , marți , să reia plățile pentru România în cadrul programelor &quot; competitivitate economică &quot; și &quot; Mediu &quot; , ambele întrerupte la începutul lunii aprilie 2015 .
 2 | 0.995251238346 0.554548621178 0.986067473888 0.977536916733 0.471415698528 0.965951085091 0.991383254528 0.735538363457 0.99354493618 0.959721267223 0.960633397102 0.987248241901 0.73650187254 0.958207905293 0.329731225967 0.941679000854 0.48397654295 0.872097313404 0.995552778244 0.99405169487 0.820243418217 0.72900468111 0.978062391281 0.980996310711 0.959786713123 0.870699226856 0.956985473633 0.989414513111 0.948426306248 0.996526777744 0.996653676033 0.995466053486 0.999979257584 
 3 | judecătorul nu a exclus dacă melodia L@@ M@@ FAO în sine a fost o copie ne@@ autorizată a &quot; H@@ ust@@ lin &quot; &quot; .
 4 | 0.748930931091 0.976350605488 0.90377175808 0.238382071257 0.800515711308 0.51756888628 0.782619535923 0.955519676208 0.894009530544 0.183243229985 0.996174514294 0.782620131969 0.927685260773 0.802042484283 0.788843691349 0.390572547913 0.356075167656 0.823610961437 0.785067260265 0.941457808018 0.976138412952 0.979526996613 0.859899282455 0.516458272934 0.989753842354 0.999218225479 
 5 | naționala României face parte din Grupa D în Cupa Mondială din Anglia , alături de Franța , Irlanda , Canada și Italia .
 6 | 0.336522132158 0.97390460968 0.485618531704 0.998266816139 0.977845489979 0.972954690456 0.995464265347 0.582527756691 0.900587379932 0.904148697853 0.926693975925 0.990065574646 0.982615590096 0.970086634159 0.995798170567 0.985046744347 0.999237596989 0.992471039295 0.998591423035 0.994875609875 0.995780050755 0.996373534203 0.996117174625 0.99995225668 
 7 | transmite un mesaj : țara dumneavoastră nu apreciază că devine părinte .
 8 | 0.343225359917 0.930076539516 0.998842597008 0.99683535099 0.859772562981 0.548672556877 0.990485429764 0.126094281673 0.79455691576 0.418934345245 0.782570242882 0.974693894386 0.999907135963 
 9 | discu@@ tia despre care se va face la momentul oportun , trebuie sa avem in vedere traficul de tramvaie din zona si avem nevoie si de o aprobare de la Compania Nationala de auto@@ str@@ azi .
10 | 0.0200441926718 0.983767747879 0.254417777061 0.626059830189 0.914376199245 0.6536039114 0.598313570023 0.473640501499 0.660739302635 0.727347791195 0.59266859293 0.936970472336 0.982369661331 0.226246803999 0.963698983192 0.996792733669 0.877014875412 0.466113090515 0.902705550194 0.558049559593 0.952391505241 0.783247053623 0.856589257717 0.994170725346 0.818028509617 0.973481237888 0.627151310444 0.944025158882 0.787506222725 0.954702436924 0.380503386259 0.954006671906 0.737255275249 0.340464830399 0.983342587948 0.980352401733 0.994605183601 0.99982637167 
11 | 


--------------------------------------------------------------------------------
/utils/copy_unknown_words.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | '''
 3 | This script is to replace the unknown words in target sentences with their aligned words in source sentences.
 4 | Args: 
 5 | 	- input: an alignment file produced by translating with the option '--output_alignment'
 6 | 	- output: output text file
 7 | 	- unknown word token (optional): a string, default="UNK"
 8 | To use:
 9 | 	python copy_unknown_words.py -i translation.txt -o updated_translation.txt -u 'UNK'
10 | '''
11 | 
12 | import json
13 | import numpy
14 | import argparse
15 | import sys
16 | 
17 | ''' 
18 | Example input file:
19 | 0 ||| das ist ein Test . ||| 0 ||| this is a UNK . ||| 6 6
20 | 0.723781 0.0561881 0.0652739 0.0888658 0.0159646 0.0499262
21 | 0.0250772 0.728351 0.105699 0.0764411 0.0245384 0.0398933
22 | 0.0257915 0.0667947 0.543118 0.177978 0.020311 0.166007
23 | 0.000306134 0.0161435 0.025201 0.937249 0.00364889 0.0174515
24 | 0.0116866 0.195885 0.0383414 0.0331976 0.437992 0.282897
25 | 0.0121966 0.00570636 0.00524746 0.014052 0.0325562 0.930241
26 | '''
27 | 
28 | def copy_unknown_words(filename, out_filename, unk_token):
29 |     for line in filename:
30 |         items = line.split(' ||| ')
31 |         if len(items) > 1:
32 |             src = items[1].split()
33 |             target = items[3].split()
34 |             alignments = []
35 |         elif line.strip():
36 |             alignment = map(float,line.split())
37 |             hard_alignment = numpy.argmax(alignment, axis=0)
38 |             alignments.append(hard_alignment)
39 |         elif line == '\n':
40 |             print alignments
41 |             for i, word in enumerate(target):
42 |                 if word == unk_token:
43 |                     target[i] = src[alignments[i]]
44 |             out_filename.write(' '.join(target) + '\n')
45 | 
46 | 
47 | if __name__ == "__main__":
48 |         parser = argparse.ArgumentParser()
49 |         parser.add_argument('--input', '-i', type=argparse.FileType('r'),
50 |                                                 metavar='PATH', default=sys.stdin,
51 |                                                 help='''Input text file (produced by decoding with \'--output_alignment\')''')
52 |         parser.add_argument('--output', '-o', type=argparse.FileType('w'),
53 |                                                 default=sys.stdout, metavar='PATH',
54 |                                                 help="Output file (default: standard output)")
55 |         parser.add_argument('--unknown', '-u', type=str, nargs = '?', default="UNK",
56 |                                                 help='Unknown token to be replaced (default: "UNK")')
57 | 
58 |         args = parser.parse_args()
59 | 
60 |         copy_unknown_words(args.input, args.output, args.unknown)


--------------------------------------------------------------------------------
/test/test_translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import os
 6 | import unittest
 7 | import requests
 8 | 
 9 | sys.path.append(os.path.abspath('../nematus'))
10 | from translate import main as translate
11 | 
12 | 
13 | def load_wmt16_model(src, target):
14 |         path = os.path.join('models', '{0}-{1}'.format(src,target))
15 |         try:
16 |             os.makedirs(path)
17 |         except OSError:
18 |             pass
19 |         for filename in ['model.npz', 'model.npz.json', 'vocab.{0}.json'.format(src), 'vocab.{0}.json'.format(target)]:
20 |             if not os.path.exists(os.path.join(path, filename)):
21 |                 r = requests.get('http://data.statmt.org/rsennrich/wmt16_systems/{0}-{1}/'.format(src,target) + filename, stream=True)
22 |                 with open(os.path.join(path, filename), 'wb') as f:
23 |                     for chunk in r.iter_content(1024**2):
24 |                         f.write(chunk)
25 | 
26 | class TestTranslate(unittest.TestCase):
27 |     """
28 |     Regression tests for translation with WMT16 models
29 |     """
30 | 
31 |     def setUp(self):
32 |         """
33 |         Download pre-trained models
34 |         """
35 |         load_wmt16_model('en','de')
36 |         load_wmt16_model('en','ro')
37 | 
38 |     def outputEqual(self, output1, output2):
39 |         """given two translation outputs, check that output string is identical,
40 |         and probabilities are equal within rounding error.
41 |         """
42 |         for i, (line, line2) in enumerate(zip(open(output1).readlines(), open(output2).readlines())):
43 |             if not i % 2:
44 |                 self.assertEqual(line, line2)
45 |             else:
46 |                 probs = map(float, line.split())
47 |                 probs2 = map(float, line.split())
48 |                 for p, p2 in zip(probs, probs2):
49 |                     self.assertAlmostEqual(p, p2, 5)
50 | 
51 |     # English-German WMT16 system, no dropout
52 |     def test_ende(self):
53 |         os.chdir('models/en-de/')
54 |         translate(['model.npz'], open('../../en-de/in'), open('../../en-de/out','w'), k=12, normalize=True, n_process=1, suppress_unk=True, print_word_probabilities=True)
55 |         os.chdir('../..')
56 |         self.outputEqual('en-de/ref','en-de/out')
57 | 
58 |     # English-Romanian WMT16 system, dropout
59 |     def test_enro(self):
60 |         os.chdir('models/en-ro/')
61 |         translate(['model.npz'], open('../../en-ro/in'), open('../../en-ro/out','w'), k=12, normalize=True, n_process=1, suppress_unk=True, print_word_probabilities=True)
62 |         os.chdir('../..')
63 |         self.outputEqual('en-ro/ref','en-ro/out')
64 | 
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     unittest.main()
69 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/nematus/metrics/test_chrf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import unittest
 5 | 
 6 | from chrf import CharacterFScorer
 7 | 
 8 | class TestCharacterFScoreReference(unittest.TestCase):
 9 |     """
10 |     Regression tests for SmoothedBleuReference
11 |     """
12 |     @staticmethod
13 |     def tokenize(sentence):
14 |         return sentence.split(" ")
15 |     def test_identical_segments(self):
16 |         segment = self.tokenize("Consistency is the last refuge of the unimaginative")
17 |         scorer = CharacterFScorer('n=6,beta=3')
18 |         scorer.set_reference(segment)
19 |         self.assertEqual(scorer.score(segment), 1.0)   
20 |     def test_completely_different_segments(self):
21 |         segment_a = self.tokenize("AAAAAA")
22 |         segment_b = self.tokenize("BBBB")
23 |         scorer = CharacterFScorer('n=3,beta=3')
24 |         scorer.set_reference(segment_a)
25 |         self.assertEqual(scorer.score(segment_b), 0.0)
26 |     def test_empty_string(self):
27 |         segment_a = self.tokenize("")
28 |         segment_b = self.tokenize("")
29 |         scorer = CharacterFScorer('n=6,beta=3')
30 |         scorer.set_reference(segment_a)
31 |         self.assertEqual(scorer.score(segment_b), 1.0)
32 |     def test_one_character_empty_string(self):
33 |         segment_a = self.tokenize("A")
34 |         segment_b = self.tokenize("")
35 |         scorer = CharacterFScorer('n=6,beta=3')
36 |         scorer.set_reference(segment_a)
37 |         self.assertEqual(scorer.score(segment_b), 0.0)
38 |     def test_empty_string_one_character(self):
39 |         segment_a = self.tokenize("")
40 |         segment_b = self.tokenize("A")
41 |         scorer = CharacterFScorer('n=6,beta=3')
42 |         scorer.set_reference(segment_a)
43 |         self.assertEqual(scorer.score(segment_b), 0.0)
44 |     def test_half_right(self):
45 |         segment_a = self.tokenize("AB")
46 |         segment_b = self.tokenize("AA")
47 |         scorer = CharacterFScorer('n=6,beta=3')
48 |         scorer.set_reference(segment_a)
49 |         self.assertEqual(scorer.score(segment_b), 0.25)                     
50 |     def test_one_character(self):
51 |         segment_a = self.tokenize("A")
52 |         segment_b = self.tokenize("A")
53 |         scorer = CharacterFScorer('n=6,beta=3')
54 |         scorer.set_reference(segment_a)
55 |         self.assertEqual(scorer.score(segment_b), 1.0)
56 |     def test_almost_correct(self):
57 |         segment_a = self.tokenize("risk assessment has to be undertaken by those who are qualified and expert in that area - that is the scientists .")
58 |         segment_b = self.tokenize(" risk assessment must be made of those who are qualified and expertise in the sector - these are the scientists .")
59 |         scorer = CharacterFScorer('n=6,beta=3')
60 |         scorer.set_reference(segment_a)
61 |         self.assertEqual('{0:.12f}'.format(scorer.score(segment_b)), "0.652414427449")
62 |     
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.pt:
--------------------------------------------------------------------------------
  1 | #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
  2 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  3 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  6 | #usually upper case letters are initials in a name
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104 | Adj
105 | Adm
106 | Adv
107 | Art
108 | Ca
109 | Capt
110 | Cmdr
111 | Col
112 | Comdr
113 | Con
114 | Corp
115 | Cpl
116 | DR
117 | DRA
118 | Dr
119 | Dra
120 | Dras
121 | Drs
122 | Eng
123 | Enga
124 | Engas
125 | Engos
126 | Ex
127 | Exo
128 | Exmo
129 | Fig
130 | Gen
131 | Hosp
132 | Insp
133 | Lda
134 | MM
135 | MR
136 | MRS
137 | MS
138 | Maj
139 | Mrs
140 | Ms
141 | Msgr
142 | Op
143 | Ord
144 | Pfc
145 | Ph
146 | Prof
147 | Pvt
148 | Rep
149 | Reps
150 | Res
151 | Rev
152 | Rt
153 | Sen
154 | Sens
155 | Sfc
156 | Sgt
157 | Sr
158 | Sra
159 | Sras
160 | Srs
161 | Sto
162 | Supt
163 | Surg
164 | adj
165 | adm
166 | adv
167 | art
168 | cit
169 | col
170 | con
171 | corp
172 | cpl
173 | dr
174 | dra
175 | dras
176 | drs
177 | eng
178 | enga
179 | engas
180 | engos
181 | ex
182 | exo
183 | exmo
184 | fig
185 | op
186 | prof
187 | sr
188 | sra
189 | sras
190 | srs
191 | sto
192 | 
193 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
194 | v
195 | vs
196 | i.e
197 | rev
198 | e.g
199 | 
200 | #Numbers only. These should only induce breaks when followed by a numeric sequence
201 | # add NUMERIC_ONLY after the word for this function
202 | #This case is mostly for the english "No." which can either be a sentence of its own, or
203 | #if followed by a number, a non-breaking prefix
204 | No #NUMERIC_ONLY# 
205 | Nos
206 | Art #NUMERIC_ONLY#
207 | Nr
208 | p #NUMERIC_ONLY#
209 | pp #NUMERIC_ONLY#
210 | 
211 | 


--------------------------------------------------------------------------------
/utils/visualize_probs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | # given a source sentence, a target sentence, and a sequence of probabilities (one per target word, plus an end-of-sentence probability),
 5 | # visualize the probability of each target word via HTML output.
 6 | # black fields indicate high confidence, light fields low confidence.
 7 | # example input:
 8 | """
 9 | Unsere digitalen Leben haben die Notwendigkeit, stark, lebenslustig und erfolgreich zu erscheinen, verdoppelt.
10 | Our digital lives have doubled the need to appear strong, lifel... ike and successful .
11 | 0.882218956947 0.989946246147 0.793388187885 0.790167689323 0.768674969673 0.941913545132 0.955783545971 0.777168631554 0.266917765141 0.909709095955 0.990240097046 0.341023534536 0.828059256077 0.854399263859 0.906807541847 0.960786998272 0.997184157372"""
12 | 
13 | html_text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
14 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
15 | 
16 | <html>
17 | <HEAD>
18 | <title>Results page</title>
19 | <meta http-equiv=Content-Type content=text/html; charset=UTF8>
20 | <style>
21 | html, body, pre{{
22 | background-color: #FFFFFF;
23 | color: #FFFFFF;
24 | font-family: Arial, Helvetica, sans-serif;
25 | font-size: 20px;
26 | }}
27 | td {{
28 | min-width:100px;
29 | }}
30 | th {{
31 | color: #000000;
32 | text-align: left;
33 | }}
34 | </style>
35 | </head>
36 | \n
37 | \n
38 | <body>
39 |  <table>
40 |   {0}
41 | </table> 
42 | 
43 | </body>
44 | </html>
45 | """
46 | 
47 | 
48 | def print_probdist(infile, outfile):
49 | 
50 |     entries = []
51 | 
52 |     for i, line in enumerate(infile):
53 |         if i % 3 == 0:
54 |             #words = line.split()
55 |             entry = ""
56 |             #for w in words:
57 |                 #entry += "<th>" + w + "</thr>\n"
58 |             entry = "<tr><th colspan=\"0\">" + line + "</th></tr>\n"
59 |             entries.append(entry)
60 | 
61 |         if i % 3 == 1:
62 |             words = line.split()
63 |             words.append('&lt;/s&gt;')
64 |         elif i % 3 == 2:
65 |             probs = map(float, line.split())
66 |             entry = ""
67 |             for w,p in zip(words, probs):
68 |                 color = '#%02x%02x%02x' % (int((1-p)*255), int((1-p)*255), int((1-p)*255))
69 |                 entry += "<td bgcolor=\"{0}\">{1}</td>".format(color, w)
70 |             entry = "<tr>" + entry + "</tr>\n"
71 |             entries.append(entry)
72 | 
73 | 
74 |     outfile.write(html_text.format('\n'.join(entries)))
75 | 
76 | 
77 | parser = argparse.ArgumentParser()
78 | parser.add_argument('--input', '-i', type=argparse.FileType('r'),
79 |                         default=sys.stdin, metavar='PATH',
80 |                         help="Input file (default: standard input)")
81 | parser.add_argument('--output', '-o', type=argparse.FileType('w'),
82 |                         default=sys.stdout, metavar='PATH',
83 |                         help="Output file (default: standard output)")
84 | 
85 | args = parser.parse_args()
86 | 
87 | print_probdist(args.input, args.output)


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/nematus/metrics/beer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import subprocess, threading
 5 | from scorer import Scorer
 6 | from reference import Reference
 7 | 
 8 | class BeerError(Exception):
 9 |     def __init__(self, value):
10 |         self.value = value
11 |     def __str__(self):
12 |         return repr(self.value)
13 | 
14 | class BeerScorer(Scorer):
15 |     """
16 |     Python wrapper for the BEER metric. Starts a BEER process and keeps it alive, so that the model
17 |     can be kept in memeory. Arguments are the BEER language abbreviation and the path to the BEER
18 |     installation. They need to be specified as follows:"beer_language=lg,beer_path=path" (any order).
19 |     """
20 |     def __init__(self, argument_string):
21 |         Scorer.__init__(self, argument_string)
22 |         
23 |         #Lock for the BEER process, which can only handle one request at a time:
24 |         self.lock = threading.Lock()
25 |         
26 |         #Get necessary arguments for starting BEER from argument string parsed in Scorer.__init__()
27 |         self._beer_language = self._arguments["beer_language"]
28 |         self._beer_path = self._arguments["beer_path"] + "/"
29 |         
30 |         #Start a BEER process:
31 |         command = self._beer_path+"beer -l "+self._beer_language+" --workingMode interactive "
32 |         self.beer_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
33 | 
34 |     def set_reference(self, reference_tokens):
35 |         """
36 |         Construct a BeerReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses.
37 |         This can be done any time.
38 |         """
39 |         self.lock.acquire()
40 |         self._reference = BeerReference(reference_tokens, self)
41 |         self.lock.release()
42 | 
43 |     def terminate_process(self):
44 |         """
45 |         Waits for the current request to be processed and terminates the BEER process.
46 |         """
47 |         self.lock.acquire()
48 |         self.beer_process.terminate()
49 |         self.lock.release()
50 |         
51 |     def kill_process(self):
52 |         """
53 |         Kills the BEER process right away.
54 |         """
55 |         self.beer_process.kill()
56 | 
57 | class BeerReference(Reference):
58 |     """
59 |     BEER reference object, against which hypotheses can be scored.
60 |     """
61 |     def __init__(self, reference_tokens, beer_scorer):
62 |         Reference.__init__(self, reference_tokens)
63 |         
64 |         #Construct reference string from tokens
65 |         self._reference_string = " ".join(reference_tokens)
66 |         self._beer_scorer = beer_scorer
67 | 
68 |     def score(self, hypothesis_tokens):
69 |         
70 |         #Construct hypothesis string from hypothesis tokens:
71 |         hypothesis_string = " ".join(hypothesis_tokens)
72 |         
73 |         #Acquire lock to make sure BEER process is not in use:
74 |         self._beer_scorer.lock.acquire()
75 |         
76 |         #Score hypothesis string against reference string
77 |         try:
78 |             self._beer_scorer.beer_process.stdin.write("EVAL ||| "+hypothesis_string+" ||| "+self._reference_string+"\n")
79 |         except:
80 |             raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip())
81 |         
82 |         #Read feature values from process output
83 |         std_out = self._beer_scorer.beer_process.stdout.readline()
84 |         #Release the process lock
85 |         self._beer_scorer.lock.release()
86 |         
87 |         #Check if BEER returned a score:
88 |         try:
89 |             n = float(std_out)
90 |         except:
91 |             raise BeerError("Beer returned the following error: "+ self._beer_scorer.beer_process.stderr.readline().strip())
92 |         #Return final score
93 |         return n


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/nematus/metrics/sentence_bleu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import division
  5 | 
  6 | from math import exp
  7 | from operator import mul
  8 | from collections import defaultdict
  9 | 
 10 | from scorer import Scorer
 11 | from reference import Reference
 12 | 
 13 | class SentenceBleuScorer(Scorer):
 14 |     """
 15 |     Scores SmoothedBleuReference objects.
 16 |     """
 17 | 
 18 |     def __init__(self, argument_string):
 19 |         """
 20 |         Initialises metric-specific parameters.
 21 |         """
 22 |         Scorer.__init__(self, argument_string)
 23 |         # use n-gram order of 4 by default
 24 |         if not 'n' in self._arguments.keys():
 25 |             self._arguments['n'] = 4
 26 | 
 27 |     def set_reference(self, reference_tokens):
 28 |         """
 29 |         Sets the reference against hypotheses are scored.
 30 |         """
 31 |         self._reference = SentenceBleuReference(
 32 |             reference_tokens,
 33 |             self._arguments['n']
 34 |         )
 35 | 
 36 | class SentenceBleuReference(Reference):
 37 |     """
 38 |     Smoothed sentence-level BLEU as as proposed by Lin and Och (2004).
 39 |     Implemented as described in (Chen and Cherry, 2014).
 40 |     """
 41 | 
 42 |     def __init__(self, reference_tokens, n=4):
 43 |         """
 44 |         @param reference the reference translation that hypotheses shall be
 45 |                          scored against. Must be an iterable of tokens (any
 46 |                          type).
 47 |         @param n         maximum n-gram order to consider.
 48 |         """
 49 |         Reference.__init__(self, reference_tokens)
 50 |         self.n = n
 51 |         # preprocess reference
 52 |         self._reference_length = len(self._reference_tokens)
 53 |         self._reference_ngrams = self._get_ngrams(self._reference_tokens, self.n)
 54 | 
 55 |     def _get_ngrams(self, tokens, max_n):
 56 |         """
 57 |         Extracts all n-grams of order 1 up to (and including) @param max_n from
 58 |         a list of @param tokens.
 59 |         """
 60 |         n_grams = []
 61 |         for n in range(1, max_n+1):
 62 |             n_grams.append(defaultdict(int))
 63 |             for n_gram in zip(*[tokens[i:] for i in range(n)]):
 64 |                 n_grams[n-1][n_gram] += 1
 65 |         return n_grams
 66 | 
 67 |     def score(self, hypothesis_tokens):
 68 |         """
 69 |         Scores @param hypothesis against this reference.
 70 | 
 71 |         @return the smoothed sentence-level BLEU score: 1.0 is best, 0.0 worst.
 72 |         """
 73 |         def product(iterable):
 74 |             return reduce(mul, iterable, 1)
 75 |         def ngram_precisions(ref_ngrams, hyp_ngrams):
 76 |             precisions = []
 77 |             for n in range(1, self.n+1):
 78 |                 overlap = 0
 79 |                 for ref_ngram, ref_ngram_count in ref_ngrams[n-1].iteritems():
 80 |                     if ref_ngram in hyp_ngrams[n-1]:
 81 |                         overlap += min(ref_ngram_count, hyp_ngrams[n-1][ref_ngram])
 82 |                 hyp_length = max(0, len(hypothesis_tokens)-n+1)
 83 |                 if n >= 2:
 84 |                     # smoothing as proposed by Lin and Och (2004),
 85 |                     # implemented as described in (Chen and Cherry, 2014)
 86 |                     overlap += 1
 87 |                     hyp_length += 1
 88 |                 precisions.append(overlap/hyp_length if hyp_length > 0 else 0.0)
 89 |             return precisions
 90 |         def brevity_penalty(ref_length, hyp_length):
 91 |             return min(1.0, exp(1-(ref_length/hyp_length if hyp_length > 0 else 0.0)))
 92 |         # preprocess hypothesis
 93 |         hypothesis_length = len(hypothesis_tokens)
 94 |         hypothesis_ngrams = self._get_ngrams(hypothesis_tokens, self.n)
 95 |         # calculate n-gram precision for all orders
 96 |         np = ngram_precisions(self._reference_ngrams, hypothesis_ngrams)
 97 |         # calculate brevity penalty
 98 |         bp = brevity_penalty(self._reference_length, hypothesis_length)
 99 |         # compose final BLEU score
100 |         return product(np)**(1/self.n) * bp
101 | 


--------------------------------------------------------------------------------
/nematus/theano_util.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Theano utility functions
  3 | '''
  4 | 
  5 | import json
  6 | import cPickle as pkl
  7 | import numpy
  8 | from collections import OrderedDict
  9 | 
 10 | import theano
 11 | import theano.tensor as tensor
 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 13 | 
 14 | # push parameters to Theano shared variables
 15 | def zip_to_theano(params, tparams):
 16 |     for kk, vv in params.iteritems():
 17 |         tparams[kk].set_value(vv)
 18 | 
 19 | 
 20 | # pull parameters from Theano shared variables
 21 | def unzip_from_theano(zipped, excluding_prefix=None):
 22 |     new_params = OrderedDict()
 23 |     for kk, vv in zipped.iteritems():
 24 |         if excluding_prefix and (kk.startswith(excluding_prefix)):
 25 |             continue
 26 |         new_params[kk] = vv.get_value()
 27 |     return new_params
 28 | 
 29 | 
 30 | # get the list of parameters: Note that tparams must be OrderedDict
 31 | def itemlist(tparams):
 32 |     return [vv for kk, vv in tparams.iteritems()]
 33 | 
 34 | # make prefix-appended name
 35 | def pp(pp, name):
 36 |     return '%s_%s' % (pp, name)
 37 | 
 38 | # initialize Theano shared variables according to the initial parameters
 39 | def init_theano_params(params):
 40 |     tparams = OrderedDict()
 41 |     for kk, pp in params.iteritems():
 42 |         tparams[kk] = theano.shared(params[kk], name=kk)
 43 |     return tparams
 44 | 
 45 | 
 46 | # load parameters
 47 | def load_params(path, params, with_prefix=''):
 48 |     pp = numpy.load(path)
 49 |     new_params = OrderedDict()
 50 |     for kk, vv in params.iteritems():
 51 |         if kk not in pp:
 52 |             warnings.warn('%s is not in the archive' % kk)
 53 |             continue
 54 |         new_params[with_prefix+kk] = pp[kk]
 55 | 
 56 |     params.update(new_params)
 57 |     return params
 58 | 
 59 | # load parameters of the optimizer
 60 | def load_optimizer_params(path, optimizer_name):
 61 |     params = {}
 62 |     pp = numpy.load(path)
 63 |     for kk in pp:
 64 |         if kk.startswith(optimizer_name):
 65 |             params[kk] = pp[kk]
 66 |     return params
 67 | 
 68 | def tanh(x):
 69 |     return tensor.tanh(x)
 70 | 
 71 | 
 72 | def linear(x):
 73 |     return x
 74 | 
 75 | 
 76 | def concatenate(tensor_list, axis=0):
 77 |     """
 78 |     Alternative implementation of `theano.tensor.concatenate`.
 79 |     This function does exactly the same thing, but contrary to Theano's own
 80 |     implementation, the gradient is implemented on the GPU.
 81 |     Backpropagating through `theano.tensor.concatenate` yields slowdowns
 82 |     because the inverse operation (splitting) needs to be done on the CPU.
 83 |     This implementation does not have that problem.
 84 |     :usage:
 85 |         >>> x, y = theano.tensor.matrices('x', 'y')
 86 |         >>> c = concatenate([x, y], axis=1)
 87 |     :parameters:
 88 |         - tensor_list : list
 89 |             list of Theano tensor expressions that should be concatenated.
 90 |         - axis : int
 91 |             the tensors will be joined along this axis.
 92 |     :returns:
 93 |         - out : tensor
 94 |             the concatenated tensor expression.
 95 |     """
 96 |     concat_size = sum(tt.shape[axis] for tt in tensor_list)
 97 | 
 98 |     output_shape = ()
 99 |     for k in range(axis):
100 |         output_shape += (tensor_list[0].shape[k],)
101 |     output_shape += (concat_size,)
102 |     for k in range(axis + 1, tensor_list[0].ndim):
103 |         output_shape += (tensor_list[0].shape[k],)
104 | 
105 |     out = tensor.zeros(output_shape)
106 |     offset = 0
107 |     for tt in tensor_list:
108 |         indices = ()
109 |         for k in range(axis):
110 |             indices += (slice(None),)
111 |         indices += (slice(offset, offset + tt.shape[axis]),)
112 |         for k in range(axis + 1, tensor_list[0].ndim):
113 |             indices += (slice(None),)
114 | 
115 |         out = tensor.set_subtensor(out[indices], tt)
116 |         offset += tt.shape[axis]
117 | 
118 |     return out
119 | 
120 | # return name of word embedding for factor i
121 | # special handling of factor 0 for backward compatibility
122 | def embedding_name(i):
123 |     if i == 0:
124 |         return 'Wemb'
125 |     else:
126 |         return 'Wemb'+str(i)
127 | 
128 | # Zero out all parameters
129 | def zero_all(params):
130 |     for kk, vv in params.iteritems():
131 |         vv[:] = numpy.zeros_like(vv)
132 | 
133 | 


--------------------------------------------------------------------------------
/nematus/metrics/meteor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import subprocess, threading
  5 | from scorer import Scorer
  6 | from reference import Reference
  7 | 
  8 | class MeteorError(Exception):
  9 |     def __init__(self, value):
 10 |         self.value = value
 11 |     def __str__(self):
 12 |         return repr(self.value)
 13 | 
 14 | class MeteorScorer(Scorer):
 15 |     """
 16 |     Python wrapper for the METEOR metric. Starts a METEOR process and keeps it alive, so that the model
 17 |     can be kept in memeory. Arguments are the meteor language abbreviation and the path to the METEOR
 18 |     installation. They need to be specified as follows:"meteor_language=lg,meteor_path=path" (any order).
 19 |     """
 20 |     def __init__(self, argument_string):
 21 |         Scorer.__init__(self, argument_string)
 22 |         
 23 |         #Lock for the METEOR process, which can only handle one request at a time:
 24 |         self.lock = threading.Lock()
 25 |         
 26 |         #Get necessary arguments for starting METEOR from argument string parsed in Scorer.__init__()
 27 |         self._meteor_language = self._arguments["meteor_language"]
 28 |         self._meteor_path = self._arguments["meteor_path"] + "/"
 29 |         
 30 |         #Start a METEOR process:
 31 |         command = "java -Xmx2G -jar "+self._meteor_path+"meteor-*.jar - - -l "+self._meteor_language+" -stdio"
 32 |         self.meteor_process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
 33 | 
 34 |     def set_reference(self, reference_tokens):
 35 |         """
 36 |         Construct a MeteorReference from a sequence of tokens and make it the reference against which the scorer evaluates hypotheses.
 37 |         This can be done any time.
 38 |         """
 39 |         self.lock.acquire()
 40 |         self._reference = MeteorReference(reference_tokens, self)
 41 |         self.lock.release()
 42 | 
 43 |     def terminate_process(self):
 44 |         """
 45 |         Waits for the current request to be processed and terminates the METEOR process.
 46 |         """
 47 |         self.lock.acquire()
 48 |         self.meteor_process.terminate()
 49 |         self.lock.release()
 50 |         
 51 |     def kill_process(self):
 52 |         """
 53 |         Kills the METEOR process right away.
 54 |         """
 55 |         self.meteor_process.kill()
 56 | 
 57 | class MeteorReference(Reference):
 58 |     """
 59 |     METEOR reference object, against which hypotheses can be scored.
 60 |     """
 61 |     def __init__(self, reference_tokens, meteor_scorer):
 62 |         Reference.__init__(self, reference_tokens)
 63 |         
 64 |         #Construct reference string from tokens
 65 |         self._reference_string = " ".join(reference_tokens)
 66 |         self._meteor_scorer = meteor_scorer
 67 | 
 68 |     def score(self, hypothesis_tokens):
 69 |         
 70 |         #Construct hypothesis string from hypothesis tokens:
 71 |         hypothesis_string = " ".join(hypothesis_tokens)
 72 |         
 73 |         #Acquire lock to make sure METEOR process is not in use:
 74 |         self._meteor_scorer.lock.acquire()
 75 |         
 76 |         #Score hypothesis string against reference string
 77 |         try:
 78 |             self._meteor_scorer.meteor_process.stdin.write("SCORE ||| "+self._reference_string+" ||| "+hypothesis_string+"\n")
 79 |         except:
 80 |             raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip())
 81 |         
 82 |         #Read feature values from process output
 83 |         std_out = self._meteor_scorer.meteor_process.stdout.readline()
 84 |         
 85 |         #Pass feature values to METEOR process for computation of the final score
 86 |         try:
 87 |             self._meteor_scorer.meteor_process.stdin.write("EVAL ||| "+std_out)
 88 |         except:
 89 |             raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip())
 90 |         std_out = self._meteor_scorer.meteor_process.stdout.readline()
 91 |         
 92 |         #Release the process lock
 93 |         self._meteor_scorer.lock.release()
 94 |         
 95 |         #Check if Meteor returned a score:
 96 |         try:
 97 |             n = float(std_out)
 98 |         except:
 99 |             raise MeteorError("Meteor returned the following error: "+ self._meteor_scorer.meteor_process.stderr.readline().strip())
100 |         
101 |         #Return final score
102 |         return n


--------------------------------------------------------------------------------
/nematus/hypgraph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from collections import defaultdict
  5 | 
  6 | class HypGraph(object):
  7 | 	
  8 | 	def __init__(self):
  9 | 		self.nodes = defaultdict(str) # {id = label}
 10 | 		self.edges = [] # (parent_node_id, child_node_id)
 11 | 		self.costs = defaultdict(float) # {node_id = cost}
 12 | 		self.word_probs = defaultdict(float) # {node_id = word_prob}
 13 | 				
 14 | 	def get_id(self, word, history):	
 15 | 		if history == []:
 16 | 			return str(word)
 17 | 		history = '-'.join([str(h) for h in reversed(history)])
 18 | 		return '%s-%s' % (word, history)
 19 | 	
 20 | 	def get_ids(self, words):	
 21 | 		ids = []
 22 | 		for i, w in enumerate(words):
 23 | 			history = words[:i]
 24 | 			ids.append(self.get_id(w, history))
 25 | 		return ids
 26 | 
 27 | 	def add(self, word, history, word_prob=None, cost=None):
 28 | 		history_labels = [0] + history
 29 | 		history_ids = self.get_ids(history_labels)	
 30 | 		word_label = word
 31 | 		word_id = self.get_id(word_label, history_labels)	
 32 | 		# store
 33 | 		self.nodes[word_id] = word_label
 34 | 		self.edges.append((history_ids[-1], word_id))
 35 | 		if word_prob != None:
 36 | 			self.word_probs[word_id] = word_prob
 37 | 		if cost != None:
 38 | 			self.costs[word_id] = cost
 39 | 
 40 | class HypGraphRenderer(object):
 41 | 
 42 | 	def __init__(self, hyp_graph):
 43 | 		self.nodes = hyp_graph.nodes
 44 | 		self.edges = hyp_graph.edges
 45 | 		self.costs = hyp_graph.costs
 46 | 		self.word_probs = hyp_graph.word_probs
 47 | 		# constants
 48 | 		self.BOS_SYMBOLS = ['0']
 49 | 		self.EOS_SYMBOLS = ['<eos>']
 50 | 
 51 | 	def _escape_label(self, label):
 52 | 		replacements = {
 53 | 			'<': '\<',
 54 | 			'>': '\>',
 55 | 		}
 56 | 		for original, replacement in replacements.iteritems():
 57 | 			label = label.replace(original, replacement)
 58 | 		return label
 59 | 
 60 | 	def _render(self, costs=False, word_probs=False, highlight_best=False):	
 61 | 		from pygraphviz import AGraph
 62 |                 graph = AGraph(directed=True)
 63 |                 for node_id, node_label in self.nodes.iteritems():
 64 |                         attributes = self._node_attr(node_id, costs=costs, word_probs=word_probs)
 65 | 			graph.add_node(node_id, **attributes)
 66 |                 for (parent_node_id, child_node_id) in self.edges:
 67 |                         graph.add_edge(parent_node_id, child_node_id)
 68 | 		self.graph = graph
 69 | 		if highlight_best:
 70 | 			self._highlight_best()
 71 | 
 72 | 	def _node_attr(self, node_id, costs=False, word_probs=False):	
 73 | 		word = self.nodes[node_id].decode('utf-8')
 74 | 		cost = self.costs[node_id]
 75 | 		prob = self.word_probs[node_id]
 76 | 		attr = {}
 77 | 		if costs and word_probs:
 78 | 			attr['shape'] = "record"
 79 | 			attr['label'] = "{{%s|%.3f}|%.3f}" % (word, prob, cost)
 80 | 		elif costs:
 81 | 			attr['shape'] = "record"
 82 |                         attr['label'] = "{{%s}|%.3f}" % (word, cost)
 83 | 		elif word_probs:
 84 | 			attr['shape'] = "record"
 85 |                         attr['label'] = "{{%s|%.3f}}" % (word, prob)
 86 | 		else:
 87 | 			attr['label'] = word
 88 | 		attr['label'] = self._escape_label(attr['label'])
 89 | 		return attr
 90 | 	
 91 | 	def _highlight_best(self):
 92 | 		best_hyp_bg_color = '#CDE9EC'
 93 | 		best_hyp_cost = None
 94 | 		best_hyp_leaf_node_id = None
 95 | 		for node_id, label in self.nodes.iteritems():	
 96 | 			if label in self.EOS_SYMBOLS:
 97 | 				if best_hyp_cost == None or self.costs[node_id] < best_hyp_cost:
 98 | 					best_hyp_leaf_node_id = node_id
 99 | 					best_hyp_cost = self.costs[node_id]
100 | 		if best_hyp_leaf_node_id:
101 | 			best_hyp_leaf_node = self.graph.get_node(best_hyp_leaf_node_id)		
102 | 			current_node = best_hyp_leaf_node
103 | 			while current_node != []:	
104 | 				current_node.attr['style'] = 'filled'
105 | 				current_node.attr['fillcolor'] = best_hyp_bg_color
106 | 				try:
107 | 					current_node = self.graph.predecessors(current_node)[0]
108 | 				except IndexError:
109 | 					break
110 | 
111 | 	def wordify(self, word_dict):
112 | 		"""
113 | 		Replace node labels (usually integers) with words, subwords, or
114 | 		characters.
115 | 		"""	
116 | 		for node_id, label in self.nodes.iteritems():
117 | 			self.nodes[node_id] = word_dict[label]	
118 | 
119 | 	def save_png(self, filepath, detailed=False, highlight_best=False):
120 | 		"""
121 | 		Renders the graph as PNG image.
122 | 		
123 | 		@param filepath the taget file
124 | 		@param detailed whether to include word probabilities and
125 |                        hypothesis costs.
126 | 		@param highlight_best whether to highlight the best hypothesis.
127 | 		"""
128 | 		costs = True if detailed else False
129 | 		word_probs = True if detailed else False	
130 | 		self._render(costs=costs, word_probs=word_probs, highlight_best=highlight_best)
131 | 		self.graph.draw(filepath, prog="dot")
132 | 


--------------------------------------------------------------------------------
/data/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/utils/plot_heatmap.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import matplotlib.pyplot as plt
  3 | import sys
  4 | import json
  5 | import argparse
  6 | 
  7 | # input:
  8 | #  alignment matrix - numpy array
  9 | #  shape (target tokens + eos, number of hidden source states = source tokens +eos)
 10 | # one line correpsonds to one decoding step producing one target token
 11 | # each line has the attention model weights corresponding to that decoding step
 12 | # each float on a line is the attention model weight for a corresponding source state.
 13 | # plot: a heat map of the alignment matrix
 14 | # x axis are the source tokens (alignment is to source hidden state that roughly corresponds to a source token)
 15 | # y axis are the target tokens
 16 | 
 17 | # http://stackoverflow.com/questions/14391959/heatmap-in-matplotlib-with-pcolor
 18 | def plot_head_map(mma, target_labels, source_labels):
 19 |   fig, ax = plt.subplots()
 20 |   heatmap = ax.pcolor(mma, cmap=plt.cm.Blues)
 21 | 
 22 |   # put the major ticks at the middle of each cell
 23 |   ax.set_xticks(numpy.arange(mma.shape[1])+0.5, minor=False)
 24 |   ax.set_yticks(numpy.arange(mma.shape[0])+0.5, minor=False)
 25 |   
 26 |   # without this I get some extra columns rows
 27 |   # http://stackoverflow.com/questions/31601351/why-does-this-matplotlib-heatmap-have-an-extra-blank-column
 28 |   ax.set_xlim(0, int(mma.shape[1]))
 29 |   ax.set_ylim(0, int(mma.shape[0]))
 30 | 
 31 |   # want a more natural, table-like display
 32 |   ax.invert_yaxis()
 33 |   ax.xaxis.tick_top()
 34 | 
 35 |   # source words -> column labels
 36 |   ax.set_xticklabels(source_labels, minor=False)
 37 |   # target words -> row labels
 38 |   ax.set_yticklabels(target_labels, minor=False)
 39 |   
 40 |   plt.xticks(rotation=45)
 41 | 
 42 |   #plt.tight_layout()
 43 |   plt.show()
 44 | 
 45 | # column labels -> target words
 46 | # row labels -> source words
 47 | 
 48 | def read_alignment_matrix(f):
 49 |   header = f.readline().strip().split('|||')
 50 |   if header[0] == '':
 51 |     return None, None, None, None
 52 |   sid = int(header[0].strip())
 53 |   # number of tokens in source and translation +1 for eos
 54 |   src_count, trg_count = map(int,header[-1].split())
 55 |   # source words
 56 |   source_labels = header[3].decode('UTF-8').split()
 57 |   source_labels.append('</s>')
 58 |   # target words
 59 |   target_labels = header[1].decode('UTF-8').split()
 60 |   target_labels.append('</s>')
 61 | 
 62 |   mm = []
 63 |   for r in range(trg_count):
 64 |     alignment = map(float,f.readline().strip().split())
 65 |     mm.append(alignment)
 66 |   mma = numpy.array(mm)
 67 |   return sid,mma, target_labels, source_labels
 68 | 
 69 | 
 70 | def read_plot_alignment_matrices(f, n):
 71 |   while(f):
 72 |     sid, mma, target_labels, source_labels = read_alignment_matrix(f)
 73 |     if mma is None:
 74 |       return
 75 |     if sid >n:
 76 |       return
 77 |     plot_head_map(mma, target_labels, source_labels)
 78 |     # empty line separating the matrices
 79 |     f.readline()
 80 | 
 81 | 
 82 | """
 83 | Adding functions to read the json format.
 84 | """
 85 | 
 86 | def read_plot_alignment_json(file, n):
 87 |     while (file):
 88 |         sid, mma, target_labels, source_labels = read_alignment_json(file)
 89 |         if mma is None:
 90 |             return
 91 |         if sid > n:
 92 |             return
 93 |         plot_head_map(mma, target_labels, source_labels)
 94 | 
 95 | def read_alignment_json(file):
 96 |     data = file.readline() ##one line containing the json object.
 97 |     if len(data.strip()) == 0:
 98 |         return None, None, None, None
 99 |     jdata = json.loads(data)
100 |     ## messy json encodings... TODO: make this better
101 |     jdata = json.loads(json.dumps(jdata).decode('unicode-escape').encode('utf8'))
102 |     #print jdata
103 |     sid = int(jdata["id"])
104 |     mma = numpy.array(jdata["matrix"])
105 |     ##target words
106 |     target_labels = jdata["target_sent"].split()
107 |     target_labels.append('</s>')
108 |     ##source words
109 |     source_labels = jdata["source_sent"].split()
110 |     source_labels.append('</s>')
111 |     return sid,mma, target_labels, source_labels
112 | 
113 | if __name__ == "__main__":
114 | 
115 |     parser = argparse.ArgumentParser()
116 |     # '/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/en-de/test.alignment'
117 |     parser.add_argument('--input', '-i', type=argparse.FileType('r'),
118 |                             default='/Users/mnadejde/Documents/workspace/MTMA2016/models/wmt16_systems/ro-en/newstest2016-roen-src.ro.alignment', metavar='PATH',
119 |                             help="Input file (default: standard input)")
120 | 
121 |     parser.add_argument('--json', '-j', required = False,action="store_true",
122 |                             help="If this option is used, then read alignment matrix from a Json formatted file.")
123 |     args = parser.parse_args()
124 | 
125 |     if args.json:
126 |         read_plot_alignment_json(args.input, 10)   ##n is the maximum number of sentences to process.
127 |     else:
128 |         read_plot_alignment_matrices(args.input,10)
129 | >>>>>>> origin/nematus-liucan
130 | 


--------------------------------------------------------------------------------
/data/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | sub add_to_ref {
 35 |     my ($file,$REF) = @_;
 36 |     my $s=0;
 37 |     open(REF,$file) or die "Can't read $file";
 38 |     while(<REF>) {
 39 | 	chop;
 40 | 	push @{$$REF[$s++]}, $_;
 41 |     }
 42 |     close(REF);
 43 | }
 44 | 
 45 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 46 | my $s=0;
 47 | while(<STDIN>) {
 48 |     chop;
 49 |     $_ = lc if $lowercase;
 50 |     my @WORD = split;
 51 |     my %REF_NGRAM = ();
 52 |     my $length_translation_this_sentence = scalar(@WORD);
 53 |     my ($closest_diff,$closest_length) = (9999,9999);
 54 |     foreach my $reference (@{$REF[$s]}) {
 55 | #      print "$s $_ <=> $reference\n";
 56 |   $reference = lc($reference) if $lowercase;
 57 | 	my @WORD = split(' ',$reference);
 58 | 	my $length = scalar(@WORD);
 59 |         my $diff = abs($length_translation_this_sentence-$length);
 60 | 	if ($diff < $closest_diff) {
 61 | 	    $closest_diff = $diff;
 62 | 	    $closest_length = $length;
 63 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 64 | 	} elsif ($diff == $closest_diff) {
 65 |             $closest_length = $length if $length < $closest_length;
 66 |             # from two references with the same closeness to me
 67 |             # take the *shorter* into account, not the "first" one.
 68 |         }
 69 | 	for(my $n=1;$n<=4;$n++) {
 70 | 	    my %REF_NGRAM_N = ();
 71 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 72 | 		my $ngram = "$n";
 73 | 		for(my $w=0;$w<$n;$w++) {
 74 | 		    $ngram .= " ".$WORD[$start+$w];
 75 | 		}
 76 | 		$REF_NGRAM_N{$ngram}++;
 77 | 	    }
 78 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 79 | 		if (!defined($REF_NGRAM{$ngram}) ||
 80 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 81 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 82 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 83 | 		}
 84 | 	    }
 85 | 	}
 86 |     }
 87 |     $length_translation += $length_translation_this_sentence;
 88 |     $length_reference += $closest_length;
 89 |     for(my $n=1;$n<=4;$n++) {
 90 | 	my %T_NGRAM = ();
 91 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 92 | 	    my $ngram = "$n";
 93 | 	    for(my $w=0;$w<$n;$w++) {
 94 | 		$ngram .= " ".$WORD[$start+$w];
 95 | 	    }
 96 | 	    $T_NGRAM{$ngram}++;
 97 | 	}
 98 | 	foreach my $ngram (keys %T_NGRAM) {
 99 | 	    $ngram =~ /^(\d+) /;
100 | 	    my $n = $1;
101 |             # my $corr = 0;
102 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
103 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
104 | 	    if (defined($REF_NGRAM{$ngram})) {
105 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
106 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
107 |                     # $corr =  $T_NGRAM{$ngram};
108 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
109 | 		}
110 | 		else {
111 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
112 |                     # $corr =  $REF_NGRAM{$ngram};
113 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
114 | 		}
115 | 	    }
116 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
117 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
118 | 	}
119 |     }
120 |     $s++;
121 | }
122 | my $brevity_penalty = 1;
123 | my $bleu = 0;
124 | 
125 | my @bleu=();
126 | 
127 | for(my $n=1;$n<=4;$n++) {
128 |   if (defined ($TOTAL[$n])){
129 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
130 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
131 |   }else{
132 |     $bleu[$n]=0;
133 |   }
134 | }
135 | 
136 | if ($length_reference==0){
137 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
138 |   exit(1);
139 | }
140 | 
141 | if ($length_translation<$length_reference) {
142 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
143 | }
144 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
145 | 				my_log( $bleu[2] ) +
146 | 				my_log( $bleu[3] ) +
147 | 				my_log( $bleu[4] ) ) / 4) ;
148 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
149 |     100*$bleu,
150 |     100*$bleu[1],
151 |     100*$bleu[2],
152 |     100*$bleu[3],
153 |     100*$bleu[4],
154 |     $brevity_penalty,
155 |     $length_translation / $length_reference,
156 |     $length_translation,
157 |     $length_reference;
158 | 
159 | sub my_log {
160 |   return -9999999999 unless $_[0];
161 |   return log($_[0]);
162 | }
163 | 


--------------------------------------------------------------------------------
/nematus/data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import gzip
  4 | 
  5 | import shuffle
  6 | from util import load_dict
  7 | 
  8 | def fopen(filename, mode='r'):
  9 |     if filename.endswith('.gz'):
 10 |         return gzip.open(filename, mode)
 11 |     return open(filename, mode)
 12 | 
 13 | class TextIterator:
 14 |     """Simple Bitext iterator."""
 15 |     def __init__(self, source, target,
 16 |                  source_dicts, target_dict,
 17 |                  batch_size=128,
 18 |                  maxlen=100,
 19 |                  n_words_source=-1,
 20 |                  n_words_target=-1,
 21 |                  skip_empty=False,
 22 |                  shuffle_each_epoch=False,
 23 |                  sort_by_length=True,
 24 |                  maxibatch_size=20):
 25 |         if shuffle_each_epoch:
 26 |             self.source_orig = source
 27 |             self.target_orig = target
 28 |             self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
 29 |         else:
 30 |             self.source = fopen(source, 'r')
 31 |             self.target = fopen(target, 'r')
 32 |         self.source_dicts = []
 33 |         for source_dict in source_dicts:
 34 |             self.source_dicts.append(load_dict(source_dict))
 35 |         self.target_dict = load_dict(target_dict)
 36 | 
 37 |         self.batch_size = batch_size
 38 |         self.maxlen = maxlen
 39 |         self.skip_empty = skip_empty
 40 | 
 41 |         self.n_words_source = n_words_source
 42 |         self.n_words_target = n_words_target
 43 | 
 44 |         if self.n_words_source > 0:
 45 |             for d in self.source_dicts:
 46 |                 for key, idx in d.items():
 47 |                     if idx >= self.n_words_source:
 48 |                         del d[key]
 49 | 
 50 |         if self.n_words_target > 0:
 51 |                 for key, idx in self.target_dict.items():
 52 |                     if idx >= self.n_words_target:
 53 |                         del self.target_dict[key]
 54 | 
 55 |         self.shuffle = shuffle_each_epoch
 56 |         self.sort_by_length = sort_by_length
 57 | 
 58 |         self.source_buffer = []
 59 |         self.target_buffer = []
 60 |         self.k = batch_size * maxibatch_size
 61 |         
 62 | 
 63 |         self.end_of_data = False
 64 | 
 65 |     def __iter__(self):
 66 |         return self
 67 | 
 68 |     def reset(self):
 69 |         if self.shuffle:
 70 |             self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
 71 |         else:
 72 |             self.source.seek(0)
 73 |             self.target.seek(0)
 74 | 
 75 |     def next(self):
 76 |         if self.end_of_data:
 77 |             self.end_of_data = False
 78 |             self.reset()
 79 |             raise StopIteration
 80 | 
 81 |         source = []
 82 |         target = []
 83 | 
 84 |         # fill buffer, if it's empty
 85 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
 86 | 
 87 |         if len(self.source_buffer) == 0:
 88 |             for k_ in xrange(self.k):
 89 |                 ss = self.source.readline()
 90 |                 if ss == "":
 91 |                     break
 92 |                 tt = self.target.readline()
 93 |                 if tt == "":
 94 |                     break
 95 | 
 96 |                 self.source_buffer.append(ss.strip().split())
 97 |                 self.target_buffer.append(tt.strip().split())
 98 | 
 99 |             # sort by target buffer
100 |             if self.sort_by_length:
101 |                 tlen = numpy.array([len(t) for t in self.target_buffer])
102 |                 tidx = tlen.argsort()
103 | 
104 |                 _sbuf = [self.source_buffer[i] for i in tidx]
105 |                 _tbuf = [self.target_buffer[i] for i in tidx]
106 | 
107 |                 self.source_buffer = _sbuf
108 |                 self.target_buffer = _tbuf
109 | 
110 |             else:
111 |                 self.source_buffer.reverse()
112 |                 self.target_buffer.reverse()
113 | 
114 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
115 |             self.end_of_data = False
116 |             self.reset()
117 |             raise StopIteration
118 | 
119 |         try:
120 | 
121 |             # actual work here
122 |             while True:
123 | 
124 |                 # read from source file and map to word index
125 |                 try:
126 |                     ss = self.source_buffer.pop()
127 |                 except IndexError:
128 |                     break
129 |                 tmp = []
130 |                 for w in ss:
131 |                     w = [self.source_dicts[i][f] if f in self.source_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))]
132 |                     tmp.append(w)
133 |                 ss = tmp
134 | 
135 |                 # read from source file and map to word index
136 |                 tt = self.target_buffer.pop()
137 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
138 |                       for w in tt]
139 |                 if self.n_words_target > 0:
140 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
141 | 
142 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
143 |                     continue
144 |                 if self.skip_empty and (not ss or not tt):
145 |                     continue
146 | 
147 |                 source.append(ss)
148 |                 target.append(tt)
149 | 
150 |                 if len(source) >= self.batch_size or \
151 |                         len(target) >= self.batch_size:
152 |                     break
153 |         except IOError:
154 |             self.end_of_data = True
155 | 
156 |         # all sentence pairs in maxibatch filtered out because of length
157 |         if len(source) == 0 or len(target) == 0:
158 |             source, target = self.next()
159 | 
160 |         return source, target
161 | 


--------------------------------------------------------------------------------
/nematus/score.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Given a parallel corpus of sentence pairs: with one-to-one of target and source sentences,
  3 | produce the score, and optionally alignment for each pair.
  4 | """
  5 | 
  6 | import sys
  7 | import argparse
  8 | import tempfile
  9 | 
 10 | import numpy
 11 | import json
 12 | 
 13 | from data_iterator import TextIterator
 14 | from util import load_dict, load_config
 15 | from alignment_util import *
 16 | from compat import fill_options
 17 | 
 18 | from theano_util import (load_params, init_theano_params)
 19 | from nmt import (pred_probs, build_model, prepare_data, init_params)
 20 | 
 21 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 22 | import theano
 23 | 
 24 | def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights):
 25 | 
 26 |     trng = RandomStreams(1234)
 27 | 
 28 |     fs_log_probs = []
 29 | 
 30 |     for model, option in zip(models, options):
 31 | 
 32 |         # load model parameters and set theano shared variables
 33 |         param_list = numpy.load(model).files
 34 |         param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0)
 35 |         params = load_params(model, param_list)
 36 |         tparams = init_theano_params(params)
 37 | 
 38 |         trng, use_noise, \
 39 |             x, x_mask, y, y_mask, \
 40 |             opt_ret, \
 41 |             cost = \
 42 |             build_model(tparams, option)
 43 |         inps = [x, x_mask, y, y_mask]
 44 |         use_noise.set_value(0.)
 45 | 
 46 |         if alignweights:
 47 |             sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
 48 |             outputs = [cost, opt_ret['dec_alphas']]
 49 |             f_log_probs = theano.function(inps, outputs)
 50 |         else:
 51 |             f_log_probs = theano.function(inps, cost)
 52 | 
 53 |         fs_log_probs.append(f_log_probs)
 54 | 
 55 |     def _score(pairs, alignweights=False):
 56 |         # sample given an input sequence and obtain scores
 57 |         scores = []
 58 |         alignments = []
 59 |         for i, f_log_probs in enumerate(fs_log_probs):
 60 |             score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
 61 |             scores.append(score)
 62 |             alignments.append(alignment)
 63 | 
 64 |         return scores, alignments
 65 | 
 66 |     pairs = TextIterator(source_file.name, target_file.name,
 67 |                     options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
 68 |                      n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
 69 |                      batch_size=b,
 70 |                      maxlen=float('inf'),
 71 |                      sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after
 72 | 
 73 |     scores, alignments = _score(pairs, alignweights)
 74 | 
 75 |     source_file.seek(0)
 76 |     target_file.seek(0)
 77 |     source_lines = source_file.readlines()
 78 |     target_lines = target_file.readlines()
 79 | 
 80 |     for i, line in enumerate(target_lines):
 81 |         score_str = ' '.join(map(str,[s[i] for s in scores]))
 82 |         if verbose:
 83 |             saveto.write('{0} '.format(line.strip()))
 84 |         saveto.write('{0}\n'.format(score_str))
 85 | 
 86 |     ### optional save weights mode.
 87 |     if alignweights:
 88 |         ### writing out the alignments.
 89 |         temp_name = saveto.name + ".json"
 90 |         with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
 91 |             for line in all_alignments:
 92 |                 align_OUT.write(line + "\n")
 93 |             ### combining the actual source and target words.
 94 |             combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
 95 | 
 96 | def main(models, source_file, nbest_file, saveto, b=80,
 97 |          normalize=False, verbose=False, alignweights=False):
 98 | 
 99 |     # load model model_options
100 |     options = []
101 |     for model in models:
102 |         options.append(load_config(model))
103 | 
104 |         fill_options(options[-1])
105 | 
106 |     rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
107 | 
108 | if __name__ == "__main__":
109 |     parser = argparse.ArgumentParser()
110 |     parser.add_argument('-b', type=int, default=80,
111 |                         help="Minibatch size (default: %(default)s))")
112 |     parser.add_argument('-n', action="store_true",
113 |                         help="Normalize scores by sentence length")
114 |     parser.add_argument('-v', action="store_true", help="verbose mode.")
115 |     parser.add_argument('--models', '-m', type=str, nargs = '+', required=True,
116 |                         help="model to use. Provide multiple models (with same vocabulary) for ensemble decoding")
117 |     parser.add_argument('--source', '-s', type=argparse.FileType('r'),
118 |                         required=True, metavar='PATH',
119 |                         help="Source text file")
120 |     parser.add_argument('--target', '-t', type=argparse.FileType('r'),
121 |                         required=True, metavar='PATH',
122 |                         help="Target text file")
123 |     parser.add_argument('--output', '-o', type=argparse.FileType('w'),
124 |                         default=sys.stdout, metavar='PATH',
125 |                         help="Output file (default: standard output)")
126 |     parser.add_argument('--walign', '-w',required = False,action="store_true",
127 |                         help="Whether to store the alignment weights or not. If specified, weights will be saved in <target>.alignment")
128 | 
129 |     args = parser.parse_args()
130 | 
131 |     main(args.models, args.source, args.target,
132 |          args.output, b=args.b, normalize=args.n, verbose=args.v, alignweights=args.walign)
133 | 


--------------------------------------------------------------------------------
/nematus/metrics/chrf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from scorer import Scorer
  5 | from reference import Reference
  6 | 
  7 | class CharacterFScorer(Scorer):
  8 |     """
  9 |     Scores CharacterFScoreReference objects.
 10 |     """
 11 | 
 12 |     def __init__(self, argument_string):
 13 |         """
 14 |         Initialises metric-specific parameters.
 15 |         """
 16 |         Scorer.__init__(self, argument_string)
 17 |         # use character n-gram order of 4 by default
 18 |         if not 'n' in self._arguments.keys():
 19 |             self._arguments['n'] = 6
 20 |         # use beta = 1 by default (recommendation by Maja Popovic for generative modelling)
 21 |         if not 'beta' in self._arguments.keys():
 22 |             self._arguments['beta'] = 1
 23 | 
 24 |     def set_reference(self, reference_tokens):
 25 |         """
 26 |         Sets the reference against hypotheses are scored.
 27 |         """
 28 |         self._reference = CharacterFScoreReference(
 29 |             reference_tokens,
 30 |             self._arguments['n'],
 31 |             self._arguments['beta']
 32 |         )
 33 | 
 34 | class CharacterFScoreReference(Reference):
 35 |     """
 36 |     References for Character F-Score, as proposed by Popovic (2015): http://www.statmt.org/wmt15/pdf/WMT49.pdf
 37 |     """
 38 | 
 39 |     def __init__(self, reference_tokens, n=6, beta=1):
 40 |         """
 41 |         @param reference the reference translation that hypotheses shall be
 42 |                          scored against.
 43 |         @param n         maximum character n-gram order to consider.
 44 |         @param beta      algorithm paramater beta (interpolation weight, needs to be > 0).
 45 |         """
 46 |         if beta <= 0:
 47 |             raise ValueError("Value of beta needs to be larger than zero!")
 48 |         
 49 |         Reference.__init__(self, reference_tokens)
 50 |         self.n = n
 51 |         self.max_order = n
 52 |         self.beta_squared = beta ** 2
 53 |         
 54 |         # The paper specifies that whitespace is ignored, but for a training objective,
 55 |         #it's perhaps better to leave it in. According to the paper, it makes no
 56 |         #difference in practise for scoring.
 57 |         self._reference_string = " ".join(reference_tokens).strip()
 58 |                 
 59 |         # Get n-grams from reference:
 60 |         self._reference_ngrams = self._get_ngrams(self._reference_string, self.n)
 61 |         
 62 |     def _get_ngrams(self, tokens, n):
 63 |         """
 64 |         Extracts all n-grams up to order @param n from a list of @param tokens.
 65 |         """     
 66 |         n_grams_dict = {}
 67 |         length = len(tokens)
 68 |         #If the reference is shorter than n characters, insist on an exact match:
 69 |         if len(tokens) < n:
 70 |             self.max_order = len(tokens)
 71 |         m = 1
 72 |         while m <= n: #n-gram order
 73 |             i = m
 74 |             n_grams_list = []
 75 |             order_dict = {}
 76 |             while (i <= length):
 77 |                 n_grams_list.append(tokens[i-m:i])
 78 |                 i += 1            
 79 |             for ngr in n_grams_list:
 80 |                 order_dict[ngr] = order_dict.setdefault(ngr,0) + 1
 81 |             n_grams_dict[m] = order_dict
 82 |             m += 1
 83 |         return n_grams_dict
 84 | 
 85 |     def score(self, hypothesis_tokens):
 86 |         """
 87 |         Scores @param hypothesis against this reference.
 88 | 
 89 |         @return the sentence-level ChrF score: 1.0 is best, 0.0 worst.
 90 |         """
 91 |         #See comment above on treating whitespace.
 92 |         hypothesis_string = " ".join(hypothesis_tokens).strip()
 93 |         
 94 |         #If the hypothesis or the reference is empty, insist on an exact match:
 95 |         if len(self._reference_string) < 1 or len(hypothesis_string) < 1:
 96 |             if hypothesis_string == self._reference_string:
 97 |                 return 1.0
 98 |             else:
 99 |                 return 0.0
100 |         
101 |         hypothesis_ngrams = self._get_ngrams(hypothesis_string, self.n)
102 |         
103 |         #Calculate character precision:
104 |         chrP = 0.0
105 |         chrR = 0.0
106 |         for m in range(1,self.n+1):
107 |             hyp_count = 0.0
108 |             count_total = 0.0
109 |             count_in = 0.0
110 |             for ngr in hypothesis_ngrams[m]:
111 |                 hyp_count = hypothesis_ngrams[m][ngr]
112 |                 count_total += hyp_count
113 |                 if ngr in self._reference_ngrams[m]:
114 |                     count_in += min(hyp_count, self._reference_ngrams[m][ngr])
115 |             #Catch division by zero:
116 |             if count_total == 0.0:
117 |                 chrP += 0.0
118 |             else:
119 |                 chrP += count_in / count_total    
120 |         #average chrP over n-gram orders:        
121 |         chrP = chrP / float(self.max_order)
122 |         
123 |         #Calculate character recall:
124 |         for m in range(1,self.n+1):
125 |             ref_count = 0.0
126 |             count_total = 0.0
127 |             count_in = 0.0
128 |             for ngr in self._reference_ngrams[m]:
129 |                 ref_count = self._reference_ngrams[m][ngr]
130 |                 count_total += ref_count
131 |                 if ngr in hypothesis_ngrams[m]:
132 |                     count_in += min(ref_count, hypothesis_ngrams[m][ngr])
133 |             #Catch division by zero:
134 |             if count_total == 0.0:
135 |                 chrR += 0.0
136 |             else:    
137 |                 chrR += count_in/count_total
138 |         #average chrR over n-gram orders:
139 |         chrR = chrR / float(self.max_order)
140 |                 
141 |         #Catch division by zero:
142 |         if chrP == 0.0 and chrR == 0.0:
143 |             return 0.0
144 |         return (1 + self.beta_squared) * (chrP*chrR) / ((self.beta_squared * chrP) + chrR)


--------------------------------------------------------------------------------
/nematus/rescore.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Rescoring an n-best list of translations using a translation model.
  3 | '''
  4 | import sys
  5 | import argparse
  6 | import tempfile
  7 | 
  8 | import numpy
  9 | import json
 10 | 
 11 | from data_iterator import TextIterator
 12 | from util import load_dict, load_config
 13 | from alignment_util import *
 14 | from compat import fill_options
 15 | 
 16 | from theano_util import (load_params, init_theano_params)
 17 | from nmt import (pred_probs, build_model, prepare_data, init_params)
 18 | 
 19 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 20 | import theano
 21 | 
 22 | def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights):
 23 | 
 24 |     trng = RandomStreams(1234)
 25 | 
 26 |     fs_log_probs = []
 27 | 
 28 |     for model, option in zip(models, options):
 29 | 
 30 |         # load model parameters and set theano shared variables
 31 |         param_list = numpy.load(model).files
 32 |         param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0)
 33 |         params = load_params(model, param_list)
 34 |         tparams = init_theano_params(params)
 35 | 
 36 |         trng, use_noise, \
 37 |             x, x_mask, y, y_mask, \
 38 |             opt_ret, \
 39 |             cost = \
 40 |             build_model(tparams, option)
 41 |         inps = [x, x_mask, y, y_mask]
 42 |         use_noise.set_value(0.)
 43 | 
 44 |         if alignweights:
 45 |             sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
 46 |             outputs = [cost, opt_ret['dec_alphas']]
 47 |             f_log_probs = theano.function(inps, outputs)
 48 |         else:
 49 |             f_log_probs = theano.function(inps, cost)
 50 | 
 51 |         fs_log_probs.append(f_log_probs)
 52 | 
 53 |     def _score(pairs, alignweights=False):
 54 |         # sample given an input sequence and obtain scores
 55 |         scores = []
 56 |         alignments = []
 57 |         for i, f_log_probs in enumerate(fs_log_probs):
 58 |             score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
 59 |             scores.append(score)
 60 |             alignments.append(alignment)
 61 | 
 62 |         return scores, alignments
 63 | 
 64 |     lines = source_file.readlines()
 65 |     nbest_lines = nbest_file.readlines()
 66 | 
 67 |     if alignweights: ### opening the temporary file.
 68 |         temp_name = saveto.name + ".json"
 69 |         align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)
 70 | 
 71 |     with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out:
 72 |         for line in nbest_lines:
 73 |             linesplit = line.split(' ||| ')
 74 |             idx = int(linesplit[0])   ##index from the source file. Starting from 0.
 75 |             tmp_in.write(lines[idx])
 76 |             tmp_out.write(linesplit[1] + '\n')
 77 | 
 78 |         tmp_in.seek(0)
 79 |         tmp_out.seek(0)
 80 |         pairs = TextIterator(tmp_in.name, tmp_out.name,
 81 |                         options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
 82 |                          n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
 83 |                          batch_size=b,
 84 |                          maxlen=float('inf'),
 85 |                          sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after
 86 | 
 87 | 
 88 |         scores, alignments = _score(pairs, alignweights)
 89 | 
 90 |         for i, line in enumerate(nbest_lines):
 91 |             score_str = ' '.join(map(str,[s[i] for s in scores]))
 92 |             saveto.write('{0} {1}\n'.format(line.strip(), score_str))
 93 | 
 94 |         ### optional save weights mode.
 95 |         if alignweights:
 96 |             for line in alignments:
 97 |                 align_OUT.write(line + "\n")
 98 |     if alignweights:
 99 |         combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT)
100 |         align_OUT.close()
101 | 
102 | def main(models, source_file, nbest_file, saveto, b=80,
103 |          normalize=False, verbose=False, alignweights=False):
104 | 
105 |     # load model model_options
106 |     options = []
107 |     for model in models:
108 |         options.append(load_config(model))
109 | 
110 |         fill_options(options[-1])
111 | 
112 |     rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
113 | 
114 | if __name__ == "__main__":
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument('-b', type=int, default=80,
117 |                         help="Minibatch size (default: %(default)s))")
118 |     parser.add_argument('-n', action="store_true",
119 |                         help="Normalize scores by sentence length")
120 |     parser.add_argument('-v', action="store_true", help="verbose mode.")
121 |     parser.add_argument('--models', '-m', type=str, nargs = '+', required=True,
122 |                         help="model to use. Provide multiple models (with same vocabulary) for ensemble decoding")
123 |     parser.add_argument('--source', '-s', type=argparse.FileType('r'),
124 |                         required=True, metavar='PATH',
125 |                         help="Source text file")
126 |     parser.add_argument('--input', '-i', type=argparse.FileType('r'),
127 |                         default=sys.stdin, metavar='PATH',
128 |                         help="Input n-best list file (default: standard input)")
129 |     parser.add_argument('--output', '-o', type=argparse.FileType('w'),
130 |                         default=sys.stdout, metavar='PATH',
131 |                         help="Output file (default: standard output)")
132 |     parser.add_argument('--walign', '-w',required = False,action="store_true",
133 |                         help="Whether to store the alignment weights or not. If specified, weights will be saved in <input>.alignment")
134 | 
135 |     args = parser.parse_args()
136 | 
137 |     main(args.models, args.source, args.input,
138 |          args.output, b=args.b, normalize=args.n, verbose=args.v, alignweights=args.walign)
139 | 


--------------------------------------------------------------------------------
/nematus/optimizers.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Optimizers
  3 | '''
  4 | 
  5 | import numpy
  6 | from collections import OrderedDict
  7 | 
  8 | import theano
  9 | import theano.tensor as tensor
 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 11 | 
 12 | from util import *
 13 | from theano_util import *
 14 | 
 15 | # Calling convention:
 16 | # f_grad_shared, f_update = name(hyperp, tparams, grads, inputs (list), cost)
 17 | # with profile as an optional argument
 18 | 
 19 | def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8, optimizer_params={}, profile=False):
 20 |     PREFIX='adam_'
 21 | 
 22 |     gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
 23 |                for k, p in tparams.iteritems()]
 24 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
 25 | 
 26 |     f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)
 27 | 
 28 |     updates = []
 29 |     optimizer_tparams = {}
 30 | 
 31 |     t_prev_name = PREFIX + 't_prev'
 32 |     if t_prev_name in optimizer_params:
 33 |         t_prev_init = optimizer_params[t_prev_name]
 34 |     else:
 35 |         t_prev_init = 0.
 36 |     t_prev = theano.shared(numpy.float32(t_prev_init), t_prev_name)
 37 |     optimizer_tparams[t_prev_name] = t_prev
 38 |     
 39 |     t = t_prev + 1.
 40 |     lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t)
 41 | 
 42 |     for p, g in zip(tparams.values(), gshared):
 43 |         # Create/Load variable for first moment
 44 |         m_name = PREFIX + p.name + '_mean'
 45 |         if m_name in optimizer_params:
 46 |             m_init = optimizer_params[m_name]
 47 |         else:
 48 |             m_init = p.get_value() * 0.
 49 |         m = theano.shared(m_init, m_name)
 50 |         optimizer_tparams[m_name] = m
 51 | 
 52 |         # Create/Load variable for second moment
 53 |         v_name = PREFIX + p.name + '_variance'
 54 |         if v_name in optimizer_params:
 55 |             v_init = optimizer_params[v_name]
 56 |         else:
 57 |             v_init = p.get_value() * 0.
 58 |         v = theano.shared(v_init, v_name)
 59 |         optimizer_tparams[v_name] = v
 60 | 
 61 |         # Define updates on shared vars
 62 |         m_t = beta1 * m + (1. - beta1) * g
 63 |         v_t = beta2 * v + (1. - beta2) * g**2
 64 |         step = lr_t * m_t / (tensor.sqrt(v_t) + e)
 65 |         p_t = p - step
 66 |         updates.append((m, m_t))
 67 |         updates.append((v, v_t))
 68 |         updates.append((p, p_t))
 69 |     updates.append((t_prev, t))
 70 | 
 71 |     f_update = theano.function([lr], [], updates=updates,
 72 |                                on_unused_input='ignore', profile=profile)
 73 | 
 74 |     return f_grad_shared, f_update, optimizer_tparams
 75 | 
 76 | def adadelta(lr, tparams, grads, inp, cost, optimizer_params={}, profile=False):
 77 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
 78 |                                   name='%s_grad' % k)
 79 |                     for k, p in tparams.iteritems()]
 80 |     running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
 81 |                                  name='%s_rup2' % k)
 82 |                    for k, p in tparams.iteritems()]
 83 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
 84 |                                     name='%s_rgrad2' % k)
 85 |                       for k, p in tparams.iteritems()]
 86 | 
 87 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
 88 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
 89 |              for rg2, g in zip(running_grads2, grads)]
 90 | 
 91 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
 92 |                                     profile=profile)
 93 | 
 94 |     updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
 95 |              for zg, ru2, rg2 in zip(zipped_grads, running_up2,
 96 |                                      running_grads2)]
 97 |     ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
 98 |              for ru2, ud in zip(running_up2, updir)]
 99 |     param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
100 | 
101 |     f_update = theano.function([lr], [], updates=ru2up+param_up,
102 |                                on_unused_input='ignore', profile=profile)
103 | 
104 |     # TODO: third return value should be a dict of name->shared var used by optimizer
105 |     return f_grad_shared, f_update, {}
106 | 
107 | 
108 | def rmsprop(lr, tparams, grads, inp, cost, optimizer_params={}, profile=False):
109 |     zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
110 |                                   name='%s_grad' % k)
111 |                     for k, p in tparams.iteritems()]
112 |     running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
113 |                                    name='%s_rgrad' % k)
114 |                      for k, p in tparams.iteritems()]
115 |     running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
116 |                                     name='%s_rgrad2' % k)
117 |                       for k, p in tparams.iteritems()]
118 | 
119 |     zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
120 |     rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
121 |     rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
122 |              for rg2, g in zip(running_grads2, grads)]
123 | 
124 |     f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up,
125 |                                     profile=profile)
126 | 
127 |     updir = [theano.shared(p.get_value() * numpy.float32(0.),
128 |                            name='%s_updir' % k)
129 |              for k, p in tparams.iteritems()]
130 |     updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
131 |                  for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
132 |                                             running_grads2)]
133 |     param_up = [(p, p + udn[1])
134 |                 for p, udn in zip(itemlist(tparams), updir_new)]
135 |     f_update = theano.function([lr], [], updates=updir_new+param_up,
136 |                                on_unused_input='ignore', profile=profile)
137 | 
138 |     # TODO: third return value should be a dict of name->shared var used by optimizer
139 |     return f_grad_shared, f_update, {}
140 | 
141 | 
142 | def sgd(lr, tparams, grads, inp, cost, optimizer_params=None, profile=False):
143 |     gshared = [theano.shared(p.get_value() * 0.,
144 |                              name='%s_grad' % k)
145 |                for k, p in tparams.iteritems()]
146 |     gsup = [(gs, g) for gs, g in zip(gshared, grads)]
147 | 
148 |     f_grad_shared = theano.function(inp, cost, updates=gsup,
149 |                                     profile=profile)
150 | 
151 |     pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
152 |     f_update = theano.function([lr], [], updates=pup, profile=profile)
153 | 
154 |     return f_grad_shared, f_update, {}
155 | 
156 | 


--------------------------------------------------------------------------------
/nematus/domain_interpolation_data_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | import gzip
  4 | 
  5 | import shuffle
  6 | from util import load_dict
  7 | 
  8 | import math
  9 | 
 10 | def fopen(filename, mode='r'):
 11 |     if filename.endswith('.gz'):
 12 |         return gzip.open(filename, mode)
 13 |     return open(filename, mode)
 14 | 
 15 | 
 16 | class DomainInterpolatorTextIterator:
 17 |     """Bitext iterator with domain interpolation."""
 18 |     def __init__(self, source, target,
 19 |                  source_dicts, target_dict,
 20 |                  batch_size=128,
 21 |                  maxlen=100,
 22 |                  n_words_source=-1,
 23 |                  n_words_target=-1,
 24 |                  skip_empty=False,
 25 |                  shuffle_each_epoch=False,
 26 |                  sort_by_length=True,
 27 |                  indomain_source='', indomain_target='',
 28 |                  interpolation_rate=0.1,
 29 |                  maxibatch_size=20):
 30 |         if shuffle_each_epoch:
 31 |             self.source_orig = source
 32 |             self.target_orig = target
 33 |             self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
 34 |             self.indomain_source_orig = indomain_source
 35 |             self.indomain_target_orig = indomain_target
 36 |             self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True)
 37 |         else:
 38 |             self.source = fopen(source, 'r')
 39 |             self.target = fopen(target, 'r')
 40 |             self.indomain_source = fopen(indomain_source, 'r')
 41 |             self.indomain_target = fopen(indomain_target, 'r')
 42 |         self.source_dicts = []
 43 |         for source_dict in source_dicts:
 44 |             self.source_dicts.append(load_dict(source_dict))
 45 |         self.target_dict = load_dict(target_dict)
 46 | 
 47 |         self.batch_size = batch_size
 48 |         self.maxlen = maxlen
 49 |         self.skip_empty = skip_empty
 50 | 
 51 |         self.n_words_source = n_words_source
 52 |         self.n_words_target = n_words_target
 53 | 
 54 |         if self.n_words_source > 0:
 55 |             for d in self.source_dicts:
 56 |                 for key, idx in d.items():
 57 |                     if idx >= self.n_words_source:
 58 |                         del d[key]
 59 | 
 60 |         if self.n_words_target > 0:
 61 |                 for key, idx in self.target_dict.items():
 62 |                     if idx >= self.n_words_target:
 63 |                         del self.target_dict[key]
 64 | 
 65 |         self.shuffle = shuffle_each_epoch
 66 |         self.sort_by_length = sort_by_length
 67 | 
 68 |         self.source_buffer = []
 69 |         self.target_buffer = []
 70 |         self.k = batch_size * maxibatch_size
 71 | 
 72 |         self.end_of_data = False
 73 | 
 74 |         self.interpolation_rate = interpolation_rate
 75 |         self.cur_interpolation_rate = self.interpolation_rate
 76 |         self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k))
 77 |         self.outdomain_k = self.k - self.indomain_k
 78 | 
 79 |     def __iter__(self):
 80 |         return self
 81 | 
 82 |     def reset(self):
 83 |         if self.shuffle:
 84 |             self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
 85 |         else:
 86 |             self.source.seek(0)
 87 |             self.target.seek(0)
 88 | 
 89 |     def indomain_reset(self):
 90 |         if self.shuffle:
 91 |             self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True)
 92 |         else:
 93 |             self.indomain_source.seek(0)
 94 |             self.indomain_target.seek(0)
 95 | 
 96 |     def adjust_domain_interpolation_rate(self, interpolation_rate):
 97 |         # discard sentences in buffers
 98 |         self.source_buffer = []
 99 |         self.target_buffer = []
100 |         # adjust rate
101 |         self.cur_interpolation_rate = interpolation_rate
102 |         self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k))
103 |         self.outdomain_k = self.k - self.indomain_k
104 |         
105 |     def next(self):
106 |         if self.end_of_data:
107 |             self.end_of_data = False
108 |             self.reset()
109 |             #raise StopIteration
110 | 
111 |         source = []
112 |         target = []
113 | 
114 |         # fill buffer, if it's empty
115 |         assert len(self.source_buffer) == len(self.target_buffer), 'Buffer size mismatch!'
116 | 
117 |         if len(self.source_buffer) == 0:
118 |             for k_ in xrange(self.outdomain_k):
119 |                 ss = self.source.readline()
120 |                 if ss == "":
121 |                     break
122 |                 tt = self.target.readline()
123 |                 if tt == "":
124 |                     break
125 |                 self.source_buffer.append(ss.strip().split())
126 |                 self.target_buffer.append(tt.strip().split())
127 |             for k_ in xrange(self.indomain_k):
128 |                 indomain_error = False
129 |                 try:
130 |                     ss = self.indomain_source.readline()
131 |                     tt = self.indomain_target.readline()
132 |                 except IOError:
133 |                     indomain_error = True
134 |                 if (ss == "") or (tt == "") or indomain_error:
135 |                     self.indomain_reset()
136 |                     raise StopIteration
137 |                 self.source_buffer.append(ss.strip().split())
138 |                 self.target_buffer.append(tt.strip().split())
139 | 
140 |             # sort by target buffer
141 |             if self.sort_by_length:
142 |                 tlen = numpy.array([len(t) for t in self.target_buffer])
143 |                 tidx = tlen.argsort()
144 | 
145 |                 _sbuf = [self.source_buffer[i] for i in tidx]
146 |                 _tbuf = [self.target_buffer[i] for i in tidx]
147 | 
148 |                 self.source_buffer = _sbuf
149 |                 self.target_buffer = _tbuf
150 | 
151 |             else:
152 |                 self.source_buffer.reverse()
153 |                 self.target_buffer.reverse()
154 | 
155 |         if len(self.source_buffer) == 0 or len(self.target_buffer) == 0:
156 |             self.end_of_data = False
157 |             self.reset()
158 |             #raise StopIteration
159 | 
160 |         try:
161 | 
162 |             # actual work here
163 |             while True:
164 | 
165 |                 # read from source file and map to word index
166 |                 try:
167 |                     ss = self.source_buffer.pop()
168 |                 except IndexError:
169 |                     break
170 |                 tmp = []
171 |                 for w in ss:
172 |                     w = [self.source_dicts[i][f] if f in self.source_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))]
173 |                     tmp.append(w)
174 |                 ss = tmp
175 | 
176 |                 # read from source file and map to word index
177 |                 tt = self.target_buffer.pop()
178 |                 tt = [self.target_dict[w] if w in self.target_dict else 1
179 |                       for w in tt]
180 |                 if self.n_words_target > 0:
181 |                     tt = [w if w < self.n_words_target else 1 for w in tt]
182 | 
183 |                 if len(ss) > self.maxlen and len(tt) > self.maxlen:
184 |                     continue
185 |                 if self.skip_empty and (not ss or not tt):
186 |                     continue
187 | 
188 |                 source.append(ss)
189 |                 target.append(tt)
190 | 
191 |                 if len(source) >= self.batch_size or \
192 |                         len(target) >= self.batch_size:
193 |                     break
194 |         except IOError:
195 |             self.end_of_data = True
196 | 
197 |         # all sentence pairs in maxibatch filtered out because of length
198 |         if len(source) == 0 or len(target) == 0:
199 |             source, target = self.next()
200 | 
201 |         return source, target
202 | 


--------------------------------------------------------------------------------
/test/data/indomain-dev.en:
--------------------------------------------------------------------------------
  1 | one day , Los Angeles Times colum@@ n@@ ist Steve Lopez was walking along the streets of downtown Los Angeles when he heard beautiful music .
  2 | and the source was a man , an African-@@ American man , charming , rugged , homeless , playing a violin that only had two strings .
  3 | and I &apos;m telling a story that many of you know , because Steve &apos;s columns became the basis for a book , which was turned into a movie , with Robert Do@@ w@@ ney Jr. acting as Steve Lopez , and Jamie Fo@@ xx as Nath@@ an@@ iel Anthony A@@ yers , the Ju@@ illi@@ ard-@@ trained double b@@ assist whose promising career was cut short by a tragic afflic@@ tion with paranoid schizophren@@ ia .
  4 | Nath@@ an@@ iel dropped out of Ju@@ illi@@ ard , he suffered a complete breakdown , and 30 years later he was living homeless on the streets of Sk@@ id Ro@@ w in downtown Los Angeles .
  5 | I encourage all of you to read Steve &apos;s book or to watch the movie to understand not only the beautiful bond that formed between these two men , but how music helped shape that bond , and ultimately was instrumental -- if you &apos;ll pardon the p@@ un -- in helping Nath@@ an@@ iel get off the streets .
  6 | I met Mr. A@@ yers in 2008 , two years ago , at Walt Disney Concert Hall .
  7 | he had just heard a performance of Beethoven &apos;s First and Fourth symph@@ onies , and came back@@ stage and introduced himself .
  8 | he was speaking in a very jo@@ vial and greg@@ arious way about Y@@ o-@@ Y@@ o Ma and Hillary Clinton and how the Dod@@ gers were never going to make the World Series , all because of the tre@@ acher@@ ous first violin passage work in the last movement of Beethoven &apos;s Fourth Symphony .
  9 | and we got talking about music , and I got an email from Steve a few days later saying that Nath@@ an@@ iel was interested in a violin lesson with me .
 10 | now , I should mention that Nath@@ an@@ iel refuses treatment because when he was treated it was with shock therapy and Thor@@ az@@ ine and hand@@ cu@@ ffs , and that scar has stayed with him for his entire life .
 11 | but as a result now , he is prone to these schizophren@@ ic episodes , the worst of which can manifest themselves as and then disappearing for days , wandering the streets of Sk@@ id Ro@@ w , exposed to its horrors , with the tor@@ ment of his own mind unleashed upon him .
 12 | and Nath@@ an@@ iel was in such a state of ag@@ itation when we started our first lesson at Walt Disney Concert Hall -- he had a kind of man@@ ic g@@ lin@@ t in his eyes , he was lost .
 13 | and he was talking about invisible demons and smoke , and how someone was poisoning him in his sleep .
 14 | and I was afraid , not for myself , but I was afraid that I was going to lose him , that he was going to sink into one of his states , and that I would ruin his relationship with the violin if I started talking about scales and ar@@ peg@@ gi@@ os and other exciting forms of didac@@ tic violin pedagog@@ y .
 15 | so , I just started playing .
 16 | and I played the first movement of the Beethoven Viol@@ in Concerto .
 17 | and as I played , I understood that there was a profound change occurring in Nath@@ an@@ iel &apos;s eyes .
 18 | it was as if he was in the grip of some invisible pharmaceutical , a chemical reaction , for which my playing the music was its catalyst .
 19 | and Nath@@ an@@ iel &apos;s man@@ ic rage was transformed into understanding , a quiet curiosity and grace .
 20 | and in a miracle , he lifted his own violin and he started playing , by ear , certain sni@@ ppets of violin concer@@ tos which he then asked me to complete -- Mendelssohn , T@@ ch@@ ai@@ kovsky , Si@@ bel@@ ius .
 21 | and we started talking about music , from Bach to Beethoven and Brahms , Bruck@@ ner , all the B &apos;s , from Bart@@ ó@@ k , all the way up to E@@ sa-@@ Pek@@ ka Sal@@ onen .
 22 | and I understood that he not only had an en@@ cyclop@@ edic knowledge of music , but he related to this music at a personal level .
 23 | he spoke about it with the kind of passion and understanding that I share with my colleagues in the Los Angeles Philharmonic .
 24 | and through playing music and talking about music , this man had transformed from the paranoid , disturbed man that had just come from walking the streets of downtown Los Angeles to the charming , eru@@ dite , brilliant , Ju@@ illi@@ ard-@@ trained musician .
 25 | music is medicine . music changes us .
 26 | and for Nath@@ an@@ iel , music is san@@ ity .
 27 | because music allows him to take his thoughts and delu@@ sions and shape them through his imagination and his creativity , into reality .
 28 | and that is an escape from his tor@@ mented state .
 29 | and I understood that this was the very essence of art .
 30 | this was the very reason why we made music , that we take something that exists within all of us at our very fundamental core , our emotions , and through our artistic lens , through our creativity , we &apos;re able to shape those emotions into reality .
 31 | and the reality of that expression reaches all of us and moves us , inspires and unites us .
 32 | and for Nath@@ an@@ iel , music brought him back into a fold of friends .
 33 | the rede@@ mp@@ tive power of music brought him back into a family of musicians that understood him , that recognized his talents and respected him .
 34 | and I will always make music with Nath@@ an@@ iel , whether we &apos;re at Walt Disney Concert Hall or on Sk@@ id Ro@@ w , because he reminds me why I became a musician .
 35 | thank you .
 36 | Bruno Gi@@ uss@@ ani : thank you . thanks .
 37 | Robert G@@ up@@ ta .
 38 | Robert G@@ up@@ ta : I &apos;m going to play something that I sham@@ elessly st@@ ole from cell@@ ists .
 39 | so , please forgive me .
 40 | so , I &apos;ve known a lot of fish in my life .
 41 | I &apos;ve loved only two .
 42 | that first one , it was more like a passionate affair .
 43 | it was a beautiful fish : fla@@ vor@@ ful , tex@@ tured , me@@ aty , a bes@@ tseller on the menu .
 44 | what a fish .
 45 | even better , it was far@@ m-@@ raised to the supposed highest standards of sustainability .
 46 | so you could feel good about selling it .
 47 | I was in a relationship with this beauty for several months .
 48 | one day , the head of the company called and asked if I &apos;d speak at an event about the farm &apos;s sustainability .
 49 | `` Absol@@ utely , &apos;&apos; I said .
 50 | here was a company trying to solve what &apos;s become this unimaginable problem for us chefs : how do we keep fish on our menus ?
 51 | for the past 50 years , we &apos;ve been fishing the seas like we clear-cut forests .
 52 | it &apos;s hard to over@@ state the destruction .
 53 | nin@@ ety percent of large fish , the ones we love -- the tun@@ as , the hal@@ i@@ bu@@ ts , the sal@@ mons , s@@ word@@ fish -- they &apos;ve collapsed .
 54 | there &apos;s almost nothing left .
 55 | so , for better or for worse , aquaculture , fish farming , is going to be a part of our future .
 56 | a lot of arguments against it : fish farms pollute -- most of them do anyway -- and they &apos;re inefficient . take tuna , a major drawback .
 57 | it &apos;s got a feed conversion ratio of 15 to one .
 58 | that means it takes fifteen pounds of wild fish to get you one pound of farm tuna .
 59 | not very sustainable .
 60 | it does n&apos;t taste very good either .
 61 | so here , finally , was a company trying to do it right .
 62 | I wanted to support them .
 63 | the day before the event , I called the head of P.@@ R. for the company .
 64 | let &apos;s call him Don .
 65 | `` Don , &apos;&apos; I said , `` just to get the facts straight , you guys are famous for farming so far out to sea , you do n&apos;t pollute . &apos;&apos;
 66 | `` That &apos;s right , &apos;&apos; he said . `` We &apos;re so far out , the waste from our fish gets distributed , not concentrated . &apos;&apos;
 67 | and then he added , `` We &apos;re basically a world unto ourselves .
 68 | that feed conversion ratio ? 2.5 to one , &apos;&apos; he said .
 69 | `` Best in the business . &apos;&apos;
 70 | 2.5 to one , great .
 71 | `` 2.5 what ? what are you feeding ? &apos;&apos;
 72 | `` Sustainable proteins , &apos;&apos; he said .
 73 | `` Great , &apos;&apos; I said . got off the phone .
 74 | and that night , I was lying in bed , and I thought : what the hell is a sustainable protein ?
 75 | so the next day , just before the event , I called Don .
 76 | I said , `` Don , what are some examples of sustainable proteins ? &apos;&apos;
 77 | he said he did n&apos;t know . he would ask around .
 78 | well , I got on the phone with a few people in the company ; no one could give me a straight answer until finally , I got on the phone with the head bi@@ ologist .
 79 | let &apos;s call him Don too .
 80 | `` Don , &apos;&apos; I said , `` what are some examples of sustainable proteins ? &apos;&apos;
 81 | well , he mentioned some al@@ ga@@ es and some fish meals , and then he said chicken pellets .
 82 | I said , `` Ch@@ icken pellets ? &apos;&apos;
 83 | he said , `` Ye@@ ah , feathers , skin , bone meal , scra@@ ps , dried and processed into feed . &apos;&apos;
 84 | I said , `` What percentage of your feed is chicken ? &apos;&apos;
 85 | thinking , you know , two percent .
 86 | `` Well , it &apos;s about 30 percent , &apos;&apos; he said .
 87 | I said , `` Don , what &apos;s sustainable about feeding chicken to fish ? &apos;&apos;
 88 | there was a long pause on the line , and he said , `` There &apos;s just too much chicken in the world . &apos;&apos;
 89 | I fell out of love with this fish .
 90 | no , not because I &apos;m some self-@@ righteous , good@@ y-@@ two shoes Foo@@ die .
 91 | I actually am .
 92 | no , I actually fell out of love with this fish because , I swe@@ ar to God , after that conversation , the fish tasted like chicken .
 93 | this second fish , it &apos;s a different kind of love story .
 94 | it &apos;s the romantic kind , the kind where the more you get to know your fish , you love the fish .
 95 | I first ate it at a restaurant in southern Spain .
 96 | a journalist friend had been talking about this fish for a long time .
 97 | she kind of set us up .
 98 | it came to the table a bright , almost sh@@ immer@@ ing , white color .
 99 | the chef had over@@ cooked it .
100 | like twice over .
101 | 


--------------------------------------------------------------------------------
/nematus/alignment_util.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'canliu'
  2 | """
  3 | Save the alignment matrix in XML format. Like the following:
  4 | <sentence>
  5 |     <source> </source>
  6 |     <target> <target>
  7 |     <alignment>
  8 |         <sourceword> x,x,x... </sourceword>
  9 |         <sourceword> x,x,x... </sourceword>
 10 |     </alignment>
 11 | </sentence>
 12 | 
 13 | The number of rows is equal to the number of target_words + 1.
 14 | The number of columns is equal to the number of source_words + 1.
 15 | """
 16 | import json
 17 | import sys
 18 | import codecs
 19 | 
 20 | def get_alignments(attention, x_mask, y_mask):
 21 |     #print "\nPrinting Attention..."
 22 |     #print attention
 23 |     #print "\nPrinting x_mask, need to figure out how to use it"
 24 |     #print x_mask
 25 |     #print "\nPrinting y_mask, need to figure out how to use it"
 26 |     #print y_mask
 27 | 
 28 |     n_rows, n_cols = y_mask.shape  ###n_cols correspond to the number of sentences.
 29 |     #print "Number of rows and number of columns: \n\n", n_rows, n_cols
 30 | 
 31 |     for target_sent_index in range(n_cols):
 32 |         #print "\n\n","*" * 40
 33 |         print "Going through sentence", target_sent_index
 34 |         #source_sent_index = source_indexes[target_sent_index]
 35 |         target_length = y_mask[:,target_sent_index].tolist().count(1)
 36 |         source_length = x_mask[:,target_sent_index].tolist().count(1)
 37 |         # #print "STEP1: The attention matrix that is relevant for this sentence",
 38 |         temp_attention = attention[range(target_length),:,:]
 39 |         #print "STEP2: The attention matrix that is particular to just this sentence\n",
 40 |         this_attention = temp_attention[:,target_sent_index,range(source_length)]
 41 | 
 42 |         jdata = {}
 43 |         jdata['matrix'] = this_attention.tolist()
 44 |         jdata = json.dumps(jdata)
 45 |         #print "\t\tJSON Data"
 46 |         #print "\t\t",jdata
 47 |         yield jdata
 48 | 
 49 | def combine_source_target_text(source_IN, nbest_IN, saveto, alignment_IN):
 50 |     """
 51 |     there can be multiple target sentences, aligned to the same source sentence.
 52 |     """
 53 |     source_IN.seek(0)
 54 |     nbest_IN.seek(0)
 55 |     alignment_IN.seek(0)
 56 | 
 57 |     with open(saveto + "_withwords.json", "w") as alignment_OUT:
 58 |         all_matrixes = alignment_IN.readlines()
 59 |         nbest_lines = nbest_IN.readlines()
 60 |         source_lines = source_IN.readlines()
 61 |         assert len(all_matrixes) == len(nbest_lines), "The number of lines does not match with each other!"
 62 | 
 63 |         for target_index in range(len(all_matrixes)):
 64 |             jdata = json.loads(all_matrixes[target_index])
 65 |             target_line = nbest_lines[target_index]
 66 |             elements = target_line.strip().split("|||")
 67 |             refer_index = int(elements[0].strip())
 68 |             source_sent = source_lines[refer_index].strip()
 69 |             target_sent = elements[1].strip()
 70 | 
 71 |             jdata["source_sent"] = source_sent
 72 |             jdata["target_sent"] = target_sent
 73 |             jdata["id"] = refer_index
 74 |             jdata["prob"] = 0 #float(elements[2].strip().split()[1])
 75 | 
 76 |             #jdata = json.dumps(jdata)
 77 |             jdata = json.dumps(jdata).decode('unicode-escape').encode('utf8')
 78 |             alignment_OUT.write(jdata + "\n")
 79 | 
 80 | def combine_source_target_text_1to1(source_IN, target_IN, saveto, alignment_IN):
 81 |     """
 82 |     There is a 1-1 mapping of target and source sentence.
 83 |     """
 84 |     source_IN.seek(0)
 85 |     target_IN.seek(0)
 86 |     alignment_IN.seek(0)
 87 |     with open(saveto + "_withwords.json", "w") as alignment_OUT:
 88 | 
 89 |         all_matrixes = alignment_IN.readlines()
 90 |         target_lines = target_IN.readlines()
 91 |         source_lines = source_IN.readlines()
 92 |         assert len(all_matrixes) == len(target_lines), "The number of lines does not match with each other!"
 93 | 
 94 |         for target_index in range(len(all_matrixes)):
 95 |             jdata = json.loads(all_matrixes[target_index])
 96 | 
 97 |             jdata["source_sent"] = source_lines[target_index].strip()
 98 |             jdata["target_sent"] = target_lines[target_index].strip()
 99 |             jdata["id"] = target_index
100 |             jdata["prob"] = 0 #float(elements[2].strip().split()[1])
101 | 
102 |             #jdata = json.dumps(jdata)
103 |             jdata = json.dumps(jdata).decode('unicode-escape').encode('utf8')
104 |             alignment_OUT.write(jdata + "\n")
105 | 
106 | 
107 | def convert_to_nodes_edges_v1(filename):
108 |     """
109 |     Take as input the aligned file with file names ".withtext", and convert this into a file with nodes and edges.
110 |     Which will later used for Visualization.
111 |     """
112 |     with open(filename, "r") as IN:
113 |         with open(filename + ".forweb" , "w") as OUT:
114 |             in_lines = IN.readlines()
115 |             for data in in_lines:
116 |                 data4web = convert_to_nodes_edges_each_v1(data)
117 |                 OUT.write(data4web + "\n")
118 | 
119 | def convert_to_nodes_edges_each_v1(data):
120 |     """
121 |     give a single data object string, convert it into a json data string that is compatible with the Web interface.
122 |     """
123 |     jdata = json.loads(data)
124 |     web_data = {}
125 |     source_words = jdata["source_sent"].strip().split()
126 |     target_words = jdata["target_sent"].strip().split()
127 | 
128 |     ###make the data for source and target words
129 |     web_data["nodes"] = []
130 |     for word in source_words:
131 |         web_data["nodes"].append({"name":word, "group": 1})
132 |     web_data["nodes"].append({"name":"<EOS", "group": 1})
133 |     for word in target_words:
134 |         web_data["nodes"].append({"name":word, "group": 2})
135 |     web_data["nodes"].append({"name":"<EOS", "group": 2})
136 | 
137 |     matrix = jdata["matrix"]
138 |     n_rows = len(matrix)
139 |     n_cols = len(matrix[0])
140 | 
141 |     web_data["links"] = []
142 |     for target_index in range(n_rows):
143 |         for source_index in range(n_cols):
144 |             if target_index == (n_rows-1):
145 |                 target_word = "<EOS>"
146 |             else:
147 |                 target_word = target_words[target_index]
148 |             if source_index == (n_cols-1):
149 |                 source_word = "<EOS>"
150 |             else:
151 |                 source_word = source_words[source_index]
152 |             score = matrix[target_index][source_index]
153 |             web_data["links"].append( {"source": source_word, "target": target_word, "value": score} )
154 | 
155 |     web_data = json.dumps(web_data).decode('unicode-escape').encode('utf8')
156 |     #print web_data, "\n\n"
157 |     return web_data
158 | 
159 | def convert_to_nodes_edges_v2(filename):
160 |     """
161 |     Take as input the aligned file with file names ".withtext", and convert this into a file with nodes and edges.
162 |     Which will later used for Visualization.
163 |     """
164 |     with codecs.open(filename, "r", encoding="UTF-8") as IN:
165 |         with open(filename + ".forweb" , "w") as OUT:
166 |             in_lines = IN.readlines()
167 |             source_list = []
168 |             target_list = []
169 |             all_links = []
170 |             for sent_id in range(len(in_lines)):
171 |                 data = in_lines[sent_id]
172 |                 #print data
173 |                 source_sent, target_sent, links = convert_to_nodes_edges_each_v2(data, sent_id)
174 |                 source_list.append(source_sent)
175 |                 target_list.append(target_sent)
176 |                 all_links += links
177 | 
178 |             jdata = {}
179 |             jdata["source_list"] = source_list
180 |             jdata["target_list"] = target_list
181 |             jdata["links"] = all_links
182 |             jdata = json.dumps(jdata).decode('unicode-escape').encode('utf8')
183 | 
184 |             OUT.write(jdata)
185 | 
186 | def convert_to_nodes_edges_each_v2(data, sent_id):
187 |     """
188 |     give a single data object string, convert it into a json data string that is compatible with the Web interface.
189 |     """
190 |     #print data
191 |     jdata = json.loads(data)
192 |     #jdata = json.loads(json.dumps(jdata).decode('unicode-escape').encode('utf8'))
193 |     print jdata
194 |     source_words = jdata["source_sent"].encode('unicode-escape').strip().split()
195 |     source_words.append("EOS")
196 |     target_words = jdata["target_sent"].strip().split()
197 |     #print target_words
198 |     target_words.append("EOS")
199 | 
200 |     #print target_words
201 | 
202 |     matrix = jdata["matrix"]
203 |     n_rows = len(matrix)
204 |     n_cols = len(matrix[0])
205 | 
206 |     links = []
207 |     for target_index in range(n_rows):
208 |         for source_index in range(n_cols):
209 |             five_tuple = []
210 |             score = matrix[target_index][source_index]
211 |             five_tuple.append(target_index)
212 |             five_tuple.append(sent_id)
213 |             five_tuple.append(score)
214 |             five_tuple.append(source_index)
215 |             five_tuple.append(sent_id)
216 |             links.append(five_tuple)
217 | 
218 |     return source_words, target_words, links
219 | 
220 | if __name__ == "__main__":
221 |     """
222 |     Run the conversion to Web format if needed.
223 |     """
224 |     input_file = sys.argv[1]
225 |     convert_to_nodes_edges_v2(input_file)
226 | 
227 |     import sys
228 |     reload(sys)
229 |     sys.setdefaultencoding('utf-8')
230 | ### Json for web visuaslization format version 1.
231 | ### This corresponds to convert_to_nodes_edges_each_v1; and convert_to_nodes_edges_v1.
232 | """
233 | {
234 |   "nodes":[
235 |         {"name":"Good","group":1},
236 |         {"name":"Morning","group":1},
237 |         {"name":"Buenos","group":2},
238 |         {"name":"dias","group":2}
239 |     ],
240 |   "links":[
241 |         {"source":"Good" ,"target":"Buenos","value":0.90},
242 |         {"source":"Good" ,"target":"dias","value":0.30},
243 |         {"source":"Morning" ,"target":"Buenos","value":0.50},
244 |         {"source":"Morning" ,"target":"dias","value":0.95}
245 |     ]
246 | }
247 | """
248 | 
249 | ### Json for Web visualization format version 2.
250 | ### This corresponds to
251 | 
252 | 


--------------------------------------------------------------------------------
/test/data/indomain-dev.de:
--------------------------------------------------------------------------------
  1 | als Steve Lopez , Kolum@@ n@@ ist der Los Angeles Times , eines Tages durch die Straßen im Zentrum von Los Angeles ging , hörte er eine wundervolle Musik .
  2 | Sie kam von einem Mann , einem Afro@@ -@@ Amerikaner , sympath@@ isch , rau , obdach@@ los , der auf einer Gei@@ ge spielte , die nur noch zwei Sa@@ iten hatte .
  3 | viele von Ihnen werden die Geschichte kennen , denn aus St@@ eves Artikel darüber wurde später ein Buch , das wiederum ver@@ fil@@ mt wurde , mit Robert Do@@ w@@ ney junior als Steve Lopez und Jamie Fo@@ xx als Nath@@ an@@ iel Anthony A@@ yers , dem Kontr@@ ab@@ ass@@ isten , der am Ju@@ illi@@ ard-@@ Konservatorium studierte und dessen vielversprechende Karriere trag@@ ischerweise früh durch seine parano@@ ide Schiz@@ op@@ hren@@ ie beendet wurde .
  4 | Nath@@ an@@ iel verließ Ju@@ illi@@ ard , erlitt einen Nerven@@ zusammen@@ bruch und 30 Jahre später lebte er als Obdach@@ loser auf den Straßen von Sk@@ id Ro@@ w im Zentrum von Los Angeles .
  5 | ich empfehle Ihnen allen , St@@ eves Buch zu lesen oder sich den Film anzusehen , damit Sie nicht nur die wunderbare Verbundenheit verstehen , die zwischen diesen beiden Männern entstanden ist , sondern auch , wie die Musik diese Verbindung herstellen half und wie sie schließlich das Instrument war , wenn dieses Wor@@ tspiel erlaubt ist , das mith@@ al@@ f , Nath@@ an@@ iel weg von der Straße zu kriegen .
  6 | ich traf Mr A@@ yers im Jahre 2008 , vor zwei Jahren , in der Walt Disney Concert Hall .
  7 | er hatte gerade eine Vorführung von Beetho@@ v@@ ens Erster und Vier@@ ter Sinf@@ onie gehört und kam hinter die Bühne , um sich mir vorzustellen .
  8 | er sprach in einem sehr hei@@ teren und gesel@@ ligen Ton über Y@@ o-@@ Y@@ o Ma und Hillary Clinton und darüber , wie die Dod@@ gers es nie in die Baseball World Series schaffen würden . und das alles wegen der tück@@ ischen Passage der ersten Gei@@ ge im letzten Satz von Beetho@@ v@@ ens Vier@@ ter Sinf@@ onie .
  9 | wir kamen aufs Thema Musik zu sprechen . und ein paar Tage später bekam ich eine E-Mail , in der stand , dass Nath@@ an@@ iel sich für G@@ eigen@@ unterricht bei mir interessierte .
 10 | ich muss noch erwähnen , dass Nath@@ an@@ iel eine medizinische Behandlung ablehnte , denn er war bereits mit Elektro@@ scho@@ cks und mit Thor@@ az@@ in und Hand@@ sch@@ ellen behandelt worden , ein Trauma , das ihn sein ganzes Leben verfolgt hat .
 11 | als Folge davon ist er jetzt besonders anfällig für diese schizop@@ hr@@ enen Phasen . die sind manchmal so schlimm , und er tag@@ el@@ ang verschwindet , in den Straßen von Sk@@ id Ro@@ w herum@@ wandert , immer diesem Horror und den Folter@@ qu@@ alen seines eigenen Geistes ausgesetzt .
 12 | und in genau so einem Rei@@ z@@ stadium befand sich Nath@@ an@@ iel , als wir mit unserer ersten Unterrichts@@ stunde in der Walt Disney Concert Hall begannen . er hatte dieses irre Fl@@ ack@@ ern in seinen Augen und wirkte völlig verloren .
 13 | er sprach über unsichtbare Dämonen und Rauch und darüber , wie ihn jemand im Schlaf vergi@@ ften wollte .
 14 | ich hatte Angst , nicht mein@@ et@@ wegen , sondern ich hatte Angst , dass ich ihn verlieren könnte , dass er in einen seiner Zustände versinken könnte und dass ich ihm seine Beziehung zur Gei@@ ge zerstören könnte , wenn ich anfing , über Ton@@ leitern und Ar@@ peg@@ gi@@ os und andere aufregende Formen der didak@@ tischen G@@ eigen@@ pä@@ dago@@ gik zu reden .
 15 | ich fing also einfach an zu spielen .
 16 | ich spielte den ersten Satz von Beetho@@ v@@ ens Viol@@ in@@ konzert .
 17 | und während ich spielte , fiel mir auf , dass in Nath@@ an@@ iel@@ s Augen eine vollkommene Veränderung vor sich ging .
 18 | es war , als ob er unter dem Einfluss einer unsichtbaren Arz@@ n@@ ei stand , eine chemische Reaktion , dessen Katalysator mein Spiel war .
 19 | Nath@@ an@@ iel@@ s man@@ ischer Zorn verwandelte sich in Verständnis , in eine ruhige Neugier und An@@ mut .
 20 | und wie durch ein Wunder nahm er seine Gei@@ ge und begann , nach Gehör einige Aus@@ schnitte von Viol@@ in@@ konz@@ erten zu spielen und bat mich dann , sie zu Ende zu spielen : Mendelssohn , Tsch@@ ai@@ kowski , Si@@ bel@@ ius .
 21 | wir fingen dann an , über Musik zu sprechen , angefangen von Bach über Beethoven , Brahms , Bruck@@ ner und all die anderen Bs , von Bart@@ ó@@ k bis hin zu E@@ sa-@@ Pek@@ ka Sal@@ onen .
 22 | und mir wurde klar , dass er nicht nur ein Enzykl@@ opä@@ disches Wissen über Musik besaß , sondern ihn mit ihr auch eine enge persönliche Beziehung verband .
 23 | er sprach von ihr mit einer Leidenschaft und einem Verständnis , das ich sonst nur von meinen Kollegen in der Philharmonie von Los Angeles kenne .
 24 | indem er Musik spielte und über Musik sprach , war aus diesem parano@@ iden , verwirr@@ ten Mann , der eben noch durch die Straßen von Los Angeles gezogen war , ein lieb@@ enswerter , gebil@@ deter , ausgezeichneter , in Ju@@ illi@@ ard ausgebil@@ deter Musiker geworden .
 25 | Musik ist Medizin . Musik verändert uns .
 26 | für Nath@@ an@@ iel bedeutet Musik seel@@ ische Gesundheit .
 27 | denn die Musik erlaubt es ihm , seine Gedanken und Wahn@@ vorstellungen mit Hilfe seiner Vorstellungskraft und Kreativität in etwas Re@@ ales umzu@@ formen .
 28 | und so ent@@ fli@@ eht er seinen quäl@@ enden Zuständen .
 29 | ich verstand , dass genau dies das Wesen der Kunst ist .
 30 | genau dies ist der Grund , warum wir Musik machen : damit wir etwas , das in uns allen , tief im Inneren steckt , unsere Gefühle , durch unsere künstlerische Linse , durch unsere Kreativität zur Wirklichkeit formen können .
 31 | und die Wirklichkeit jenes Ausdrucks erreicht uns alle , und bewegt , inspiriert und vereint uns .
 32 | was Nath@@ an@@ iel anging , so brachte ihn die Musik zurück in eine Gemeinschaft von Freunden .
 33 | die er@@ lö@@ sende Kraft der Musik brachte ihn zurück in eine Familie von Musikern , die ihn verstand , die sein Talent erkannte und ihn achtete .
 34 | und ich werde immer wieder mit Nath@@ an@@ iel musi@@ zieren , egal ob in der Walt Disney Concert Hall oder in Sk@@ id Ro@@ w , denn er erinnert mich daran , warum ich Musiker geworden bin .
 35 | vielen Dank .
 36 | Bruno Gi@@ uss@@ ani : vielen Dank . danke .
 37 | Robert G@@ up@@ ta .
 38 | Robert G@@ up@@ ta : ich möchte etwas spielen , das ich unversch@@ äm@@ terweise den Cell@@ isten gestohlen habe .
 39 | ich hoffe , Sie verzeihen mir .
 40 | also ich habe in meinem Leben viele Fische gek@@ annt .
 41 | ich habe nur zwei geliebt .
 42 | dieser erste , das war mehr wie eine leidenschaftliche Affäre .
 43 | es war ein schöner Fisch , wohl@@ schmeck@@ end , gute Konsistenz , ge@@ halt@@ voll , ein Bestseller auf der Speisekarte .
 44 | was für ein Fisch .
 45 | noch besser , er wurde in Aquakultur nach den angeblich höchsten Standards der Nachhaltigkeit gezüchtet .
 46 | man konnte sich also wohl dabei fühlen , ihn zu verkaufen .
 47 | ich hatte mit dieser Schönheit eine Beziehung über mehrere Monate .
 48 | eines Tages rief der Chef der Firma an und fragte , ob ich bei einer Veranstaltung vortragen könnte über die Nachhaltigkeit der Farm .
 49 | &quot; natürlich , &quot; sagte ich .
 50 | hier war eine Firma , die zu lösen versuchte , was dieses unvor@@ stellbare Problem für unsere Köche geworden ist . wie behalten wir Fisch auf unseren Speise@@ karten ?
 51 | in den letzten 50 Jahren haben wir die Meere gef@@ ischt wie wir Wälder k@@ ahl geschlagen haben .
 52 | es ist schwer , die Zerstörung zu über@@ bewerten .
 53 | 90 Prozent der großen Fische , der , die wir lieben , die Thun@@ fische , die Heil@@ but@@ te , die Lach@@ se , Schwert@@ fisch , sie sind zusammengebrochen .
 54 | es ist fast nichts mehr übrig .
 55 | also wird wohl oder übel Aquakultur , Fisch@@ anbau , ein Teil unserer Zukunft sein .
 56 | viele Argumente dagegen . Fisch@@ zucht@@ anlagen verschm@@ utzen die Umwelt , die meisten von ihnen jedenfalls , und sie sind ineffizient , nehmen wir Thunfisch . ein großer Nachteil .
 57 | er hat eine Futter@@ verwertung von 15 zu eins .
 58 | das heißt , 15 Pfund Wild@@ fisch sind nötig , damit man ein Pfund Zu@@ ch@@ t-@@ Thunfisch bekommt .
 59 | nicht sehr nachhaltig .
 60 | schmeckt auch nicht sehr gut .
 61 | also hier war endlich eine Firma , die versuchte , es richtig zu machen .
 62 | ich wollte sie unterstützten .
 63 | am Tag vor der Veranstaltung rief ich den Chef der Öffentlichkeitsarbeit für die Firma an .
 64 | nennen wir ihn Don .
 65 | &quot; Don , &quot; sagte ich , &quot; nur um die Fakten richtig zu haben , ihr seid berühmt dafür , so weit draußen im Meer anzu@@ bauen , dass ihr die Umwelt nicht verschmutzt . &quot;
 66 | &quot; das stimmt , &quot; sagte er . &quot; wir sind so weit draußen , dass der Abfall von unserem Fisch verteilt wird , nicht konzentriert . &quot;
 67 | und dann fügte er hinzu , &quot; Wir sind im Grunde eine eigene Welt .
 68 | diese Futter@@ verwertung von 2.5 zu 1 , &quot; sagte er .
 69 | &quot; die beste in der Branche . &quot;
 70 | 2.5 zu eins , großartig .
 71 | &quot; 2.5 zu eins was ? was füt@@ tert ihr ? &quot;
 72 | &quot; nachhaltige Proteine , &quot; sagte er .
 73 | &quot; großartig , &quot; sagte ich . legte auf .
 74 | und an diesem Abend lag ich im Bett und dachte : was zur Hölle ist ein nachhaltiges Protein ?
 75 | also rief ich am nächsten Tag , kurz vor der Veranstaltung , Don an .
 76 | ich sagte : &quot; Don , was zum Beispiel sind nachhaltige Proteine ? &quot;
 77 | er sagte , er wis@@ se das nicht . er werde nachfragen .
 78 | nun , ich telefon@@ ierte mit ein paar Leuten in der Firma . niemand konnte mir eine ordentliche Antwort geben . bis ich endlich mit dem leitenden Bi@@ ologen telefon@@ ierte .
 79 | nennen wir ihn ebenfalls Don .
 80 | &quot; Don , &quot; sagte ich , &quot; was zum Beispiel sind nachhaltige Proteine ? &quot;
 81 | nun , er erwähnte einige Algen und einige Fisch@@ m@@ ehle , und dann sagte er , Hüh@@ n@@ chen@@ -P@@ ell@@ ets .
 82 | ich sagte : &quot; Hüh@@ n@@ chen@@ -P@@ ell@@ ets ? &quot;
 83 | er sagte : &quot; ja , Federn , Haut , Knochen@@ mehl , Reste , getrocknet und zu Futter verarbeitet . &quot;
 84 | ich sagte : &quot; wie viel Prozent eures Fut@@ ters ist Hüh@@ nchen ? , &quot;
 85 | in der Annahme von vielleicht zwei Prozent .
 86 | &quot; nun , das sind ungefähr 30 Prozent , &quot; sagte er .
 87 | ich sagte : &quot; Don , was ist nachhaltig daran , Hüh@@ nchen an Fische zu verfü@@ t@@ tern ? &quot;
 88 | es gab eine lange Pause in der Leitung und er sagte : &quot; es gibt einfach zu viel Hüh@@ nchen auf der Welt . &quot;
 89 | ich ent@@ liebte mich von diesem Fisch .
 90 | Nein , nicht , weil ich ein selbst@@ gerechter Genie@@ ßer und Gut@@ men@@ sch bin .
 91 | das bin ich sogar .
 92 | Nein , ich habe m@@ icht tatsächlich von diesem Fisch ent@@ liebt , weil , ich schw@@ öre bei Gott , der Fisch nach dieser Unterhaltung nach Hüh@@ nchen geschm@@ eckt hat .
 93 | dieser zweite Fisch , das ist eine andere Art von Liebes@@ geschichte .
 94 | es ist die romantische Art , die Art , bei der man , je besser man seinen Fisch kennen lernt , man den Fisch umso mehr liebt .
 95 | ich habe ihn zuerst in einem Restaurant in Süd@@ sp@@ anien gegessen .
 96 | eine befreund@@ ete Journalistin hatte seit langem von diesem Fisch gesprochen .
 97 | Sie hat uns gewissermaßen verk@@ u@@ pp@@ elt .
 98 | er kam auf den Tisch mit einer hellen , fast schimm@@ ernden weißen Farbe .
 99 | der Koch hatte ihn ver@@ kocht .
100 | Halt zweimal .
101 | 


--------------------------------------------------------------------------------
/utils/attention_web.php:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |   <meta name="description" content="The HTML5 Herald">
  5 |   <meta name="author" content="SitePoint">
  6 | <title>Visualization Demo</title>
  7 |     <link href="_assets/css/global.css" type="text/css" rel="stylesheet">
  8 |     <link href="_assets/css/fonts.css" rel='stylesheet' type='text/css'>
  9 | 
 10 |     <link href="_assets/css/basic.css" type="text/css" rel="stylesheet" />
 11 |     <link href="_assets/css/visualize.css" type="text/css" rel="stylesheet" />
 12 |     <link href="_assets/css/visualize-light.css" type="text/css" rel="stylesheet" />
 13 |     <script type="text/javascript" src="_assets/js/d3.min.js"  charset="utf-8"></script>
 14 |     <!--script type="text/javascript" src="_assets/js/jquery.min.js"></script-->
 15 |     <script src="_assets/js/jquery-1.9.1.js"></script>
 16 |     <script src="_assets/js/jquery.min.js"></script>
 17 |     <!--script src="http://malsup.github.com/jquery.form.js"></script-->
 18 |     <script type="text/javascript" src="_assets/js/jquery.validate.min.js"></script>
 19 |     <!--script type="text/javascript" src="jquery-validation-1.11.1/lib/jquery.js"></script-->
 20 |     <script type="text/javascript" src="_assets/js/visualize.jQuery.js"></script>
 21 |     <script type="text/javascript" src="_assets/js/plugins.js" ></script>
 22 |     <script type="text/javascript" src="_assets/js/global.js" ></script>
 23 |     <!--link rel="stylesheet" type="text/css" href="_assets/css/jquery.mobile-1.4.5.min.css"/>
 24 |     <script src="_assets/js/jquery.mobile-1.4.5.min.js"></script!-->
 25 |     <script language="javascript" src="_assets/calendar/calendar/calendar.js"></script>
 26 |     <script type="text/javascript">
 27 | /*      $(function() {
 28 |         var page = '';
 29 |         if (page) "
 30 |           $('li.' + page).addClass('active');
 31 |         }
 32 |       });*/
 33 |     </script>
 34 |   <!--[if lt IE 9]>
 35 |     <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script>
 36 |   <![endif]-->
 37 | </head>
 38 | <style>
 39 | body{
 40 | 	width:1200px;
 41 | 	margin:100px auto;
 42 | }
 43 | svg text{
 44 | 	font-size:12px;
 45 | }
 46 | rect{
 47 | 	shape-rendering:crispEdges;
 48 | }
 49 | 
 50 | 
 51 | </style>
 52 | <body>
 53 | <div id="area1"></div>
 54 | <div id="area2"></div>
 55 | <script src="http://d3js.org/d3.v3.min.js"></script>
 56 | <script src="attention.js"></script>
 57 | <script>
 58 | var target = [["eine", "republi@@", "kanische", "Strategie", ",", "um", "der", "Wiederwahl", "von", "Obama", "entgegenzutreten", "EOS"]];//[['katze','bin','eine','katze'],['a','cat']];
 59 | var source = [["a", "Republican", "strategy", "to", "counter", "the", "re-election", "of", "Obama", "EOS"]];//[['cat','am','a','cat'],['eine','katze']]
 60 | var sales_data=[
 61 | /*['abc','def',0.16,0,0],
 62 | ['abc','def',0.78,0,0],
 63 | ['abc','df4',0.10,0,0],
 64 | ['abc','df3',0.58,0,0],
 65 | ['abc','df2',0.99,0,0],
 66 | ['abc1','df1',0.45,1]*/
 67 | /*[0,0,0.96,0,0],
 68 | [0,1,0.23,0,0],
 69 | [0,2,0.20,0,0],
 70 | [0,3,0.66,0,0],
 71 | [1,0,0.23,0,0],
 72 | [1,1,0.88,0,0],
 73 | [1,2,0.34,0,0],
 74 | [1,3,0.39,0,0],
 75 | [2,0,0.20,0,0],
 76 | [2,1,0.55,0,0],
 77 | [2,2,0.94,0,0],
 78 | //[2,2,0.78,0,0],
 79 | [2,3,0.30,0,0],
 80 | [3,0,0.67,0,0],
 81 | [3,1,0.34,0,0],
 82 | [3,2,0.55,0,0],
 83 | [3,2,0.44,0,0],
 84 | [3,3,0.8,0,0],
 85 | [0,0,0.95,1,1],
 86 | [0,1,0.55,1,1],
 87 | [1,0,0.55,1,1],
 88 | [1,1,0.95,1,1]*/
 89 | [0, 0, 0.8606763482093811, 0, 0], [0, 0, 0.07394544780254364, 1, 0], [0, 0, 0.0050795478746294975, 2, 0], [0, 0, 0.014749017544090748, 3, 0], [0, 0, 0.00689421221613884, 4, 0], [0, 0, 0.007193631026893854, 5, 0], [0, 0, 0.0023485629353672266, 6, 0], [0, 0, 0.002437103074043989, 7, 0], [0, 0, 0.0041503868997097015, 8, 0], [0, 0, 0.022525690495967865, 9, 0], [1, 0, 0.008867422118782997, 0, 0], [1, 0, 0.9746508002281189, 1, 0], [1, 0, 0.006591449491679668, 2, 0], [1, 0, 0.0015248449053615332, 3, 0], [1, 0, 0.0019079294288530946, 4, 0], [1, 0, 0.0003202259249519557, 5, 0], [1, 0, 0.0014177649281919003, 6, 0], [1, 0, 0.00010574129555607215, 7, 0], [1, 0, 6.681956438114867e-05, 8, 0], [1, 0, 0.0045470003969967365, 9, 0], [2, 0, 0.017237717285752296, 0, 0], [2, 0, 0.13048002123832703, 1, 0], [2, 0, 0.005057570990175009, 2, 0], [2, 0, 0.0023649714421480894, 3, 0], [2, 0, 0.0011181416921317577, 4, 0], [2, 0, 0.0006632875301875174, 5, 0], [2, 0, 0.0031986164394766092, 6, 0], [2, 0, 0.002395403804257512, 7, 0], [2, 0, 0.0023285525385290384, 8, 0], [2, 0, 0.835155725479126, 9, 0], [3, 0, 0.0007024086662568152, 0, 0], [3, 0, 0.004170551430433989, 1, 0], [3, 0, 0.986028254032135, 2, 0], [3, 0, 0.0025388309732079506, 3, 0], [3, 0, 0.0006586579256691039, 4, 0], [3, 0, 5.288098327582702e-05, 5, 0], [3, 0, 0.0005594021058641374, 6, 0], [3, 0, 0.00013853979180566967, 7, 0], [3, 0, 2.5045808797585778e-05, 8, 0], [3, 0, 0.005125435534864664, 9, 0], [4, 0, 0.004196085501462221, 0, 0], [4, 0, 0.005216502584517002, 1, 0], [4, 0, 0.01276960875838995, 2, 0], [4, 0, 0.539452850818634, 3, 0], [4, 0, 0.12216465175151825, 4, 0], [4, 0, 0.042928386479616165, 5, 0], [4, 0, 0.0028886948712170124, 6, 0], [4, 0, 0.019435355439782143, 7, 0], [4, 0, 0.008831476792693138, 8, 0], [4, 0, 0.24211636185646057, 9, 0], [5, 0, 0.0017269871896132827, 0, 0], [5, 0, 0.0004039984487462789, 1, 0], [5, 0, 0.0005107998149469495, 2, 0], [5, 0, 0.07915166765451431, 3, 0], [5, 0, 0.2411874532699585, 4, 0], [5, 0, 0.5567825436592102, 5, 0], [5, 0, 0.004832962993532419, 6, 0], [5, 0, 0.00825989805161953, 7, 0], [5, 0, 0.03283322602510452, 8, 0], [5, 0, 0.07431045174598694, 9, 0], [6, 0, 0.0011403380194678903, 0, 0], [6, 0, 0.00031833394314162433, 1, 0], [6, 0, 0.00016495760064572096, 2, 0], [6, 0, 0.013225900009274483, 3, 0], [6, 0, 0.24790987372398376, 4, 0], [6, 0, 0.6110280752182007, 5, 0], [6, 0, 0.016282757744193077, 6, 0], [6, 0, 0.011716436594724655, 7, 0], [6, 0, 0.05048535019159317, 8, 0], [6, 0, 0.04772792384028435, 9, 0], [7, 0, 0.00010706923058023676, 0, 0], [7, 0, 0.0017059684032574296, 1, 0], [7, 0, 0.0001661568967392668, 2, 0], [7, 0, 0.0003117379965260625, 3, 0], [7, 0, 0.018107030540704727, 4, 0], [7, 0, 0.027465928345918655, 5, 0], [7, 0, 0.9100472927093506, 6, 0], [7, 0, 0.009352157823741436, 7, 0], [7, 0, 0.012818872928619385, 8, 0], [7, 0, 0.019917771220207214, 9, 0], [8, 0, 0.0002763264928944409, 0, 0], [8, 0, 8.873225306160748e-05, 1, 0], [8, 0, 1.1112268111901358e-05, 2, 0], [8, 0, 5.050943946116604e-05, 3, 0], [8, 0, 0.0018735375488176942, 4, 0], [8, 0, 0.005560200195759535, 5, 0], [8, 0, 0.0037295932415872812, 6, 0], [8, 0, 0.13867616653442383, 7, 0], [8, 0, 0.8325313329696655, 8, 0], [8, 0, 0.017202477902173996, 9, 0], [9, 0, 0.00012520256859716028, 0, 0], [9, 0, 0.00010352569370297715, 1, 0], [9, 0, 1.2285055163374636e-05, 2, 0], [9, 0, 8.521879863110371e-06, 3, 0], [9, 0, 0.00027309719007462263, 4, 0], [9, 0, 0.0010494114831089973, 5, 0], [9, 0, 0.002524220384657383, 6, 0], [9, 0, 0.0026750960387289524, 7, 0], [9, 0, 0.9733797311782837, 8, 0], [9, 0, 0.019848907366394997, 9, 0], [10, 0, 0.0016987099079415202, 0, 0], [10, 0, 0.0014881069073453546, 1, 0], [10, 0, 0.004352489486336708, 2, 0], [10, 0, 0.007792209275066853, 3, 0], [10, 0, 0.8691822290420532, 4, 0], [10, 0, 0.003717853920534253, 5, 0], [10, 0, 0.006918297614902258, 6, 0], [10, 0, 0.006369971204549074, 7, 0], [10, 0, 0.0013700248673558235, 8, 0], [10, 0, 0.09711004048585892, 9, 0], [11, 0, 0.003006486687809229, 0, 0], [11, 0, 0.0015359335811808705, 1, 0], [11, 0, 0.003430589335039258, 2, 0], [11, 0, 0.047293197363615036, 3, 0], [11, 0, 0.03258756920695305, 4, 0], [11, 0, 0.005403296090662479, 5, 0], [11, 0, 0.0019564235117286444, 6, 0], [11, 0, 0.008543130941689014, 7, 0], [11, 0, 0.005399803165346384, 8, 0], [11, 0, 0.8908436298370361, 9, 0]
 90 | ];
 91 | /*['Lite','AZ',5453,35],
 92 | ['Small','AZ',683,1],
 93 | ['Medium','AZ',862,0],
 94 | ['Grand','AZ',6228,30],
 95 | ['Lite','AL',15001,449],
 96 | ['Small','AL',527,3],
 97 | ['Medium','AL',836,0],
 98 | ['Plus','AL',28648,1419],
 99 | ['Grand','AL',3,0]
100 | /*['Lite','CO',13,0],
101 | ['Small','CO',396,0],
102 | ['Medium','CO',362,0],
103 | ['Plus','CO',78,10],
104 | ['Grand','CO',2473,32],
105 | ['Elite','CO',2063,64],
106 | ['Medium','DE',203,0],
107 | ['Grand','DE',686,2],
108 | ['Elite','DE',826,0],
109 | ['Lite','KS',1738,110],
110 | ['Small','KS',12925,13],
111 | ['Medium','KS',15413,0],
112 | ['Small','GA',2166,2],
113 | ['Medium','GA',86,0],
114 | ['Plus','GA',348,3],
115 | ['Grand','GA',4244,18],
116 | ['Elite','GA',1536,1],
117 | ['Small','IA',351,0],
118 | ['Grand','IA',405,1],
119 | ['Small','IL',914,1],
120 | ['Medium','IL',127,0],
121 | ['Grand','IL',1470,7],
122 | ['Elite','IL',516,1],
123 | ['Lite','IN',43,0],
124 | ['Small','IN',667,1],
125 | ['Medium','IN',172,0],
126 | ['Plus','IN',149,1],
127 | ['Grand','IN',1380,5],
128 | ['Elite','IN',791,23],
129 | ['Small','FL',1,0],
130 | ['Grand','FL',1,0],
131 | ['Small','MD',1070,1],
132 | ['Grand','MD',1171,2],
133 | ['Elite','MD',33,0],
134 | ['Plus','TX',1,0],
135 | ['Small','MS',407,0],
136 | ['Medium','MS',3,0],
137 | ['Grand','MS',457,2],
138 | ['Elite','MS',20,0],
139 | ['Small','NC',557,0],
140 | ['Medium','NC',167,0],
141 | ['Plus','NC',95,1],
142 | ['Grand','NC',1090,5],
143 | ['Elite','NC',676,6],
144 | ['Lite','NM',1195,99],
145 | ['Small','NM',350,3],
146 | ['Medium','NM',212,0],
147 | ['Grand','NM',1509,8],
148 | ['Lite','NV',3899,389],
149 | ['Small','NV',147,0],
150 | ['Medium','NV',455,0],
151 | ['Plus','NV',1,1],
152 | ['Grand','NV',4100,16],
153 | ['Lite','OH',12,0],
154 | ['Small','OH',634,2],
155 | ['Medium','OH',749,0],
156 | ['Plus','OH',119,1],
157 | ['Grand','OH',3705,19],
158 | ['Elite','OH',3456,25],
159 | ['Small','PA',828,2],
160 | ['Medium','PA',288,0],
161 | ['Plus','PA',141,0],
162 | ['Grand','PA',2625,7],
163 | ['Elite','PA',1920,10],
164 | ['Small','SC',1146,2],
165 | ['Medium','SC',212,0],
166 | ['Plus','SC',223,4],
167 | ['Grand','SC',1803,6],
168 | ['Elite','SC',761,8],
169 | ['Small','TN',527,0],
170 | ['Medium','TN',90,0],
171 | ['Grand','TN',930,4],
172 | ['Elite','TN',395,1],
173 | ['Lite','ME',7232,58],
174 | ['Small','ME',1272,0],
175 | ['Medium','ME',1896,0],
176 | ['Plus','ME',1,0],
177 | ['Grand','ME',10782,33],
178 | ['Elite','ME',1911,3],
179 | ['Small','VA',495,0],
180 | ['Medium','VA',32,0],
181 | ['Plus','VA',7,0],
182 | ['Grand','VA',1557,12],
183 | ['Elite','VA',24,0],
184 | ['Small','WA',460,1],
185 | ['Plus','WA',88,3],
186 | ['Grand','WA',956,3],
187 | ['Small','WV',232,0],
188 | ['Medium','WV',71,0],
189 | ['Grand','WV',575,2],
190 | ['Elite','WV',368,3]*/
191 | 
192 | var width = 2200, height = 690, margin ={b:0, t:40, l:-50, r:50};
193 | var c = "area1";
194 | var svg = d3.select("#area1")
195 | //var svg  = d3.select(c).select("div").select(".plots")
196 | 	.append("svg").attr('width',width/2).attr('height',(height+margin.b+margin.t)/2)
197 | 	.append("g").attr("transform","translate("+ margin.l+","+margin.t+")");
198 | 
199 | var data = [ 
200 | 	{data:bP.partData(sales_data,0,0,target,source), id:'SalesAttempts', header:["Channel","State", "Sales Attempts"]}
201 | //	{data:bP.partData(sales_data,3), id:'Sales', header:["Channel","State", "Sales"]}
202 | ];
203 | 
204 | bP.draw(data, svg);
205 | var data = [ 
206 |         {data:bP.partData(sales_data,1,1,target,source), id:'SsAttempts', header:["Channel","State", "Sales Attempts"]}
207 | //      {data:bP.partData(sales_data,3), id:'Sales', header:["Channel","State", "Sales"]}
208 | ];
209 | svg2 = d3.select("#area2")
210 |         .append("svg").attr('width',width/2).attr('height',(height+margin.b+margin.t)/2)
211 |         .append("g").attr("transform","translate("+ margin.l+","+margin.t+")");
212 | bP.draw(data, svg2);
213 | </script>
214 | </body>
215 | 


--------------------------------------------------------------------------------