├── example
    ├── working-dir
    │   └── .ignore
    ├── data
    │   ├── parallelC.de-en.parsed_target.en
    │   ├── parallelC.de-en.parsed_both.de
    │   ├── parallelC.de-en.parsed_both.en
    │   ├── parallelC.de-en.parsed_target.de
    │   ├── parallelC.de-en.en
    │   └── parallelC.de-en.de
    ├── README.md
    ├── toy_example.config
    ├── toy_example_2015.config
    ├── toy_example_2015_2.config
    ├── toy_example_2015_3.config
    ├── toy_example_2015_4.config
    ├── toy_example_2015_5.config
    └── toy_example_2015_6.config
├── emnlp2015
    ├── split_and_restructure.sh
    ├── detruecase_ptkvz.sh
    ├── oov_filter.py
    ├── unbinarize.py
    ├── fst_wrapper.py
    ├── binarize.py
    ├── separable_prefix_postprocessing.py
    ├── separable_prefix.py
    └── hyphen-splitter.py
├── README.md
├── enrich_labelset.py
└── hybrid_compound_splitter.py


/example/working-dir/.ignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/data/parallelC.de-en.parsed_target.en:
--------------------------------------------------------------------------------
1 | parallelC.de-en.en


--------------------------------------------------------------------------------
/example/data/parallelC.de-en.parsed_both.de:
--------------------------------------------------------------------------------
1 | parallelC.de-en.parsed.de


--------------------------------------------------------------------------------
/example/data/parallelC.de-en.parsed_both.en:
--------------------------------------------------------------------------------
1 | parallelC.de-en.parsed.en


--------------------------------------------------------------------------------
/example/data/parallelC.de-en.parsed_target.de:
--------------------------------------------------------------------------------
1 | parallelC.de-en.parsed.de


--------------------------------------------------------------------------------
/emnlp2015/split_and_restructure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #perform compound splitting and particle verb restructuring
 4 | 
 5 | script_dir=$1
 6 | shift
 7 | smor=$1
 8 | shift
 9 | 
10 | $script_dir/hybrid_compound_splitter.py \
11 |   -smor $smor \
12 |   -write-filler -no-truecase -q -syntax -fewest -dependency $@ \
13 | | $script_dir/emnlp2015/hyphen-splitter.py -syntax \
14 | | $script_dir/emnlp2015/separable_prefix.py $smor


--------------------------------------------------------------------------------
/emnlp2015/detruecase_ptkvz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # EMS hack: do post-processing of particle verbs in detruecase step;
 4 | # instead of string translation output, we need tree output that we take from -Ttree file.
 5 | 
 6 | script_dir=$1
 7 | shift
 8 | 
 9 | grep "Full Tree" $1 | cut -f 2- -d ":" | cut -f "2-" -d " " | \
10 | python3 $script_dir/emnlp2015/unbinarize.py | \
11 | python $script_dir/emnlp2015/separable_prefix_postprocessing.py
12 | 


--------------------------------------------------------------------------------
/emnlp2015/oov_filter.py:
--------------------------------------------------------------------------------
 1 | # filter out all phrases in a phrase table that contain words that are not in
 2 | # the provided vocabulary file
 3 | 
 4 | # usage: python oov_filter.py vocabulary_file < phrase_table_in > phrase_table_out
 5 | 
 6 | import sys
 7 | 
 8 | vocab = open(sys.argv[1]).readlines()
 9 | vocab = set([item.strip() for item in vocab])
10 | 
11 | discarded = open('discarded','w')
12 | 
13 | count = 0
14 | dcount = 0
15 | for line in sys.stdin:
16 |     count += 1
17 |     linesplit = line.split('|||')
18 |     for word in linesplit[1].split()[:-1]:
19 |       if word.startswith('['):
20 |         continue
21 |       elif word not in vocab:
22 |         discarded.write(line)
23 |         dcount += 1
24 |         break
25 |     else:
26 |         print line,
27 | 
28 | sys.stderr.write('{0} out of {1} lines discarded\n'.format(dcount, count))
29 | 


--------------------------------------------------------------------------------
/emnlp2015/unbinarize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Rico Sennrich
 4 | 
 5 | 
 6 | from __future__ import print_function, unicode_literals
 7 | import sys
 8 | import tree
 9 | import re
10 | 
11 | whitespace = re.compile('\s+')
12 | 
13 | def get_unbinarized_children(t, children=None):
14 | 
15 |     if children is None:
16 |         children = []
17 | 
18 |     for child in t:
19 |         if isinstance(child, tree.Tree) and child.node.startswith('^'):
20 |             get_unbinarized_children(child, children)
21 |         else:
22 |             children.append(child)
23 | 
24 |     if not isinstance(t, tree.Tree) or t.node.startswith('^'):
25 |         return
26 |     else:
27 |         t[:] = children
28 |         for child in t:
29 |             get_unbinarized_children(child)
30 | 
31 | 
32 | 
33 | if __name__ == '__main__':
34 |   for line in sys.stdin:
35 |       t = tree.Tree(line)
36 |       get_unbinarized_children(t)
37 |       print(whitespace.sub(' ',t.__str__()))


--------------------------------------------------------------------------------
/emnlp2015/fst_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: Beat Kunz
 4 | 
 5 | from __future__ import unicode_literals, print_function
 6 | import sys
 7 | import re
 8 | import pexpect
 9 | 
10 | 
11 | class FstWrapper():
12 |     def __init__(self, smor_binary, smor_model):
13 |         self.child = pexpect.spawnu(smor_binary + ' ' + smor_model)
14 |         self.child.delaybeforesend = 0
15 |         self.child.expect(["analyze> ", pexpect.EOF], timeout=600)
16 |         self.morAnalyseMode = True
17 |         before = self.child.before
18 |         if self.child.terminated:
19 |             raise RuntimeError(before)
20 | 
21 |     def analyse(self, word):
22 |         word = word.strip()
23 |         if word == "" or word == "q" or word == "\x7f":
24 |             return []
25 |         # if not in analyse mode, go to it
26 |         if self.morAnalyseMode == False: 
27 |             # print "Was not in analyse mode => toggle to it!"
28 |             self.toggleMorMode()
29 |             self.child.sendline("") # "" is used in the fst-mor to toggle between analyse/generate
30 |             self.child.expect(["analyze> ", pexpect.EOF])
31 |             self.child.before
32 |         self.child.sendline(word)
33 |         try:
34 |             self.child.expect(["analyze> ", pexpect.EOF])
35 |         except pexpect.TIMEOUT:
36 |             sys.stderr.write('Warning: timeout while waiting for fst-mor\n')
37 |             sys.stderr.write('String: {0}'.format(word))
38 |             return []
39 |         result = self.child.before.split("\r\n")[1:-1]
40 |         if len(result) == 1 and re.match("^no result for ", result[0]):
41 |             result = []
42 |         return result
43 | 
44 |     def generate(self, word):
45 |         word = word.strip()
46 |         if word == "" or word == "q":
47 |             return []
48 |         # if not in analyse mode, go to it
49 |         if self.morAnalyseMode == True: 
50 |             # print "Was not in generate mode => toggle to it!"
51 |             self.toggleMorMode()
52 |             self.child.sendline("") # "" is used in the fst-mor to toggle between analyse/generate
53 |             self.child.expect(["generate> ", pexpect.EOF])
54 |             self.child.before
55 |         self.child.sendline(word)
56 |         try:
57 |             self.child.expect(["generate> ", pexpect.EOF])
58 |         except pexpect.TIMEOUT:
59 |             sys.stderr.write('Warning: timeout while waiting for fst-mor\n')
60 |             sys.stderr.write('String: {0}'.format(word))
61 |             return []
62 |         result = self.child.before.split("\r\n")[1:-1]
63 |         if len(result) == 1 and re.match("^no result for ", result[0]):
64 |             result = []
65 |         return result
66 | 
67 |     # if you just want to play around you can use this function
68 |     def openShell(self):
69 | 
70 |         while True:
71 |             input_string = raw_input("input<<<<")
72 |             if input_string == "":
73 |                 self.toggleMorMode()
74 |             self.child.sendline(input_string)
75 |             if self.morAnalyseMode == True:
76 |                 self.child.expect(["analyze> ", pexpect.EOF])
77 |             else:
78 |                 self.child.expect(["generate> ", pexpect.EOF])
79 | 
80 |     def toggleMorMode(self):
81 |         self.morAnalyseMode = not self.morAnalyseMode


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
 1 | Toy EMS Config for String-to-Tree SMT System 
 2 | ============================================
 3 | 
 4 | The EMS configuration file `toy_example.config` documents the settings used to 
 5 | build a string-to-tree SMT system like our submission to WMT 2014.
 6 | 
 7 | The configuration only uses some toy data that is provided with this repository, 
 8 | but a full-scale system can be implemented by replacing references to 
 9 | `parallelA`, `parallelB` and `monolingualA` with real data sets, and changing 
10 | the tuning and evaluation sets.
11 | 
12 | Main differences from the WMT 2014 submission:
13 | 
14 |   - this config does not include syntactic constraints
15 |   - this config does not filter the tuning set to short sentences
16 | 
17 | The file `toy_example_2015.config` shows the base configuration of the WMT 2015 submissions.
18 | It includes tuning on the head-word chain metric (HWCM), and some updated settings.
19 | 
20 |   - `toy_example_2015_2.config` adds head binarization
21 |   - `toy_example_2015_3.config` adds a relational dependency language model
22 |   - `toy_example_2015_4.config` adds source-syntactic constraints
23 |   - `toy_example_2015_5.config` adds a 5-gram neural language model
24 |   - `toy_example_2015_6.config` slighly modifies compound splitting, and adds particle verb restructuring
25 | 
26 | [on real-sized data, some steps (such as parsing and training neural networks on all monolingual data)
27 | may take a long time, and you may want to consider to manually distribute the workload over many machines,
28 | and/or to only parse the parallel data and train neural networks on a subset of data, and/or for fewer epochs.]
29 | 
30 | `toy_example_2015_5.config` contains all models of our official WMT 2015 submission (uedin-syntax); our submission contains two manual "hacks" not automated by EMS:
31 |   - we remove all virtual nodes from the tree binarization (those starting in "^") from `model/unknown-word-soft-matches.*`
32 |     [this means that unknown words are not allowed to match those nodes; RDLM produces lots of warnings if these matches are allowed]
33 |   - we remove all rule table entries from `model/phrase-table.*` whose target side contains words that are not in the vocabulary of RDLM and NPLM.
34 |     [this avoids problems with poor probability estimates for those translations]
35 |     (see `emnlp2015/oov_filter.py`)
36 | 
37 | 
38 | 
39 | Instructions
40 | ------------
41 | 
42 | 1. download and install all required software
43 | 
44 |   - mosesdecoder (http://statmt.org/moses/)
45 |   - ParZu (https://github.com/rsennrich/ParZu)
46 |   - mgiza (https://github.com/moses-smt/mgiza)
47 |   - SRILM (http://www.speech.sri.com/projects/srilm/) [LM training could also be done with other tools, but SRILM is still used for interpolation]
48 | 
49 | for some configs, also install the following:
50 |   - NPLM (https://github.com/rsennrich/nplm/) for RDLM and NPLM toy_example_2015_{3,5}
51 |     if you use NPLM, (re-)compile Moses with the option "--with-nplm=<root dir of the NPLM toolkit>"
52 |   - Stanford CoreNLP (http://nlp.stanford.edu/software/corenlp.shtml) for English parsing for toy_example_2015_4
53 |   - Maltparser (http://www.maltparser.org/) for projectivization of English parse trees for toy_example_2015_4
54 | 
55 | 2. set the paths in the first 20 lines of `toy_example.config`
56 | 
57 | 3. run EMS with the example configuration. Models etc. are written to `working-dir`
58 | 
59 |   /path/to/mosesdecoder/scripts/ems/experiment.perl --config toy_example.config --exec
60 | 
61 | 
62 | Common issues
63 | -------------
64 | 
65 | these configs were tested with moses commit 5d8af9c (29 May 2015), and 89d16a4 (31 July 2015).
66 | 


--------------------------------------------------------------------------------
/emnlp2015/binarize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | # perform deterministic head binarization of trees that were converted from dependency format (with mosesdecoder/scripts/training/wrappers/conll2mosesxml.py):
  6 | # right-binarization of the head and its pre-modifiers, followed by left-binarization of all post-modifiers
  7 | 
  8 | from __future__ import print_function, unicode_literals
  9 | import sys
 10 | import codecs
 11 | from collections import defaultdict
 12 | 
 13 | try:
 14 |     from lxml import etree as ET
 15 | except ImportError:
 16 |     from xml.etree import cElementTree as ET
 17 | 
 18 | def escape_xml(element):
 19 | 
 20 |     if element.text:
 21 |         element.text = element.text.replace('\'','&apos;')
 22 |         element.text = element.text.replace('"','&quot;')
 23 | 
 24 |     for child in element:
 25 |         escape_xml(child)
 26 | 
 27 | def escape_text(s):
 28 | 
 29 |     s = s.replace('|','&#124;') # factor separator
 30 |     s = s.replace('[','&#91;') # syntax non-terminal
 31 |     s = s.replace(']','&#93;') # syntax non-terminal
 32 | 
 33 |     s = s.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
 34 |     s = s.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
 35 | 
 36 |     return s
 37 | 
 38 | # assume dependency structure where each nonterminal has exactly one pre-terminal child, which is the head of the structure.
 39 | def find_head(xml):
 40 |     for i, child in enumerate(xml):
 41 |         if len(child) == 0:
 42 |             return i
 43 |     # if no head found, we pick the last child (which results in right-binarization of tree)
 44 |     return len(xml)-1
 45 | 
 46 | def binarize(xml, mode):
 47 | 
 48 |     for child in xml:
 49 |         binarize(child, mode)
 50 | 
 51 |     if len(xml) > 2 and mode == 'head':
 52 |         head_position = find_head(xml)
 53 |         # right-binarize head position and everything before it
 54 |         while head_position > 0 and len(xml) > 2:
 55 |             head_position -= 1
 56 |             virtual_node = ET.Element('tree')
 57 |             if head_position > 0:
 58 |                 # prefix '^i' marks that we expect more siblings on the left (and possibly on the right)
 59 |                 virtual_node.set('label', '^i' + xml.get('label'))
 60 |             else:
 61 |                 # prefix '^l' marks that we reached beginning of structure and have more siblings on the right
 62 |                 virtual_node.set('label', '^l' + xml.get('label'))
 63 |             virtual_node.append(xml[head_position])
 64 |             virtual_node.append(xml[head_position])
 65 |             xml.insert(head_position, virtual_node)
 66 |         # left-binarize the rest
 67 |         while len(xml) > 2:
 68 |             virtual_node = ET.Element('tree')
 69 |             virtual_node.set('label', '^l' + xml.get('label'))
 70 |             virtual_node.append(xml[0])
 71 |             virtual_node.append(xml[0])
 72 |             xml.insert(0, virtual_node)
 73 | 
 74 |     else:
 75 |         while len(xml) > 2:
 76 |             virtual_node = ET.Element('tree')
 77 |             virtual_node.set('label', '^' + xml.get('label'))
 78 |             if mode == 'left':
 79 |                 virtual_node.append(xml[0])
 80 |                 virtual_node.append(xml[0])
 81 |                 xml.insert(0, virtual_node)
 82 |             elif mode == 'right':
 83 |                 virtual_node.append(xml[-2])
 84 |                 virtual_node.append(xml[-1])
 85 |                 xml.append(virtual_node)
 86 | 
 87 | if __name__ == '__main__':
 88 | 
 89 |     if sys.version_info < (3, 0):
 90 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 91 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 92 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
 93 | 
 94 |     mode = sys.argv[1]
 95 | 
 96 |     for line in sys.stdin:
 97 |         if line == '\n':
 98 |             sys.stdout.write(line)
 99 |             continue
100 |         xml = ET.fromstring(line)
101 |         binarize(xml, mode)
102 |         escape_xml(xml)
103 |         sys.stdout.write(escape_text(ET.tostring(xml, encoding="UTF-8").decode("UTF-8") + '\n'))


--------------------------------------------------------------------------------
/emnlp2015/separable_prefix_postprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | # restore original representation of particle verbs.
  6 | # described in Rico Sennrich and Barry Haddow (2015). A Joint Dependency Model of Morphological and Syntactic Structure for Statistical Machine Translation. Proceedings of EMNLP.
  7 | 
  8 | from __future__ import unicode_literals
  9 | import sys
 10 | import codecs
 11 | import tree
 12 | 
 13 | def first_leaf(node):
 14 |     if isinstance(node, tree.Tree) and len(node):
 15 |         return first_leaf(node[0])
 16 |     else:
 17 |         return node
 18 | 
 19 | def last_leaf(node):
 20 |     if isinstance(node, tree.Tree) and len(node):
 21 |         return last_leaf(node[-1])
 22 |     else:
 23 |         return node
 24 | 
 25 | def comma_enclosure(node):
 26 |     comma = False
 27 |     if len(node):
 28 |         if first_leaf(node).strip() == b',' and not node.node.startswith('kon'):
 29 |             comma = True
 30 |             if comma and len(node) > 1 and last_leaf(node).strip() != b',':
 31 |                 node.append(tree.Tree(b'[comma [$, ,]]'))
 32 |                 return
 33 |         elif isinstance(node, tree.Tree) and len(node):
 34 |             comma_enclosure(node[-1])
 35 | 
 36 | def convert_ptkvz(node):
 37 | 
 38 |     part = None
 39 |     avz = None
 40 |     v_pos = None
 41 | 
 42 |     for i,child in list(enumerate(node)):
 43 |         if isinstance(child, tree.Tree):
 44 |             convert_ptkvz(child)
 45 | 
 46 |             if child.node == b'avz':
 47 |                 for grandchild in child:
 48 |                     if grandchild.node == b'PTKVZ':
 49 |                         avz = grandchild
 50 |                         avz_pos = i
 51 | 
 52 |             elif child.node == b'part':
 53 |                 for grandchild in child:
 54 |                     if grandchild.node == b'PTKZU':
 55 |                         part = grandchild
 56 |                         part_pos = i
 57 | 
 58 |             elif child.node.startswith(b'V'):
 59 |                 v_pos = i
 60 |                 if avz is not None:
 61 |                     # infinitive with zu-prefix and 
 62 |                     if child.node == b'VVINF' and part is not None and avz is not None and part_pos == i-2 and avz_pos == i-1:
 63 |                         child[0] = avz[0] + part[0] + child[0]
 64 |                         del node[part_pos]
 65 |                         del node[part_pos]
 66 |                         avz = None
 67 |                         child.node = b'VVIZU'
 68 |                     
 69 |                     elif child.node in [b'VVINF', b'VVPP'] and avz is not None and avz_pos == i-1:
 70 |                         child[0] = avz[0] + child[0]
 71 |                         del node[avz_pos]
 72 |                         avz = None
 73 | 
 74 |                     # we are not in main clause, so we should concatenate prefix and verb
 75 |                     elif avz is not None and (node.node in [b'objc', b'subjc', b'neb', b'rel', b'aux', b'root', b'vkon_sub'] or node.node.startswith(b'obji') or node.node.startswith(b'kon')) and avz_pos == i-1:
 76 |                         child[0] = avz[0] + child[0]
 77 |                         avz = None
 78 |                         del node[avz_pos]
 79 | 
 80 |             # identify end field by fact that subordinated clause follows
 81 |             elif avz is not None and v_pos is not None and (child.node in [b'objc', b'obji', b'subjc', b'rel', b'neb', b'vroot', b'comma', b'aux'] or child.node.startswith(b'kon') or child.node.startswith(b'obji')):
 82 |                 node.insert(i, avz)
 83 |                 del node[avz_pos]
 84 |                 avz = None
 85 |                 comma_enclosure(node[i-1])
 86 | 
 87 |     # we insert avz as last dependent if we haven't already
 88 |     if v_pos is not None and avz is not None:
 89 |         node.append(avz)
 90 |         del node[avz_pos]
 91 |         comma_enclosure(node[-2])
 92 |         
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 | 
 97 |     for line in sys.stdin:
 98 |         my_tree = tree.Tree(line)
 99 |         convert_ptkvz(my_tree)
100 |         if '--tree' in sys.argv:
101 |             sys.stdout.write(my_tree._pprint_flat(nodesep=b'', parens=b'[]', quotes=False) + b'\n')
102 |         else:
103 |             sys.stdout.write(b' '.join([leaf for leaf in my_tree.leaves() if leaf not in [b'<s>', b'</s>']]) + b'\n')


--------------------------------------------------------------------------------
/emnlp2015/separable_prefix.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | # normalize representation of German particle verbs to common representation
  6 | # described in Rico Sennrich and Barry Haddow (2015). A Joint Dependency Model of Morphological and Syntactic Structure for Statistical Machine Translation. Proceedings of EMNLP.
  7 | 
  8 | from __future__ import print_function, unicode_literals
  9 | import sys
 10 | import codecs
 11 | from collections import defaultdict
 12 | 
 13 | import fst_wrapper
 14 | 
 15 | from lxml import etree as ET
 16 | 
 17 | def get_text(element, text):
 18 |     if element.text:
 19 |         text.append(element.text)
 20 |     for child in element:
 21 |         get_text(child, text)
 22 |     if element.tail:
 23 |         text.append(element.tail)
 24 | 
 25 | def strip_xml(xml):
 26 |     text_list = []
 27 |     get_text(xml, text_list)
 28 |     text = ' '.join([t.strip() for t in text_list])
 29 |     return text
 30 | 
 31 | def escape_xml(element):
 32 | 
 33 |     if element.text:
 34 |         element.text = element.text.replace('\'','&apos;')
 35 |         element.text = element.text.replace('"','&quot;')
 36 | 
 37 |     for child in element:
 38 |         escape_xml(child)
 39 | 
 40 | def escape_text(s):
 41 | 
 42 |     s = s.replace('|','&#124;') # factor separator
 43 |     s = s.replace('[','&#91;') # syntax non-terminal
 44 |     s = s.replace(']','&#93;') # syntax non-terminal
 45 | 
 46 |     s = s.replace('&amp;apos;','&apos;') # lxml is buggy if input is escaped
 47 |     s = s.replace('&amp;quot;','&quot;') # lxml is buggy if input is escaped
 48 | 
 49 |     return s
 50 | 
 51 | def convert_ptkvz(xml):
 52 | 
 53 |     vvfin = None
 54 |     avz = None
 55 | 
 56 |     offset = 0
 57 |     for i, child in list(enumerate(xml)):
 58 |         # separate prefix from verbs
 59 |         if child.get('label').startswith('VV') and child.text:
 60 |             split = has_vpart(child.text.strip())
 61 |             if split:
 62 |                 avz = ET.Element('tree')
 63 |                 avz.set('label', 'avz')
 64 |                 ptkvz = ET.Element('tree')
 65 |                 ptkvz.set('label', 'PTKVZ')
 66 |                 ptkvz.text = split[0]
 67 |                 avz.append(ptkvz)
 68 |                 xml.insert(i+offset,avz)
 69 |                 child.text = split[1]
 70 |                 if split[1].startswith('zu') and split[2]:
 71 |                     part = ET.Element('tree')
 72 |                     part.set('label', 'part')
 73 |                     ptkzu = ET.Element('tree')
 74 |                     ptkzu.set('label', 'PTKZU')
 75 |                     ptkzu.text = 'zu'
 76 |                     part.append(ptkzu)
 77 |                     xml.insert(i+offset,part)
 78 |                     offset += 1
 79 |                     child.text = split[1][2:]
 80 |                     child.set('label', 'VVINF')
 81 |                 offset += 1
 82 | 
 83 |         if child.get('label') == 'VVFIN':
 84 |             vvfin = child
 85 |             vvfin_pos = i+offset
 86 |         elif child.get('label') == 'avz':
 87 |             avz = child
 88 |             break
 89 | 
 90 |     # verb has separated prefix: reorder
 91 |     if vvfin is not None and avz is not None:
 92 |         xml.insert(vvfin_pos, avz)
 93 | 
 94 |     # recursion
 95 |     for child in xml:
 96 |         convert_ptkvz(child)
 97 | 
 98 | 
 99 | def has_vpart(word):
100 |     if word in smor_cache:
101 |         return smor_cache[word]
102 |     else:
103 |         analyses = sorted(smor.analyse(word), key=lambda x: x.count('<'))
104 |         analyses = [x for x in analyses if '<+V>' in x]
105 |         if analyses and all('<#>' in line for line in analyses):
106 |             prefix_len = analyses[0].index('<#>')
107 |             if analyses[0].startswith('<CAP>'):
108 |                 prefix_len -= 5
109 |             has_zu = "<zu>" in analyses[0]
110 |             smor_cache[word] = word[:prefix_len], word[prefix_len:], has_zu
111 |             return word[:prefix_len], word[prefix_len:], has_zu
112 |         else:
113 |             smor_cache[word] = False
114 |             return False
115 | 
116 | 
117 | if __name__ == '__main__':
118 | 
119 |     if '-train' in sys.argv:
120 |         sys.exit(0)
121 | 
122 |     smor = fst_wrapper.FstWrapper('fst-mor', sys.argv[1])
123 |     smor_cache = {}
124 | 
125 |     if sys.version_info < (3, 0):
126 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
127 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
128 |         sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
129 | 
130 |     for line in sys.stdin:
131 |         if line == '\n':
132 |             sys.stdout.write(line)
133 |             continue
134 |         xml = ET.fromstring(line)
135 |         convert_ptkvz(xml)
136 |         escape_xml(xml)
137 |         sys.stdout.write(escape_text(ET.tostring(xml, encoding="UTF-8").decode("UTF-8") + '\n'))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Scripts for Edinburgh English-German syntax system for WMT 2014 and WMT 2015
 2 | ============================================================================
 3 | 
 4 | This repository contains scripts and an example config used for the Edinburgh syntax submission (UEDIN-SYNTAX) for the English-German
 5 | shared translation task at the 2014 and 2015 Workshops on Statistical Machine Translation (http://www.statmt.org/wmt14/ http://www.statmt.org/wmt15/).
 6 | 
 7 | The scripts will facilitate the reproduction of our results, and may be useful for people who want to use ParZu (or a different parser with the dependency format by Kilian Foth) for SMT,
 8 | or string-to-tree systems in general. The hybrid compound splitter can also be used for phrase-based systems, and with German as source language.
 9 | 
10 | CONTENTS
11 | --------
12 | 
13 | - hybrid_compound_splitter.py
14 | 
15 |    compound splitter for German (hybrid of finite-state and corpus-based methods as described in Fritzinger & Fraser (2010)),
16 |    with a novel syntactic representation of split compounds for simple compound merging after string-to-tree translation.
17 |    The syntactic representation of split compounds is treebank independent and described in Sennrich, Williams and Huck (2014).
18 | 
19 |    The system to WMT 2014 used the following commands for training/applying the compound splitter:
20 | 
21 |    `hybrid_compound_splitter.py -train -syntax -corpus INPUT_FILE -model MODEL_FILE`
22 |    `hybrid_compound_splitter.py -write-filler -no-truecase -q -syntax -smor zmorge-{version}-smor_newlemma.a -model MODEL_FILE < INPUT_FILE > OUTPUT_FILE`
23 | 
24 |    In a string-to-tree system with a syntactic representation of compounds,
25 |    just apply the following regex substitution to the output for compound merging:
26 | 
27 |    `s/ \@(.*?)\@ /\1/g;`
28 | 
29 | - enrich_labelset.py
30 | 
31 |    modification of ParZu dependency label set for SMT, splitting up overgeneral labels into distinct subtypes.
32 |    This script can be applied to ParZu output in CONLL format (before conversion into moses format
33 |    with the script included in mosesdecoder under `scripts/training/wrappers/conll2mosesxml.py`).
34 | 
35 |    Use command line option `--wmt14` to activate the modifications used for the submission.
36 |    Assuming you have the (German-side) tokenized corpus as `INPUT_FILE`, the Moses parsed files are generated as follows:
37 | 
38 |    ```
39 |    /path/to/mosesdecoder/scripts/tokenizer/deescape-special-chars.perl < INPUT_FILE | \
40 |     /path/to/ParZu/parzu -i tokenized_lines --projective | \
41 |     enrich_labelset.py --wmt14 | \
42 |     /path/to/mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
43 |     ```
44 | 
45 | -  emnlp2015/*
46 | 
47 |    scripts used for tree binarization, verb particle restructuring, and (a modified) compound splitting.
48 |    The techniques are described in Sennrich and Haddow (2015).
49 | 
50 | - example/toy_example*.config
51 | 
52 |    toy configs for the moses experimental management system (EMS) that document good settings for training
53 |    string-to-tree system, and automates the integration of ParZu, compound splitting, tuning on a syntactic
54 |    metric, a relational dependency language model, and other models into the training process.
55 |    The different toy examples also document our submissions to the WMT 2014/5 shared translation tasks.
56 | 
57 |    To facilitate reproduction of our results, parses of the German WMT data sets have been released:
58 |    http://statmt.org/rsennrich/parsed_wmt/
59 | 
60 | LICENSE
61 | -------
62 | 
63 | The scripts are available under the LGPL v2.
64 | 
65 | PUBLICATIONS
66 | ------------
67 | 
68 | The Edinburgh syntax submission to WMT 2014 is described in:
69 | 
70 |  Philip Williams, Rico Sennrich, Maria Nadejde, Matthias Huck, Eva Hasler and Philipp Koehn (2014): 
71 |    Edinburgh's Syntax-Based Systems at WMT 2014. In: Proceedings of the Ninth Workshop on Statistical Machine Translation.
72 | 
73 | More details are provided in:
74 | 
75 |  Rico Sennrich, Philip Williams, Matthias Huck (2015):
76 |    A tree does not make a well-formed sentence: Improving syntactic string-to-tree statistical machine translation with more linguistic knowledge.
77 |    In: Computer Speech & Language, 32(1):27-45. Hybrid Machine Translation: integration of linguistics and statistics.
78 | 
79 | The Edinburgh syntax submission to WMT 2015 is described in:
80 | 
81 |   Philip Williams, Rico Sennrich, Maria Nadejde, Matthias Huck and Philipp Koehn (2015):
82 |    Edinburgh's Syntax-Based Systems at WMT 2015.
83 |    In: Proceedings of the Tenth Workshop on Statistical Machine Translation. Lisbon, Portugal, pp. 199-209.
84 | 
85 | More details are provided in:
86 | 
87 |  Rico Sennrich (2015):
88 |    Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation.
89 |    In: Transactions of the Association for Computational Linguistics 3, 169--182.
90 | 
91 |  Rico Sennrich and Barry Haddow (2015):
92 |    A Joint Dependency Model of Morphological and Syntactic Structure for Statistical Machine Translation.
93 |    In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. Lisbon, Portugal, pp. 2081-2087.


--------------------------------------------------------------------------------
/emnlp2015/hyphen-splitter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | # hyphen splitter: splits all hyphenated words, and with option -syntax, creates a hierarchical tree in moses XML format.
  6 | 
  7 | from __future__ import division, unicode_literals
  8 | import sys
  9 | import re
 10 | import codecs
 11 | import argparse
 12 | 
 13 | from lxml import etree as ET
 14 | 
 15 | def create_compound_xml(element, wordlist, merge_junctures, dependency, initial=False):
 16 | 
 17 |     # separate last segment, then recursively label remainder as compound modifier
 18 |     if initial:
 19 |         juncture = ''
 20 |         dep = ET.Element('tree')
 21 |         dep.set('label', 'SEGMENT')
 22 |         dep.text = wordlist[-1]
 23 |         remainder = wordlist[:-1]
 24 |         if remainder:
 25 |             create_compound_xml(element, remainder, merge_junctures, dependency)
 26 |         element.append(dep)
 27 |         return
 28 | 
 29 |     juncture = wordlist[-1]
 30 |     word = wordlist[-2]
 31 |     remainder = wordlist[:-2]
 32 | 
 33 |     head = ET.Element('tree')
 34 |     head.set('label', 'comp_mod')
 35 |     element.append(head)
 36 | 
 37 |     dep1 = ET.Element('tree')
 38 |     dep1.text = word
 39 |     if merge_junctures:
 40 |         dep1.set('label', 'SEGMENT+JUNC')
 41 |     else:
 42 |         dep1.set('label', 'SEGMENT')
 43 | 
 44 |     if remainder:
 45 |         create_compound_xml(head, remainder, merge_junctures, dependency)
 46 | 
 47 |     head.append(dep1)
 48 | 
 49 |     dep2 = ET.Element('tree')
 50 |     dep2.set('label', 'JUNC')
 51 |     dep2.text = juncture
 52 |     dep3 = ET.Element('tree')
 53 |     dep3.set('label', 'junc')
 54 |     dep3.append(dep2)
 55 |     head.append(dep3)
 56 | 
 57 | 
 58 | def main(file_obj, merge_junctures, syntax, dependency):
 59 | 
 60 |     re_syntax_splitter = re.compile(r'((?:\s*(?:<[^<>]*>)+\s*)|(?:(?<!>)\s+(?!<)))')
 61 |     re_hyphen_splitter = re.compile(r'(\S+?)\-(?=\S)')
 62 | 
 63 |     for line in file_obj:
 64 | 
 65 |         # only do syntactic processing if option syntax is used and we see '<' in line
 66 |         write_syntax = syntax
 67 |         if write_syntax and not '<' in line:
 68 |             write_syntax = False
 69 | 
 70 |         if write_syntax:
 71 |             words_in = re_syntax_splitter.split(line)
 72 |             words_in_clean = [word for word in words_in if word and not word.startswith('<') and not word == ' ']
 73 |         else:
 74 |             words_in = line.split()
 75 |             words_in_clean = words_in
 76 | 
 77 |         words = []
 78 |         for word in words_in:
 79 | 
 80 |             if not word:
 81 |                 continue
 82 |             if word == ' ' or (write_syntax and word.startswith('<')) or word == '@-@':
 83 |                 words.append(word)
 84 |                 continue
 85 | 
 86 |             if merge_junctures:
 87 |                 word = re_hyphen_splitter.sub(r'\1-@@ ', word)
 88 |             else:
 89 |                 word = re_hyphen_splitter.sub(r'\1 @-@ ', word)
 90 | 
 91 |             if write_syntax and len(word.split()) > 1:
 92 |                 head = ET.Element('x')
 93 |                 create_compound_xml(head, word.split(), merge_junctures, dependency, initial=True)
 94 |                 word = ET.tostring(head, encoding="UTF-8")[3:-4].decode("UTF-8")
 95 |                 word = word.rsplit('<',1)[0]
 96 |                 words[-1] = words[-1].rsplit('<',1)[0]
 97 | 
 98 |             words.append(word)
 99 | 
100 |         if write_syntax:
101 |             sys.stdout.write(''.join(words))
102 |         else:
103 |             sys.stdout.write(' '.join(words) + '\n')
104 | 
105 | 
106 | def parse_arguments():
107 | 
108 |     help_text =  "hyphen splitter\n"
109 | 
110 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
111 | 
112 |     general = parser.add_argument_group('general options')
113 | 
114 |     general.add_argument('-model', metavar='MODEL',
115 |                     help='path to statistical model. Currently ignored.')
116 |     general.add_argument('-corpus', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
117 |                     help='input text (default: standard input).')
118 |     general.add_argument('-train', action="store_true",
119 |                     help='train model on input text. Currently ignored.')
120 |     general.add_argument('-syntax', action="store_true",
121 |                     help='input/output is syntactic tree')
122 |     general.add_argument('-q', action="store_true",
123 |                     help='quiet mode.')
124 |     general.add_argument('-dependency', action='store_true',
125 |                     help='dependency-like representation of compounds (ensure that every nonterminal in compound representation has exactly one preterminal)')
126 | 
127 |     general.add_argument('-merge-filler', action="store_true", dest='merge_junctures',
128 |                     help='concatenate hyphens with preceding word ("Test-@@ Datei" instead of "Test @-@ Datei")')
129 | 
130 |     args = parser.parse_args()
131 | 
132 |     return args
133 | 
134 | if __name__ == '__main__':
135 | 
136 |     args = parse_arguments()
137 | 
138 |     VERBOSE = not args.q
139 | 
140 |     if sys.version_info < (3, 0):
141 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
142 |         args.corpus = codecs.getreader('UTF-8')(args.corpus)
143 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
144 | 
145 |     if args.train:
146 |         sys.exit(0)
147 | 
148 |     else:
149 |         main(args.corpus, args.merge_junctures, args.syntax, args.dependency)
150 | 


--------------------------------------------------------------------------------
/example/data/parallelC.de-en.en:
--------------------------------------------------------------------------------
  1 | The ECB wants to hold inflation to under two percent , or somewhere in that vicinity .
  2 | They also predict that the ECB will cut interest rates twice during the course of 2008 .
  3 | The infection was detected in a man who had been hospitalized after a major accident .
  4 | About two-thirds of the infections -LRB- 683 cases -RRB- were caused by MRSAs , whose official name is a methicillin -LRB- oxacillin -RRB- resistant Staphylococcus aureus .
  5 | One-third of all people carry the Staphylococcus aureus -LRB- SA -RRB- bacteria in their noses without it ever causing an infection .
  6 | Analysts suggest that people who buy real estate in Bulgaria would also like to change their lifestyles .
  7 | Hungary &apos;s six percent
  8 | The ministries of health , labour and local government organized more puritan dances .
  9 | All of these factors influence the political stability of the region , &apos;&apos; it adds .
 10 | The initiative triggered heated debate among the Bulgarian public , it added .
 11 | March 15 is a national holiday commemorating the revolution of 1848 while October 23 commemorates the revolution of 1956 .
 12 | B.Zs. : Of course it feels good to see more and more people at our concerts .
 13 | That song is about much more .
 14 | A Crisis Not Only in the U.S .
 15 | `` All I ever wanted was to gain some money for my bank , &apos;&apos; he claims in his testimony .
 16 | In the summer he managed to gain a half billion Euro from this American crisis .
 17 | He entered Société Générale in the 2000 , immediately after he had finished his university studies in Lyon .
 18 | Two years later , he was already an assistant to the broker , and in 2005 he became an independent broker with an annual income of $ 90,000 .
 19 | `` As early as during my first interview in 2005 , I knew they despised me .
 20 | But soon after Kerviel entered his career , the first big success followed .
 21 | And that was the trigger mechanism - he wanted more .
 22 | `` It &apos;s like a snowball , &apos;&apos; he told the police .
 23 | In a single day , he lost 1 billion Euro .
 24 | When he came to the office on Monday , he was no longer employed by the bank .
 25 | And a few days later , the bank announced it had lost 5 billion Euro because of him .
 26 | `` I took only four days out of the last year .
 27 | A broker who does not take vacations is a broker who does not want anybody to look into his records , &apos;&apos; Kerviel concluded .
 28 | Czech Republic has chance to get 365 billion .
 29 | This is roughly half the total sum the Czech Republic can acquire from Brussels during the years 2007 to 2013 .
 30 | The Transportation program is the biggest operational program , it represents almost 22 percent of the resources which the Czech Republic can obtain from the funds .
 31 | Companies should obtain financial resources for the introduction of new technologies , for the industrial research , but also for the introduction of more effective methods of organization .
 32 | These companies were imaginary and the accounts were actually held by his relatives and closest adherents .
 33 | Each governance begins and ends with the individual chapters and items of the budget , like it or not .
 34 | The budget is a substantial part of politics , but it is covered by the struggle for power between the government and the opposition .
 35 | The politicians &apos; quarrels are much more interesting for the public than straight numbers .
 36 | Our economy has already been passing through the phase of growth for a few years , as is the rest of the world , about which CSSD keeps boasting .
 37 | This has been a unique opportunity for its governments to put public financing in order .
 38 | Other countries , with a few exceptions such as Hungary , Albania , Pakistan or Egypt , have managed the money much more reasonably than our republic .
 39 | We are among the countries which are , measured globally , endangered by poverty to the least extent .
 40 | More than $ 100 billion will enter the monetary markets by means of public sales .
 41 | Banks have already lost $ 60 billion due to the non-repaid loans , especially in the U.S .
 42 | That will lower the consumption and send the American economics into a recession , with possible impact on the economic growth of the whole world .
 43 | `` It will have the same effect as another decrease in the interest rates , &apos;&apos; Greg Salvaggio from Tempus Consulting told Reuters .
 44 | The Czech president emphasized during the presentation of his book that its topic is very important for him , which is why he is so engaged in it .
 45 | Today is the last day for people to apply for the renewal so that the authorities could manage to issue the new license by the end of the year .
 46 | The validity of the licenses issued 2001-2004 will end in December 2013 .
 47 | He can not get any more .
 48 | In exchange , the gang surrendered 30 percent of the loot to Opava and Koňařík .
 49 | The hospital &apos;s attorney , Ondřej Dostál , therefore radically disapproves of Veleba &apos;s proposition .
 50 | `` We reject such a claim , it is too high , &apos;&apos; Dostál said after today &apos;s meeting .
 51 | We will find out if it has any informative value at all .
 52 | Of raising money and devising the projects , &apos;&apos; he said .
 53 | According to the court , Polívka used the appellation `` the Wallachian king &apos;&apos; for the first time in his TV show .
 54 | Nevertheless , at the time it was not related to the intention to build a fictitious kingdom .
 55 | The collaboration of currently quarrelling parties continued until the year 2000 , when Harabiš organized an official coronation in Vsetín for Polívka .
 56 | In 2002 Polívka brought a lawsuit against Harabiš and his company , which the court overruled today .
 57 | The king should be elected for a definite time span .
 58 | Even Mr. Polívka can sign up , &apos;&apos; added Harabiš .
 59 | The Test of Peugeot 207 SW 1.6 VTi : Melting Prejudices
 60 | To tell the truth , I have never rooted for Peugeot 206 too much .
 61 | But now there is its successor , the Peugeot 207 on the market , and it is far better .
 62 | The tested wagon was equipped with a four-cylinder petrol engine with 1.6 l capacity and 88kW power .
 63 | Nevertheless , we shall bear in mind that we have tested a fairly equipped car with a spacious interior and a very good engine .
 64 | The complaints come from both sides .
 65 | The above mentioned cases indicate that the mood between the locals and the immigrants at British schools is beginning to be strained .
 66 | For its visitors it is really an island of freedom .
 67 | Golden and lily-white beaches , high waves or a sea calm as a pond .
 68 | Kite-surfing , golf and roulette .
 69 | But there is just one island where you find much more , and that island is Cuba , the biggest and most beautiful of the West Indies .
 70 | And then the American era came .
 71 | The Hotel Nacional was built at the very end of the 1920s .
 72 | Winston Churchill , Ava Gardner , Frank Sinatra .
 73 | Ernest Hemingway bought a homestead in San Francisco de Paula , not far from Havana , and he had a yacht in Cojímar .
 74 | The movie stars and politicians from all over America were leaving their signatures on the walls of Bodeguita del Medio .
 75 | An ancient city which was established shortly after the Spaniards had colonized the island .
 76 | Fame and fortune were brought there with the slave and sugar trades .
 77 | The colonial Spanish architecture , deeply influenced by the later French colonists , has not changed much during four centuries .
 78 | Nothing was being built there , therefore nothing was being torn down either , everybody was glad to have a roof over their head .
 79 | In the evening , it is appropriate go to Casa de la Música , which becomes full of life at nightfall and the music abates there at dawn .
 80 | According to the U.S. , the low number of soldiers and the lack of necessary equipment is influencing the course of the mission in Afghanistan .
 81 | According to Secretary of Defense Robert Gates , NATO should design a concrete plan for the following years .
 82 | `` The development in Afghanistan is tangible , but armed forces can only be part of the solution .
 83 | The number of violent acts rose 27 percent compared with last year , as much as 60 percent in the southern Helmand province .
 84 | The NATO countries have about 40 thousand soldiers in Afghanistan , about a quarter of whom are in the southern territory .
 85 | `` We will have to think hard about the concrete goals for the coming era and the ways to reach them , &apos;&apos; said Gates .
 86 | `` I am not happy about what we have on the ground and in the air in Afghanistan .
 87 | There are some quarrels among the countries involved over whether all the allies are doing their best in Afghanistan .
 88 | The German Bild fought back and printed a photo of a British unit during a rugby match in the southern part of Afghanistan .
 89 | The total debt rose by the end of September to 842.7 billion crowns .
 90 | During all of last year , the state incurred debts of more than 111 billion crowns .
 91 | The average due date of the national debt rose to 6.5 years .
 92 | Better than Expected
 93 | Surely , Muhammad is the name of the Muslim prophet , but first of all it &apos;s a very common name in Sudan .
 94 | Then , the children created a book for the class with the bear &apos;s picture followed by the legend `` My name is Muhammad . &apos;&apos;
 95 | Certainly , it &apos;s a matter of interpretation .
 96 | Troika mediators , which gathers the European Union , USA and Russia , will go for the last time to Serbia and Kosovo on Monday .
 97 | Americans and Europeans consider that their mediation will end on the 10th December .
 98 | The Russians , who have already blocked in the Security Council the granting of independence to Kosovo , are insisting on continuing negotiations beyond this deadline .
 99 | `` Russia has been and remains committed to a negotiated solution , a compromise solution , &apos;&apos; said Botsane-Khartchenko .
100 | Pristina is ready to proclaim it unilaterally if the Security Council wo n&apos;t be in its favour .
101 | 


--------------------------------------------------------------------------------
/example/data/parallelC.de-en.de:
--------------------------------------------------------------------------------
  1 | Die EZB ist bestrebt , die Inflationsrate unter zwei Prozent , oder zumindest knapp an der zwei-Prozent-Marke zu halten . 
  2 | Für 2008 rechnen Experten damit , dass die EZB die Zinsen zweimal senken wird . 
  3 | Das gegen Antibiotika resistente Bakterium wurde in einem männlichen Patienten gefunden , der nach einem schweren Unfall ins Krankenhaus eingeliefert wurde . 
  4 | Etwa zwei Drittel der Infektionen ( 683 Fälle ) wurden vom MRSA , dem gegen Methicillin-Oxacillin resistenten Staphylococcus aureus ausgelöst . 
  5 | Das Staphylococcus aureus ( SA ) Bakterium trägt etwa ein Drittel der Menschen in ihrer Nase , ohne dass es eine Krankheit verursachen würde . 
  6 | Analysten zufolge streben Menschen , die in Bulgarien Immobilien erwerben , auch eine Veränderung ihres Lebenswandels an . 
  7 | Ungarns sechs Prozent 
  8 | Die Ministerien für Gesundheit , Arbeit , und Selbstverwaltung bereiten sich jeweils auf puritanische Feier vor . 
  9 | Das alles hat Auswirkungen auf die politische Stabilität der Region &quot; - hieß es . 
 10 | Und weiter : Die Initiative hätte eine heftige Debatte in der bulgarischen Öffentlichkeit losgetreten . 
 11 | Die Staatsfeiertage sind der 15. März , der an die Revolution von 1848 erinnert , und der 23. Oktober , der Gedenktag der Revolution von 1956 . 
 12 | B. Zs . : Natürlich ist es ein gutes Gefühl , zu sehen , dass immer mehr zu unseren Konzerten kommen . 
 13 | In diesem Lied geht es um mehr . 
 14 | Krise nicht nur in Amerika 
 15 | &quot; Alles , was ich wollte , war für meine Bank Geld zu verdienen &quot; , behauptet er in seiner Aussage . 
 16 | Im Sommer war es ihm gelungen , gerade an der amerikanischen Krise eine halbe Milliarde Euro zu verdienen . 
 17 | In die Société Générale trat er im Jahr 2000 , unmittelbar nachdem er sein Studium an der Universität Lyon beendet hatte , ein . 
 18 | Nach zwei Jahren war er bereits Assistent eines Maklers und im Jahr 2005 wurde er selbstständiger Makler mit einem Jahresgehalt von 90000 Dollar . 
 19 | &quot; Schon bei meinem ersten Gespräch im Jahr 2005 wusste ich , dass man mich beobachtete . 
 20 | Aber schon bald nach seinem Eintritt kam der erste große Erfolg . 
 21 | Und das war der Auslöser - jetzt wollte er mehr . 
 22 | &quot; Das ist wie ein Schneeball &quot; , sagte er den Polizisten . 
 23 | An einem einzigen Tag verlor er eine Milliarde Euro . 
 24 | Als er am Montag zur Arbeit kam , war er schon nicht mehr Angestellter der Bank . 
 25 | Und ein paar Tage später verkündete die Bank , dass sie seinetwegen fünf Milliarden Euro verloren habe . 
 26 | &quot; Ich habe nur vier Tage vom Vorjahr genommen . 
 27 | Ein Makler , der keinen Urlaub nimmt , ist einer , der nicht will , dass man ihm in die Karten schaut &quot; , sagte Kerviel abschließend . 
 28 | Tschechien hat die Chance , zu 365 Milliarden zu kommen 
 29 | Es geht hier ungefähr um die Hälfte der Summe , die Tschechien in den Jahren 2007 bis 2013 überhaupt aus Brüssel erhalten kann . 
 30 | Das Verkehrsprogramm ist das größte Entwicklungsprogramm und umfasst bis zu 22 Prozent der Mittel , die Tschechien aus dem Fonds erhalten kann . 
 31 | Die Firmen sollen dadurch Finanzen für die Einführung neuer Technologien , für Unternehmensforschung , aber auch für die Einführung effektiverer Organisationsmethoden gewinnen . 
 32 | Diese Firmen waren fiktiv und die Konten gehörten in Wirklichkeit seinen Verwandten und engsten Anhängern . 
 33 | Alles Regieren beginnt und endet mit den einzelnen Kapiteln und Posten des Haushalts , ob einem das gefällt oder nicht . 
 34 | Der Haushalt ist ein wesentlicher Bestandteil der Politik , wird aber verdeckt durch den Machtkampf der Regierung und der Opposition . 
 35 | Ein Streit unter Politikern ist für die Öffentlichkeit viel interessanter als nüchterne Zahlen . 
 36 | unsere Wirtschaft schon seit einigen Jahren , wie der Rest der Welt , eine Wachstumsphase durchläuft , deren sich die ČSSD ohne Unterlass rühmt . 
 37 | Das stellte eine einmalige Möglichkeit für ihre Regierungen dar , die öffentlichen Finanzen in Ordnung zu bringen . 
 38 | Andere Länder , abgesehen von ein paar Ausnahmen wie Ungarn , Albanien , Pakistan oder Ägypten , haben viel vernünftiger gewirtschaftet als unsere Republik . 
 39 | Wir gehören zu den Ländern , die , im globalen Maßstab gesehen , am wenigsten von Armut bedroht sind . 
 40 | So werden mehr als 100 Milliarden Dollar vermittels Auktionen auf die Währungsmärkte gelangen . 
 41 | Die Banken haben wegen nicht bedienter Kredite schon um 60 Milliarden Dollar verloren , vor allem in den Vereinigten Staaten . 
 42 | Das würde den Konsum verringern und die amerikanische Wirtschaft in eine Rezession stürzen , möglicherweise mit Auswirkungen auf das Wirtschaftswachstum auf der ganzen Welt . 
 43 | &quot; Das wird den gleichen Effekt haben wie eine weitere Absenkung der Zinssätze &quot; , sagte Greg Salvaggio von Tempus Consulting der Agentur Reuters . 
 44 | Der tschechische Präsident betonte bei der Vorstellung seines Buches , dass dessen Thema für ihn sehr wichtig sei , und dass er sich dafür deshalb so engagiere . 
 45 | Heute ist der letzte Tag , wo man den Umtausch beantragen kann , damit die Behörden es schaffen , bis zum Ende des Jahres einen neuen Führerschein auszustellen . 
 46 | Die Gültigkeit von Führerscheinen , die 2001 bis 2004 ausgestellt wurden , endet im Dezember 2013 . 
 47 | Zu mehr kann er nicht verurteilt werden . 
 48 | Die Bande überließ dafür Opava und Koňařík 30 Prozent der Beute . 
 49 | Der Rechtsvertreter des Krankenhauses , Ondřej Dostál , ist deshalb mit dem Vorschlag Velebas ganz und gar nicht einverstanden . 
 50 | &quot; Einen solchen Vorschlag lehnen wir ab , er liegt um ein Vielfaches zu hoch &quot; , sagte Dostál nach der heutigen Verhandlung . 
 51 | Wir werden feststellen , ob es überhaupt Aussagewert hat . 
 52 | Geldbeschaffung und Konzepte der Aktivitäten &quot; , sagte er . 
 53 | Dem Gericht zufolge benutzte Polívka die Bezeichnung &quot; Walachischer König &quot; zum ersten Mal in einer seiner Fernsehsendungen . 
 54 | Das hing aber damals nicht mit der Absicht zusammen , ein fiktives Königreich zu begründen . 
 55 | Die Zusammenarbeit der heute zerstrittenen Parteien dauerte bis zum Jahr 2000 , als Harabiš für Polívka in Vsetín die offizielle Krönung organisierte . 
 56 | 2002 reichte dann Polívka gegen Harabiš und seine Firma die Klage ein , die das Gericht heute abwies . 
 57 | Der König sollte für eine bestimmte Zeit gewählt werden . 
 58 | Auch Herr Polívka kann sich dabei ruhig anmelden &quot; , fügte Harabiš hinzu . 
 59 | Test Peugeot 207 SW 1.6 VTi : Voruteile schwinden 
 60 | Um die Wahrheit zu sagen , ich war nie ein besonderer Fan des Peugeot 206 . 
 61 | Doch jetzt ist sein Nachfolger , der Peugeot 207 , auf dem Markt , und der ist um vieles besser . 
 62 | Unser Testwagen war mit einem Vierzylinder-Benzinmotor mit Hubraum 1,6 Liter und einer Leistung von 88 kWp ausgestattet . 
 63 | Wir müssen uns aber daran erinnern , dass wir ein gut ausgestattetes Auto mit einem geräumigen Inneren und einem sehr guten Motor getestet haben . 
 64 | Beschwerden kommen von beiden Seiten . 
 65 | Die angeführten Fälle bezeugen , dass die Stimmung zwischen Einheimischen und Zuwanderern an britischen Schulen immer angespannter wird . 
 66 | Für Besucher ist es wirklich eine Insel der Freiheit . 
 67 | Goldene und weiße Strände , hohe Wellen oder ein Meer , das ruhig ist wie ein Teich . 
 68 | Kitesurfing , Golf und Roulette . 
 69 | Nur auf einer Insel finden Sie aber noch viel mehr , und diese Insel ist Kuba , die größte und schönste der Großen Antillen . 
 70 | Und dann kam die amerikanische Epoche . 
 71 | Das Hotel Nacional entstand ganz am Ende des 20. Jahrhunderts . 
 72 | Winston Churchill , Ava Gardner , Frank Sinatra . 
 73 | Ernest Hemingway kaufte sich ein Anwesen in San Francisco de Paula , in der Nähe Havannas , und in Cojímar hatte er seine Jacht liegen . 
 74 | An den Wänden der Bodeguita del Medio hinterließen Stars des Showbusiness und Politiker aus ganz Amerika ihre Unterschriften . 
 75 | Eine uralte Stadt , die kurz , nachdem die Spanier die Insel kolonisiert hatten , gegründet wurde . 
 76 | Ruhm und Reichtum brachte ihr der Handel mit Sklaven und Zucker . 
 77 | Die spanische Kolonialarchitektur , stark beeinflusst von den späteren französischen Siedlern , hat sich seit vierhundert Jahren kaum verändert . 
 78 | Nichts wurde dort gebaut , aber deshalb auch nichts abgerissen , jeder war froh , dass er ein Dach über dem Kopf hatte . 
 79 | Am Abend sollte man dann in die Casa de la Música gehen , die sich nach der Dunkelheit belebt , und wo die Musik bis zum Morgen nicht schweigt . 
 80 | Zu wenig Soldaten und der Mangel an nötiger Ausrüstung beeinflusst den Vereinigten Staaten zufolge den Verlauf der Mission in Afghanistan . 
 81 | Die NATO sollte nach den Worten von Verteidigungsminister Robert Gates einen konkreten Plan für die nächsten Jahre entwerfen . 
 82 | &quot; In Afghanistan ist eine Vorwärtsentwicklung spürbar , aber Militärmacht kann nur ein Teil der Lösung sein . 
 83 | Die Anzahl von Gewaltaktionen ist im Vergleich zum Vorjahr um 27 Prozent gestiegen , in der südlichen Provinz Helmand sogar um sechzig Prozent . 
 84 | Die NATO-Länder haben in Afghanistan etwa 40000 Soldaten stationiert , davon etwa ein Viertel im südlichen Territorium . 
 85 | &quot; Wir werden ernsthaft über die konkreten Ziele für die nächste Zeit und über die Art und Weise nachdenken müssen , wie wir sie erreichen können &quot; , sagte Gates . 
 86 | &quot; Ich bin nicht erfreut darüber , was wir in Afghanistan zu Lande und in der Luft zur Verfügung haben . 
 87 | Unter den beteiligten Ländern herrscht in den letzten Monaten auch Streit darüber , ob alle Verbündeten in Afghanistan ihr Bestes geben . 
 88 | Die deutsche Bildzeitung ging zum Gegenangriff über und druckte ein Bild einer britischen Einheit bei einem Rugby-Wettkampf im südlichen Teil Afghanistans ab . 
 89 | Die Gesamtschuld stieg bis Ende September auf 842,7 Milliarden Kronen . 
 90 | Über das gesamte vergangene Jahr verschuldete sich der Staat um mehr als 111 Milliarden Kronen . 
 91 | Die durchschnittliche Fälligkeit der Staatsschulden stieg auf 6,5 Jahre . 
 92 | Besser als erwartet 
 93 | Sicherlich , Mohammed ist der Name des islamischen Propheten , aber vor allem auch ein im Sudan sehr häufiger Vorname . 
 94 | Anschließend haben die Kinder ein Buch für die Klasse erstellt , mit dem Foto des Bären darauf und der Bildunterschrift  Mein Name ist Mohammed  . 
 95 | Dies ist selbstverständlich eine Frage der Interpretation . 
 96 | Die Vermittler der Troika , die die Europäische Union , die USA und Russland in sich vereint , werden sich am Montag ein letztes Mal in Serbien und im Kosovo treffen . 
 97 | Amerikaner und Europäer rechnen damit , dass ihr Vermittlungsauftrag am 10. Dezember enden wird . 
 98 | Die Russen , die bereits die Bewilligung der Unabhängigkeit des Kosovo beim Sicherheitsrat blockiert hatten , bestehen auf einer Fortsetzung der Verhandlungen über dieses Enddatum hinaus . 
 99 |  Russland war und bleibt Fürsprecher einer ausgehandelten Lösung , einer einvernehmlichen Lösung  , sagte Botsane-Khartchenko . 
100 | Pristina zeigt sich bereit dazu , diese einseitig zu verkünden , wenn der Sicherheitsrat nicht zu deren Gunsten entscheiden sollte . 
101 | 


--------------------------------------------------------------------------------
/example/toy_example.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /data/tools/mosesdecoder
 11 | wmt2014-scripts = /data/smtworkspace/wmt2014-scripts
 12 | parzu-path = /data/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /data/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /data/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | 
 17 | ###### no further changes should be required to run the toy example
 18 | ###### (but feel free to experiment with different settings, or change the training/test data)
 19 | 
 20 | moses-script-dir = $moses-src-dir/scripts
 21 | moses-bin-dir = $moses-src-dir/bin
 22 | toy-data = $wmt2014-scripts/example/data
 23 | working-dir = $wmt2014-scripts/example/working-dir
 24 | decoder = $moses-src-dir/bin/moses
 25 | 
 26 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 27 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 28 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 29 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 30 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 31 | 
 32 | # parsing pipeline used for WMT 2014
 33 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt14 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 34 | 
 35 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015)
 36 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax"
 37 | 
 38 | # sed instructions unsplit the split compunds from output-splitter
 39 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 40 | 
 41 | input-extension = en
 42 | output-extension = de
 43 | pair-extension = de-en
 44 | 
 45 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 46 | 
 47 | jobs = 10
 48 | 
 49 | #################################################################
 50 | # PARALLEL CORPUS PREPARATION: 
 51 | # create a tokenized, sentence-aligned corpus, ready for training
 52 | 
 53 | [CORPUS]
 54 | 
 55 | cores = 10
 56 | 
 57 | ### tools to use to prepare the data
 58 | #
 59 | #tokenizer = 
 60 | #lowercaser = 
 61 | 
 62 | ### long sentences are filtered out, since they slow down GIZA++ 
 63 | # and are a less reliable source of data. set here the maximum
 64 | # length of a sentence
 65 | #
 66 | max-sentence-length = 80
 67 | 
 68 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 69 | # since uneven sentence length is an indicator of a misalignment,
 70 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 71 | #
 72 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 73 | 
 74 | [CORPUS:parallelA]
 75 | raw-stem = $toy-data/parallelA.$pair-extension
 76 | 
 77 | [CORPUS:parallelB]
 78 | raw-stem = $toy-data/parallelB.$pair-extension
 79 | 
 80 | 
 81 | #################################################################
 82 | # LANGUAGE MODEL TRAINING
 83 | 
 84 | [LM]
 85 | 
 86 | cores = 10
 87 | 
 88 | ### tool to be used for language model training
 89 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
 90 | # 
 91 | lm-training = $srilm-dir/ngram-count
 92 | settings = "-interpolate -kndiscount -unk"
 93 | order = 5
 94 | 
 95 | ### script to use for binary table format
 96 | # (default: no binarization)
 97 | #
 98 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
 99 | 
100 | # kenlm, also set type to 8
101 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
102 | #type = 8
103 | 
104 | ### script to create quantized language model format
105 | # (default: no quantization)
106 | # 
107 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
108 | 
109 | ### tools to use to prepare the data
110 | #
111 | #tokenizer = 
112 | #lowercaser = 
113 | 
114 | ### each language model to be used has its own section here
115 | 
116 | ### if corpus preparation should be skipped, 
117 | # point to the prepared language model
118 | #
119 | #lm = 
120 | 
121 | [LM:parallelA]
122 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
123 | 
124 | [LM:parallelB]
125 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
126 | 
127 | [LM:monolingualA]
128 | raw-corpus = $toy-data/monolingualA.$output-extension
129 | 
130 | #################################################################
131 | # INTERPOLATING LANGUAGE MODELS
132 | 
133 | [INTERPOLATED-LM]
134 | 
135 | # if multiple language models are used, these may be combined
136 | # by optimizing perplexity on a tuning set
137 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
138 | 
139 | ### script to interpolate language models
140 | # if commented out, no interpolation is performed
141 | #
142 | script = $moses-script-dir/ems/support/interpolate-lm.perl
143 | 
144 | ### tuning set
145 | # you may use the same set that is used for mert tuning (reference set)
146 | #
147 | raw-tuning = $toy-data/newstest2012.$output-extension
148 | 
149 | ### script to use for binary table format for irstlm or kenlm
150 | # kenlm, also set type to 8
151 | lm-binarizer = $moses-src-dir/bin/build_binary
152 | type = 8
153 | 
154 | #################################################################
155 | # TRANSLATION MODEL TRAINING
156 | 
157 | [TRAINING]
158 | 
159 | ### training script to be used: either a legacy script or 
160 | # current moses training script (default) 
161 | # 
162 | script = $moses-script-dir/training/train-model.perl
163 | 
164 | ### general options
165 | #
166 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
167 | 
168 | ### symmetrization method to obtain word alignments from giza output
169 | # (commonly used: grow-diag-final-and)
170 | #
171 | alignment-symmetrization-method = grow-diag-final-and
172 | 
173 | run-giza-in-parts = 5
174 | 
175 | ### if word alignment (giza symmetrization) should be skipped,
176 | # point to word alignment files
177 | #
178 | # word-alignment =
179 | 
180 | ### hierarchical rule set
181 | #
182 | hierarchical-rule-set = true
183 | use-ghkm = true
184 | use-pcfg-feature = true
185 | use-unknown-word-soft-matches = true
186 | dont-tune-glue-grammar = true
187 | 
188 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 20 --MaxRuleDepth 5 --MaxRuleSize 5"
189 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2"
190 | 
191 | 
192 | ### if phrase extraction should be skipped,
193 | # point to stem for extract files
194 | #
195 | # extracted-phrases = 
196 | 
197 | ### if phrase table training should be skipped,
198 | # point to phrase translation table
199 | #
200 | # phrase-translation-table = 
201 | 
202 | ### if training should be skipped, 
203 | # point to a configuration file that contains
204 | # pointers to all relevant model files
205 | # config =
206 | 
207 | ####################################################### TUNING: finding good weights for model components
208 | 
209 | [TUNING]
210 | 
211 | ### instead of tuning with this setting, old weights may be recycled
212 | # specify here an old configuration file with matching weights
213 | #
214 | #weight-config =
215 | 
216 | ### tuning script to be used
217 | #
218 | tuning-script = $moses-script-dir/training/mert-moses.pl
219 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU'"
220 | 
221 | ### specify the corpus used for tuning 
222 | # it should contain 100s if not 1000s of sentences
223 | #
224 | raw-input = $toy-data/newstest2012.$input-extension
225 | # tokenized-input = 
226 | # factorized-input = 
227 | # input =
228 | 
229 | inputtype = 3
230 | 
231 | raw-reference = $toy-data/newstest2012.$output-extension
232 | # tokenized-reference = 
233 | # factorized-reference = 
234 | # reference = 
235 | 
236 | ### size of n-best list used (typically 100)
237 | #
238 | nbest = 1000
239 | 
240 | ### ranges for weights for random initialization
241 | # if not specified, the tuning script will use generic ranges
242 | # it is not clear, if this matters
243 | #
244 | # lambda = 
245 | 
246 | ### additional flags for the decoder
247 | #
248 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50"
249 | 
250 | ### if tuning should be skipped, specify this here
251 | # and also point to a configuration file that contains
252 | # pointers to all relevant model files
253 | #
254 | 
255 | 
256 | #########################################################
257 | ## RECASER: restore case, this part only trains the model
258 | 
259 | [RECASING]
260 | 
261 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
262 | 
263 | ### training data
264 | # raw input needs to be still tokenized,
265 | # also also tokenized input may be specified
266 | #
267 | #tokenized = [LM:europarl:tokenized-corpus]
268 | 
269 | # recase-config = 
270 | 
271 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
272 | 
273 | #######################################################
274 | ## TRUECASER: train model to truecase corpora and input
275 | 
276 | [TRUECASER]
277 | 
278 | ### script to train truecaser models
279 | #
280 | trainer = $moses-script-dir/recaser/train-truecaser.perl
281 | 
282 | ### training data
283 | # raw input needs to be still tokenized,
284 | # also also tokenized input may be specified
285 | #
286 | # tokenized-stem = $working-dir/data/ep+nc
287 | 
288 | ### trained model
289 | #
290 | #truecase-model = 
291 | 
292 | ############################################################
293 | ## EVALUATION: translating a test set using the tuned system
294 | 
295 | [EVALUATION]
296 | 
297 | ### number of jobs (if parallel execution of testing)
298 | #
299 | jobs = 10
300 | 
301 | filter-settings = "  "
302 | 
303 | 
304 | ### prepare system output for scoring 
305 | # this may include detokenization and wrapping output in sgm 
306 | # (needed for nist-bleu, ter, meteor)
307 | #
308 | #recaser = $moses-script-dir/recaser/recase.perl
309 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
310 | # output-sgm = 
311 | 
312 | ### should output be scored case-sensitive (default: no)?
313 | #
314 | # case-sensitive = yes
315 | 
316 | ### BLEU
317 | #
318 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
319 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
320 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
321 | # ibm-bleu =
322 | 
323 | ### TER: translation error rate (BBN metric) based on edit distance
324 | #
325 | # ter = $edinburgh-script-dir/tercom_v6a.pl
326 | 
327 | ### METEOR: gives credit to stem / worknet synonym matches
328 | #
329 | # meteor = 
330 | 
331 | ### Analysis: carry out various forms of analysis on the output
332 | #
333 | analysis = $moses-script-dir/ems/support/analysis.perl
334 | #analyze-coverage = yes
335 | report-segmentation = yes
336 | 
337 | 
338 | [EVALUATION:newstest2013]
339 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100"
340 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
341 | wrapping-frame = $input-sgm
342 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
343 | 
344 | [REPORTING]
345 | 
346 | ### what to do with result (default: store in file evaluation/report)
347 | # 
348 | # email = pkoehn@inf.ed.ac.uk
349 | 
350 | 


--------------------------------------------------------------------------------
/example/toy_example_2015.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder
 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts
 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | 
 17 | ###### no further changes should be required to run the toy example
 18 | ###### (but feel free to experiment with different settings, or change the training/test data)
 19 | 
 20 | moses-script-dir = $moses-src-dir/scripts
 21 | moses-bin-dir = $moses-src-dir/bin
 22 | toy-data = $wmt2014-scripts/example/data
 23 | working-dir = $wmt2014-scripts/example/working-dir
 24 | decoder = $moses-src-dir/bin/moses
 25 | 
 26 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 27 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 28 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 29 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 30 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 31 | 
 32 | # parsing pipeline used for WMT 2014
 33 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 34 | 
 35 | # also parse tuning/evaluation reference files
 36 | mock-output-parser-references = $output-parser
 37 | 
 38 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015)
 39 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest"
 40 | 
 41 | # sed instructions unsplit the split compunds from output-splitter
 42 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 43 | 
 44 | input-extension = en
 45 | output-extension = de
 46 | pair-extension = de-en
 47 | 
 48 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 49 | 
 50 | jobs = 10
 51 | 
 52 | #################################################################
 53 | # PARALLEL CORPUS PREPARATION: 
 54 | # create a tokenized, sentence-aligned corpus, ready for training
 55 | 
 56 | [CORPUS]
 57 | 
 58 | cores = 10
 59 | 
 60 | ### tools to use to prepare the data
 61 | #
 62 | #tokenizer = 
 63 | #lowercaser = 
 64 | 
 65 | ### long sentences are filtered out, since they slow down GIZA++ 
 66 | # and are a less reliable source of data. set here the maximum
 67 | # length of a sentence
 68 | #
 69 | max-sentence-length = 80
 70 | 
 71 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 72 | # since uneven sentence length is an indicator of a misalignment,
 73 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 74 | #
 75 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 76 | 
 77 | [CORPUS:parallelA]
 78 | raw-stem = $toy-data/parallelA.$pair-extension
 79 | 
 80 | [CORPUS:parallelB]
 81 | raw-stem = $toy-data/parallelB.$pair-extension
 82 | 
 83 | [CORPUS:parallelC]
 84 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
 85 | # you can add parsed corpora to your system like this
 86 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_target
 87 | 
 88 | #################################################################
 89 | # LANGUAGE MODEL TRAINING
 90 | 
 91 | [LM]
 92 | 
 93 | cores = 10
 94 | 
 95 | ### tool to be used for language model training
 96 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
 97 | # 
 98 | lm-training = $srilm-dir/ngram-count
 99 | settings = "-interpolate -kndiscount -unk"
100 | order = 5
101 | 
102 | ### script to use for binary table format
103 | # (default: no binarization)
104 | #
105 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
106 | 
107 | # kenlm, also set type to 8
108 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
109 | #type = 8
110 | 
111 | ### script to create quantized language model format
112 | # (default: no quantization)
113 | # 
114 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
115 | 
116 | ### tools to use to prepare the data
117 | #
118 | #tokenizer = 
119 | #lowercaser = 
120 | 
121 | ### each language model to be used has its own section here
122 | 
123 | ### if corpus preparation should be skipped, 
124 | # point to the prepared language model
125 | #
126 | #lm = 
127 | 
128 | [LM:parallelA]
129 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
130 | 
131 | [LM:parallelB]
132 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
133 | 
134 | [LM:parallelC]
135 | raw-corpus = $toy-data/parallelC.$pair-extension.$output-extension
136 | 
137 | [LM:monolingualA]
138 | raw-corpus = $toy-data/monolingualA.$output-extension
139 | 
140 | #################################################################
141 | # INTERPOLATING LANGUAGE MODELS
142 | 
143 | [INTERPOLATED-LM]
144 | 
145 | # if multiple language models are used, these may be combined
146 | # by optimizing perplexity on a tuning set
147 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
148 | 
149 | ### script to interpolate language models
150 | # if commented out, no interpolation is performed
151 | #
152 | script = $moses-script-dir/ems/support/interpolate-lm.perl
153 | 
154 | ### tuning set
155 | # you may use the same set that is used for mert tuning (reference set)
156 | #
157 | raw-tuning = $toy-data/newstest2012.$output-extension
158 | 
159 | ### script to use for binary table format for irstlm or kenlm
160 | # kenlm, also set type to 8
161 | lm-binarizer = $moses-src-dir/bin/build_binary
162 | type = 8
163 | 
164 | #################################################################
165 | # TRANSLATION MODEL TRAINING
166 | 
167 | [TRAINING]
168 | 
169 | ### training script to be used: either a legacy script or 
170 | # current moses training script (default) 
171 | # 
172 | script = $moses-script-dir/training/train-model.perl
173 | 
174 | ### general options
175 | #
176 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
177 | 
178 | ### symmetrization method to obtain word alignments from giza output
179 | # (commonly used: grow-diag-final-and)
180 | #
181 | alignment-symmetrization-method = grow-diag-final-and
182 | 
183 | run-giza-in-parts = 5
184 | 
185 | ### if word alignment (giza symmetrization) should be skipped,
186 | # point to word alignment files
187 | #
188 | # word-alignment =
189 | 
190 | ### hierarchical rule set
191 | #
192 | hierarchical-rule-set = true
193 | use-ghkm = true
194 | use-pcfg-feature = true
195 | use-unknown-word-soft-matches = true
196 | dont-tune-glue-grammar = true
197 | 
198 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 20 --MaxRuleDepth 5 --MaxRuleSize 5 --AllowUnary"
199 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001"
200 | 
201 | 
202 | ### if phrase extraction should be skipped,
203 | # point to stem for extract files
204 | #
205 | # extracted-phrases = 
206 | 
207 | ### if phrase table training should be skipped,
208 | # point to phrase translation table
209 | #
210 | # phrase-translation-table = 
211 | 
212 | ### if training should be skipped, 
213 | # point to a configuration file that contains
214 | # pointers to all relevant model files
215 | # config =
216 | 
217 | ####################################################### TUNING: finding good weights for model components
218 | 
219 | [TUNING]
220 | 
221 | ### instead of tuning with this setting, old weights may be recycled
222 | # specify here an old configuration file with matching weights
223 | #
224 | #weight-config =
225 | 
226 | ### tuning script to be used
227 | #
228 | tuning-script = $moses-script-dir/training/mert-moses.pl
229 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'"
230 | 
231 | ### specify the corpus used for tuning 
232 | # it should contain 100s if not 1000s of sentences
233 | #
234 | raw-input = $toy-data/newstest2012.$input-extension
235 | # tokenized-input = 
236 | # factorized-input = 
237 | # input =
238 | 
239 | inputtype = 3
240 | 
241 | raw-reference = $toy-data/newstest2012.$output-extension
242 | # tokenized-reference = 
243 | # factorized-reference = 
244 | # reference = 
245 | 
246 | ### size of n-best list used (typically 100)
247 | #
248 | nbest = 1000
249 | 
250 | ### ranges for weights for random initialization
251 | # if not specified, the tuning script will use generic ranges
252 | # it is not clear, if this matters
253 | #
254 | # lambda = 
255 | 
256 | ### additional flags for the decoder
257 | #
258 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees"
259 | 
260 | ### if tuning should be skipped, specify this here
261 | # and also point to a configuration file that contains
262 | # pointers to all relevant model files
263 | #
264 | 
265 | 
266 | #########################################################
267 | ## RECASER: restore case, this part only trains the model
268 | 
269 | [RECASING]
270 | 
271 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
272 | 
273 | ### training data
274 | # raw input needs to be still tokenized,
275 | # also also tokenized input may be specified
276 | #
277 | #tokenized = [LM:europarl:tokenized-corpus]
278 | 
279 | # recase-config = 
280 | 
281 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
282 | 
283 | #######################################################
284 | ## TRUECASER: train model to truecase corpora and input
285 | 
286 | [TRUECASER]
287 | 
288 | ### script to train truecaser models
289 | #
290 | trainer = $moses-script-dir/recaser/train-truecaser.perl
291 | 
292 | ### training data
293 | # raw input needs to be still tokenized,
294 | # also also tokenized input may be specified
295 | #
296 | # tokenized-stem = $working-dir/data/ep+nc
297 | 
298 | ### trained model
299 | #
300 | #truecase-model = 
301 | 
302 | ############################################################
303 | ## EVALUATION: translating a test set using the tuned system
304 | 
305 | [EVALUATION]
306 | 
307 | ### number of jobs (if parallel execution of testing)
308 | #
309 | jobs = 10
310 | 
311 | filter-settings = "  "
312 | 
313 | 
314 | ### prepare system output for scoring 
315 | # this may include detokenization and wrapping output in sgm 
316 | # (needed for nist-bleu, ter, meteor)
317 | #
318 | #recaser = $moses-script-dir/recaser/recase.perl
319 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
320 | # output-sgm = 
321 | 
322 | ### should output be scored case-sensitive (default: no)?
323 | #
324 | # case-sensitive = yes
325 | 
326 | ### BLEU
327 | #
328 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
329 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
330 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
331 | # ibm-bleu =
332 | 
333 | ### TER: translation error rate (BBN metric) based on edit distance
334 | #
335 | # ter = $edinburgh-script-dir/tercom_v6a.pl
336 | 
337 | ### METEOR: gives credit to stem / worknet synonym matches
338 | #
339 | # meteor = 
340 | 
341 | ### Analysis: carry out various forms of analysis on the output
342 | #
343 | analysis = $moses-script-dir/ems/support/analysis.perl
344 | #analyze-coverage = yes
345 | report-segmentation = yes
346 | 
347 | 
348 | [EVALUATION:newstest2013]
349 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100"
350 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
351 | wrapping-frame = $input-sgm
352 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
353 | 
354 | [REPORTING]
355 | 
356 | ### what to do with result (default: store in file evaluation/report)
357 | # 
358 | # email = pkoehn@inf.ed.ac.uk
359 | 
360 | 


--------------------------------------------------------------------------------
/example/toy_example_2015_2.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder
 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts
 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | 
 17 | ###### no further changes should be required to run the toy example
 18 | ###### (but feel free to experiment with different settings, or change the training/test data)
 19 | 
 20 | moses-script-dir = $moses-src-dir/scripts
 21 | moses-bin-dir = $moses-src-dir/bin
 22 | toy-data = $wmt2014-scripts/example/data
 23 | working-dir = $wmt2014-scripts/example/working-dir
 24 | decoder = $moses-src-dir/bin/moses
 25 | 
 26 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 27 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 28 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 29 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 30 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 31 | 
 32 | # parsing pipeline used for WMT 2014
 33 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 34 | 
 35 | # also parse tuning/evaluation reference files
 36 | mock-output-parser-references = $output-parser
 37 | 
 38 | # head binarization
 39 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head"
 40 | 
 41 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015)
 42 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest"
 43 | 
 44 | # sed instructions unsplit the split compunds from output-splitter
 45 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 46 | 
 47 | input-extension = en
 48 | output-extension = de
 49 | pair-extension = de-en
 50 | 
 51 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 52 | 
 53 | jobs = 10
 54 | 
 55 | #################################################################
 56 | # PARALLEL CORPUS PREPARATION: 
 57 | # create a tokenized, sentence-aligned corpus, ready for training
 58 | 
 59 | [CORPUS]
 60 | 
 61 | cores = 10
 62 | 
 63 | ### tools to use to prepare the data
 64 | #
 65 | #tokenizer = 
 66 | #lowercaser = 
 67 | 
 68 | ### long sentences are filtered out, since they slow down GIZA++ 
 69 | # and are a less reliable source of data. set here the maximum
 70 | # length of a sentence
 71 | #
 72 | max-sentence-length = 80
 73 | 
 74 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 75 | # since uneven sentence length is an indicator of a misalignment,
 76 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 77 | #
 78 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 79 | 
 80 | [CORPUS:parallelA]
 81 | raw-stem = $toy-data/parallelA.$pair-extension
 82 | 
 83 | [CORPUS:parallelB]
 84 | raw-stem = $toy-data/parallelB.$pair-extension
 85 | 
 86 | [CORPUS:parallelC]
 87 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
 88 | # you can add parsed corpora to your system like this
 89 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_target
 90 | 
 91 | #################################################################
 92 | # LANGUAGE MODEL TRAINING
 93 | 
 94 | [LM]
 95 | 
 96 | cores = 10
 97 | 
 98 | ### tool to be used for language model training
 99 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
100 | # 
101 | lm-training = $srilm-dir/ngram-count
102 | settings = "-interpolate -kndiscount -unk"
103 | order = 5
104 | 
105 | ### script to use for binary table format
106 | # (default: no binarization)
107 | #
108 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
109 | 
110 | # kenlm, also set type to 8
111 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
112 | #type = 8
113 | 
114 | ### script to create quantized language model format
115 | # (default: no quantization)
116 | # 
117 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
118 | 
119 | ### tools to use to prepare the data
120 | #
121 | #tokenizer = 
122 | #lowercaser = 
123 | 
124 | ### each language model to be used has its own section here
125 | 
126 | ### if corpus preparation should be skipped, 
127 | # point to the prepared language model
128 | #
129 | #lm = 
130 | 
131 | [LM:parallelA]
132 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
133 | 
134 | [LM:parallelB]
135 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
136 | 
137 | [LM:parallelC]
138 | raw-corpus = $toy-data/parallelC.$pair-extension.$output-extension
139 | 
140 | [LM:monolingualA]
141 | raw-corpus = $toy-data/monolingualA.$output-extension
142 | 
143 | #################################################################
144 | # INTERPOLATING LANGUAGE MODELS
145 | 
146 | [INTERPOLATED-LM]
147 | 
148 | # if multiple language models are used, these may be combined
149 | # by optimizing perplexity on a tuning set
150 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
151 | 
152 | ### script to interpolate language models
153 | # if commented out, no interpolation is performed
154 | #
155 | script = $moses-script-dir/ems/support/interpolate-lm.perl
156 | 
157 | ### tuning set
158 | # you may use the same set that is used for mert tuning (reference set)
159 | #
160 | raw-tuning = $toy-data/newstest2012.$output-extension
161 | 
162 | ### script to use for binary table format for irstlm or kenlm
163 | # kenlm, also set type to 8
164 | lm-binarizer = $moses-src-dir/bin/build_binary
165 | type = 8
166 | 
167 | #################################################################
168 | # TRANSLATION MODEL TRAINING
169 | 
170 | [TRAINING]
171 | 
172 | ### training script to be used: either a legacy script or 
173 | # current moses training script (default) 
174 | # 
175 | script = $moses-script-dir/training/train-model.perl
176 | 
177 | ### general options
178 | #
179 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
180 | 
181 | ### symmetrization method to obtain word alignments from giza output
182 | # (commonly used: grow-diag-final-and)
183 | #
184 | alignment-symmetrization-method = grow-diag-final-and
185 | 
186 | run-giza-in-parts = 5
187 | 
188 | ### if word alignment (giza symmetrization) should be skipped,
189 | # point to word alignment files
190 | #
191 | # word-alignment =
192 | 
193 | ### hierarchical rule set
194 | #
195 | hierarchical-rule-set = true
196 | use-ghkm = true
197 | use-pcfg-feature = true
198 | use-unknown-word-soft-matches = true
199 | dont-tune-glue-grammar = true
200 | 
201 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary"
202 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001"
203 | 
204 | 
205 | ### if phrase extraction should be skipped,
206 | # point to stem for extract files
207 | #
208 | # extracted-phrases = 
209 | 
210 | ### if phrase table training should be skipped,
211 | # point to phrase translation table
212 | #
213 | # phrase-translation-table = 
214 | 
215 | ### if training should be skipped, 
216 | # point to a configuration file that contains
217 | # pointers to all relevant model files
218 | # config =
219 | 
220 | ####################################################### TUNING: finding good weights for model components
221 | 
222 | [TUNING]
223 | 
224 | ### instead of tuning with this setting, old weights may be recycled
225 | # specify here an old configuration file with matching weights
226 | #
227 | #weight-config =
228 | 
229 | ### tuning script to be used
230 | #
231 | tuning-script = $moses-script-dir/training/mert-moses.pl
232 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'"
233 | 
234 | ### specify the corpus used for tuning 
235 | # it should contain 100s if not 1000s of sentences
236 | #
237 | raw-input = $toy-data/newstest2012.$input-extension
238 | # tokenized-input = 
239 | # factorized-input = 
240 | # input =
241 | 
242 | inputtype = 3
243 | 
244 | raw-reference = $toy-data/newstest2012.$output-extension
245 | # tokenized-reference = 
246 | # factorized-reference = 
247 | # reference = 
248 | 
249 | ### size of n-best list used (typically 100)
250 | #
251 | nbest = 1000
252 | 
253 | ### ranges for weights for random initialization
254 | # if not specified, the tuning script will use generic ranges
255 | # it is not clear, if this matters
256 | #
257 | # lambda = 
258 | 
259 | ### additional flags for the decoder
260 | #
261 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees"
262 | 
263 | ### if tuning should be skipped, specify this here
264 | # and also point to a configuration file that contains
265 | # pointers to all relevant model files
266 | #
267 | 
268 | 
269 | #########################################################
270 | ## RECASER: restore case, this part only trains the model
271 | 
272 | [RECASING]
273 | 
274 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
275 | 
276 | ### training data
277 | # raw input needs to be still tokenized,
278 | # also also tokenized input may be specified
279 | #
280 | #tokenized = [LM:europarl:tokenized-corpus]
281 | 
282 | # recase-config = 
283 | 
284 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
285 | 
286 | #######################################################
287 | ## TRUECASER: train model to truecase corpora and input
288 | 
289 | [TRUECASER]
290 | 
291 | ### script to train truecaser models
292 | #
293 | trainer = $moses-script-dir/recaser/train-truecaser.perl
294 | 
295 | ### training data
296 | # raw input needs to be still tokenized,
297 | # also also tokenized input may be specified
298 | #
299 | # tokenized-stem = $working-dir/data/ep+nc
300 | 
301 | ### trained model
302 | #
303 | #truecase-model = 
304 | 
305 | ############################################################
306 | ## EVALUATION: translating a test set using the tuned system
307 | 
308 | [EVALUATION]
309 | 
310 | ### number of jobs (if parallel execution of testing)
311 | #
312 | jobs = 10
313 | 
314 | filter-settings = "  "
315 | 
316 | 
317 | ### prepare system output for scoring 
318 | # this may include detokenization and wrapping output in sgm 
319 | # (needed for nist-bleu, ter, meteor)
320 | #
321 | #recaser = $moses-script-dir/recaser/recase.perl
322 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
323 | # output-sgm = 
324 | 
325 | ### should output be scored case-sensitive (default: no)?
326 | #
327 | # case-sensitive = yes
328 | 
329 | ### BLEU
330 | #
331 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
332 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
333 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
334 | # ibm-bleu =
335 | 
336 | ### TER: translation error rate (BBN metric) based on edit distance
337 | #
338 | # ter = $edinburgh-script-dir/tercom_v6a.pl
339 | 
340 | ### METEOR: gives credit to stem / worknet synonym matches
341 | #
342 | # meteor = 
343 | 
344 | ### Analysis: carry out various forms of analysis on the output
345 | #
346 | analysis = $moses-script-dir/ems/support/analysis.perl
347 | #analyze-coverage = yes
348 | report-segmentation = yes
349 | 
350 | 
351 | [EVALUATION:newstest2013]
352 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100"
353 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
354 | wrapping-frame = $input-sgm
355 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
356 | 
357 | [REPORTING]
358 | 
359 | ### what to do with result (default: store in file evaluation/report)
360 | # 
361 | # email = pkoehn@inf.ed.ac.uk
362 | 
363 | 


--------------------------------------------------------------------------------
/example/toy_example_2015_3.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder
 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts
 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | nplm-dir = /home/rsennrich/tools/nplm-github/
 17 | 
 18 | ###### no further changes should be required to run the toy example
 19 | ###### (but feel free to experiment with different settings, or change the training/test data)
 20 | 
 21 | moses-script-dir = $moses-src-dir/scripts
 22 | moses-bin-dir = $moses-src-dir/bin
 23 | toy-data = $wmt2014-scripts/example/data
 24 | working-dir = $wmt2014-scripts/example/working-dir
 25 | decoder = $moses-src-dir/bin/moses
 26 | 
 27 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 28 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 29 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 30 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 31 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 32 | 
 33 | # parsing pipeline used for WMT 2014
 34 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 35 | 
 36 | # also parse tuning/evaluation reference files
 37 | mock-output-parser-references = $output-parser
 38 | mock-output-parser-lm = $output-parser
 39 | 
 40 | # head binarization
 41 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head"
 42 | 
 43 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015)
 44 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest"
 45 | 
 46 | # sed instructions unsplit the split compunds from output-splitter
 47 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 48 | 
 49 | input-extension = en
 50 | output-extension = de
 51 | pair-extension = de-en
 52 | 
 53 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 54 | 
 55 | jobs = 10
 56 | 
 57 | #################################################################
 58 | # PARALLEL CORPUS PREPARATION: 
 59 | # create a tokenized, sentence-aligned corpus, ready for training
 60 | 
 61 | [CORPUS]
 62 | 
 63 | cores = 10
 64 | 
 65 | ### tools to use to prepare the data
 66 | #
 67 | #tokenizer = 
 68 | #lowercaser = 
 69 | 
 70 | ### long sentences are filtered out, since they slow down GIZA++ 
 71 | # and are a less reliable source of data. set here the maximum
 72 | # length of a sentence
 73 | #
 74 | max-sentence-length = 80
 75 | 
 76 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 77 | # since uneven sentence length is an indicator of a misalignment,
 78 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 79 | #
 80 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 81 | 
 82 | [CORPUS:parallelA]
 83 | raw-stem = $toy-data/parallelA.$pair-extension
 84 | 
 85 | [CORPUS:parallelB]
 86 | raw-stem = $toy-data/parallelB.$pair-extension
 87 | 
 88 | [CORPUS:parallelC]
 89 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
 90 | # you can add parsed corpora to your system like this
 91 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_target
 92 | 
 93 | #################################################################
 94 | # LANGUAGE MODEL TRAINING
 95 | 
 96 | [LM]
 97 | 
 98 | cores = 10
 99 | 
100 | ### tool to be used for language model training
101 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
102 | # 
103 | lm-training = $srilm-dir/ngram-count
104 | settings = "-interpolate -kndiscount -unk"
105 | order = 5
106 | 
107 | ### script to use for binary table format
108 | # (default: no binarization)
109 | #
110 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
111 | 
112 | # kenlm, also set type to 8
113 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
114 | #type = 8
115 | 
116 | ### script to create quantized language model format
117 | # (default: no quantization)
118 | # 
119 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
120 | 
121 | ### tools to use to prepare the data
122 | #
123 | #tokenizer = 
124 | #lowercaser = 
125 | 
126 | ### each language model to be used has its own section here
127 | 
128 | ### if corpus preparation should be skipped, 
129 | # point to the prepared language model
130 | #
131 | #lm = 
132 | 
133 | # internal settings for RDLM
134 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files
135 | rdlm-left-context = 3
136 | rdlm-right-context = 3
137 | rdlm-up-context = 2
138 | rdlm-working-dir = 1
139 | 
140 | [LM:parallelA]
141 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
142 | 
143 | [LM:parallelB]
144 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
145 | 
146 | [LM:parallelC]
147 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
148 | # you can add parsed corpora to your system like this
149 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_target.$output-extension
150 | 
151 | [LM:monolingualA]
152 | raw-corpus = $toy-data/monolingualA.$output-extension
153 | 
154 | ### Relational Dependency LM trained on concatenation of other training corpora [head model]
155 | [LM:RDLM]
156 | 
157 | ### define which corpora to concatenate
158 | # we use -split here because we do not want to strip away syntactic markup
159 | #
160 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus]
161 | 
162 | ### tell INTERPOLATED-LM to ignore this model
163 | #
164 | exclude-from-interpolation = true
165 | 
166 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
167 | #
168 | syntactic = true
169 | 
170 | ### training command
171 | #
172 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
173 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \
174 |                    --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \
175 |                    --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \
176 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
177 |                    --epochs 10 --mmap"
178 | 
179 | # we train two RDLMs, but only need one entry in the config, so we leave this empty
180 | config-feature-line = " "
181 | config-weight-line = " "
182 | 
183 | 
184 | ### Relational Dependency LM trained on concatenation of other training corpora [label model]
185 | [LM:RDLM2]
186 | 
187 | ### define which corpora to concatenate
188 | # we use -split here because we do not want to strip away syntactic markup
189 | #
190 | split-corpus = [LM:RDLM:split-corpus]
191 | 
192 | ### tell INTERPOLATED-LM to ignore this model
193 | #
194 | exclude-from-interpolation = true
195 | 
196 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
197 | #
198 | syntactic = true
199 | 
200 | ### training command
201 | #
202 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
203 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \
204 |                    --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \
205 |                    --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \
206 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
207 |                    --epochs 10 --mmap"
208 | 
209 | ### manually specify feature and weight lines for moses.ini (required for custom-training)
210 | #
211 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full"
212 | config-weight-line = "RDLM0= 0.1 0.1"
213 | 
214 | 
215 | #################################################################
216 | # INTERPOLATING LANGUAGE MODELS
217 | 
218 | [INTERPOLATED-LM]
219 | 
220 | # if multiple language models are used, these may be combined
221 | # by optimizing perplexity on a tuning set
222 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
223 | 
224 | ### script to interpolate language models
225 | # if commented out, no interpolation is performed
226 | #
227 | script = $moses-script-dir/ems/support/interpolate-lm.perl
228 | 
229 | ### tuning set
230 | # you may use the same set that is used for mert tuning (reference set)
231 | #
232 | raw-tuning = $toy-data/newstest2012.$output-extension
233 | 
234 | ### script to use for binary table format for irstlm or kenlm
235 | # kenlm, also set type to 8
236 | lm-binarizer = $moses-src-dir/bin/build_binary
237 | type = 8
238 | 
239 | #################################################################
240 | # TRANSLATION MODEL TRAINING
241 | 
242 | [TRAINING]
243 | 
244 | ### training script to be used: either a legacy script or 
245 | # current moses training script (default) 
246 | # 
247 | script = $moses-script-dir/training/train-model.perl
248 | 
249 | ### general options
250 | #
251 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
252 | 
253 | ### symmetrization method to obtain word alignments from giza output
254 | # (commonly used: grow-diag-final-and)
255 | #
256 | alignment-symmetrization-method = grow-diag-final-and
257 | 
258 | run-giza-in-parts = 5
259 | 
260 | ### if word alignment (giza symmetrization) should be skipped,
261 | # point to word alignment files
262 | #
263 | # word-alignment =
264 | 
265 | ### hierarchical rule set
266 | #
267 | hierarchical-rule-set = true
268 | use-ghkm = true
269 | use-pcfg-feature = true
270 | use-unknown-word-soft-matches = true
271 | dont-tune-glue-grammar = true
272 | 
273 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary"
274 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001"
275 | 
276 | 
277 | ### if phrase extraction should be skipped,
278 | # point to stem for extract files
279 | #
280 | # extracted-phrases = 
281 | 
282 | ### if phrase table training should be skipped,
283 | # point to phrase translation table
284 | #
285 | # phrase-translation-table = 
286 | 
287 | ### if training should be skipped, 
288 | # point to a configuration file that contains
289 | # pointers to all relevant model files
290 | # config =
291 | 
292 | ####################################################### TUNING: finding good weights for model components
293 | 
294 | [TUNING]
295 | 
296 | ### instead of tuning with this setting, old weights may be recycled
297 | # specify here an old configuration file with matching weights
298 | #
299 | #weight-config =
300 | 
301 | ### tuning script to be used
302 | #
303 | tuning-script = $moses-script-dir/training/mert-moses.pl
304 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'"
305 | 
306 | ### specify the corpus used for tuning 
307 | # it should contain 100s if not 1000s of sentences
308 | #
309 | raw-input = $toy-data/newstest2012.$input-extension
310 | # tokenized-input = 
311 | # factorized-input = 
312 | # input =
313 | 
314 | inputtype = 3
315 | 
316 | raw-reference = $toy-data/newstest2012.$output-extension
317 | # tokenized-reference = 
318 | # factorized-reference = 
319 | # reference = 
320 | 
321 | ### size of n-best list used (typically 100)
322 | #
323 | nbest = 1000
324 | 
325 | ### ranges for weights for random initialization
326 | # if not specified, the tuning script will use generic ranges
327 | # it is not clear, if this matters
328 | #
329 | # lambda = 
330 | 
331 | ### additional flags for the decoder
332 | #
333 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees"
334 | 
335 | ### if tuning should be skipped, specify this here
336 | # and also point to a configuration file that contains
337 | # pointers to all relevant model files
338 | #
339 | 
340 | 
341 | #########################################################
342 | ## RECASER: restore case, this part only trains the model
343 | 
344 | [RECASING]
345 | 
346 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
347 | 
348 | ### training data
349 | # raw input needs to be still tokenized,
350 | # also also tokenized input may be specified
351 | #
352 | #tokenized = [LM:europarl:tokenized-corpus]
353 | 
354 | # recase-config = 
355 | 
356 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
357 | 
358 | #######################################################
359 | ## TRUECASER: train model to truecase corpora and input
360 | 
361 | [TRUECASER]
362 | 
363 | ### script to train truecaser models
364 | #
365 | trainer = $moses-script-dir/recaser/train-truecaser.perl
366 | 
367 | ### training data
368 | # raw input needs to be still tokenized,
369 | # also also tokenized input may be specified
370 | #
371 | # tokenized-stem = $working-dir/data/ep+nc
372 | 
373 | ### trained model
374 | #
375 | #truecase-model = 
376 | 
377 | ############################################################
378 | ## EVALUATION: translating a test set using the tuned system
379 | 
380 | [EVALUATION]
381 | 
382 | ### number of jobs (if parallel execution of testing)
383 | #
384 | jobs = 10
385 | 
386 | filter-settings = "  "
387 | 
388 | 
389 | ### prepare system output for scoring 
390 | # this may include detokenization and wrapping output in sgm 
391 | # (needed for nist-bleu, ter, meteor)
392 | #
393 | #recaser = $moses-script-dir/recaser/recase.perl
394 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
395 | # output-sgm = 
396 | 
397 | ### should output be scored case-sensitive (default: no)?
398 | #
399 | # case-sensitive = yes
400 | 
401 | ### BLEU
402 | #
403 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
404 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
405 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
406 | # ibm-bleu =
407 | 
408 | ### TER: translation error rate (BBN metric) based on edit distance
409 | #
410 | # ter = $edinburgh-script-dir/tercom_v6a.pl
411 | 
412 | ### METEOR: gives credit to stem / worknet synonym matches
413 | #
414 | # meteor = 
415 | 
416 | ### Analysis: carry out various forms of analysis on the output
417 | #
418 | analysis = $moses-script-dir/ems/support/analysis.perl
419 | #analyze-coverage = yes
420 | report-segmentation = yes
421 | 
422 | 
423 | [EVALUATION:newstest2013]
424 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100"
425 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
426 | wrapping-frame = $input-sgm
427 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
428 | 
429 | [REPORTING]
430 | 
431 | ### what to do with result (default: store in file evaluation/report)
432 | # 
433 | # email = pkoehn@inf.ed.ac.uk
434 | 
435 | 


--------------------------------------------------------------------------------
/enrich_labelset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | # this script modifies the ParZu grammar output to a representation that is more suitable for SMT:
  6 | # ambiguous labels are split, and optionally enriched with morphological information. The script also restructures coordinations.
  7 | # The modifications (the subset used for the WMT 2014 shared translation task EN-DE) are described in:
  8 | #  Rico Sennrich, Philip Williams, Matthias Huck (2015):
  9 | #    A tree does not make a well-formed sentence: Improving syntactic string-to-tree statistical machine translation with more linguistic knowledge.
 10 | #    In: Computer Speech & Language 32(1), 27-45.
 11 | 
 12 | from __future__ import print_function, unicode_literals
 13 | import sys
 14 | import codecs
 15 | from collections import defaultdict
 16 | 
 17 | #at which point in the morphological output is case information stored
 18 | CASE_POSITION = {b'ADJA':2
 19 |                     ,b'PPER':3
 20 |                     ,b'ART':2
 21 |                     ,b'APPRART':0
 22 |                     ,b'APPR':0
 23 |                     ,b'APPO':0
 24 |                     ,b'PRF':2
 25 |                     ,b'NN':1
 26 |                     ,b'FM':1
 27 |                     ,b'NE':1
 28 |                     ,b'PIS':1
 29 |                     ,b'PIAT':1
 30 |                     ,b'PDS':1
 31 |                     ,b'PIDAT':1
 32 |                     ,b'PPOSS':1
 33 |                     ,b'PPOSAT':1
 34 |                     ,b'PRELS':1
 35 |                     ,b'PRELAT':1
 36 |                     ,b'PWS':1
 37 |                     ,b'PWAT':1
 38 |                     }
 39 | 
 40 | 
 41 | 
 42 | GENDER_POSITION = {b'ADJA':1
 43 |                     ,b'PPER':2
 44 |                     ,b'ART':1
 45 |                     ,b'NN':0
 46 |                     ,b'FM':0
 47 |                     ,b'NE':0
 48 |                     ,b'PIS':0
 49 |                     ,b'PIAT':0
 50 |                     ,b'PDS':0
 51 |                     ,b'PIDAT':0
 52 |                     ,b'PPOSS':0
 53 |                     ,b'PRELS':0
 54 |                     ,b'PPOSAT':0
 55 |                     ,b'PRELAT':0
 56 |                     ,b'PWS':0
 57 |                     ,b'PWAT':0
 58 |                     }
 59 | 
 60 | 
 61 | NUMBER_POSITION = {b'ADJA':3
 62 |                     ,b'PPER':1
 63 |                     ,b'ART':3
 64 |                     ,b'PRF':1
 65 |                     ,b'NN':2
 66 |                     ,b'FM':2
 67 |                     ,b'NE':2
 68 |                     ,b'PIS':2
 69 |                     ,b'PIAT':2
 70 |                     ,b'PDS':2
 71 |                     ,b'PIDAT':2
 72 |                     ,b'PPOSS':2
 73 |                     ,b'PPOSAT':2
 74 |                     ,b'PRELS':2
 75 |                     ,b'PRELAT':2
 76 |                     ,b'PWS':2
 77 |                     ,b'PWAT':2
 78 |                     ,b'VVFIN':1
 79 |                     ,b'VAFIN':1
 80 |                     ,b'VMFIN':1
 81 |                     }
 82 | 
 83 | 
 84 | PERSON_POSITION = {b'PPER':0
 85 |                     ,b'VVFIN':0
 86 |                     ,b'VAFIN':0
 87 |                     ,b'VMFIN':0
 88 |                     }
 89 | 
 90 | 
 91 | KEYWORDS = ['pos','word','lemma','tag','tag2','morph','head','func', 'proj_head', 'proj_func']
 92 | def create_named_dict(values):
 93 |     return dict(zip(KEYWORDS,values))
 94 | 
 95 | def sorted_values(named_dict):
 96 |     return [named_dict[keyword] for keyword in KEYWORDS]
 97 | 
 98 | def write(sentence):
 99 |     for word in sentence:
100 |         sys.stdout.write(b'\t'.join(sorted_values(word)) + b'\n')
101 |     sys.stdout.write(b'\n')
102 | 
103 | def main(fobj_in):
104 |     sentence = []
105 |     for line in fobj_in:
106 | 
107 |         if line == b"\n":
108 |             convert(sentence)
109 |             write(sentence)
110 |             sentence = []
111 |             continue
112 | 
113 |         word = create_named_dict(line.split())
114 |         sentence.append(word)
115 | 
116 | 
117 | def convert(sentence):
118 | 
119 |     spans = get_spans(sentence)
120 |     for word in sentence:
121 | 
122 |         if word['func'] != word['proj_func']:
123 |             sys.stderr.write('Whoops, better check why label and projective label are different\n')
124 |             sys.stderr.write(b'\t'.join(sorted_values(word)) + b'\n')
125 |             sys.exit(1)
126 | 
127 |         if word['func'] in CONVERSIONS:
128 |             CONVERSIONS[word['func']](word, sentence, spans)
129 | 
130 | def get_head(word, sentence):
131 |     head_position = int(word['proj_head'])
132 |     if head_position:
133 |         return sentence[head_position-1]
134 | 
135 | def comma_is_kon(word, sentence, spans):
136 |     '''if comma joins two coordinated elements, mark this with a new function,
137 |     then make it the head of the element to the right, and the dependent of the element to the left.
138 |     this allows for recursive addition of new coordinated elements.
139 | 
140 |     '''
141 | 
142 |     if not 'kon' in CONVERSIONS:
143 |         return
144 | 
145 |     head = get_head(word, sentence)
146 |     if head and head['func'] == b'kon' and int(word['proj_head']) > int(word['pos']) and head['tag'] != b'KON':
147 |         # make sure projectivity isn't violated
148 |         if not any(int(w['proj_head']) > int(word['pos']) or int(w['proj_head']) < int(head['proj_head']) for w in sentence[int(head['proj_head']):int(word['pos'])-1]):
149 |             word['proj_head'] = head['proj_head']
150 |             head['proj_head'] = word['pos']
151 |             word['proj_func'] = b'kon'
152 |             word['func'] = b'kon'
153 |             kon_conversion(word, sentence, spans)
154 |             return word['func']
155 | 
156 | def aux_conversion(word, sentence, spans):
157 |     '''distinguish between past participle and infinitive auxiliary verbs to avoid overgeneralization.'''
158 |     morph_info = b''
159 |     if word['tag2'].endswith(b'PP'):
160 |         morph_info = b'_pp'
161 |     elif word['tag2'].endswith(b'INF'):
162 |         if any(w['tag'] == b'PTKZU' and w['head'] == word['pos'] for w in sentence):
163 |             morph_info = b'_izu'
164 |         else:
165 |             morph_info = b'_inf'
166 |     elif word['tag2'].endswith(b'IZU'):
167 |         morph_info = b'_izu'
168 | 
169 |     word['func'] += morph_info
170 |     word['proj_func'] += morph_info
171 | 
172 | 
173 | def root_conversion(word, sentence, spans):
174 |     '''distinguish between five types of structures that receive label 'root':
175 |     punct: full stops, question marks etc.
176 |     comma: commas
177 |     bracket: quotation marks, hyphens, and brackets
178 |     vroot: full verb; root of a successful parse
179 |     root: everything else; typically root of partial trees.
180 | 
181 |     '''
182 |     morph_info = word['func']
183 |     if word['tag2'] == b'$.':
184 |         morph_info = b'punct'
185 |     elif word['tag2'] == b'$(':
186 |         morph_info = b'bracket'
187 |     elif word['tag2'] in [b'VVFIN',b'VMFIN',b'VAFIN']:
188 |       # try to only give label 'vroot' to main clause roots, not to verb-last structures that remain unattached in parse
189 |       midfield_labels = set(['subj','obja','subjc','adv','pred','pp','objp'])
190 |       aux_labels = set(['aux','aux_pp','aux_inf','aux_vvizu'])
191 |       direct_dependents_left = [w for w in sentence[:int(word['pos'])] if w['proj_head'] == word['pos'] and w['tag2'] not in ['$,','$(']]
192 |       direct_dependents_right = [w for w in sentence[int(word['pos']):] if w['proj_head'] == word['pos']]
193 |       if (len(direct_dependents_left) < 2 and not any(w['proj_func'] in aux_labels for w in direct_dependents_left)) or any(w['proj_func'] in midfield_labels for w in direct_dependents_right):
194 |           morph_info = b'vroot'
195 |     elif word['tag2'] == b'$,':
196 |         morph_info = comma_is_kon(word, sentence, spans)
197 |         if not morph_info:
198 |             morph_info = b'comma'
199 | 
200 |     # mark remaining roots that cover the full sentence (or anything between two punctuation marks) with 'sroot'
201 |     if morph_info  == b'root':
202 |         dependents = sorted(get_dependents_for_word(word, spans))
203 |         if dependents[0] == 0 or sentence[dependents[0]-1]['tag2'] == b'$.' or (sentence[dependents[0]-1]['tag2'] == b'$(' and (dependents[0]-1 == 0 or sentence[dependents[0]-2]['tag2'] == b'$.')):
204 |             if dependents[-1]+1 == len(sentence) or sentence[dependents[-1]+1]['tag2'] == b'$.' or (sentence[dependents[-1]+1]['tag2'] == b'$(' and (dependents[-1]+2 == len(sentence) or sentence[dependents[-1]+2]['tag2'] == b'$.')):
205 |                 morph_info = b'sroot'
206 | 
207 |     word['func'] = morph_info
208 |     word['proj_func'] = morph_info
209 | 
210 | 
211 | def obji_conversion(word, sentence, spans):
212 |     '''distinguish between infinitive with 'zu' and bare infinitive
213 |     examples: 
214 |     ich lasse ihn schlafen/obji_bare
215 |     ich bitte ihn, zu schlafen/obji_zu
216 |     '''
217 |     morph_info = b''
218 |     if word['tag2'] == b'VVIZU':
219 |         morph_info = b'_zu'
220 |     elif any(w['tag'] == b'PTKZU' and w['head'] == word['pos'] for w in sentence):
221 |         morph_info = b'_zu'
222 |     else:
223 |         morph_info = b'_bare'
224 | 
225 |     word['func'] += morph_info
226 |     word['proj_func'] += morph_info
227 | 
228 |     dependents = sorted(get_dependents_for_word(word, spans))
229 |     if sentence[dependents[0]]['proj_func'] == b'comma':
230 |         word['func'] += b'_comma'
231 |         word['proj_func'] += b'_comma'
232 | 
233 | def pn_conversion(word, sentence, spans):
234 |     '''add grammatical case to prepositional noun'''
235 |     head = get_head(word, sentence)
236 |     case = get_morphology(head)['case']
237 | 
238 |     if case != b'_':
239 |         word['func'] += b'_'+ case
240 |         word['proj_func'] += b'_'+ case
241 | 
242 | 
243 | def np_conversion(word, sentence, spans):
244 |     '''enforce agreement within NP (case, number, gender)'''
245 |     morph_dict = get_morphology(word)
246 | 
247 |     # gender doesn't matter for plural agreement
248 |     if morph_dict['number'] == 'pl':
249 |         morph_dict['gender'] = b'_'
250 | 
251 |     morph_info = morph_dict['gender'] + b'-' + morph_dict['case'] + b'-' + morph_dict['number']
252 | 
253 |     if morph_info != b'_-_-_':
254 |         word['func'] += b'_'+ morph_info
255 |         word['proj_func'] += b'_'+ morph_info
256 | 
257 | 
258 | def subj_coord_conversion(word, sentence, spans):
259 |     '''mark coordinated subjects (which do not need to agree with verb in number)'''
260 |     if any(w['proj_func'] == 'kon' and w['proj_head'] == word['pos'] for w in sentence):
261 |         word['func'] = b'csubj'
262 |         word['proj_func'] = b'csubj'
263 | 
264 | def subj_conversion(word, sentence, spans):
265 |     '''enforce agreement between subject and verb (person/number)'''
266 | 
267 |     head = get_head(word, sentence)
268 |     morph_dict = get_morphology(head)
269 | 
270 |     morph_info = morph_dict['person'] + b'-' + morph_dict['number']
271 | 
272 |     if morph_info != b'_-_':
273 |         word['func'] += b'_'+ morph_info
274 |         word['proj_func'] += b'_'+ morph_info
275 | 
276 | 
277 | def kon_conversion(word, sentence, spans):
278 |     '''
279 |     let elements in coordination copy the label of the first element,
280 |     and mark commas and conjunctions with label that specifies what type of structure is coordinated.
281 | 
282 |     '''
283 |     head = get_head(word, sentence)
284 |     while head and (head['func'].startswith(b'kon') or head['func'].startswith(b'app') or head['func'].startswith(b'cj')):
285 |         head = get_head(head, sentence)
286 | 
287 |     if head:
288 |         headfunc = head['func']
289 |     else:
290 |         headfunc = b'root'
291 | 
292 |     # ignore comparative clause
293 |     if headfunc.startswith(b'kom'):
294 |         return
295 | 
296 |     elif headfunc.startswith(b'rel') or headfunc.startswith(b'objc') or headfunc.startswith(b'subjc') or headfunc.startswith(b'neb'):
297 |         headfunc = b'vkon_sub'
298 | 
299 |     # ignore number/person information
300 |     elif headfunc.startswith(b'subj'):
301 |         headfunc = b'subj'
302 | 
303 |     if word['func'] == b'cj' and headfunc == b'csubj':
304 |         headfunc = b'subj'
305 | 
306 |     if word['func'] == b'kon' and word['tag'] == b'KON' or word['tag'] == b'$,':
307 |         word['func'] += b'_'+ headfunc
308 |         word['proj_func'] += b'_'+ headfunc
309 |     else:
310 |         word['func'] = headfunc
311 |         word['proj_func'] = headfunc
312 | 
313 | 
314 | def gmod_conversion(word, sentence, spans):
315 |     '''distinguish between premodifying and postmodifying genitive modifiers
316 |     premodifying are typically named entities without articles (Peters X)
317 |     postmodifying are typically noun phrases with articles (X der Firma)
318 | 
319 |     '''
320 |     if int(word['pos']) > int(word['proj_head']):
321 |         info = b'post'
322 |     else:
323 |         info = b'pre'
324 | 
325 |     word['func'] += b'_'+ info
326 |     word['proj_func'] += b'_'+ info
327 | 
328 | def pred_conversion(word, sentence, spans):
329 |     '''distinguish between adverbial and nominal predicates'''
330 | 
331 |     info = b''
332 |     if word['tag2'] in [b'ADJD',b'ADV',b'PWAV']:
333 |         info = b'_adv'
334 |     elif word['tag2'] in [b'NE', b'NN', b'FM', b'PIS', b'PPER', b'PWS', b'ADJA']:
335 |         info = b'_nn'
336 | 
337 |     word['func'] += info
338 |     word['proj_func'] += info
339 | 
340 | def get_morphology(word):
341 |     morph_info = word['morph'].split(b'|')
342 |     morph_dict = {}
343 | 
344 |     tag = word['tag2']
345 | 
346 |     try:
347 |         morph_dict['case'] = morph_info[CASE_POSITION[tag]].lower()
348 |     except (IndexError, KeyError):
349 |         morph_dict['case'] = b'_'
350 | 
351 |     try:
352 |         morph_dict['gender'] = morph_info[GENDER_POSITION[tag]].lower()
353 |     except (IndexError, KeyError):
354 |         morph_dict['gender'] = b'_'
355 | 
356 |     try:
357 |         morph_dict['number'] = morph_info[NUMBER_POSITION[tag]].lower()
358 |     except (IndexError, KeyError):
359 |         morph_dict['number'] = b'_'
360 | 
361 |     try:
362 |         morph_dict['person'] = morph_info[PERSON_POSITION[tag]].lower()
363 |     except (IndexError, KeyError):
364 |         morph_dict['person'] = b'_'
365 | 
366 |     return morph_dict
367 | 
368 | 
369 | def get_spans(sentence):
370 |     spans = {}
371 |     dominates = defaultdict(set)
372 |     for i,w in enumerate(sentence):
373 |         dominates[i].add(i)
374 |         head = int(w['proj_head'])-1
375 |         while head != -1:
376 |             if i in dominates[head]:
377 |                 break
378 |             dominates[head].add(i)
379 |             head = int(sentence[head]['proj_head'])-1
380 | 
381 |     return dominates
382 | 
383 | def get_dependents_for_word(word, dependents):
384 |     return dependents[int(word['pos'])-1]
385 | 
386 | CONVERSIONS = {b'aux':aux_conversion
387 |                 ,b'root':root_conversion
388 |                 ,b'obji':obji_conversion
389 |                 ,b'pn':pn_conversion
390 |                 ,b'det':np_conversion
391 |                 ,b'attr':np_conversion
392 |                 ,b'subj':subj_conversion
393 |                 ,b'kon':kon_conversion
394 |                 ,b'cj':kon_conversion
395 |                 ,b'gmod':gmod_conversion
396 |                 ,b'pred':pred_conversion
397 |                 }
398 | 
399 | if __name__ == '__main__':
400 |     if sys.version_info >= (3,0,0):
401 |         sys.stdin = sys.stdin.buffer
402 |         sys.stdout = sys.stdout.buffer
403 |         sys.stderr = sys.stderr.buffer
404 | 
405 |     # conversions used for WMT 14
406 |     if '--wmt14' in sys.argv:
407 |       CONVERSIONS = {b'root':root_conversion
408 |                 ,b'kon':kon_conversion
409 |                 ,b'cj':kon_conversion
410 |                 ,b'gmod':gmod_conversion}
411 | 
412 |     if '--wmt15' in sys.argv:
413 |       CONVERSIONS = {b'root':root_conversion
414 |                 ,b'kon':kon_conversion
415 |                 ,b'cj':kon_conversion
416 |                 ,b'gmod':gmod_conversion
417 |                 ,b'subj':subj_coord_conversion
418 |                 ,b'obji':obji_conversion}
419 | 
420 |     if '--coord-subj' in sys.argv:
421 |         CONVERSIONS[b'subj'] = subj_coord_conversion
422 | 
423 |     if '--obji' in sys.argv:
424 |         CONVERSIONS[b'obji'] = obji_conversion
425 | 
426 |     for arg in sys.argv[1:]:
427 |         if arg.startswith('--disable_'):
428 |             disabled_class = arg.split('_',1)[1].encode('UTF-8')
429 |             del CONVERSIONS[disabled_class]
430 | 
431 |     main(sys.stdin)
432 | 


--------------------------------------------------------------------------------
/example/toy_example_2015_4.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder
 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts
 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | nplm-dir = /home/rsennrich/tools/nplm-github/
 17 | maltparser = /home/rsennrich/tools/maltparser-1.8.1/maltparser-1.8.1.jar
 18 | corenlp = /home/rsennrich/tools/stanford-corenlp-full-2014-10-31
 19 | 
 20 | ###### no further changes should be required to run the toy example
 21 | ###### (but feel free to experiment with different settings, or change the training/test data)
 22 | 
 23 | moses-script-dir = $moses-src-dir/scripts
 24 | moses-bin-dir = $moses-src-dir/bin
 25 | toy-data = $wmt2014-scripts/example/data
 26 | working-dir = $wmt2014-scripts/example/working-dir
 27 | decoder = $moses-src-dir/bin/moses
 28 | 
 29 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 30 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 31 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 32 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 33 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 34 | 
 35 | input-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl \
 36 |                 | python $moses-script-dir/training/wrappers/parse-en-stanford.py --stanford $corenlp \
 37 |                 | java -jar $maltparser -c pproj -m proj -pp baseline -pcr head \
 38 |                 | python $moses-script-dir/training/wrappers/conll2mosesxml.py"
 39 | 
 40 | # parsing pipeline used for WMT 2014
 41 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 42 | 
 43 | # also parse tuning/evaluation reference files
 44 | mock-output-parser-references = $output-parser
 45 | mock-output-parser-lm = $output-parser
 46 | 
 47 | # SAMT relaxation for soft source-syntactic constraints
 48 | input-parse-relaxer = "$moses-src-dir/bin/relax-parse --SAMT 2"
 49 | 
 50 | # head binarization
 51 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head"
 52 | 
 53 | inputtype = 3
 54 | 
 55 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015)
 56 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest"
 57 | 
 58 | # sed instructions unsplit the split compunds from output-splitter
 59 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 60 | 
 61 | input-extension = en
 62 | output-extension = de
 63 | pair-extension = de-en
 64 | 
 65 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 66 | 
 67 | jobs = 10
 68 | 
 69 | #################################################################
 70 | # PARALLEL CORPUS PREPARATION: 
 71 | # create a tokenized, sentence-aligned corpus, ready for training
 72 | 
 73 | [CORPUS]
 74 | 
 75 | cores = 10
 76 | 
 77 | ### tools to use to prepare the data
 78 | #
 79 | #tokenizer = 
 80 | #lowercaser = 
 81 | 
 82 | ### long sentences are filtered out, since they slow down GIZA++ 
 83 | # and are a less reliable source of data. set here the maximum
 84 | # length of a sentence
 85 | #
 86 | max-sentence-length = 80
 87 | 
 88 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 89 | # since uneven sentence length is an indicator of a misalignment,
 90 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 91 | #
 92 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 93 | 
 94 | [CORPUS:parallelA]
 95 | raw-stem = $toy-data/parallelA.$pair-extension
 96 | 
 97 | [CORPUS:parallelB]
 98 | raw-stem = $toy-data/parallelB.$pair-extension
 99 | 
100 | [CORPUS:parallelC]
101 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
102 | # you can add parsed corpora to your system like this
103 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_both
104 | 
105 | #################################################################
106 | # LANGUAGE MODEL TRAINING
107 | 
108 | [LM]
109 | 
110 | cores = 10
111 | 
112 | ### tool to be used for language model training
113 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
114 | # 
115 | lm-training = $srilm-dir/ngram-count
116 | settings = "-interpolate -kndiscount -unk"
117 | order = 5
118 | 
119 | ### script to use for binary table format
120 | # (default: no binarization)
121 | #
122 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
123 | 
124 | # kenlm, also set type to 8
125 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
126 | #type = 8
127 | 
128 | ### script to create quantized language model format
129 | # (default: no quantization)
130 | # 
131 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
132 | 
133 | ### tools to use to prepare the data
134 | #
135 | #tokenizer = 
136 | #lowercaser = 
137 | 
138 | ### each language model to be used has its own section here
139 | 
140 | ### if corpus preparation should be skipped, 
141 | # point to the prepared language model
142 | #
143 | #lm = 
144 | 
145 | # internal settings for RDLM
146 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files
147 | rdlm-left-context = 3
148 | rdlm-right-context = 3
149 | rdlm-up-context = 2
150 | rdlm-working-dir = 1
151 | 
152 | [LM:parallelA]
153 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
154 | 
155 | [LM:parallelB]
156 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
157 | 
158 | [LM:parallelC]
159 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
160 | # you can add parsed corpora to your system like this
161 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_both.$output-extension
162 | 
163 | [LM:monolingualA]
164 | raw-corpus = $toy-data/monolingualA.$output-extension
165 | 
166 | ### Relational Dependency LM trained on concatenation of other training corpora [head model]
167 | [LM:RDLM]
168 | 
169 | ### define which corpora to concatenate
170 | # we use -split here because we do not want to strip away syntactic markup
171 | #
172 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus]
173 | 
174 | ### tell INTERPOLATED-LM to ignore this model
175 | #
176 | exclude-from-interpolation = true
177 | 
178 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
179 | #
180 | syntactic = true
181 | 
182 | ### training command
183 | #
184 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
185 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \
186 |                    --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \
187 |                    --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \
188 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
189 |                    --epochs 10 --mmap"
190 | 
191 | # we train two RDLMs, but only need one entry in the config, so we leave this empty
192 | config-feature-line = " "
193 | config-weight-line = " "
194 | 
195 | 
196 | ### Relational Dependency LM trained on concatenation of other training corpora [label model]
197 | [LM:RDLM2]
198 | 
199 | ### define which corpora to concatenate
200 | # we use -split here because we do not want to strip away syntactic markup
201 | #
202 | split-corpus = [LM:RDLM:split-corpus]
203 | 
204 | ### tell INTERPOLATED-LM to ignore this model
205 | #
206 | exclude-from-interpolation = true
207 | 
208 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
209 | #
210 | syntactic = true
211 | 
212 | ### training command
213 | #
214 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
215 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \
216 |                    --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \
217 |                    --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \
218 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
219 |                    --epochs 10 --mmap"
220 | 
221 | ### manually specify feature and weight lines for moses.ini (required for custom-training)
222 | #
223 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full"
224 | config-weight-line = "RDLM0= 0.1 0.1"
225 | 
226 | 
227 | #################################################################
228 | # INTERPOLATING LANGUAGE MODELS
229 | 
230 | [INTERPOLATED-LM]
231 | 
232 | # if multiple language models are used, these may be combined
233 | # by optimizing perplexity on a tuning set
234 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
235 | 
236 | ### script to interpolate language models
237 | # if commented out, no interpolation is performed
238 | #
239 | script = $moses-script-dir/ems/support/interpolate-lm.perl
240 | 
241 | ### tuning set
242 | # you may use the same set that is used for mert tuning (reference set)
243 | #
244 | raw-tuning = $toy-data/newstest2012.$output-extension
245 | 
246 | ### script to use for binary table format for irstlm or kenlm
247 | # kenlm, also set type to 8
248 | lm-binarizer = $moses-src-dir/bin/build_binary
249 | type = 8
250 | 
251 | #################################################################
252 | # TRANSLATION MODEL TRAINING
253 | 
254 | [TRAINING]
255 | 
256 | ### training script to be used: either a legacy script or 
257 | # current moses training script (default) 
258 | # 
259 | script = $moses-script-dir/training/train-model.perl
260 | 
261 | ### general options
262 | #
263 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
264 | 
265 | ### symmetrization method to obtain word alignments from giza output
266 | # (commonly used: grow-diag-final-and)
267 | #
268 | alignment-symmetrization-method = grow-diag-final-and
269 | 
270 | run-giza-in-parts = 5
271 | 
272 | ### if word alignment (giza symmetrization) should be skipped,
273 | # point to word alignment files
274 | #
275 | # word-alignment =
276 | 
277 | ### hierarchical rule set
278 | #
279 | hierarchical-rule-set = true
280 | use-ghkm = true
281 | use-pcfg-feature = true
282 | use-unknown-word-soft-matches = true
283 | dont-tune-glue-grammar = true
284 | ghkm-source-labels = true
285 | 
286 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary"
287 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001"
288 | 
289 | 
290 | ### if phrase extraction should be skipped,
291 | # point to stem for extract files
292 | #
293 | # extracted-phrases = 
294 | 
295 | ### if phrase table training should be skipped,
296 | # point to phrase translation table
297 | #
298 | # phrase-translation-table = 
299 | 
300 | ### if training should be skipped, 
301 | # point to a configuration file that contains
302 | # pointers to all relevant model files
303 | # config =
304 | 
305 | ####################################################### TUNING: finding good weights for model components
306 | 
307 | [TUNING]
308 | 
309 | ### instead of tuning with this setting, old weights may be recycled
310 | # specify here an old configuration file with matching weights
311 | #
312 | #weight-config =
313 | 
314 | ### tuning script to be used
315 | #
316 | tuning-script = $moses-script-dir/training/mert-moses.pl
317 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'"
318 | 
319 | ### specify the corpus used for tuning 
320 | # it should contain 100s if not 1000s of sentences
321 | #
322 | raw-input = $toy-data/newstest2012.$input-extension
323 | # tokenized-input = 
324 | # factorized-input = 
325 | # input =
326 | 
327 | inputtype = 3
328 | 
329 | raw-reference = $toy-data/newstest2012.$output-extension
330 | # tokenized-reference = 
331 | # factorized-reference = 
332 | # reference = 
333 | 
334 | ### size of n-best list used (typically 100)
335 | #
336 | nbest = 1000
337 | 
338 | ### ranges for weights for random initialization
339 | # if not specified, the tuning script will use generic ranges
340 | # it is not clear, if this matters
341 | #
342 | # lambda = 
343 | 
344 | ### additional flags for the decoder
345 | #
346 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees"
347 | 
348 | ### if tuning should be skipped, specify this here
349 | # and also point to a configuration file that contains
350 | # pointers to all relevant model files
351 | #
352 | 
353 | 
354 | #########################################################
355 | ## RECASER: restore case, this part only trains the model
356 | 
357 | [RECASING]
358 | 
359 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
360 | 
361 | ### training data
362 | # raw input needs to be still tokenized,
363 | # also also tokenized input may be specified
364 | #
365 | #tokenized = [LM:europarl:tokenized-corpus]
366 | 
367 | # recase-config = 
368 | 
369 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
370 | 
371 | #######################################################
372 | ## TRUECASER: train model to truecase corpora and input
373 | 
374 | [TRUECASER]
375 | 
376 | ### script to train truecaser models
377 | #
378 | trainer = $moses-script-dir/recaser/train-truecaser.perl
379 | 
380 | ### training data
381 | # raw input needs to be still tokenized,
382 | # also also tokenized input may be specified
383 | #
384 | # tokenized-stem = $working-dir/data/ep+nc
385 | 
386 | ### trained model
387 | #
388 | #truecase-model = 
389 | 
390 | ############################################################
391 | ## EVALUATION: translating a test set using the tuned system
392 | 
393 | [EVALUATION]
394 | 
395 | ### number of jobs (if parallel execution of testing)
396 | #
397 | jobs = 10
398 | 
399 | filter-settings = "  "
400 | 
401 | 
402 | ### prepare system output for scoring 
403 | # this may include detokenization and wrapping output in sgm 
404 | # (needed for nist-bleu, ter, meteor)
405 | #
406 | #recaser = $moses-script-dir/recaser/recase.perl
407 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
408 | # output-sgm = 
409 | 
410 | ### should output be scored case-sensitive (default: no)?
411 | #
412 | # case-sensitive = yes
413 | 
414 | ### BLEU
415 | #
416 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
417 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
418 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
419 | # ibm-bleu =
420 | 
421 | ### TER: translation error rate (BBN metric) based on edit distance
422 | #
423 | # ter = $edinburgh-script-dir/tercom_v6a.pl
424 | 
425 | ### METEOR: gives credit to stem / worknet synonym matches
426 | #
427 | # meteor = 
428 | 
429 | ### Analysis: carry out various forms of analysis on the output
430 | #
431 | analysis = $moses-script-dir/ems/support/analysis.perl
432 | #analyze-coverage = yes
433 | report-segmentation = yes
434 | 
435 | 
436 | [EVALUATION:newstest2013]
437 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100"
438 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
439 | wrapping-frame = $input-sgm
440 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
441 | 
442 | [REPORTING]
443 | 
444 | ### what to do with result (default: store in file evaluation/report)
445 | # 
446 | # email = pkoehn@inf.ed.ac.uk
447 | 
448 | 


--------------------------------------------------------------------------------
/example/toy_example_2015_5.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder
 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts
 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | nplm-dir = /home/rsennrich/tools/nplm-github/
 17 | maltparser = /home/rsennrich/tools/maltparser-1.8.1/maltparser-1.8.1.jar
 18 | corenlp = /home/rsennrich/tools/stanford-corenlp-full-2014-10-31
 19 | 
 20 | ###### no further changes should be required to run the toy example
 21 | ###### (but feel free to experiment with different settings, or change the training/test data)
 22 | 
 23 | moses-script-dir = $moses-src-dir/scripts
 24 | moses-bin-dir = $moses-src-dir/bin
 25 | toy-data = $wmt2014-scripts/example/data
 26 | working-dir = $wmt2014-scripts/example/working-dir
 27 | decoder = $moses-src-dir/bin/moses
 28 | 
 29 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 30 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 31 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 32 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 33 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 34 | 
 35 | input-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl \
 36 |                 | python $moses-script-dir/training/wrappers/parse-en-stanford.py --stanford $corenlp \
 37 |                 | java -jar $maltparser -c pproj -m proj -pp baseline -pcr head \
 38 |                 | python $moses-script-dir/training/wrappers/conll2mosesxml.py"
 39 | 
 40 | # parsing pipeline used for WMT 2014
 41 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 42 | 
 43 | # also parse tuning/evaluation reference files
 44 | mock-output-parser-references = $output-parser
 45 | mock-output-parser-lm = $output-parser
 46 | 
 47 | # SAMT relaxation for soft source-syntactic constraints
 48 | input-parse-relaxer = "$moses-src-dir/bin/relax-parse --SAMT 2"
 49 | 
 50 | # head binarization
 51 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head"
 52 | 
 53 | inputtype = 3
 54 | 
 55 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015)
 56 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest"
 57 | 
 58 | # sed instructions unsplit the split compunds from output-splitter
 59 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 60 | 
 61 | input-extension = en
 62 | output-extension = de
 63 | pair-extension = de-en
 64 | 
 65 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 66 | 
 67 | jobs = 10
 68 | 
 69 | #################################################################
 70 | # PARALLEL CORPUS PREPARATION: 
 71 | # create a tokenized, sentence-aligned corpus, ready for training
 72 | 
 73 | [CORPUS]
 74 | 
 75 | cores = 10
 76 | 
 77 | ### tools to use to prepare the data
 78 | #
 79 | #tokenizer = 
 80 | #lowercaser = 
 81 | 
 82 | ### long sentences are filtered out, since they slow down GIZA++ 
 83 | # and are a less reliable source of data. set here the maximum
 84 | # length of a sentence
 85 | #
 86 | max-sentence-length = 80
 87 | 
 88 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 89 | # since uneven sentence length is an indicator of a misalignment,
 90 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 91 | #
 92 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 93 | 
 94 | [CORPUS:parallelA]
 95 | raw-stem = $toy-data/parallelA.$pair-extension
 96 | 
 97 | [CORPUS:parallelB]
 98 | raw-stem = $toy-data/parallelB.$pair-extension
 99 | 
100 | [CORPUS:parallelC]
101 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
102 | # you can add parsed corpora to your system like this
103 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_both
104 | 
105 | #################################################################
106 | # LANGUAGE MODEL TRAINING
107 | 
108 | [LM]
109 | 
110 | cores = 10
111 | 
112 | ### tool to be used for language model training
113 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
114 | # 
115 | lm-training = $srilm-dir/ngram-count
116 | settings = "-interpolate -kndiscount -unk"
117 | order = 5
118 | 
119 | ### script to use for binary table format
120 | # (default: no binarization)
121 | #
122 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
123 | 
124 | # kenlm, also set type to 8
125 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
126 | #type = 8
127 | 
128 | ### script to create quantized language model format
129 | # (default: no quantization)
130 | # 
131 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
132 | 
133 | ### tools to use to prepare the data
134 | #
135 | #tokenizer = 
136 | #lowercaser = 
137 | 
138 | ### each language model to be used has its own section here
139 | 
140 | ### if corpus preparation should be skipped, 
141 | # point to the prepared language model
142 | #
143 | #lm = 
144 | 
145 | # internal settings for RDLM
146 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files
147 | rdlm-left-context = 3
148 | rdlm-right-context = 3
149 | rdlm-up-context = 2
150 | rdlm-working-dir = 1
151 | 
152 | [LM:parallelA]
153 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
154 | 
155 | [LM:parallelB]
156 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
157 | 
158 | [LM:parallelC]
159 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
160 | # you can add parsed corpora to your system like this
161 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_both.$output-extension
162 | 
163 | [LM:monolingualA]
164 | raw-corpus = $toy-data/monolingualA.$output-extension
165 | 
166 | ### Relational Dependency LM trained on concatenation of other training corpora [head model]
167 | [LM:RDLM]
168 | 
169 | ### define which corpora to concatenate
170 | # we use -split here because we do not want to strip away syntactic markup
171 | #
172 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus]
173 | 
174 | ### tell INTERPOLATED-LM to ignore this model
175 | #
176 | exclude-from-interpolation = true
177 | 
178 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
179 | #
180 | syntactic = true
181 | 
182 | ### training command
183 | #
184 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
185 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \
186 |                    --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \
187 |                    --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \
188 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
189 |                    --epochs 10 --mmap"
190 | 
191 | # we train two RDLMs, but only need one entry in the config, so we leave this empty
192 | config-feature-line = " "
193 | config-weight-line = " "
194 | 
195 | 
196 | ### Relational Dependency LM trained on concatenation of other training corpora [label model]
197 | [LM:RDLM2]
198 | 
199 | ### define which corpora to concatenate
200 | # we use -split here because we do not want to strip away syntactic markup
201 | #
202 | split-corpus = [LM:RDLM:split-corpus]
203 | 
204 | ### tell INTERPOLATED-LM to ignore this model
205 | #
206 | exclude-from-interpolation = true
207 | 
208 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
209 | #
210 | syntactic = true
211 | 
212 | ### training command
213 | #
214 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
215 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \
216 |                    --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \
217 |                    --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \
218 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
219 |                    --epochs 10 --mmap"
220 | 
221 | ### manually specify feature and weight lines for moses.ini (required for custom-training)
222 | #
223 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full"
224 | config-weight-line = "RDLM0= 0.1 0.1"
225 | 
226 | 
227 | ### 5-gram Neural Network LM
228 | [LM:NPLM]
229 | 
230 | ### define which corpora to concatenate
231 | # we use -split here because we do not want to strip away syntactic markup
232 | #
233 | concatenate-files = [LM:{parallelA,parallelB,monolingualA}:stripped-corpus]
234 | 
235 | ### tell INTERPOLATED-LM to ignore this model
236 | #
237 | exclude-from-interpolation = true
238 | 
239 | # internal settings for NPLM
240 | # IMPORTANT: update nplm-working-dir when training a new NPLM to avoid overwriting old files
241 | order = 5
242 | nplm-working-dir = 1
243 | 
244 | ### training command
245 | #
246 | custom-training = "mkdir -p $working-dir/lm/nplm/nplm$nplm-working-dir && $moses-script-dir/training/train-neurallm.py \
247 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/nplm/nplm$nplm-working-dir \
248 |                    --output-dir $working-dir/lm/nplm/nplm$nplm-working-dir --output-model nplm \
249 |                    --vocab-size 500000 --noise 100 --order $order \
250 |                    --epochs 10"
251 | 
252 | # we train two RDLMs, but only need one entry in the config, so we leave this empty
253 | config-feature-line = "NeuralLM path=$working-dir/lm/nplm/nplm$nplm-working-dir/nplm.model.nplm order=$order"
254 | config-weight-line = "NeuralLM0= 0.1"
255 | 
256 | #################################################################
257 | # INTERPOLATING LANGUAGE MODELS
258 | 
259 | [INTERPOLATED-LM]
260 | 
261 | # if multiple language models are used, these may be combined
262 | # by optimizing perplexity on a tuning set
263 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
264 | 
265 | ### script to interpolate language models
266 | # if commented out, no interpolation is performed
267 | #
268 | script = $moses-script-dir/ems/support/interpolate-lm.perl
269 | 
270 | ### tuning set
271 | # you may use the same set that is used for mert tuning (reference set)
272 | #
273 | raw-tuning = $toy-data/newstest2012.$output-extension
274 | 
275 | ### script to use for binary table format for irstlm or kenlm
276 | # kenlm, also set type to 8
277 | lm-binarizer = $moses-src-dir/bin/build_binary
278 | type = 8
279 | 
280 | #################################################################
281 | # TRANSLATION MODEL TRAINING
282 | 
283 | [TRAINING]
284 | 
285 | ### training script to be used: either a legacy script or 
286 | # current moses training script (default) 
287 | # 
288 | script = $moses-script-dir/training/train-model.perl
289 | 
290 | ### general options
291 | #
292 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
293 | 
294 | ### symmetrization method to obtain word alignments from giza output
295 | # (commonly used: grow-diag-final-and)
296 | #
297 | alignment-symmetrization-method = grow-diag-final-and
298 | 
299 | run-giza-in-parts = 5
300 | 
301 | ### if word alignment (giza symmetrization) should be skipped,
302 | # point to word alignment files
303 | #
304 | # word-alignment =
305 | 
306 | ### hierarchical rule set
307 | #
308 | hierarchical-rule-set = true
309 | use-ghkm = true
310 | use-pcfg-feature = true
311 | use-unknown-word-soft-matches = true
312 | dont-tune-glue-grammar = true
313 | ghkm-source-labels = true
314 | 
315 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary"
316 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001"
317 | 
318 | 
319 | ### if phrase extraction should be skipped,
320 | # point to stem for extract files
321 | #
322 | # extracted-phrases = 
323 | 
324 | ### if phrase table training should be skipped,
325 | # point to phrase translation table
326 | #
327 | # phrase-translation-table = 
328 | 
329 | ### if training should be skipped, 
330 | # point to a configuration file that contains
331 | # pointers to all relevant model files
332 | # config =
333 | 
334 | ####################################################### TUNING: finding good weights for model components
335 | 
336 | [TUNING]
337 | 
338 | ### instead of tuning with this setting, old weights may be recycled
339 | # specify here an old configuration file with matching weights
340 | #
341 | #weight-config =
342 | 
343 | ### tuning script to be used
344 | #
345 | tuning-script = $moses-script-dir/training/mert-moses.pl
346 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'"
347 | 
348 | ### specify the corpus used for tuning 
349 | # it should contain 100s if not 1000s of sentences
350 | #
351 | raw-input = $toy-data/newstest2012.$input-extension
352 | # tokenized-input = 
353 | # factorized-input = 
354 | # input =
355 | 
356 | inputtype = 3
357 | 
358 | raw-reference = $toy-data/newstest2012.$output-extension
359 | # tokenized-reference = 
360 | # factorized-reference = 
361 | # reference = 
362 | 
363 | ### size of n-best list used (typically 100)
364 | #
365 | nbest = 1000
366 | 
367 | ### ranges for weights for random initialization
368 | # if not specified, the tuning script will use generic ranges
369 | # it is not clear, if this matters
370 | #
371 | # lambda = 
372 | 
373 | ### additional flags for the decoder
374 | #
375 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees"
376 | 
377 | ### if tuning should be skipped, specify this here
378 | # and also point to a configuration file that contains
379 | # pointers to all relevant model files
380 | #
381 | 
382 | 
383 | #########################################################
384 | ## RECASER: restore case, this part only trains the model
385 | 
386 | [RECASING]
387 | 
388 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
389 | 
390 | ### training data
391 | # raw input needs to be still tokenized,
392 | # also also tokenized input may be specified
393 | #
394 | #tokenized = [LM:europarl:tokenized-corpus]
395 | 
396 | # recase-config = 
397 | 
398 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
399 | 
400 | #######################################################
401 | ## TRUECASER: train model to truecase corpora and input
402 | 
403 | [TRUECASER]
404 | 
405 | ### script to train truecaser models
406 | #
407 | trainer = $moses-script-dir/recaser/train-truecaser.perl
408 | 
409 | ### training data
410 | # raw input needs to be still tokenized,
411 | # also also tokenized input may be specified
412 | #
413 | # tokenized-stem = $working-dir/data/ep+nc
414 | 
415 | ### trained model
416 | #
417 | #truecase-model = 
418 | 
419 | ############################################################
420 | ## EVALUATION: translating a test set using the tuned system
421 | 
422 | [EVALUATION]
423 | 
424 | ### number of jobs (if parallel execution of testing)
425 | #
426 | jobs = 10
427 | 
428 | filter-settings = "  "
429 | 
430 | 
431 | ### prepare system output for scoring 
432 | # this may include detokenization and wrapping output in sgm 
433 | # (needed for nist-bleu, ter, meteor)
434 | #
435 | #recaser = $moses-script-dir/recaser/recase.perl
436 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
437 | # output-sgm = 
438 | 
439 | ### should output be scored case-sensitive (default: no)?
440 | #
441 | # case-sensitive = yes
442 | 
443 | ### BLEU
444 | #
445 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
446 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
447 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
448 | # ibm-bleu =
449 | 
450 | ### TER: translation error rate (BBN metric) based on edit distance
451 | #
452 | # ter = $edinburgh-script-dir/tercom_v6a.pl
453 | 
454 | ### METEOR: gives credit to stem / worknet synonym matches
455 | #
456 | # meteor = 
457 | 
458 | ### Analysis: carry out various forms of analysis on the output
459 | #
460 | analysis = $moses-script-dir/ems/support/analysis.perl
461 | #analyze-coverage = yes
462 | report-segmentation = yes
463 | 
464 | 
465 | [EVALUATION:newstest2013]
466 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100"
467 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
468 | wrapping-frame = $input-sgm
469 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
470 | 
471 | [REPORTING]
472 | 
473 | ### what to do with result (default: store in file evaluation/report)
474 | # 
475 | # email = pkoehn@inf.ed.ac.uk
476 | 
477 | 


--------------------------------------------------------------------------------
/example/toy_example_2015_6.config:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
  3 | ################################################
  4 | 
  5 | [GENERAL]
  6 | 
  7 | ###### you need to set these paths to match your environemnt
  8 | ######
  9 | 
 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder
 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts
 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu
 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/
 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/
 15 | external-bin-dir = ~/bin
 16 | nplm-dir = /home/rsennrich/tools/nplm-github/
 17 | maltparser = /home/rsennrich/tools/maltparser-1.8.1/maltparser-1.8.1.jar
 18 | corenlp = /home/rsennrich/tools/stanford-corenlp-full-2014-10-31
 19 | 
 20 | # IMPORTANT: update run-id to avoid decoder output (Ttree file) being overwritten
 21 | run-id = 1
 22 | 
 23 | ###### no further changes should be required to run the toy example
 24 | ###### (but feel free to experiment with different settings, or change the training/test data)
 25 | 
 26 | moses-script-dir = $moses-src-dir/scripts
 27 | moses-bin-dir = $moses-src-dir/bin
 28 | toy-data = $wmt2014-scripts/example/data
 29 | working-dir = $wmt2014-scripts/example/working-dir
 30 | decoder = $moses-src-dir/bin/moses
 31 | 
 32 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn"
 33 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
 34 | input-truecaser = $moses-script-dir/recaser/truecase.perl
 35 | output-truecaser = $moses-script-dir/recaser/truecase.perl
 36 | detruecaser = $moses-script-dir/recaser/detruecase.perl
 37 | 
 38 | input-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl \
 39 |                 | python $moses-script-dir/training/wrappers/parse-en-stanford.py --stanford $corenlp --java /home/rsennrich/tools/openjdk/OBF_DROP_DIR/openjdk8/j2re-image/bin/java \
 40 |                 | java -jar $maltparser -c pproj -m proj -pp baseline -pcr head \
 41 |                 | python $moses-script-dir/training/wrappers/conll2mosesxml.py"
 42 | 
 43 | # parsing pipeline used for WMT 2014
 44 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py"
 45 | 
 46 | # also parse tuning/evaluation reference files
 47 | mock-output-parser-references = $output-parser
 48 | mock-output-parser-lm = $output-parser
 49 | 
 50 | # SAMT relaxation for soft source-syntactic constraints
 51 | input-parse-relaxer = "$moses-src-dir/bin/relax-parse --SAMT 2"
 52 | 
 53 | # head binarization
 54 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head"
 55 | 
 56 | inputtype = 3
 57 | 
 58 | # hyphen splitting on input
 59 | input-splitter = "$wmt2014-scripts/emnlp2015/hyphen-splitter.py -syntax"
 60 | 
 61 | # hybrid compound splitting and particle verb restructuring (described in Sennrich and Haddow, 2015)
 62 | output-splitter = "$wmt2014-scripts/emnlp2015/split_and_restructure.sh $wmt2014-scripts $zmorge-model"
 63 | 
 64 | # sed instructions unsplit the split compunds from output-splitter
 65 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'"
 66 | 
 67 | input-extension = en
 68 | output-extension = de
 69 | pair-extension = de-en
 70 | 
 71 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
 72 | 
 73 | jobs = 10
 74 | 
 75 | #################################################################
 76 | # PARALLEL CORPUS PREPARATION: 
 77 | # create a tokenized, sentence-aligned corpus, ready for training
 78 | 
 79 | [CORPUS]
 80 | 
 81 | cores = 10
 82 | 
 83 | ### tools to use to prepare the data
 84 | #
 85 | #tokenizer = 
 86 | #lowercaser = 
 87 | 
 88 | ### long sentences are filtered out, since they slow down GIZA++ 
 89 | # and are a less reliable source of data. set here the maximum
 90 | # length of a sentence
 91 | #
 92 | max-sentence-length = 80
 93 | 
 94 | ### GIZA++ does not allow sentence pairs of highly uneven length.
 95 | # since uneven sentence length is an indicator of a misalignment,
 96 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting)
 97 | #
 98 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3"
 99 | 
100 | [CORPUS:parallelA]
101 | raw-stem = $toy-data/parallelA.$pair-extension
102 | 
103 | [CORPUS:parallelB]
104 | raw-stem = $toy-data/parallelB.$pair-extension
105 | 
106 | [CORPUS:parallelC]
107 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
108 | # you can add parsed corpora to your system like this
109 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_both
110 | 
111 | #################################################################
112 | # LANGUAGE MODEL TRAINING
113 | 
114 | [LM]
115 | 
116 | cores = 10
117 | 
118 | ### tool to be used for language model training
119 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
120 | # 
121 | lm-training = $srilm-dir/ngram-count
122 | settings = "-interpolate -kndiscount -unk"
123 | order = 5
124 | 
125 | ### script to use for binary table format
126 | # (default: no binarization)
127 | #
128 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm
129 | 
130 | # kenlm, also set type to 8
131 | #lm-binarizer = $moses-src-dir/kenlm/build_binary
132 | #type = 8
133 | 
134 | ### script to create quantized language model format
135 | # (default: no quantization)
136 | # 
137 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm
138 | 
139 | ### tools to use to prepare the data
140 | #
141 | #tokenizer = 
142 | #lowercaser = 
143 | 
144 | ### each language model to be used has its own section here
145 | 
146 | ### if corpus preparation should be skipped, 
147 | # point to the prepared language model
148 | #
149 | #lm = 
150 | 
151 | # internal settings for RDLM
152 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files
153 | rdlm-left-context = 3
154 | rdlm-right-context = 3
155 | rdlm-up-context = 2
156 | rdlm-working-dir = 1
157 | 
158 | [LM:parallelA]
159 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension
160 | 
161 | [LM:parallelB]
162 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension
163 | 
164 | [LM:parallelC]
165 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ),
166 | # you can add parsed corpora to your system like this
167 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_both.$output-extension
168 | 
169 | [LM:monolingualA]
170 | raw-corpus = $toy-data/monolingualA.$output-extension
171 | 
172 | ### Relational Dependency LM trained on concatenation of other training corpora [head model]
173 | [LM:RDLM]
174 | 
175 | ### define which corpora to concatenate
176 | # we use -split here because we do not want to strip away syntactic markup
177 | #
178 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus]
179 | 
180 | ### tell INTERPOLATED-LM to ignore this model
181 | #
182 | exclude-from-interpolation = true
183 | 
184 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
185 | #
186 | syntactic = true
187 | 
188 | ### training command
189 | #
190 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
191 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \
192 |                    --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \
193 |                    --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \
194 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
195 |                    --epochs 10 --mmap"
196 | 
197 | # we train two RDLMs, but only need one entry in the config, so we leave this empty
198 | config-feature-line = " "
199 | config-weight-line = " "
200 | 
201 | 
202 | ### Relational Dependency LM trained on concatenation of other training corpora [label model]
203 | [LM:RDLM2]
204 | 
205 | ### define which corpora to concatenate
206 | # we use -split here because we do not want to strip away syntactic markup
207 | #
208 | split-corpus = [LM:RDLM:split-corpus]
209 | 
210 | ### tell INTERPOLATED-LM to ignore this model
211 | #
212 | exclude-from-interpolation = true
213 | 
214 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm)
215 | #
216 | syntactic = true
217 | 
218 | ### training command
219 | #
220 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \
221 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \
222 |                    --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \
223 |                    --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \
224 |                    --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \
225 |                    --epochs 10 --mmap"
226 | 
227 | ### manually specify feature and weight lines for moses.ini (required for custom-training)
228 | #
229 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full"
230 | config-weight-line = "RDLM0= 0.1 0.1"
231 | 
232 | 
233 | ### 5-gram Neural Network LM
234 | [LM:NPLM]
235 | 
236 | ### define which corpora to concatenate
237 | # we use -split here because we do not want to strip away syntactic markup
238 | #
239 | concatenate-files = [LM:{parallelA,parallelB,monolingualA}:stripped-corpus]
240 | 
241 | ### tell INTERPOLATED-LM to ignore this model
242 | #
243 | exclude-from-interpolation = true
244 | 
245 | # internal settings for NPLM
246 | # IMPORTANT: update nplm-working-dir when training a new NPLM to avoid overwriting old files
247 | order = 5
248 | nplm-working-dir = 1
249 | 
250 | ### training command
251 | #
252 | custom-training = "mkdir -p $working-dir/lm/nplm/nplm$nplm-working-dir && $moses-script-dir/training/train-neurallm.py \
253 |                    --nplm-home $nplm-dir --working-dir $working-dir/lm/nplm/nplm$nplm-working-dir \
254 |                    --output-dir $working-dir/lm/nplm/nplm$nplm-working-dir --output-model nplm \
255 |                    --vocab-size 500000 --noise 100 --order $order \
256 |                    --epochs 10"
257 | 
258 | # we train two RDLMs, but only need one entry in the config, so we leave this empty
259 | config-feature-line = "NeuralLM path=$working-dir/lm/nplm/nplm$nplm-working-dir/nplm.model.nplm order=$order"
260 | config-weight-line = "NeuralLM0= 0.1"
261 | 
262 | #################################################################
263 | # INTERPOLATING LANGUAGE MODELS
264 | 
265 | [INTERPOLATED-LM]
266 | 
267 | # if multiple language models are used, these may be combined
268 | # by optimizing perplexity on a tuning set
269 | # see, for instance [Koehn and Schwenk, IJCNLP 2008]
270 | 
271 | ### script to interpolate language models
272 | # if commented out, no interpolation is performed
273 | #
274 | script = $moses-script-dir/ems/support/interpolate-lm.perl
275 | 
276 | ### tuning set
277 | # you may use the same set that is used for mert tuning (reference set)
278 | #
279 | raw-tuning = $toy-data/newstest2012.$output-extension
280 | 
281 | ### script to use for binary table format for irstlm or kenlm
282 | # kenlm, also set type to 8
283 | lm-binarizer = $moses-src-dir/bin/build_binary
284 | type = 8
285 | 
286 | #################################################################
287 | # TRANSLATION MODEL TRAINING
288 | 
289 | [TRAINING]
290 | 
291 | ### training script to be used: either a legacy script or 
292 | # current moses training script (default) 
293 | # 
294 | script = $moses-script-dir/training/train-model.perl
295 | 
296 | ### general options
297 | #
298 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment"
299 | 
300 | ### symmetrization method to obtain word alignments from giza output
301 | # (commonly used: grow-diag-final-and)
302 | #
303 | alignment-symmetrization-method = grow-diag-final-and
304 | 
305 | run-giza-in-parts = 5
306 | 
307 | ### if word alignment (giza symmetrization) should be skipped,
308 | # point to word alignment files
309 | #
310 | # word-alignment =
311 | 
312 | ### hierarchical rule set
313 | #
314 | hierarchical-rule-set = true
315 | use-ghkm = true
316 | use-pcfg-feature = true
317 | use-unknown-word-soft-matches = true
318 | dont-tune-glue-grammar = true
319 | ghkm-source-labels = true
320 | 
321 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary"
322 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001"
323 | 
324 | 
325 | ### if phrase extraction should be skipped,
326 | # point to stem for extract files
327 | #
328 | # extracted-phrases = 
329 | 
330 | ### if phrase table training should be skipped,
331 | # point to phrase translation table
332 | #
333 | # phrase-translation-table = 
334 | 
335 | ### if training should be skipped, 
336 | # point to a configuration file that contains
337 | # pointers to all relevant model files
338 | # config = 
339 | 
340 | ####################################################### TUNING: finding good weights for model components
341 | 
342 | [TUNING]
343 | 
344 | ### instead of tuning with this setting, old weights may be recycled
345 | # specify here an old configuration file with matching weights
346 | #
347 | # weight-config =
348 | 
349 | ### tuning script to be used
350 | #
351 | tuning-script = $moses-script-dir/training/mert-moses.pl
352 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'"
353 | 
354 | ### specify the corpus used for tuning 
355 | # it should contain 100s if not 1000s of sentences
356 | #
357 | raw-input = $toy-data/newstest2012.$input-extension
358 | # tokenized-input = 
359 | # factorized-input = 
360 | # input =
361 | 
362 | inputtype = 3
363 | 
364 | raw-reference = $toy-data/newstest2012.$output-extension
365 | # tokenized-reference = 
366 | # factorized-reference = 
367 | # reference = 
368 | 
369 | ### size of n-best list used (typically 100)
370 | #
371 | nbest = 1000
372 | 
373 | ### ranges for weights for random initialization
374 | # if not specified, the tuning script will use generic ranges
375 | # it is not clear, if this matters
376 | #
377 | # lambda = 
378 | 
379 | ### additional flags for the decoder
380 | #
381 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees"
382 | 
383 | ### if tuning should be skipped, specify this here
384 | # and also point to a configuration file that contains
385 | # pointers to all relevant model files
386 | #
387 | 
388 | 
389 | #########################################################
390 | ## RECASER: restore case, this part only trains the model
391 | 
392 | [RECASING]
393 | 
394 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm
395 | 
396 | ### training data
397 | # raw input needs to be still tokenized,
398 | # also also tokenized input may be specified
399 | #
400 | #tokenized = [LM:europarl:tokenized-corpus]
401 | 
402 | # recase-config = 
403 | 
404 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count
405 | 
406 | #######################################################
407 | ## TRUECASER: train model to truecase corpora and input
408 | 
409 | [TRUECASER]
410 | 
411 | ### script to train truecaser models
412 | #
413 | trainer = $moses-script-dir/recaser/train-truecaser.perl
414 | 
415 | ### training data
416 | # raw input needs to be still tokenized,
417 | # also also tokenized input may be specified
418 | #
419 | # tokenized-stem = $working-dir/data/ep+nc
420 | 
421 | ### trained model
422 | #
423 | #truecase-model = 
424 | 
425 | ############################################################
426 | ## EVALUATION: translating a test set using the tuned system
427 | 
428 | [EVALUATION]
429 | 
430 | ### number of jobs (if parallel execution of testing)
431 | #
432 | jobs = 10
433 | 
434 | filter-settings = "  "
435 | 
436 | 
437 | ### prepare system output for scoring 
438 | # this may include detokenization and wrapping output in sgm 
439 | # (needed for nist-bleu, ter, meteor)
440 | #
441 | #recaser = $moses-script-dir/recaser/recase.perl
442 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
443 | # output-sgm = 
444 | 
445 | ### should output be scored case-sensitive (default: no)?
446 | #
447 | # case-sensitive = yes
448 | 
449 | ### BLEU
450 | #
451 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
452 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
453 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl
454 | # ibm-bleu =
455 | 
456 | ### TER: translation error rate (BBN metric) based on edit distance
457 | #
458 | # ter = $edinburgh-script-dir/tercom_v6a.pl
459 | 
460 | ### METEOR: gives credit to stem / worknet synonym matches
461 | #
462 | # meteor = 
463 | 
464 | ### Analysis: carry out various forms of analysis on the output
465 | #
466 | analysis = $moses-script-dir/ems/support/analysis.perl
467 | #analyze-coverage = yes
468 | report-segmentation = yes
469 | 
470 | 
471 | [EVALUATION:newstest2013]
472 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=10' -threads 1 -max-chart-span 10 -rule-limit 10 -Ttree $working-dir/evaluation/newstest2013.output.tree.$run-id"
473 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm
474 | wrapping-frame = $input-sgm
475 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm
476 | 
477 | # ugly hack: to post-process particle verbs, we read tree output (produced with -Ttree) instead of string output; particle verb restructuring is made part of detruecaser.
478 | detruecaser = "$wmt2014-scripts/emnlp2015/detruecase_ptkvz.sh $wmt2014-scripts $working-dir/evaluation/newstest2013.output.tree.$run-id | $moses-script-dir/recaser/detruecase.perl"
479 | 
480 | [REPORTING]
481 | 
482 | ### what to do with result (default: store in file evaluation/report)
483 | # 
484 | # email = pkoehn@inf.ed.ac.uk
485 | 
486 | 


--------------------------------------------------------------------------------
/hybrid_compound_splitter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | # This script implements hybrid compound splitting as described in
  6 | # Fritzinger & Fraser 2010: How to Avoid Burning Ducks: Combining Linguistic Analysis and Corpus Statistics for German Compound Processing
  7 | # the variant without morphology tool corresponds to Koehn & Knight 2003: Empirical Methods for Compound Splitting
  8 | 
  9 | # As SMOR morphology, I recommend the most recent version of Zmorge: zmorge-{version}-smor_newlemma.a at http://kitt.ifi.uzh.ch/kitt/zmorge/
 10 | # The script requires SFST in hybrid mode.
 11 | 
 12 | # A syntactic representation of split compounds as described in:
 13 | #  Rico Sennrich, Philip Williams, Matthias Huck (2015):
 14 | #    A tree does not make a well-formed sentence: Improving syntactic string-to-tree statistical machine translation with more linguistic knowledge.
 15 | #    In: Computer Speech & Language 32(1), 27-45.
 16 | # can be generated (given a corpus in the Moses XML format) with the following commands:
 17 | # hybrid_compound_splitter.py -train -syntax -corpus INPUT_FILE -model MODEL_FILE
 18 | # hybrid_compound_splitter.py -write-filler -no-truecase -q -syntax -smor zmorge-{version}-smor_newlemma.a -model MODEL_FILE < INPUT_FILE > OUTPUT_FILE
 19 | 
 20 | from __future__ import division, unicode_literals
 21 | import sys
 22 | import os
 23 | import re
 24 | import pprint
 25 | import json
 26 | import codecs
 27 | import argparse
 28 | from collections import defaultdict
 29 | from operator import mul
 30 | 
 31 | from lxml import etree as ET
 32 | 
 33 | try:
 34 |   import pexpect
 35 | except ImportError:
 36 |   sys.stderr.write('Error: this script requires Pexpect >= 3.0\n')
 37 |   sys.exit(1)
 38 | 
 39 | if pexpect.__version__ < 3:
 40 |   sys.stderr.write('Error: this script requires Pexpect >= 3.0. Version {0} found\n'.format(pexpect.__version__))
 41 |   sys.exit(1)
 42 | 
 43 | if sys.version_info >= (3, 0):
 44 |     from functools import reduce
 45 | 
 46 | JUNCTURES = ['', 's', 'es', '-'] # only allow  these junctures in unsupervised mode (ignored in hybrid mode)
 47 | SMOR_SPLIT = ['NN', 'NE', 'ADJ'] # only split these word classes with SMOR
 48 | MIN_SIZE = 4
 49 | MIN_COUNT = 5
 50 | MAX_COUNT = 5
 51 | MAX_SPLIT_HYPOTHESES = 1000 # break if there are too many ways to split a word
 52 | 
 53 | SMOR_ENCODING = 'UTF-8'
 54 | 
 55 | 
 56 | class FstWrapper():
 57 |     def __init__(self, smor_binary, smor_model):
 58 |         self.child = pexpect.spawnu(smor_binary + ' ' + smor_model)
 59 |         self.child.delaybeforesend = 0
 60 |         self.child.expect(["analyze> ", pexpect.EOF], timeout=600)
 61 |         before = self.child.before
 62 |         if self.child.terminated:
 63 |             raise RuntimeError(before)
 64 | 
 65 |     def analyse(self, word):
 66 |         word = word.strip()
 67 |         if word == "" or word == "q" or word == "\x7f":
 68 |             return []
 69 |         self.child.sendline(word)
 70 |         try:
 71 |             self.child.expect(["analyze> ", pexpect.EOF])
 72 |         except pexpect.TIMEOUT:
 73 |             sys.stderr.write('Warning: timeout while waiting for fst-mor\n')
 74 |             sys.stderr.write('String: {0}'.format(word))
 75 |             return []
 76 |         result = self.child.before.split("\r\n")[1:-1]
 77 |         if len(result) == 1 and re.match("^no result for ", result[0]):
 78 |             result = []
 79 |         return result
 80 | 
 81 | 
 82 | class SMORSplitter(object):
 83 | 
 84 |     def __init__(self, smor_model, no_truecase):
 85 | 
 86 |         self.smor = FstWrapper('fst-mor', smor_model)
 87 |         self.data = defaultdict(set)
 88 |         self.re_mainclass = re.compile(r'<\+(.*?)>')
 89 |         self.re_any = re.compile(r'<([^#~-]+?)>')
 90 |         self.re_nn = re.compile(r'<#>')
 91 |         self.re_morph = re.compile(r'<([#~-])>')
 92 |         self.re_fugenlaut = re.compile(r'<->')
 93 |         self.re_segment = re.compile(r'<([A-Z#~]*?)>')
 94 |         self.re_hyphenation = re.compile(r'\{(.+?)\}-(?:<TRUNC>)?')
 95 |         self.re_last = re.compile(r'(.+?)<\+',re.UNICODE)
 96 |         self.no_truecase = no_truecase
 97 | 
 98 | 
 99 |     def convert(self, analyses):
100 |         """convert SMOR output into list of morphemes"""
101 | 
102 |         for word, lines in analyses:
103 |             cache = []
104 |             for line in lines:
105 | 
106 |                 if line.startswith('no result'):
107 |                     continue
108 | 
109 |                 if not line:
110 |                     continue
111 | 
112 |                 try:
113 |                     pos = self.re_mainclass.search(line).group(1)
114 |                 except AttributeError:
115 |                     continue
116 | 
117 |                 if pos == 'V' and '<PPres>' in line:
118 |                     continue
119 |                 elif pos == 'PUNCT':
120 |                     continue
121 | 
122 |                 #score number of morphemes; heuristic adopted from SFST
123 |                 segments = len(self.re_segment.findall(line))
124 |                 if line.startswith('<CAP>'):
125 |                     if self.no_truecase:
126 |                         continue
127 |                     else:
128 |                         segments -= 1
129 |                 elif '<CAP>' in line:
130 |                     continue
131 | 
132 |                 # convert markup of hyphenated words into markup of compounds (with '-' as juncture element which is lost if we split, but kept if we don't)
133 |                 # {ABC}-<TRUNC>Abwehr<+NN><Fem><Nom><Sg> -> ABC<->-<#>Abwehr<+NN><Fem><Nom><Sg>
134 |                 line = self.re_hyphenation.sub(r'\1<->-<#>', line)
135 | 
136 |                 main = self.re_last.search(line).group(1)
137 |                 parts = self.re_any.sub('',main)
138 | 
139 |                 cache.append((word,segments,parts,pos))
140 | 
141 |             self.get_best(cache)
142 | 
143 | 
144 |     def get_best(self,cache):
145 |         if cache:
146 |             for best in cache: #currently, process all segmentations. possible modification: only use 'best' segmentation, i.e. the one with the fewest morphemes
147 | 
148 |                 #only split nouns
149 |                 if best[3] in SMOR_SPLIT:
150 | 
151 |                     wordform = best[0]
152 |                     lemma = best[2]
153 |                     if not '<#>' in lemma:
154 |                         continue
155 |                     if '<~>' in lemma:
156 |                         lemma = lemma.replace('<~>','')
157 |                     stem = ''.join(lemma.split('<#>')[:-1])
158 |                     stem = self.re_morph.sub('',stem)
159 | 
160 |                     # restore inflected ending from analysis
161 |                     try:
162 |                         ending = best[0].split(stem)[1]
163 |                         split = lemma.split('<#>')[:-1] + [ending]
164 |                     except:
165 |                         split = lemma.split('<#>')
166 | 
167 |                     # keep inflection of ending
168 |                     split[-1] = self.re_morph.sub('',split[-1])
169 | 
170 |                     for i, item in enumerate(split):
171 |                         root, fuge = item, ''
172 |                         items = item.split('<->')
173 | 
174 |                         if len(items) == 2:
175 |                             root, fuge = items
176 |                         elif len(items) > 2:
177 |                             root = ''.join(items[:-1])
178 |                             fuge = items[-1]
179 | 
180 |                         root = self.re_morph.sub('', root)
181 |                         split[i] = (root, fuge)
182 | 
183 |                     self.data[best[0]].add(tuple(split))
184 | 
185 | 
186 |     def analyze(self, words_in):
187 |         """get all new words from input line and send them to SMOR for analysis"""
188 | 
189 |         todo = []
190 | 
191 |         for word in words_in:
192 |             if not word in self.data:
193 | 
194 |                 self.data[word] = set([((word,''),)])
195 |                 todo.append(word)
196 | 
197 |         analyses = [(word, self.smor.analyse(word)) for word in todo]
198 |         self.convert(analyses)
199 | 
200 | 
201 | 
202 | def train_model(in_obj, out_path, syntax):
203 | 
204 |     freq = defaultdict(int)
205 | 
206 |     re_syntax_splitter = re.compile(r'((?:\s*(?:<[^<>]*>)+\s*)|(?:(?<!>)\s+(?!<)))')
207 | 
208 |     for line in in_obj:
209 |         if syntax and '<' in line:
210 |             words = [word for word in re_syntax_splitter.split(line) if word and not word == ' ' and not word.startswith('<')]
211 |         else:
212 |             words = line.split()
213 |         for word in words:
214 |             freq[word] += 1
215 | 
216 |     write_model(freq, out_path)
217 | 
218 | 
219 | def write_model(model, file_path):
220 | 
221 |     if sys.version_info < (3, 0):
222 |         file_obj = codecs.getwriter('UTF-8')(open(args.model, 'w'))
223 |     else:
224 |         file_obj = open(args.model, 'w', encoding='UTF-8')
225 | 
226 |     file_obj.write('# -*- coding: utf-8 -*-\n\n')
227 |     file_obj.write('from __future__ import unicode_literals\n\n')
228 |     file_obj.write('model = ')
229 |     json.dump(model,file_obj, indent=2)
230 |     file_obj.close()
231 | 
232 | 
233 | def generate_decompositions(splits, memory = False, write_juncture = False):
234 | 
235 |     if not memory:
236 |         memory = []
237 | 
238 |     for start in splits[-1].keys():
239 |         if start == 0:
240 |             yield [splits[-1][start]] + memory
241 |         else:
242 |             if write_juncture:
243 |                 juncture, segment, new_start = splits[-1][start]
244 |                 new_memory = [(juncture, -1), (segment, new_start)] + memory
245 |             else:
246 |                 new_memory = [splits[-1][start]] + memory
247 |             for decomposition in generate_decompositions(splits[:start+1], new_memory, write_juncture = write_juncture):
248 |                 yield decomposition
249 | 
250 | 
251 | def get_unsupervised_splits(word, freq, truecase, fst_server=None, write_juncture=False, no_truecase=False):
252 |     reachable = [{} for i in range(len(word)+1)]
253 |     for end in range(MIN_SIZE, len(word)+1):
254 |         for start in range(0, end-MIN_SIZE+1):
255 | 
256 |             if start and not reachable[start]: # no split ending in this position
257 |                 continue
258 | 
259 |             for juncture in JUNCTURES:
260 | 
261 |                 if start == 0 and juncture:
262 |                     continue
263 | 
264 |                 if word[start:start+len(juncture)] != juncture:
265 |                     continue
266 | 
267 |                 subword_orig = word[start+len(juncture):end]
268 |                 subword = subword_orig.lower()
269 |                 if subword not in freq or freq[subword] < MIN_COUNT:
270 |                     continue
271 | 
272 |                 if VERBOSE:
273 |                     sys.stderr.write('\tmatching word {0} .. {1} ({2}){3} {4}\n'.format(start, end, juncture, subword, freq[subword]))
274 | 
275 |                 if subword in truecase:
276 |                     subword = truecase[subword]
277 | 
278 |                 if no_truecase:
279 |                     subword_out = subword_orig
280 |                 else:
281 |                     subword_out = subword
282 | 
283 |                 if not start in reachable[end] or freq[subword] > reachable[end][start][1]:
284 |                     if write_juncture and not start == 0:
285 |                         juncture_out = '@' + juncture + '@'
286 |                         reachable[end][start] = (juncture_out, subword_out, freq[subword])
287 |                     else:
288 |                         reachable[end][start] = (subword_out, freq[subword])
289 | 
290 |     #no split found
291 |     if not reachable[-1]:
292 |         return
293 | 
294 |     for decomposition in generate_decompositions(reachable, write_juncture = write_juncture):
295 |         yield decomposition
296 | 
297 | def join_compounds(compounds, freq, truecase, write_junctures, no_truecase, memory = False):
298 | 
299 |     if not memory:
300 |         memory = []
301 | 
302 |     for j in range(1, len(compounds)+1):
303 | 
304 |         if j == 1:
305 |             subword_orig = compounds[0][0]
306 |             subword = subword_orig.lower()
307 |         else:
308 |             prefix = ''.join([''.join(f) for f in compounds[:j-1]])
309 |             suffix = compounds[j-1][0]
310 |             subword_orig = prefix + suffix
311 |             subword = subword_orig.lower()
312 | 
313 |         if subword not in freq or freq[subword] < MIN_COUNT:
314 |             continue
315 | 
316 |         if VERBOSE:
317 |             sys.stderr.write('\tmatching word {0} {1}\n'.format(subword, freq[subword]))
318 | 
319 |         if no_truecase:
320 |             subword_out = subword_orig
321 |         else:
322 |             if subword in truecase:
323 |                 subword = truecase[subword]
324 |             subword_out = subword
325 | 
326 |         new_element = [(subword_out, freq[subword])]
327 | 
328 |         if j == len(compounds):
329 |             yield memory + new_element
330 |         else:
331 |             if write_junctures:
332 |                 new_element.append(('@' + compounds[j-1][1] + '@', -1))
333 |             for compound in join_compounds(compounds[j:], freq, truecase, write_junctures, no_truecase, memory + new_element):
334 |                 yield compound
335 | 
336 | 
337 | def get_FST_splits(word, freq, truecase, fst_server, write_junctures, no_truecase):
338 | 
339 |     for split in fst_server.data[word]:
340 |         for compound in join_compounds(split, freq, truecase, write_junctures, no_truecase):
341 |             yield compound
342 | 
343 | 
344 | def create_compound_xml(element, wordlist, write_junctures, merge_junctures, dependency, initial=False):
345 | 
346 |     # separate last segment, then recursively label remainder as compound modifier
347 |     if initial:
348 |         juncture = ''
349 |         dep = ET.Element('tree')
350 |         dep.set('label', 'SEGMENT')
351 |         dep.text = wordlist[-1]
352 |         remainder = wordlist[:-1]
353 |         if remainder:
354 |             create_compound_xml(element, remainder, write_junctures, merge_junctures, dependency)
355 |         element.append(dep)
356 |         return
357 | 
358 |     if write_junctures or merge_junctures:
359 |         juncture = wordlist[-1]
360 |         word = wordlist[-2]
361 |         remainder = wordlist[:-2]
362 |     else:
363 |         word = wordlist[-1]
364 |         remainder = wordlist[:-1]
365 | 
366 |     head = ET.Element('tree')
367 |     head.set('label', 'comp_mod')
368 |     element.append(head)
369 | 
370 |     if merge_junctures:
371 |         dep1 = ET.Element('tree')
372 |         dep1.set('label', 'SEGMENT+JUNC')
373 |         dep1.text = word + juncture[1:-1] + '@@'
374 |     else:
375 |         dep1 = ET.Element('tree')
376 |         dep1.set('label', 'SEGMENT')
377 |         dep1.text = word
378 | 
379 |     if remainder:
380 |         create_compound_xml(head, remainder, write_junctures, merge_junctures, dependency)
381 | 
382 |     head.append(dep1)
383 | 
384 |     if write_junctures:
385 |         dep2 = ET.Element('tree')
386 |         dep2.set('label', 'JUNC')
387 |         dep2.text = juncture
388 |         if dependency:
389 |             dep3 = ET.Element('tree')
390 |             dep3.set('label', 'junc')
391 |             dep3.append(dep2)
392 |             head.append(dep3)
393 |         else:
394 |             head.append(dep2)
395 | 
396 | 
397 | def apply_model(file_obj, freq, fst_server, split_function, write_junctures, merge_junctures, syntax, no_truecase, dependency):
398 | 
399 |     re_syntax_splitter = re.compile(r'((?:\s*(?:<[^<>]*>)+\s*)|(?:(?<!>)\s+(?!<)))')
400 |     truecase = {}
401 | 
402 |     for word in list(freq):
403 |         word_lc = word.lower()
404 |         if word_lc in freq and freq[word_lc] > freq[word]:
405 |             continue
406 | 
407 |         freq[word_lc] = freq[word]
408 |         if word_lc != word and not no_truecase:
409 |             truecase[word_lc] = word
410 | 
411 |     for line in file_obj:
412 | 
413 |         # only do syntactic processing if option syntax is used and we see '<' in line
414 |         write_syntax = syntax
415 |         if write_syntax and not '<' in line:
416 |             write_syntax = False
417 | 
418 |         if write_syntax:
419 |             words_in = re_syntax_splitter.split(line)
420 |             words_in_clean = [word for word in words_in if word and not word.startswith('<') and not word == ' ']
421 |         else:
422 |             words_in = line.split()
423 |             words_in_clean = words_in
424 | 
425 |         if fst_server:
426 |             fst_server.analyze(words_in_clean)
427 | 
428 |         words = []
429 |         for word in words_in:
430 | 
431 |             if write_syntax:
432 |                 if not word:
433 |                     continue
434 |                 if word == ' ' or word.startswith('<'):
435 |                     words.append(word)
436 |                     continue
437 | 
438 |             word_lc = word.lower()
439 |             if VERBOSE:
440 |                 sys.stderr.write('considering {0} ({1})...\n'.format(word, word_lc))
441 | 
442 |             if word_lc in freq and freq[word_lc] >= MAX_COUNT:
443 |                 words.append(word)
444 |                 if VERBOSE:
445 |                     sys.stderr.write('\tfrequent word ({0}>{1}), skipping\n'.format(freq[word_lc], MAX_COUNT))
446 |                 continue
447 | 
448 |             best_split = word
449 |             best_score = 1
450 | 
451 |             for i, decomposition in enumerate(split_function(word, freq, truecase, fst_server, write_junctures or merge_junctures, no_truecase)):
452 | 
453 |                 if i >= MAX_SPLIT_HYPOTHESES:
454 |                     break
455 | 
456 |                 split_list, scores = zip(*decomposition)
457 |                 scores = [score for score in scores if score != -1] #ignoring
458 |                 total = reduce(mul, scores)
459 |                 score = total ** (1/len(scores))
460 |                 if FEWEST:
461 |                     score = (-len(scores),score)
462 |                 split = ' '.join(split_list)
463 | 
464 |                 if VERBOSE:
465 |                     sys.stderr.write('\t split: {0} ({1} ** 1/{2}) = {3}\n'.format(split, total, len(scores), score))
466 | 
467 |                 if score > best_score:
468 |                     best_split = split
469 |                     best_score = score
470 | 
471 |             if write_syntax and len(best_split.split()) > 1:
472 |                 head = ET.Element('x')
473 |                 create_compound_xml(head, best_split.split(), write_junctures, merge_junctures, dependency, initial=True)
474 |                 best_split = ET.tostring(head, encoding="UTF-8")[3:-4].decode("UTF-8")
475 |                 if dependency:
476 |                     words[-1] = words[-1].rsplit('<',1)[0]
477 |                     best_split = best_split.rsplit('<',1)[0]
478 | 
479 |             if merge_junctures:
480 |                 merged_best_split = []
481 |                 for item in best_split.split():
482 |                     if merged_best_split and len(item) > 1 and item[0] == item[-1] == "@":
483 |                         merged_best_split[-1] += item[1:-1] + "@@"
484 |                     else:
485 |                         merged_best_split.append(item)
486 |                 best_split = ' '.join(merged_best_split)
487 | 
488 |             words.append(best_split)
489 | 
490 |         if write_syntax:
491 |             sys.stdout.write(''.join(words))
492 |         else:
493 |             sys.stdout.write(' '.join(words) + '\n')
494 | 
495 | 
496 | def parse_arguments():
497 | 
498 |     help_text =  "compound splitter\n"
499 |     help_text += "  train: python hybrid_compound_splitter.py -train -corpus txt-file -model new-model\n"
500 |     help_text += "  apply: python hybrid_compound_splitter.py -model trained-model < in > out\n"
501 | 
502 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text)
503 | 
504 |     general = parser.add_argument_group('general options')
505 | 
506 |     general.add_argument('-model', metavar='MODEL', required=True,
507 |                     help='path to statistical decompounding model. Will be overwritten if -train is active.')
508 |     general.add_argument('-corpus', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
509 |                     help='input text (default: standard input).')
510 |     general.add_argument('-train', action="store_true",
511 |                     help='train model on input text. MODEL will be overwritten.')
512 |     general.add_argument('-syntax', action="store_true",
513 |                     help='input/output is syntactic tree')
514 |     general.add_argument('-q', action="store_true",
515 |                     help='quiet mode.')
516 | 
517 |     application = parser.add_argument_group('application options')
518 | 
519 |     application.add_argument('-min-size', type=int,
520 |                     help='minimum word size [don\'t split into short words] (default {0})'.format(MIN_SIZE), default=MIN_SIZE)
521 |     application.add_argument('-min-count', type=int,
522 |                     help='minimum word count [don\'t split into rare words] (default {0})'.format(MIN_COUNT), default=MIN_COUNT)
523 |     application.add_argument('-max-count', type=int,
524 |                     help='maximum word count [don\'t split up frequent words] (default {0})'.format(MAX_COUNT), default=MAX_COUNT)
525 |     application.add_argument('-fewest', action="store_true",
526 |                     help='prefer option with fewest splits (that meets all other constraints)')
527 |     application.add_argument('-module', action="store_true",
528 |                     help='load model as Python module - quicker, but model file needs to end in *.py and be in same folder as script.')
529 |     application.add_argument('-smor', metavar='PATH',
530 |                     help='perform hybrid compound splitting (with SMOR morphology). Default: purely corpus-based compound splitting.')
531 |     application.add_argument('-no-truecase', action='store_true',
532 |                     help='leave segments in original case')
533 |     application.add_argument('-dependency', action='store_true',
534 |                     help='dependency-like representation of compounds (ensure that every nonterminal in compound representation has exactly one preterminal)')
535 | 
536 |     filler = application.add_mutually_exclusive_group()
537 | 
538 |     filler.add_argument('-write-filler', action="store_true", dest='write_junctures',
539 |                     help='write filler elements (surrounded by @@)')
540 |     filler.add_argument('-merge-filler', action="store_true", dest='merge_junctures',
541 |                     help='write filler elements (concatenated with preceding segment, ending in @@)')
542 | 
543 |     args = parser.parse_args()
544 | 
545 |     return args
546 | 
547 | if __name__ == '__main__':
548 | 
549 |     args = parse_arguments()
550 | 
551 |     VERBOSE = not args.q
552 |     MIN_SIZE = args.min_size
553 |     MIN_COUNT = args.min_count
554 |     MAX_COUNT = args.max_count
555 |     FEWEST = args.fewest
556 | 
557 |     if sys.version_info < (3, 0):
558 |         sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
559 |         args.corpus = codecs.getreader('UTF-8')(args.corpus)
560 |         sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
561 | 
562 |     if args.train:
563 |         train_model(args.corpus, args.model, args.syntax)
564 | 
565 |     else:
566 |         if args.module:
567 |             if args.model.endswith('.py'):
568 |                 args.model = args.model[:-3]
569 |             model = __import__(args.model)
570 | 
571 |         else:
572 |             if sys.version_info < (3, 0):
573 |                 file_obj = codecs.getreader('UTF-8')(open(args.model, 'r'))
574 |             else:
575 |                 file_obj = open(args.model, 'r', encoding='UTF-8')
576 |             start = file_obj.read(100)
577 |             offset = start.find('{')
578 |             file_obj.seek(offset)
579 |             model = {}
580 |             model['model'] = json.load(file_obj)
581 |             model = argparse.Namespace(**model)
582 | 
583 |         if args.smor:
584 |             smor_server = SMORSplitter(args.smor, args.no_truecase)
585 |             split_function = get_FST_splits
586 |         else:
587 |             smor_server = None
588 |             split_function = get_unsupervised_splits
589 | 
590 | 
591 |         apply_model(args.corpus, model.model, smor_server, split_function, args.write_junctures, args.merge_junctures, args.syntax, args.no_truecase, args.dependency)
592 | 


--------------------------------------------------------------------------------