├── example ├── working-dir │ └── .ignore ├── data │ ├── parallelC.de-en.parsed_target.en │ ├── parallelC.de-en.parsed_both.de │ ├── parallelC.de-en.parsed_both.en │ ├── parallelC.de-en.parsed_target.de │ ├── parallelC.de-en.en │ └── parallelC.de-en.de ├── README.md ├── toy_example.config ├── toy_example_2015.config ├── toy_example_2015_2.config ├── toy_example_2015_3.config ├── toy_example_2015_4.config ├── toy_example_2015_5.config └── toy_example_2015_6.config ├── emnlp2015 ├── split_and_restructure.sh ├── detruecase_ptkvz.sh ├── oov_filter.py ├── unbinarize.py ├── fst_wrapper.py ├── binarize.py ├── separable_prefix_postprocessing.py ├── separable_prefix.py └── hyphen-splitter.py ├── README.md ├── enrich_labelset.py └── hybrid_compound_splitter.py /example/working-dir/.ignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/data/parallelC.de-en.parsed_target.en: -------------------------------------------------------------------------------- 1 | parallelC.de-en.en -------------------------------------------------------------------------------- /example/data/parallelC.de-en.parsed_both.de: -------------------------------------------------------------------------------- 1 | parallelC.de-en.parsed.de -------------------------------------------------------------------------------- /example/data/parallelC.de-en.parsed_both.en: -------------------------------------------------------------------------------- 1 | parallelC.de-en.parsed.en -------------------------------------------------------------------------------- /example/data/parallelC.de-en.parsed_target.de: -------------------------------------------------------------------------------- 1 | parallelC.de-en.parsed.de -------------------------------------------------------------------------------- /emnlp2015/split_and_restructure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #perform compound splitting and particle verb restructuring 4 | 5 | script_dir=$1 6 | shift 7 | smor=$1 8 | shift 9 | 10 | $script_dir/hybrid_compound_splitter.py \ 11 | -smor $smor \ 12 | -write-filler -no-truecase -q -syntax -fewest -dependency $@ \ 13 | | $script_dir/emnlp2015/hyphen-splitter.py -syntax \ 14 | | $script_dir/emnlp2015/separable_prefix.py $smor -------------------------------------------------------------------------------- /emnlp2015/detruecase_ptkvz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # EMS hack: do post-processing of particle verbs in detruecase step; 4 | # instead of string translation output, we need tree output that we take from -Ttree file. 5 | 6 | script_dir=$1 7 | shift 8 | 9 | grep "Full Tree" $1 | cut -f 2- -d ":" | cut -f "2-" -d " " | \ 10 | python3 $script_dir/emnlp2015/unbinarize.py | \ 11 | python $script_dir/emnlp2015/separable_prefix_postprocessing.py 12 | -------------------------------------------------------------------------------- /emnlp2015/oov_filter.py: -------------------------------------------------------------------------------- 1 | # filter out all phrases in a phrase table that contain words that are not in 2 | # the provided vocabulary file 3 | 4 | # usage: python oov_filter.py vocabulary_file < phrase_table_in > phrase_table_out 5 | 6 | import sys 7 | 8 | vocab = open(sys.argv[1]).readlines() 9 | vocab = set([item.strip() for item in vocab]) 10 | 11 | discarded = open('discarded','w') 12 | 13 | count = 0 14 | dcount = 0 15 | for line in sys.stdin: 16 | count += 1 17 | linesplit = line.split('|||') 18 | for word in linesplit[1].split()[:-1]: 19 | if word.startswith('['): 20 | continue 21 | elif word not in vocab: 22 | discarded.write(line) 23 | dcount += 1 24 | break 25 | else: 26 | print line, 27 | 28 | sys.stderr.write('{0} out of {1} lines discarded\n'.format(dcount, count)) 29 | -------------------------------------------------------------------------------- /emnlp2015/unbinarize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | 6 | from __future__ import print_function, unicode_literals 7 | import sys 8 | import tree 9 | import re 10 | 11 | whitespace = re.compile('\s+') 12 | 13 | def get_unbinarized_children(t, children=None): 14 | 15 | if children is None: 16 | children = [] 17 | 18 | for child in t: 19 | if isinstance(child, tree.Tree) and child.node.startswith('^'): 20 | get_unbinarized_children(child, children) 21 | else: 22 | children.append(child) 23 | 24 | if not isinstance(t, tree.Tree) or t.node.startswith('^'): 25 | return 26 | else: 27 | t[:] = children 28 | for child in t: 29 | get_unbinarized_children(child) 30 | 31 | 32 | 33 | if __name__ == '__main__': 34 | for line in sys.stdin: 35 | t = tree.Tree(line) 36 | get_unbinarized_children(t) 37 | print(whitespace.sub(' ',t.__str__())) -------------------------------------------------------------------------------- /emnlp2015/fst_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Beat Kunz 4 | 5 | from __future__ import unicode_literals, print_function 6 | import sys 7 | import re 8 | import pexpect 9 | 10 | 11 | class FstWrapper(): 12 | def __init__(self, smor_binary, smor_model): 13 | self.child = pexpect.spawnu(smor_binary + ' ' + smor_model) 14 | self.child.delaybeforesend = 0 15 | self.child.expect(["analyze> ", pexpect.EOF], timeout=600) 16 | self.morAnalyseMode = True 17 | before = self.child.before 18 | if self.child.terminated: 19 | raise RuntimeError(before) 20 | 21 | def analyse(self, word): 22 | word = word.strip() 23 | if word == "" or word == "q" or word == "\x7f": 24 | return [] 25 | # if not in analyse mode, go to it 26 | if self.morAnalyseMode == False: 27 | # print "Was not in analyse mode => toggle to it!" 28 | self.toggleMorMode() 29 | self.child.sendline("") # "" is used in the fst-mor to toggle between analyse/generate 30 | self.child.expect(["analyze> ", pexpect.EOF]) 31 | self.child.before 32 | self.child.sendline(word) 33 | try: 34 | self.child.expect(["analyze> ", pexpect.EOF]) 35 | except pexpect.TIMEOUT: 36 | sys.stderr.write('Warning: timeout while waiting for fst-mor\n') 37 | sys.stderr.write('String: {0}'.format(word)) 38 | return [] 39 | result = self.child.before.split("\r\n")[1:-1] 40 | if len(result) == 1 and re.match("^no result for ", result[0]): 41 | result = [] 42 | return result 43 | 44 | def generate(self, word): 45 | word = word.strip() 46 | if word == "" or word == "q": 47 | return [] 48 | # if not in analyse mode, go to it 49 | if self.morAnalyseMode == True: 50 | # print "Was not in generate mode => toggle to it!" 51 | self.toggleMorMode() 52 | self.child.sendline("") # "" is used in the fst-mor to toggle between analyse/generate 53 | self.child.expect(["generate> ", pexpect.EOF]) 54 | self.child.before 55 | self.child.sendline(word) 56 | try: 57 | self.child.expect(["generate> ", pexpect.EOF]) 58 | except pexpect.TIMEOUT: 59 | sys.stderr.write('Warning: timeout while waiting for fst-mor\n') 60 | sys.stderr.write('String: {0}'.format(word)) 61 | return [] 62 | result = self.child.before.split("\r\n")[1:-1] 63 | if len(result) == 1 and re.match("^no result for ", result[0]): 64 | result = [] 65 | return result 66 | 67 | # if you just want to play around you can use this function 68 | def openShell(self): 69 | 70 | while True: 71 | input_string = raw_input("input<<<<") 72 | if input_string == "": 73 | self.toggleMorMode() 74 | self.child.sendline(input_string) 75 | if self.morAnalyseMode == True: 76 | self.child.expect(["analyze> ", pexpect.EOF]) 77 | else: 78 | self.child.expect(["generate> ", pexpect.EOF]) 79 | 80 | def toggleMorMode(self): 81 | self.morAnalyseMode = not self.morAnalyseMode -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | Toy EMS Config for String-to-Tree SMT System 2 | ============================================ 3 | 4 | The EMS configuration file `toy_example.config` documents the settings used to 5 | build a string-to-tree SMT system like our submission to WMT 2014. 6 | 7 | The configuration only uses some toy data that is provided with this repository, 8 | but a full-scale system can be implemented by replacing references to 9 | `parallelA`, `parallelB` and `monolingualA` with real data sets, and changing 10 | the tuning and evaluation sets. 11 | 12 | Main differences from the WMT 2014 submission: 13 | 14 | - this config does not include syntactic constraints 15 | - this config does not filter the tuning set to short sentences 16 | 17 | The file `toy_example_2015.config` shows the base configuration of the WMT 2015 submissions. 18 | It includes tuning on the head-word chain metric (HWCM), and some updated settings. 19 | 20 | - `toy_example_2015_2.config` adds head binarization 21 | - `toy_example_2015_3.config` adds a relational dependency language model 22 | - `toy_example_2015_4.config` adds source-syntactic constraints 23 | - `toy_example_2015_5.config` adds a 5-gram neural language model 24 | - `toy_example_2015_6.config` slighly modifies compound splitting, and adds particle verb restructuring 25 | 26 | [on real-sized data, some steps (such as parsing and training neural networks on all monolingual data) 27 | may take a long time, and you may want to consider to manually distribute the workload over many machines, 28 | and/or to only parse the parallel data and train neural networks on a subset of data, and/or for fewer epochs.] 29 | 30 | `toy_example_2015_5.config` contains all models of our official WMT 2015 submission (uedin-syntax); our submission contains two manual "hacks" not automated by EMS: 31 | - we remove all virtual nodes from the tree binarization (those starting in "^") from `model/unknown-word-soft-matches.*` 32 | [this means that unknown words are not allowed to match those nodes; RDLM produces lots of warnings if these matches are allowed] 33 | - we remove all rule table entries from `model/phrase-table.*` whose target side contains words that are not in the vocabulary of RDLM and NPLM. 34 | [this avoids problems with poor probability estimates for those translations] 35 | (see `emnlp2015/oov_filter.py`) 36 | 37 | 38 | 39 | Instructions 40 | ------------ 41 | 42 | 1. download and install all required software 43 | 44 | - mosesdecoder (http://statmt.org/moses/) 45 | - ParZu (https://github.com/rsennrich/ParZu) 46 | - mgiza (https://github.com/moses-smt/mgiza) 47 | - SRILM (http://www.speech.sri.com/projects/srilm/) [LM training could also be done with other tools, but SRILM is still used for interpolation] 48 | 49 | for some configs, also install the following: 50 | - NPLM (https://github.com/rsennrich/nplm/) for RDLM and NPLM toy_example_2015_{3,5} 51 | if you use NPLM, (re-)compile Moses with the option "--with-nplm=" 52 | - Stanford CoreNLP (http://nlp.stanford.edu/software/corenlp.shtml) for English parsing for toy_example_2015_4 53 | - Maltparser (http://www.maltparser.org/) for projectivization of English parse trees for toy_example_2015_4 54 | 55 | 2. set the paths in the first 20 lines of `toy_example.config` 56 | 57 | 3. run EMS with the example configuration. Models etc. are written to `working-dir` 58 | 59 | /path/to/mosesdecoder/scripts/ems/experiment.perl --config toy_example.config --exec 60 | 61 | 62 | Common issues 63 | ------------- 64 | 65 | these configs were tested with moses commit 5d8af9c (29 May 2015), and 89d16a4 (31 July 2015). 66 | -------------------------------------------------------------------------------- /emnlp2015/binarize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | # perform deterministic head binarization of trees that were converted from dependency format (with mosesdecoder/scripts/training/wrappers/conll2mosesxml.py): 6 | # right-binarization of the head and its pre-modifiers, followed by left-binarization of all post-modifiers 7 | 8 | from __future__ import print_function, unicode_literals 9 | import sys 10 | import codecs 11 | from collections import defaultdict 12 | 13 | try: 14 | from lxml import etree as ET 15 | except ImportError: 16 | from xml.etree import cElementTree as ET 17 | 18 | def escape_xml(element): 19 | 20 | if element.text: 21 | element.text = element.text.replace('\'',''') 22 | element.text = element.text.replace('"','"') 23 | 24 | for child in element: 25 | escape_xml(child) 26 | 27 | def escape_text(s): 28 | 29 | s = s.replace('|','|') # factor separator 30 | s = s.replace('[','[') # syntax non-terminal 31 | s = s.replace(']',']') # syntax non-terminal 32 | 33 | s = s.replace('&apos;',''') # lxml is buggy if input is escaped 34 | s = s.replace('&quot;','"') # lxml is buggy if input is escaped 35 | 36 | return s 37 | 38 | # assume dependency structure where each nonterminal has exactly one pre-terminal child, which is the head of the structure. 39 | def find_head(xml): 40 | for i, child in enumerate(xml): 41 | if len(child) == 0: 42 | return i 43 | # if no head found, we pick the last child (which results in right-binarization of tree) 44 | return len(xml)-1 45 | 46 | def binarize(xml, mode): 47 | 48 | for child in xml: 49 | binarize(child, mode) 50 | 51 | if len(xml) > 2 and mode == 'head': 52 | head_position = find_head(xml) 53 | # right-binarize head position and everything before it 54 | while head_position > 0 and len(xml) > 2: 55 | head_position -= 1 56 | virtual_node = ET.Element('tree') 57 | if head_position > 0: 58 | # prefix '^i' marks that we expect more siblings on the left (and possibly on the right) 59 | virtual_node.set('label', '^i' + xml.get('label')) 60 | else: 61 | # prefix '^l' marks that we reached beginning of structure and have more siblings on the right 62 | virtual_node.set('label', '^l' + xml.get('label')) 63 | virtual_node.append(xml[head_position]) 64 | virtual_node.append(xml[head_position]) 65 | xml.insert(head_position, virtual_node) 66 | # left-binarize the rest 67 | while len(xml) > 2: 68 | virtual_node = ET.Element('tree') 69 | virtual_node.set('label', '^l' + xml.get('label')) 70 | virtual_node.append(xml[0]) 71 | virtual_node.append(xml[0]) 72 | xml.insert(0, virtual_node) 73 | 74 | else: 75 | while len(xml) > 2: 76 | virtual_node = ET.Element('tree') 77 | virtual_node.set('label', '^' + xml.get('label')) 78 | if mode == 'left': 79 | virtual_node.append(xml[0]) 80 | virtual_node.append(xml[0]) 81 | xml.insert(0, virtual_node) 82 | elif mode == 'right': 83 | virtual_node.append(xml[-2]) 84 | virtual_node.append(xml[-1]) 85 | xml.append(virtual_node) 86 | 87 | if __name__ == '__main__': 88 | 89 | if sys.version_info < (3, 0): 90 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 91 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 92 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 93 | 94 | mode = sys.argv[1] 95 | 96 | for line in sys.stdin: 97 | if line == '\n': 98 | sys.stdout.write(line) 99 | continue 100 | xml = ET.fromstring(line) 101 | binarize(xml, mode) 102 | escape_xml(xml) 103 | sys.stdout.write(escape_text(ET.tostring(xml, encoding="UTF-8").decode("UTF-8") + '\n')) -------------------------------------------------------------------------------- /emnlp2015/separable_prefix_postprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | # restore original representation of particle verbs. 6 | # described in Rico Sennrich and Barry Haddow (2015). A Joint Dependency Model of Morphological and Syntactic Structure for Statistical Machine Translation. Proceedings of EMNLP. 7 | 8 | from __future__ import unicode_literals 9 | import sys 10 | import codecs 11 | import tree 12 | 13 | def first_leaf(node): 14 | if isinstance(node, tree.Tree) and len(node): 15 | return first_leaf(node[0]) 16 | else: 17 | return node 18 | 19 | def last_leaf(node): 20 | if isinstance(node, tree.Tree) and len(node): 21 | return last_leaf(node[-1]) 22 | else: 23 | return node 24 | 25 | def comma_enclosure(node): 26 | comma = False 27 | if len(node): 28 | if first_leaf(node).strip() == b',' and not node.node.startswith('kon'): 29 | comma = True 30 | if comma and len(node) > 1 and last_leaf(node).strip() != b',': 31 | node.append(tree.Tree(b'[comma [$, ,]]')) 32 | return 33 | elif isinstance(node, tree.Tree) and len(node): 34 | comma_enclosure(node[-1]) 35 | 36 | def convert_ptkvz(node): 37 | 38 | part = None 39 | avz = None 40 | v_pos = None 41 | 42 | for i,child in list(enumerate(node)): 43 | if isinstance(child, tree.Tree): 44 | convert_ptkvz(child) 45 | 46 | if child.node == b'avz': 47 | for grandchild in child: 48 | if grandchild.node == b'PTKVZ': 49 | avz = grandchild 50 | avz_pos = i 51 | 52 | elif child.node == b'part': 53 | for grandchild in child: 54 | if grandchild.node == b'PTKZU': 55 | part = grandchild 56 | part_pos = i 57 | 58 | elif child.node.startswith(b'V'): 59 | v_pos = i 60 | if avz is not None: 61 | # infinitive with zu-prefix and 62 | if child.node == b'VVINF' and part is not None and avz is not None and part_pos == i-2 and avz_pos == i-1: 63 | child[0] = avz[0] + part[0] + child[0] 64 | del node[part_pos] 65 | del node[part_pos] 66 | avz = None 67 | child.node = b'VVIZU' 68 | 69 | elif child.node in [b'VVINF', b'VVPP'] and avz is not None and avz_pos == i-1: 70 | child[0] = avz[0] + child[0] 71 | del node[avz_pos] 72 | avz = None 73 | 74 | # we are not in main clause, so we should concatenate prefix and verb 75 | elif avz is not None and (node.node in [b'objc', b'subjc', b'neb', b'rel', b'aux', b'root', b'vkon_sub'] or node.node.startswith(b'obji') or node.node.startswith(b'kon')) and avz_pos == i-1: 76 | child[0] = avz[0] + child[0] 77 | avz = None 78 | del node[avz_pos] 79 | 80 | # identify end field by fact that subordinated clause follows 81 | elif avz is not None and v_pos is not None and (child.node in [b'objc', b'obji', b'subjc', b'rel', b'neb', b'vroot', b'comma', b'aux'] or child.node.startswith(b'kon') or child.node.startswith(b'obji')): 82 | node.insert(i, avz) 83 | del node[avz_pos] 84 | avz = None 85 | comma_enclosure(node[i-1]) 86 | 87 | # we insert avz as last dependent if we haven't already 88 | if v_pos is not None and avz is not None: 89 | node.append(avz) 90 | del node[avz_pos] 91 | comma_enclosure(node[-2]) 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | 97 | for line in sys.stdin: 98 | my_tree = tree.Tree(line) 99 | convert_ptkvz(my_tree) 100 | if '--tree' in sys.argv: 101 | sys.stdout.write(my_tree._pprint_flat(nodesep=b'', parens=b'[]', quotes=False) + b'\n') 102 | else: 103 | sys.stdout.write(b' '.join([leaf for leaf in my_tree.leaves() if leaf not in [b'', b'']]) + b'\n') -------------------------------------------------------------------------------- /emnlp2015/separable_prefix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | # normalize representation of German particle verbs to common representation 6 | # described in Rico Sennrich and Barry Haddow (2015). A Joint Dependency Model of Morphological and Syntactic Structure for Statistical Machine Translation. Proceedings of EMNLP. 7 | 8 | from __future__ import print_function, unicode_literals 9 | import sys 10 | import codecs 11 | from collections import defaultdict 12 | 13 | import fst_wrapper 14 | 15 | from lxml import etree as ET 16 | 17 | def get_text(element, text): 18 | if element.text: 19 | text.append(element.text) 20 | for child in element: 21 | get_text(child, text) 22 | if element.tail: 23 | text.append(element.tail) 24 | 25 | def strip_xml(xml): 26 | text_list = [] 27 | get_text(xml, text_list) 28 | text = ' '.join([t.strip() for t in text_list]) 29 | return text 30 | 31 | def escape_xml(element): 32 | 33 | if element.text: 34 | element.text = element.text.replace('\'',''') 35 | element.text = element.text.replace('"','"') 36 | 37 | for child in element: 38 | escape_xml(child) 39 | 40 | def escape_text(s): 41 | 42 | s = s.replace('|','|') # factor separator 43 | s = s.replace('[','[') # syntax non-terminal 44 | s = s.replace(']',']') # syntax non-terminal 45 | 46 | s = s.replace('&apos;',''') # lxml is buggy if input is escaped 47 | s = s.replace('&quot;','"') # lxml is buggy if input is escaped 48 | 49 | return s 50 | 51 | def convert_ptkvz(xml): 52 | 53 | vvfin = None 54 | avz = None 55 | 56 | offset = 0 57 | for i, child in list(enumerate(xml)): 58 | # separate prefix from verbs 59 | if child.get('label').startswith('VV') and child.text: 60 | split = has_vpart(child.text.strip()) 61 | if split: 62 | avz = ET.Element('tree') 63 | avz.set('label', 'avz') 64 | ptkvz = ET.Element('tree') 65 | ptkvz.set('label', 'PTKVZ') 66 | ptkvz.text = split[0] 67 | avz.append(ptkvz) 68 | xml.insert(i+offset,avz) 69 | child.text = split[1] 70 | if split[1].startswith('zu') and split[2]: 71 | part = ET.Element('tree') 72 | part.set('label', 'part') 73 | ptkzu = ET.Element('tree') 74 | ptkzu.set('label', 'PTKZU') 75 | ptkzu.text = 'zu' 76 | part.append(ptkzu) 77 | xml.insert(i+offset,part) 78 | offset += 1 79 | child.text = split[1][2:] 80 | child.set('label', 'VVINF') 81 | offset += 1 82 | 83 | if child.get('label') == 'VVFIN': 84 | vvfin = child 85 | vvfin_pos = i+offset 86 | elif child.get('label') == 'avz': 87 | avz = child 88 | break 89 | 90 | # verb has separated prefix: reorder 91 | if vvfin is not None and avz is not None: 92 | xml.insert(vvfin_pos, avz) 93 | 94 | # recursion 95 | for child in xml: 96 | convert_ptkvz(child) 97 | 98 | 99 | def has_vpart(word): 100 | if word in smor_cache: 101 | return smor_cache[word] 102 | else: 103 | analyses = sorted(smor.analyse(word), key=lambda x: x.count('<')) 104 | analyses = [x for x in analyses if '<+V>' in x] 105 | if analyses and all('<#>' in line for line in analyses): 106 | prefix_len = analyses[0].index('<#>') 107 | if analyses[0].startswith(''): 108 | prefix_len -= 5 109 | has_zu = "" in analyses[0] 110 | smor_cache[word] = word[:prefix_len], word[prefix_len:], has_zu 111 | return word[:prefix_len], word[prefix_len:], has_zu 112 | else: 113 | smor_cache[word] = False 114 | return False 115 | 116 | 117 | if __name__ == '__main__': 118 | 119 | if '-train' in sys.argv: 120 | sys.exit(0) 121 | 122 | smor = fst_wrapper.FstWrapper('fst-mor', sys.argv[1]) 123 | smor_cache = {} 124 | 125 | if sys.version_info < (3, 0): 126 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 127 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 128 | sys.stdin = codecs.getreader('UTF-8')(sys.stdin) 129 | 130 | for line in sys.stdin: 131 | if line == '\n': 132 | sys.stdout.write(line) 133 | continue 134 | xml = ET.fromstring(line) 135 | convert_ptkvz(xml) 136 | escape_xml(xml) 137 | sys.stdout.write(escape_text(ET.tostring(xml, encoding="UTF-8").decode("UTF-8") + '\n')) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scripts for Edinburgh English-German syntax system for WMT 2014 and WMT 2015 2 | ============================================================================ 3 | 4 | This repository contains scripts and an example config used for the Edinburgh syntax submission (UEDIN-SYNTAX) for the English-German 5 | shared translation task at the 2014 and 2015 Workshops on Statistical Machine Translation (http://www.statmt.org/wmt14/ http://www.statmt.org/wmt15/). 6 | 7 | The scripts will facilitate the reproduction of our results, and may be useful for people who want to use ParZu (or a different parser with the dependency format by Kilian Foth) for SMT, 8 | or string-to-tree systems in general. The hybrid compound splitter can also be used for phrase-based systems, and with German as source language. 9 | 10 | CONTENTS 11 | -------- 12 | 13 | - hybrid_compound_splitter.py 14 | 15 | compound splitter for German (hybrid of finite-state and corpus-based methods as described in Fritzinger & Fraser (2010)), 16 | with a novel syntactic representation of split compounds for simple compound merging after string-to-tree translation. 17 | The syntactic representation of split compounds is treebank independent and described in Sennrich, Williams and Huck (2014). 18 | 19 | The system to WMT 2014 used the following commands for training/applying the compound splitter: 20 | 21 | `hybrid_compound_splitter.py -train -syntax -corpus INPUT_FILE -model MODEL_FILE` 22 | `hybrid_compound_splitter.py -write-filler -no-truecase -q -syntax -smor zmorge-{version}-smor_newlemma.a -model MODEL_FILE < INPUT_FILE > OUTPUT_FILE` 23 | 24 | In a string-to-tree system with a syntactic representation of compounds, 25 | just apply the following regex substitution to the output for compound merging: 26 | 27 | `s/ \@(.*?)\@ /\1/g;` 28 | 29 | - enrich_labelset.py 30 | 31 | modification of ParZu dependency label set for SMT, splitting up overgeneral labels into distinct subtypes. 32 | This script can be applied to ParZu output in CONLL format (before conversion into moses format 33 | with the script included in mosesdecoder under `scripts/training/wrappers/conll2mosesxml.py`). 34 | 35 | Use command line option `--wmt14` to activate the modifications used for the submission. 36 | Assuming you have the (German-side) tokenized corpus as `INPUT_FILE`, the Moses parsed files are generated as follows: 37 | 38 | ``` 39 | /path/to/mosesdecoder/scripts/tokenizer/deescape-special-chars.perl < INPUT_FILE | \ 40 | /path/to/ParZu/parzu -i tokenized_lines --projective | \ 41 | enrich_labelset.py --wmt14 | \ 42 | /path/to/mosesdecoder/scripts/training/wrappers/conll2mosesxml.py 43 | ``` 44 | 45 | - emnlp2015/* 46 | 47 | scripts used for tree binarization, verb particle restructuring, and (a modified) compound splitting. 48 | The techniques are described in Sennrich and Haddow (2015). 49 | 50 | - example/toy_example*.config 51 | 52 | toy configs for the moses experimental management system (EMS) that document good settings for training 53 | string-to-tree system, and automates the integration of ParZu, compound splitting, tuning on a syntactic 54 | metric, a relational dependency language model, and other models into the training process. 55 | The different toy examples also document our submissions to the WMT 2014/5 shared translation tasks. 56 | 57 | To facilitate reproduction of our results, parses of the German WMT data sets have been released: 58 | http://statmt.org/rsennrich/parsed_wmt/ 59 | 60 | LICENSE 61 | ------- 62 | 63 | The scripts are available under the LGPL v2. 64 | 65 | PUBLICATIONS 66 | ------------ 67 | 68 | The Edinburgh syntax submission to WMT 2014 is described in: 69 | 70 | Philip Williams, Rico Sennrich, Maria Nadejde, Matthias Huck, Eva Hasler and Philipp Koehn (2014): 71 | Edinburgh's Syntax-Based Systems at WMT 2014. In: Proceedings of the Ninth Workshop on Statistical Machine Translation. 72 | 73 | More details are provided in: 74 | 75 | Rico Sennrich, Philip Williams, Matthias Huck (2015): 76 | A tree does not make a well-formed sentence: Improving syntactic string-to-tree statistical machine translation with more linguistic knowledge. 77 | In: Computer Speech & Language, 32(1):27-45. Hybrid Machine Translation: integration of linguistics and statistics. 78 | 79 | The Edinburgh syntax submission to WMT 2015 is described in: 80 | 81 | Philip Williams, Rico Sennrich, Maria Nadejde, Matthias Huck and Philipp Koehn (2015): 82 | Edinburgh's Syntax-Based Systems at WMT 2015. 83 | In: Proceedings of the Tenth Workshop on Statistical Machine Translation. Lisbon, Portugal, pp. 199-209. 84 | 85 | More details are provided in: 86 | 87 | Rico Sennrich (2015): 88 | Modelling and Optimizing on Syntactic N-Grams for Statistical Machine Translation. 89 | In: Transactions of the Association for Computational Linguistics 3, 169--182. 90 | 91 | Rico Sennrich and Barry Haddow (2015): 92 | A Joint Dependency Model of Morphological and Syntactic Structure for Statistical Machine Translation. 93 | In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. Lisbon, Portugal, pp. 2081-2087. -------------------------------------------------------------------------------- /emnlp2015/hyphen-splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | # hyphen splitter: splits all hyphenated words, and with option -syntax, creates a hierarchical tree in moses XML format. 6 | 7 | from __future__ import division, unicode_literals 8 | import sys 9 | import re 10 | import codecs 11 | import argparse 12 | 13 | from lxml import etree as ET 14 | 15 | def create_compound_xml(element, wordlist, merge_junctures, dependency, initial=False): 16 | 17 | # separate last segment, then recursively label remainder as compound modifier 18 | if initial: 19 | juncture = '' 20 | dep = ET.Element('tree') 21 | dep.set('label', 'SEGMENT') 22 | dep.text = wordlist[-1] 23 | remainder = wordlist[:-1] 24 | if remainder: 25 | create_compound_xml(element, remainder, merge_junctures, dependency) 26 | element.append(dep) 27 | return 28 | 29 | juncture = wordlist[-1] 30 | word = wordlist[-2] 31 | remainder = wordlist[:-2] 32 | 33 | head = ET.Element('tree') 34 | head.set('label', 'comp_mod') 35 | element.append(head) 36 | 37 | dep1 = ET.Element('tree') 38 | dep1.text = word 39 | if merge_junctures: 40 | dep1.set('label', 'SEGMENT+JUNC') 41 | else: 42 | dep1.set('label', 'SEGMENT') 43 | 44 | if remainder: 45 | create_compound_xml(head, remainder, merge_junctures, dependency) 46 | 47 | head.append(dep1) 48 | 49 | dep2 = ET.Element('tree') 50 | dep2.set('label', 'JUNC') 51 | dep2.text = juncture 52 | dep3 = ET.Element('tree') 53 | dep3.set('label', 'junc') 54 | dep3.append(dep2) 55 | head.append(dep3) 56 | 57 | 58 | def main(file_obj, merge_junctures, syntax, dependency): 59 | 60 | re_syntax_splitter = re.compile(r'((?:\s*(?:<[^<>]*>)+\s*)|(?:(?)\s+(?!<)))') 61 | re_hyphen_splitter = re.compile(r'(\S+?)\-(?=\S)') 62 | 63 | for line in file_obj: 64 | 65 | # only do syntactic processing if option syntax is used and we see '<' in line 66 | write_syntax = syntax 67 | if write_syntax and not '<' in line: 68 | write_syntax = False 69 | 70 | if write_syntax: 71 | words_in = re_syntax_splitter.split(line) 72 | words_in_clean = [word for word in words_in if word and not word.startswith('<') and not word == ' '] 73 | else: 74 | words_in = line.split() 75 | words_in_clean = words_in 76 | 77 | words = [] 78 | for word in words_in: 79 | 80 | if not word: 81 | continue 82 | if word == ' ' or (write_syntax and word.startswith('<')) or word == '@-@': 83 | words.append(word) 84 | continue 85 | 86 | if merge_junctures: 87 | word = re_hyphen_splitter.sub(r'\1-@@ ', word) 88 | else: 89 | word = re_hyphen_splitter.sub(r'\1 @-@ ', word) 90 | 91 | if write_syntax and len(word.split()) > 1: 92 | head = ET.Element('x') 93 | create_compound_xml(head, word.split(), merge_junctures, dependency, initial=True) 94 | word = ET.tostring(head, encoding="UTF-8")[3:-4].decode("UTF-8") 95 | word = word.rsplit('<',1)[0] 96 | words[-1] = words[-1].rsplit('<',1)[0] 97 | 98 | words.append(word) 99 | 100 | if write_syntax: 101 | sys.stdout.write(''.join(words)) 102 | else: 103 | sys.stdout.write(' '.join(words) + '\n') 104 | 105 | 106 | def parse_arguments(): 107 | 108 | help_text = "hyphen splitter\n" 109 | 110 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text) 111 | 112 | general = parser.add_argument_group('general options') 113 | 114 | general.add_argument('-model', metavar='MODEL', 115 | help='path to statistical model. Currently ignored.') 116 | general.add_argument('-corpus', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', 117 | help='input text (default: standard input).') 118 | general.add_argument('-train', action="store_true", 119 | help='train model on input text. Currently ignored.') 120 | general.add_argument('-syntax', action="store_true", 121 | help='input/output is syntactic tree') 122 | general.add_argument('-q', action="store_true", 123 | help='quiet mode.') 124 | general.add_argument('-dependency', action='store_true', 125 | help='dependency-like representation of compounds (ensure that every nonterminal in compound representation has exactly one preterminal)') 126 | 127 | general.add_argument('-merge-filler', action="store_true", dest='merge_junctures', 128 | help='concatenate hyphens with preceding word ("Test-@@ Datei" instead of "Test @-@ Datei")') 129 | 130 | args = parser.parse_args() 131 | 132 | return args 133 | 134 | if __name__ == '__main__': 135 | 136 | args = parse_arguments() 137 | 138 | VERBOSE = not args.q 139 | 140 | if sys.version_info < (3, 0): 141 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 142 | args.corpus = codecs.getreader('UTF-8')(args.corpus) 143 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 144 | 145 | if args.train: 146 | sys.exit(0) 147 | 148 | else: 149 | main(args.corpus, args.merge_junctures, args.syntax, args.dependency) 150 | -------------------------------------------------------------------------------- /example/data/parallelC.de-en.en: -------------------------------------------------------------------------------- 1 | The ECB wants to hold inflation to under two percent , or somewhere in that vicinity . 2 | They also predict that the ECB will cut interest rates twice during the course of 2008 . 3 | The infection was detected in a man who had been hospitalized after a major accident . 4 | About two-thirds of the infections -LRB- 683 cases -RRB- were caused by MRSAs , whose official name is a methicillin -LRB- oxacillin -RRB- resistant Staphylococcus aureus . 5 | One-third of all people carry the Staphylococcus aureus -LRB- SA -RRB- bacteria in their noses without it ever causing an infection . 6 | Analysts suggest that people who buy real estate in Bulgaria would also like to change their lifestyles . 7 | Hungary 's six percent 8 | The ministries of health , labour and local government organized more puritan dances . 9 | All of these factors influence the political stability of the region , '' it adds . 10 | The initiative triggered heated debate among the Bulgarian public , it added . 11 | March 15 is a national holiday commemorating the revolution of 1848 while October 23 commemorates the revolution of 1956 . 12 | B.Zs. : Of course it feels good to see more and more people at our concerts . 13 | That song is about much more . 14 | A Crisis Not Only in the U.S . 15 | `` All I ever wanted was to gain some money for my bank , '' he claims in his testimony . 16 | In the summer he managed to gain a half billion Euro from this American crisis . 17 | He entered Société Générale in the 2000 , immediately after he had finished his university studies in Lyon . 18 | Two years later , he was already an assistant to the broker , and in 2005 he became an independent broker with an annual income of $ 90,000 . 19 | `` As early as during my first interview in 2005 , I knew they despised me . 20 | But soon after Kerviel entered his career , the first big success followed . 21 | And that was the trigger mechanism - he wanted more . 22 | `` It 's like a snowball , '' he told the police . 23 | In a single day , he lost 1 billion Euro . 24 | When he came to the office on Monday , he was no longer employed by the bank . 25 | And a few days later , the bank announced it had lost 5 billion Euro because of him . 26 | `` I took only four days out of the last year . 27 | A broker who does not take vacations is a broker who does not want anybody to look into his records , '' Kerviel concluded . 28 | Czech Republic has chance to get 365 billion . 29 | This is roughly half the total sum the Czech Republic can acquire from Brussels during the years 2007 to 2013 . 30 | The Transportation program is the biggest operational program , it represents almost 22 percent of the resources which the Czech Republic can obtain from the funds . 31 | Companies should obtain financial resources for the introduction of new technologies , for the industrial research , but also for the introduction of more effective methods of organization . 32 | These companies were imaginary and the accounts were actually held by his relatives and closest adherents . 33 | Each governance begins and ends with the individual chapters and items of the budget , like it or not . 34 | The budget is a substantial part of politics , but it is covered by the struggle for power between the government and the opposition . 35 | The politicians ' quarrels are much more interesting for the public than straight numbers . 36 | Our economy has already been passing through the phase of growth for a few years , as is the rest of the world , about which CSSD keeps boasting . 37 | This has been a unique opportunity for its governments to put public financing in order . 38 | Other countries , with a few exceptions such as Hungary , Albania , Pakistan or Egypt , have managed the money much more reasonably than our republic . 39 | We are among the countries which are , measured globally , endangered by poverty to the least extent . 40 | More than $ 100 billion will enter the monetary markets by means of public sales . 41 | Banks have already lost $ 60 billion due to the non-repaid loans , especially in the U.S . 42 | That will lower the consumption and send the American economics into a recession , with possible impact on the economic growth of the whole world . 43 | `` It will have the same effect as another decrease in the interest rates , '' Greg Salvaggio from Tempus Consulting told Reuters . 44 | The Czech president emphasized during the presentation of his book that its topic is very important for him , which is why he is so engaged in it . 45 | Today is the last day for people to apply for the renewal so that the authorities could manage to issue the new license by the end of the year . 46 | The validity of the licenses issued 2001-2004 will end in December 2013 . 47 | He can not get any more . 48 | In exchange , the gang surrendered 30 percent of the loot to Opava and Koňařík . 49 | The hospital 's attorney , Ondřej Dostál , therefore radically disapproves of Veleba 's proposition . 50 | `` We reject such a claim , it is too high , '' Dostál said after today 's meeting . 51 | We will find out if it has any informative value at all . 52 | Of raising money and devising the projects , '' he said . 53 | According to the court , Polívka used the appellation `` the Wallachian king '' for the first time in his TV show . 54 | Nevertheless , at the time it was not related to the intention to build a fictitious kingdom . 55 | The collaboration of currently quarrelling parties continued until the year 2000 , when Harabiš organized an official coronation in Vsetín for Polívka . 56 | In 2002 Polívka brought a lawsuit against Harabiš and his company , which the court overruled today . 57 | The king should be elected for a definite time span . 58 | Even Mr. Polívka can sign up , '' added Harabiš . 59 | The Test of Peugeot 207 SW 1.6 VTi : Melting Prejudices 60 | To tell the truth , I have never rooted for Peugeot 206 too much . 61 | But now there is its successor , the Peugeot 207 on the market , and it is far better . 62 | The tested wagon was equipped with a four-cylinder petrol engine with 1.6 l capacity and 88kW power . 63 | Nevertheless , we shall bear in mind that we have tested a fairly equipped car with a spacious interior and a very good engine . 64 | The complaints come from both sides . 65 | The above mentioned cases indicate that the mood between the locals and the immigrants at British schools is beginning to be strained . 66 | For its visitors it is really an island of freedom . 67 | Golden and lily-white beaches , high waves or a sea calm as a pond . 68 | Kite-surfing , golf and roulette . 69 | But there is just one island where you find much more , and that island is Cuba , the biggest and most beautiful of the West Indies . 70 | And then the American era came . 71 | The Hotel Nacional was built at the very end of the 1920s . 72 | Winston Churchill , Ava Gardner , Frank Sinatra . 73 | Ernest Hemingway bought a homestead in San Francisco de Paula , not far from Havana , and he had a yacht in Cojímar . 74 | The movie stars and politicians from all over America were leaving their signatures on the walls of Bodeguita del Medio . 75 | An ancient city which was established shortly after the Spaniards had colonized the island . 76 | Fame and fortune were brought there with the slave and sugar trades . 77 | The colonial Spanish architecture , deeply influenced by the later French colonists , has not changed much during four centuries . 78 | Nothing was being built there , therefore nothing was being torn down either , everybody was glad to have a roof over their head . 79 | In the evening , it is appropriate go to Casa de la Música , which becomes full of life at nightfall and the music abates there at dawn . 80 | According to the U.S. , the low number of soldiers and the lack of necessary equipment is influencing the course of the mission in Afghanistan . 81 | According to Secretary of Defense Robert Gates , NATO should design a concrete plan for the following years . 82 | `` The development in Afghanistan is tangible , but armed forces can only be part of the solution . 83 | The number of violent acts rose 27 percent compared with last year , as much as 60 percent in the southern Helmand province . 84 | The NATO countries have about 40 thousand soldiers in Afghanistan , about a quarter of whom are in the southern territory . 85 | `` We will have to think hard about the concrete goals for the coming era and the ways to reach them , '' said Gates . 86 | `` I am not happy about what we have on the ground and in the air in Afghanistan . 87 | There are some quarrels among the countries involved over whether all the allies are doing their best in Afghanistan . 88 | The German Bild fought back and printed a photo of a British unit during a rugby match in the southern part of Afghanistan . 89 | The total debt rose by the end of September to 842.7 billion crowns . 90 | During all of last year , the state incurred debts of more than 111 billion crowns . 91 | The average due date of the national debt rose to 6.5 years . 92 | Better than Expected 93 | Surely , Muhammad is the name of the Muslim prophet , but first of all it 's a very common name in Sudan . 94 | Then , the children created a book for the class with the bear 's picture followed by the legend `` My name is Muhammad . '' 95 | Certainly , it 's a matter of interpretation . 96 | Troika mediators , which gathers the European Union , USA and Russia , will go for the last time to Serbia and Kosovo on Monday . 97 | Americans and Europeans consider that their mediation will end on the 10th December . 98 | The Russians , who have already blocked in the Security Council the granting of independence to Kosovo , are insisting on continuing negotiations beyond this deadline . 99 | `` Russia has been and remains committed to a negotiated solution , a compromise solution , '' said Botsane-Khartchenko . 100 | Pristina is ready to proclaim it unilaterally if the Security Council wo n't be in its favour . 101 | -------------------------------------------------------------------------------- /example/data/parallelC.de-en.de: -------------------------------------------------------------------------------- 1 | Die EZB ist bestrebt , die Inflationsrate unter zwei Prozent , oder zumindest knapp an der zwei-Prozent-Marke zu halten . 2 | Für 2008 rechnen Experten damit , dass die EZB die Zinsen zweimal senken wird . 3 | Das gegen Antibiotika resistente Bakterium wurde in einem männlichen Patienten gefunden , der nach einem schweren Unfall ins Krankenhaus eingeliefert wurde . 4 | Etwa zwei Drittel der Infektionen ( 683 Fälle ) wurden vom MRSA , dem gegen Methicillin-Oxacillin resistenten Staphylococcus aureus ausgelöst . 5 | Das Staphylococcus aureus ( SA ) Bakterium trägt etwa ein Drittel der Menschen in ihrer Nase , ohne dass es eine Krankheit verursachen würde . 6 | Analysten zufolge streben Menschen , die in Bulgarien Immobilien erwerben , auch eine Veränderung ihres Lebenswandels an . 7 | Ungarns sechs Prozent 8 | Die Ministerien für Gesundheit , Arbeit , und Selbstverwaltung bereiten sich jeweils auf puritanische Feier vor . 9 | Das alles hat Auswirkungen auf die politische Stabilität der Region " - hieß es . 10 | Und weiter : Die Initiative hätte eine heftige Debatte in der bulgarischen Öffentlichkeit losgetreten . 11 | Die Staatsfeiertage sind der 15. März , der an die Revolution von 1848 erinnert , und der 23. Oktober , der Gedenktag der Revolution von 1956 . 12 | B. Zs . : Natürlich ist es ein gutes Gefühl , zu sehen , dass immer mehr zu unseren Konzerten kommen . 13 | In diesem Lied geht es um mehr . 14 | Krise nicht nur in Amerika 15 | " Alles , was ich wollte , war für meine Bank Geld zu verdienen " , behauptet er in seiner Aussage . 16 | Im Sommer war es ihm gelungen , gerade an der amerikanischen Krise eine halbe Milliarde Euro zu verdienen . 17 | In die Société Générale trat er im Jahr 2000 , unmittelbar nachdem er sein Studium an der Universität Lyon beendet hatte , ein . 18 | Nach zwei Jahren war er bereits Assistent eines Maklers und im Jahr 2005 wurde er selbstständiger Makler mit einem Jahresgehalt von 90000 Dollar . 19 | " Schon bei meinem ersten Gespräch im Jahr 2005 wusste ich , dass man mich beobachtete . 20 | Aber schon bald nach seinem Eintritt kam der erste große Erfolg . 21 | Und das war der Auslöser - jetzt wollte er mehr . 22 | " Das ist wie ein Schneeball " , sagte er den Polizisten . 23 | An einem einzigen Tag verlor er eine Milliarde Euro . 24 | Als er am Montag zur Arbeit kam , war er schon nicht mehr Angestellter der Bank . 25 | Und ein paar Tage später verkündete die Bank , dass sie seinetwegen fünf Milliarden Euro verloren habe . 26 | " Ich habe nur vier Tage vom Vorjahr genommen . 27 | Ein Makler , der keinen Urlaub nimmt , ist einer , der nicht will , dass man ihm in die Karten schaut " , sagte Kerviel abschließend . 28 | Tschechien hat die Chance , zu 365 Milliarden zu kommen 29 | Es geht hier ungefähr um die Hälfte der Summe , die Tschechien in den Jahren 2007 bis 2013 überhaupt aus Brüssel erhalten kann . 30 | Das Verkehrsprogramm ist das größte Entwicklungsprogramm und umfasst bis zu 22 Prozent der Mittel , die Tschechien aus dem Fonds erhalten kann . 31 | Die Firmen sollen dadurch Finanzen für die Einführung neuer Technologien , für Unternehmensforschung , aber auch für die Einführung effektiverer Organisationsmethoden gewinnen . 32 | Diese Firmen waren fiktiv und die Konten gehörten in Wirklichkeit seinen Verwandten und engsten Anhängern . 33 | Alles Regieren beginnt und endet mit den einzelnen Kapiteln und Posten des Haushalts , ob einem das gefällt oder nicht . 34 | Der Haushalt ist ein wesentlicher Bestandteil der Politik , wird aber verdeckt durch den Machtkampf der Regierung und der Opposition . 35 | Ein Streit unter Politikern ist für die Öffentlichkeit viel interessanter als nüchterne Zahlen . 36 | unsere Wirtschaft schon seit einigen Jahren , wie der Rest der Welt , eine Wachstumsphase durchläuft , deren sich die ČSSD ohne Unterlass rühmt . 37 | Das stellte eine einmalige Möglichkeit für ihre Regierungen dar , die öffentlichen Finanzen in Ordnung zu bringen . 38 | Andere Länder , abgesehen von ein paar Ausnahmen wie Ungarn , Albanien , Pakistan oder Ägypten , haben viel vernünftiger gewirtschaftet als unsere Republik . 39 | Wir gehören zu den Ländern , die , im globalen Maßstab gesehen , am wenigsten von Armut bedroht sind . 40 | So werden mehr als 100 Milliarden Dollar vermittels Auktionen auf die Währungsmärkte gelangen . 41 | Die Banken haben wegen nicht bedienter Kredite schon um 60 Milliarden Dollar verloren , vor allem in den Vereinigten Staaten . 42 | Das würde den Konsum verringern und die amerikanische Wirtschaft in eine Rezession stürzen , möglicherweise mit Auswirkungen auf das Wirtschaftswachstum auf der ganzen Welt . 43 | " Das wird den gleichen Effekt haben wie eine weitere Absenkung der Zinssätze " , sagte Greg Salvaggio von Tempus Consulting der Agentur Reuters . 44 | Der tschechische Präsident betonte bei der Vorstellung seines Buches , dass dessen Thema für ihn sehr wichtig sei , und dass er sich dafür deshalb so engagiere . 45 | Heute ist der letzte Tag , wo man den Umtausch beantragen kann , damit die Behörden es schaffen , bis zum Ende des Jahres einen neuen Führerschein auszustellen . 46 | Die Gültigkeit von Führerscheinen , die 2001 bis 2004 ausgestellt wurden , endet im Dezember 2013 . 47 | Zu mehr kann er nicht verurteilt werden . 48 | Die Bande überließ dafür Opava und Koňařík 30 Prozent der Beute . 49 | Der Rechtsvertreter des Krankenhauses , Ondřej Dostál , ist deshalb mit dem Vorschlag Velebas ganz und gar nicht einverstanden . 50 | " Einen solchen Vorschlag lehnen wir ab , er liegt um ein Vielfaches zu hoch " , sagte Dostál nach der heutigen Verhandlung . 51 | Wir werden feststellen , ob es überhaupt Aussagewert hat . 52 | Geldbeschaffung und Konzepte der Aktivitäten " , sagte er . 53 | Dem Gericht zufolge benutzte Polívka die Bezeichnung " Walachischer König " zum ersten Mal in einer seiner Fernsehsendungen . 54 | Das hing aber damals nicht mit der Absicht zusammen , ein fiktives Königreich zu begründen . 55 | Die Zusammenarbeit der heute zerstrittenen Parteien dauerte bis zum Jahr 2000 , als Harabiš für Polívka in Vsetín die offizielle Krönung organisierte . 56 | 2002 reichte dann Polívka gegen Harabiš und seine Firma die Klage ein , die das Gericht heute abwies . 57 | Der König sollte für eine bestimmte Zeit gewählt werden . 58 | Auch Herr Polívka kann sich dabei ruhig anmelden " , fügte Harabiš hinzu . 59 | Test Peugeot 207 SW 1.6 VTi : Voruteile schwinden 60 | Um die Wahrheit zu sagen , ich war nie ein besonderer Fan des Peugeot 206 . 61 | Doch jetzt ist sein Nachfolger , der Peugeot 207 , auf dem Markt , und der ist um vieles besser . 62 | Unser Testwagen war mit einem Vierzylinder-Benzinmotor mit Hubraum 1,6 Liter und einer Leistung von 88 kWp ausgestattet . 63 | Wir müssen uns aber daran erinnern , dass wir ein gut ausgestattetes Auto mit einem geräumigen Inneren und einem sehr guten Motor getestet haben . 64 | Beschwerden kommen von beiden Seiten . 65 | Die angeführten Fälle bezeugen , dass die Stimmung zwischen Einheimischen und Zuwanderern an britischen Schulen immer angespannter wird . 66 | Für Besucher ist es wirklich eine Insel der Freiheit . 67 | Goldene und weiße Strände , hohe Wellen oder ein Meer , das ruhig ist wie ein Teich . 68 | Kitesurfing , Golf und Roulette . 69 | Nur auf einer Insel finden Sie aber noch viel mehr , und diese Insel ist Kuba , die größte und schönste der Großen Antillen . 70 | Und dann kam die amerikanische Epoche . 71 | Das Hotel Nacional entstand ganz am Ende des 20. Jahrhunderts . 72 | Winston Churchill , Ava Gardner , Frank Sinatra . 73 | Ernest Hemingway kaufte sich ein Anwesen in San Francisco de Paula , in der Nähe Havannas , und in Cojímar hatte er seine Jacht liegen . 74 | An den Wänden der Bodeguita del Medio hinterließen Stars des Showbusiness und Politiker aus ganz Amerika ihre Unterschriften . 75 | Eine uralte Stadt , die kurz , nachdem die Spanier die Insel kolonisiert hatten , gegründet wurde . 76 | Ruhm und Reichtum brachte ihr der Handel mit Sklaven und Zucker . 77 | Die spanische Kolonialarchitektur , stark beeinflusst von den späteren französischen Siedlern , hat sich seit vierhundert Jahren kaum verändert . 78 | Nichts wurde dort gebaut , aber deshalb auch nichts abgerissen , jeder war froh , dass er ein Dach über dem Kopf hatte . 79 | Am Abend sollte man dann in die Casa de la Música gehen , die sich nach der Dunkelheit belebt , und wo die Musik bis zum Morgen nicht schweigt . 80 | Zu wenig Soldaten und der Mangel an nötiger Ausrüstung beeinflusst den Vereinigten Staaten zufolge den Verlauf der Mission in Afghanistan . 81 | Die NATO sollte nach den Worten von Verteidigungsminister Robert Gates einen konkreten Plan für die nächsten Jahre entwerfen . 82 | " In Afghanistan ist eine Vorwärtsentwicklung spürbar , aber Militärmacht kann nur ein Teil der Lösung sein . 83 | Die Anzahl von Gewaltaktionen ist im Vergleich zum Vorjahr um 27 Prozent gestiegen , in der südlichen Provinz Helmand sogar um sechzig Prozent . 84 | Die NATO-Länder haben in Afghanistan etwa 40000 Soldaten stationiert , davon etwa ein Viertel im südlichen Territorium . 85 | " Wir werden ernsthaft über die konkreten Ziele für die nächste Zeit und über die Art und Weise nachdenken müssen , wie wir sie erreichen können " , sagte Gates . 86 | " Ich bin nicht erfreut darüber , was wir in Afghanistan zu Lande und in der Luft zur Verfügung haben . 87 | Unter den beteiligten Ländern herrscht in den letzten Monaten auch Streit darüber , ob alle Verbündeten in Afghanistan ihr Bestes geben . 88 | Die deutsche Bildzeitung ging zum Gegenangriff über und druckte ein Bild einer britischen Einheit bei einem Rugby-Wettkampf im südlichen Teil Afghanistans ab . 89 | Die Gesamtschuld stieg bis Ende September auf 842,7 Milliarden Kronen . 90 | Über das gesamte vergangene Jahr verschuldete sich der Staat um mehr als 111 Milliarden Kronen . 91 | Die durchschnittliche Fälligkeit der Staatsschulden stieg auf 6,5 Jahre . 92 | Besser als erwartet 93 | Sicherlich , Mohammed ist der Name des islamischen Propheten , aber vor allem auch ein im Sudan sehr häufiger Vorname . 94 | Anschließend haben die Kinder ein Buch für die Klasse erstellt , mit dem Foto des Bären darauf und der Bildunterschrift “ Mein Name ist Mohammed “ . 95 | Dies ist selbstverständlich eine Frage der Interpretation . 96 | Die Vermittler der Troika , die die Europäische Union , die USA und Russland in sich vereint , werden sich am Montag ein letztes Mal in Serbien und im Kosovo treffen . 97 | Amerikaner und Europäer rechnen damit , dass ihr Vermittlungsauftrag am 10. Dezember enden wird . 98 | Die Russen , die bereits die Bewilligung der Unabhängigkeit des Kosovo beim Sicherheitsrat blockiert hatten , bestehen auf einer Fortsetzung der Verhandlungen über dieses Enddatum hinaus . 99 | „ Russland war und bleibt Fürsprecher einer ausgehandelten Lösung , einer einvernehmlichen Lösung “ , sagte Botsane-Khartchenko . 100 | Pristina zeigt sich bereit dazu , diese einseitig zu verkünden , wenn der Sicherheitsrat nicht zu deren Gunsten entscheiden sollte . 101 | -------------------------------------------------------------------------------- /example/toy_example.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /data/tools/mosesdecoder 11 | wmt2014-scripts = /data/smtworkspace/wmt2014-scripts 12 | parzu-path = /data/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /data/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /data/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | 17 | ###### no further changes should be required to run the toy example 18 | ###### (but feel free to experiment with different settings, or change the training/test data) 19 | 20 | moses-script-dir = $moses-src-dir/scripts 21 | moses-bin-dir = $moses-src-dir/bin 22 | toy-data = $wmt2014-scripts/example/data 23 | working-dir = $wmt2014-scripts/example/working-dir 24 | decoder = $moses-src-dir/bin/moses 25 | 26 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 27 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 28 | input-truecaser = $moses-script-dir/recaser/truecase.perl 29 | output-truecaser = $moses-script-dir/recaser/truecase.perl 30 | detruecaser = $moses-script-dir/recaser/detruecase.perl 31 | 32 | # parsing pipeline used for WMT 2014 33 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt14 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 34 | 35 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015) 36 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax" 37 | 38 | # sed instructions unsplit the split compunds from output-splitter 39 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 40 | 41 | input-extension = en 42 | output-extension = de 43 | pair-extension = de-en 44 | 45 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 46 | 47 | jobs = 10 48 | 49 | ################################################################# 50 | # PARALLEL CORPUS PREPARATION: 51 | # create a tokenized, sentence-aligned corpus, ready for training 52 | 53 | [CORPUS] 54 | 55 | cores = 10 56 | 57 | ### tools to use to prepare the data 58 | # 59 | #tokenizer = 60 | #lowercaser = 61 | 62 | ### long sentences are filtered out, since they slow down GIZA++ 63 | # and are a less reliable source of data. set here the maximum 64 | # length of a sentence 65 | # 66 | max-sentence-length = 80 67 | 68 | ### GIZA++ does not allow sentence pairs of highly uneven length. 69 | # since uneven sentence length is an indicator of a misalignment, 70 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 71 | # 72 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 73 | 74 | [CORPUS:parallelA] 75 | raw-stem = $toy-data/parallelA.$pair-extension 76 | 77 | [CORPUS:parallelB] 78 | raw-stem = $toy-data/parallelB.$pair-extension 79 | 80 | 81 | ################################################################# 82 | # LANGUAGE MODEL TRAINING 83 | 84 | [LM] 85 | 86 | cores = 10 87 | 88 | ### tool to be used for language model training 89 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 90 | # 91 | lm-training = $srilm-dir/ngram-count 92 | settings = "-interpolate -kndiscount -unk" 93 | order = 5 94 | 95 | ### script to use for binary table format 96 | # (default: no binarization) 97 | # 98 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 99 | 100 | # kenlm, also set type to 8 101 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 102 | #type = 8 103 | 104 | ### script to create quantized language model format 105 | # (default: no quantization) 106 | # 107 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 108 | 109 | ### tools to use to prepare the data 110 | # 111 | #tokenizer = 112 | #lowercaser = 113 | 114 | ### each language model to be used has its own section here 115 | 116 | ### if corpus preparation should be skipped, 117 | # point to the prepared language model 118 | # 119 | #lm = 120 | 121 | [LM:parallelA] 122 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 123 | 124 | [LM:parallelB] 125 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 126 | 127 | [LM:monolingualA] 128 | raw-corpus = $toy-data/monolingualA.$output-extension 129 | 130 | ################################################################# 131 | # INTERPOLATING LANGUAGE MODELS 132 | 133 | [INTERPOLATED-LM] 134 | 135 | # if multiple language models are used, these may be combined 136 | # by optimizing perplexity on a tuning set 137 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 138 | 139 | ### script to interpolate language models 140 | # if commented out, no interpolation is performed 141 | # 142 | script = $moses-script-dir/ems/support/interpolate-lm.perl 143 | 144 | ### tuning set 145 | # you may use the same set that is used for mert tuning (reference set) 146 | # 147 | raw-tuning = $toy-data/newstest2012.$output-extension 148 | 149 | ### script to use for binary table format for irstlm or kenlm 150 | # kenlm, also set type to 8 151 | lm-binarizer = $moses-src-dir/bin/build_binary 152 | type = 8 153 | 154 | ################################################################# 155 | # TRANSLATION MODEL TRAINING 156 | 157 | [TRAINING] 158 | 159 | ### training script to be used: either a legacy script or 160 | # current moses training script (default) 161 | # 162 | script = $moses-script-dir/training/train-model.perl 163 | 164 | ### general options 165 | # 166 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 167 | 168 | ### symmetrization method to obtain word alignments from giza output 169 | # (commonly used: grow-diag-final-and) 170 | # 171 | alignment-symmetrization-method = grow-diag-final-and 172 | 173 | run-giza-in-parts = 5 174 | 175 | ### if word alignment (giza symmetrization) should be skipped, 176 | # point to word alignment files 177 | # 178 | # word-alignment = 179 | 180 | ### hierarchical rule set 181 | # 182 | hierarchical-rule-set = true 183 | use-ghkm = true 184 | use-pcfg-feature = true 185 | use-unknown-word-soft-matches = true 186 | dont-tune-glue-grammar = true 187 | 188 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 20 --MaxRuleDepth 5 --MaxRuleSize 5" 189 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2" 190 | 191 | 192 | ### if phrase extraction should be skipped, 193 | # point to stem for extract files 194 | # 195 | # extracted-phrases = 196 | 197 | ### if phrase table training should be skipped, 198 | # point to phrase translation table 199 | # 200 | # phrase-translation-table = 201 | 202 | ### if training should be skipped, 203 | # point to a configuration file that contains 204 | # pointers to all relevant model files 205 | # config = 206 | 207 | ####################################################### TUNING: finding good weights for model components 208 | 209 | [TUNING] 210 | 211 | ### instead of tuning with this setting, old weights may be recycled 212 | # specify here an old configuration file with matching weights 213 | # 214 | #weight-config = 215 | 216 | ### tuning script to be used 217 | # 218 | tuning-script = $moses-script-dir/training/mert-moses.pl 219 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU'" 220 | 221 | ### specify the corpus used for tuning 222 | # it should contain 100s if not 1000s of sentences 223 | # 224 | raw-input = $toy-data/newstest2012.$input-extension 225 | # tokenized-input = 226 | # factorized-input = 227 | # input = 228 | 229 | inputtype = 3 230 | 231 | raw-reference = $toy-data/newstest2012.$output-extension 232 | # tokenized-reference = 233 | # factorized-reference = 234 | # reference = 235 | 236 | ### size of n-best list used (typically 100) 237 | # 238 | nbest = 1000 239 | 240 | ### ranges for weights for random initialization 241 | # if not specified, the tuning script will use generic ranges 242 | # it is not clear, if this matters 243 | # 244 | # lambda = 245 | 246 | ### additional flags for the decoder 247 | # 248 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50" 249 | 250 | ### if tuning should be skipped, specify this here 251 | # and also point to a configuration file that contains 252 | # pointers to all relevant model files 253 | # 254 | 255 | 256 | ######################################################### 257 | ## RECASER: restore case, this part only trains the model 258 | 259 | [RECASING] 260 | 261 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 262 | 263 | ### training data 264 | # raw input needs to be still tokenized, 265 | # also also tokenized input may be specified 266 | # 267 | #tokenized = [LM:europarl:tokenized-corpus] 268 | 269 | # recase-config = 270 | 271 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 272 | 273 | ####################################################### 274 | ## TRUECASER: train model to truecase corpora and input 275 | 276 | [TRUECASER] 277 | 278 | ### script to train truecaser models 279 | # 280 | trainer = $moses-script-dir/recaser/train-truecaser.perl 281 | 282 | ### training data 283 | # raw input needs to be still tokenized, 284 | # also also tokenized input may be specified 285 | # 286 | # tokenized-stem = $working-dir/data/ep+nc 287 | 288 | ### trained model 289 | # 290 | #truecase-model = 291 | 292 | ############################################################ 293 | ## EVALUATION: translating a test set using the tuned system 294 | 295 | [EVALUATION] 296 | 297 | ### number of jobs (if parallel execution of testing) 298 | # 299 | jobs = 10 300 | 301 | filter-settings = " " 302 | 303 | 304 | ### prepare system output for scoring 305 | # this may include detokenization and wrapping output in sgm 306 | # (needed for nist-bleu, ter, meteor) 307 | # 308 | #recaser = $moses-script-dir/recaser/recase.perl 309 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 310 | # output-sgm = 311 | 312 | ### should output be scored case-sensitive (default: no)? 313 | # 314 | # case-sensitive = yes 315 | 316 | ### BLEU 317 | # 318 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 319 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 320 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 321 | # ibm-bleu = 322 | 323 | ### TER: translation error rate (BBN metric) based on edit distance 324 | # 325 | # ter = $edinburgh-script-dir/tercom_v6a.pl 326 | 327 | ### METEOR: gives credit to stem / worknet synonym matches 328 | # 329 | # meteor = 330 | 331 | ### Analysis: carry out various forms of analysis on the output 332 | # 333 | analysis = $moses-script-dir/ems/support/analysis.perl 334 | #analyze-coverage = yes 335 | report-segmentation = yes 336 | 337 | 338 | [EVALUATION:newstest2013] 339 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100" 340 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 341 | wrapping-frame = $input-sgm 342 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 343 | 344 | [REPORTING] 345 | 346 | ### what to do with result (default: store in file evaluation/report) 347 | # 348 | # email = pkoehn@inf.ed.ac.uk 349 | 350 | -------------------------------------------------------------------------------- /example/toy_example_2015.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | 17 | ###### no further changes should be required to run the toy example 18 | ###### (but feel free to experiment with different settings, or change the training/test data) 19 | 20 | moses-script-dir = $moses-src-dir/scripts 21 | moses-bin-dir = $moses-src-dir/bin 22 | toy-data = $wmt2014-scripts/example/data 23 | working-dir = $wmt2014-scripts/example/working-dir 24 | decoder = $moses-src-dir/bin/moses 25 | 26 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 27 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 28 | input-truecaser = $moses-script-dir/recaser/truecase.perl 29 | output-truecaser = $moses-script-dir/recaser/truecase.perl 30 | detruecaser = $moses-script-dir/recaser/detruecase.perl 31 | 32 | # parsing pipeline used for WMT 2014 33 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 34 | 35 | # also parse tuning/evaluation reference files 36 | mock-output-parser-references = $output-parser 37 | 38 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015) 39 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest" 40 | 41 | # sed instructions unsplit the split compunds from output-splitter 42 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 43 | 44 | input-extension = en 45 | output-extension = de 46 | pair-extension = de-en 47 | 48 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 49 | 50 | jobs = 10 51 | 52 | ################################################################# 53 | # PARALLEL CORPUS PREPARATION: 54 | # create a tokenized, sentence-aligned corpus, ready for training 55 | 56 | [CORPUS] 57 | 58 | cores = 10 59 | 60 | ### tools to use to prepare the data 61 | # 62 | #tokenizer = 63 | #lowercaser = 64 | 65 | ### long sentences are filtered out, since they slow down GIZA++ 66 | # and are a less reliable source of data. set here the maximum 67 | # length of a sentence 68 | # 69 | max-sentence-length = 80 70 | 71 | ### GIZA++ does not allow sentence pairs of highly uneven length. 72 | # since uneven sentence length is an indicator of a misalignment, 73 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 74 | # 75 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 76 | 77 | [CORPUS:parallelA] 78 | raw-stem = $toy-data/parallelA.$pair-extension 79 | 80 | [CORPUS:parallelB] 81 | raw-stem = $toy-data/parallelB.$pair-extension 82 | 83 | [CORPUS:parallelC] 84 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 85 | # you can add parsed corpora to your system like this 86 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_target 87 | 88 | ################################################################# 89 | # LANGUAGE MODEL TRAINING 90 | 91 | [LM] 92 | 93 | cores = 10 94 | 95 | ### tool to be used for language model training 96 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 97 | # 98 | lm-training = $srilm-dir/ngram-count 99 | settings = "-interpolate -kndiscount -unk" 100 | order = 5 101 | 102 | ### script to use for binary table format 103 | # (default: no binarization) 104 | # 105 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 106 | 107 | # kenlm, also set type to 8 108 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 109 | #type = 8 110 | 111 | ### script to create quantized language model format 112 | # (default: no quantization) 113 | # 114 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 115 | 116 | ### tools to use to prepare the data 117 | # 118 | #tokenizer = 119 | #lowercaser = 120 | 121 | ### each language model to be used has its own section here 122 | 123 | ### if corpus preparation should be skipped, 124 | # point to the prepared language model 125 | # 126 | #lm = 127 | 128 | [LM:parallelA] 129 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 130 | 131 | [LM:parallelB] 132 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 133 | 134 | [LM:parallelC] 135 | raw-corpus = $toy-data/parallelC.$pair-extension.$output-extension 136 | 137 | [LM:monolingualA] 138 | raw-corpus = $toy-data/monolingualA.$output-extension 139 | 140 | ################################################################# 141 | # INTERPOLATING LANGUAGE MODELS 142 | 143 | [INTERPOLATED-LM] 144 | 145 | # if multiple language models are used, these may be combined 146 | # by optimizing perplexity on a tuning set 147 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 148 | 149 | ### script to interpolate language models 150 | # if commented out, no interpolation is performed 151 | # 152 | script = $moses-script-dir/ems/support/interpolate-lm.perl 153 | 154 | ### tuning set 155 | # you may use the same set that is used for mert tuning (reference set) 156 | # 157 | raw-tuning = $toy-data/newstest2012.$output-extension 158 | 159 | ### script to use for binary table format for irstlm or kenlm 160 | # kenlm, also set type to 8 161 | lm-binarizer = $moses-src-dir/bin/build_binary 162 | type = 8 163 | 164 | ################################################################# 165 | # TRANSLATION MODEL TRAINING 166 | 167 | [TRAINING] 168 | 169 | ### training script to be used: either a legacy script or 170 | # current moses training script (default) 171 | # 172 | script = $moses-script-dir/training/train-model.perl 173 | 174 | ### general options 175 | # 176 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 177 | 178 | ### symmetrization method to obtain word alignments from giza output 179 | # (commonly used: grow-diag-final-and) 180 | # 181 | alignment-symmetrization-method = grow-diag-final-and 182 | 183 | run-giza-in-parts = 5 184 | 185 | ### if word alignment (giza symmetrization) should be skipped, 186 | # point to word alignment files 187 | # 188 | # word-alignment = 189 | 190 | ### hierarchical rule set 191 | # 192 | hierarchical-rule-set = true 193 | use-ghkm = true 194 | use-pcfg-feature = true 195 | use-unknown-word-soft-matches = true 196 | dont-tune-glue-grammar = true 197 | 198 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 20 --MaxRuleDepth 5 --MaxRuleSize 5 --AllowUnary" 199 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001" 200 | 201 | 202 | ### if phrase extraction should be skipped, 203 | # point to stem for extract files 204 | # 205 | # extracted-phrases = 206 | 207 | ### if phrase table training should be skipped, 208 | # point to phrase translation table 209 | # 210 | # phrase-translation-table = 211 | 212 | ### if training should be skipped, 213 | # point to a configuration file that contains 214 | # pointers to all relevant model files 215 | # config = 216 | 217 | ####################################################### TUNING: finding good weights for model components 218 | 219 | [TUNING] 220 | 221 | ### instead of tuning with this setting, old weights may be recycled 222 | # specify here an old configuration file with matching weights 223 | # 224 | #weight-config = 225 | 226 | ### tuning script to be used 227 | # 228 | tuning-script = $moses-script-dir/training/mert-moses.pl 229 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'" 230 | 231 | ### specify the corpus used for tuning 232 | # it should contain 100s if not 1000s of sentences 233 | # 234 | raw-input = $toy-data/newstest2012.$input-extension 235 | # tokenized-input = 236 | # factorized-input = 237 | # input = 238 | 239 | inputtype = 3 240 | 241 | raw-reference = $toy-data/newstest2012.$output-extension 242 | # tokenized-reference = 243 | # factorized-reference = 244 | # reference = 245 | 246 | ### size of n-best list used (typically 100) 247 | # 248 | nbest = 1000 249 | 250 | ### ranges for weights for random initialization 251 | # if not specified, the tuning script will use generic ranges 252 | # it is not clear, if this matters 253 | # 254 | # lambda = 255 | 256 | ### additional flags for the decoder 257 | # 258 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees" 259 | 260 | ### if tuning should be skipped, specify this here 261 | # and also point to a configuration file that contains 262 | # pointers to all relevant model files 263 | # 264 | 265 | 266 | ######################################################### 267 | ## RECASER: restore case, this part only trains the model 268 | 269 | [RECASING] 270 | 271 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 272 | 273 | ### training data 274 | # raw input needs to be still tokenized, 275 | # also also tokenized input may be specified 276 | # 277 | #tokenized = [LM:europarl:tokenized-corpus] 278 | 279 | # recase-config = 280 | 281 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 282 | 283 | ####################################################### 284 | ## TRUECASER: train model to truecase corpora and input 285 | 286 | [TRUECASER] 287 | 288 | ### script to train truecaser models 289 | # 290 | trainer = $moses-script-dir/recaser/train-truecaser.perl 291 | 292 | ### training data 293 | # raw input needs to be still tokenized, 294 | # also also tokenized input may be specified 295 | # 296 | # tokenized-stem = $working-dir/data/ep+nc 297 | 298 | ### trained model 299 | # 300 | #truecase-model = 301 | 302 | ############################################################ 303 | ## EVALUATION: translating a test set using the tuned system 304 | 305 | [EVALUATION] 306 | 307 | ### number of jobs (if parallel execution of testing) 308 | # 309 | jobs = 10 310 | 311 | filter-settings = " " 312 | 313 | 314 | ### prepare system output for scoring 315 | # this may include detokenization and wrapping output in sgm 316 | # (needed for nist-bleu, ter, meteor) 317 | # 318 | #recaser = $moses-script-dir/recaser/recase.perl 319 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 320 | # output-sgm = 321 | 322 | ### should output be scored case-sensitive (default: no)? 323 | # 324 | # case-sensitive = yes 325 | 326 | ### BLEU 327 | # 328 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 329 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 330 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 331 | # ibm-bleu = 332 | 333 | ### TER: translation error rate (BBN metric) based on edit distance 334 | # 335 | # ter = $edinburgh-script-dir/tercom_v6a.pl 336 | 337 | ### METEOR: gives credit to stem / worknet synonym matches 338 | # 339 | # meteor = 340 | 341 | ### Analysis: carry out various forms of analysis on the output 342 | # 343 | analysis = $moses-script-dir/ems/support/analysis.perl 344 | #analyze-coverage = yes 345 | report-segmentation = yes 346 | 347 | 348 | [EVALUATION:newstest2013] 349 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100" 350 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 351 | wrapping-frame = $input-sgm 352 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 353 | 354 | [REPORTING] 355 | 356 | ### what to do with result (default: store in file evaluation/report) 357 | # 358 | # email = pkoehn@inf.ed.ac.uk 359 | 360 | -------------------------------------------------------------------------------- /example/toy_example_2015_2.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | 17 | ###### no further changes should be required to run the toy example 18 | ###### (but feel free to experiment with different settings, or change the training/test data) 19 | 20 | moses-script-dir = $moses-src-dir/scripts 21 | moses-bin-dir = $moses-src-dir/bin 22 | toy-data = $wmt2014-scripts/example/data 23 | working-dir = $wmt2014-scripts/example/working-dir 24 | decoder = $moses-src-dir/bin/moses 25 | 26 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 27 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 28 | input-truecaser = $moses-script-dir/recaser/truecase.perl 29 | output-truecaser = $moses-script-dir/recaser/truecase.perl 30 | detruecaser = $moses-script-dir/recaser/detruecase.perl 31 | 32 | # parsing pipeline used for WMT 2014 33 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 34 | 35 | # also parse tuning/evaluation reference files 36 | mock-output-parser-references = $output-parser 37 | 38 | # head binarization 39 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head" 40 | 41 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015) 42 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest" 43 | 44 | # sed instructions unsplit the split compunds from output-splitter 45 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 46 | 47 | input-extension = en 48 | output-extension = de 49 | pair-extension = de-en 50 | 51 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 52 | 53 | jobs = 10 54 | 55 | ################################################################# 56 | # PARALLEL CORPUS PREPARATION: 57 | # create a tokenized, sentence-aligned corpus, ready for training 58 | 59 | [CORPUS] 60 | 61 | cores = 10 62 | 63 | ### tools to use to prepare the data 64 | # 65 | #tokenizer = 66 | #lowercaser = 67 | 68 | ### long sentences are filtered out, since they slow down GIZA++ 69 | # and are a less reliable source of data. set here the maximum 70 | # length of a sentence 71 | # 72 | max-sentence-length = 80 73 | 74 | ### GIZA++ does not allow sentence pairs of highly uneven length. 75 | # since uneven sentence length is an indicator of a misalignment, 76 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 77 | # 78 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 79 | 80 | [CORPUS:parallelA] 81 | raw-stem = $toy-data/parallelA.$pair-extension 82 | 83 | [CORPUS:parallelB] 84 | raw-stem = $toy-data/parallelB.$pair-extension 85 | 86 | [CORPUS:parallelC] 87 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 88 | # you can add parsed corpora to your system like this 89 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_target 90 | 91 | ################################################################# 92 | # LANGUAGE MODEL TRAINING 93 | 94 | [LM] 95 | 96 | cores = 10 97 | 98 | ### tool to be used for language model training 99 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 100 | # 101 | lm-training = $srilm-dir/ngram-count 102 | settings = "-interpolate -kndiscount -unk" 103 | order = 5 104 | 105 | ### script to use for binary table format 106 | # (default: no binarization) 107 | # 108 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 109 | 110 | # kenlm, also set type to 8 111 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 112 | #type = 8 113 | 114 | ### script to create quantized language model format 115 | # (default: no quantization) 116 | # 117 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 118 | 119 | ### tools to use to prepare the data 120 | # 121 | #tokenizer = 122 | #lowercaser = 123 | 124 | ### each language model to be used has its own section here 125 | 126 | ### if corpus preparation should be skipped, 127 | # point to the prepared language model 128 | # 129 | #lm = 130 | 131 | [LM:parallelA] 132 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 133 | 134 | [LM:parallelB] 135 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 136 | 137 | [LM:parallelC] 138 | raw-corpus = $toy-data/parallelC.$pair-extension.$output-extension 139 | 140 | [LM:monolingualA] 141 | raw-corpus = $toy-data/monolingualA.$output-extension 142 | 143 | ################################################################# 144 | # INTERPOLATING LANGUAGE MODELS 145 | 146 | [INTERPOLATED-LM] 147 | 148 | # if multiple language models are used, these may be combined 149 | # by optimizing perplexity on a tuning set 150 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 151 | 152 | ### script to interpolate language models 153 | # if commented out, no interpolation is performed 154 | # 155 | script = $moses-script-dir/ems/support/interpolate-lm.perl 156 | 157 | ### tuning set 158 | # you may use the same set that is used for mert tuning (reference set) 159 | # 160 | raw-tuning = $toy-data/newstest2012.$output-extension 161 | 162 | ### script to use for binary table format for irstlm or kenlm 163 | # kenlm, also set type to 8 164 | lm-binarizer = $moses-src-dir/bin/build_binary 165 | type = 8 166 | 167 | ################################################################# 168 | # TRANSLATION MODEL TRAINING 169 | 170 | [TRAINING] 171 | 172 | ### training script to be used: either a legacy script or 173 | # current moses training script (default) 174 | # 175 | script = $moses-script-dir/training/train-model.perl 176 | 177 | ### general options 178 | # 179 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 180 | 181 | ### symmetrization method to obtain word alignments from giza output 182 | # (commonly used: grow-diag-final-and) 183 | # 184 | alignment-symmetrization-method = grow-diag-final-and 185 | 186 | run-giza-in-parts = 5 187 | 188 | ### if word alignment (giza symmetrization) should be skipped, 189 | # point to word alignment files 190 | # 191 | # word-alignment = 192 | 193 | ### hierarchical rule set 194 | # 195 | hierarchical-rule-set = true 196 | use-ghkm = true 197 | use-pcfg-feature = true 198 | use-unknown-word-soft-matches = true 199 | dont-tune-glue-grammar = true 200 | 201 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary" 202 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001" 203 | 204 | 205 | ### if phrase extraction should be skipped, 206 | # point to stem for extract files 207 | # 208 | # extracted-phrases = 209 | 210 | ### if phrase table training should be skipped, 211 | # point to phrase translation table 212 | # 213 | # phrase-translation-table = 214 | 215 | ### if training should be skipped, 216 | # point to a configuration file that contains 217 | # pointers to all relevant model files 218 | # config = 219 | 220 | ####################################################### TUNING: finding good weights for model components 221 | 222 | [TUNING] 223 | 224 | ### instead of tuning with this setting, old weights may be recycled 225 | # specify here an old configuration file with matching weights 226 | # 227 | #weight-config = 228 | 229 | ### tuning script to be used 230 | # 231 | tuning-script = $moses-script-dir/training/mert-moses.pl 232 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'" 233 | 234 | ### specify the corpus used for tuning 235 | # it should contain 100s if not 1000s of sentences 236 | # 237 | raw-input = $toy-data/newstest2012.$input-extension 238 | # tokenized-input = 239 | # factorized-input = 240 | # input = 241 | 242 | inputtype = 3 243 | 244 | raw-reference = $toy-data/newstest2012.$output-extension 245 | # tokenized-reference = 246 | # factorized-reference = 247 | # reference = 248 | 249 | ### size of n-best list used (typically 100) 250 | # 251 | nbest = 1000 252 | 253 | ### ranges for weights for random initialization 254 | # if not specified, the tuning script will use generic ranges 255 | # it is not clear, if this matters 256 | # 257 | # lambda = 258 | 259 | ### additional flags for the decoder 260 | # 261 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees" 262 | 263 | ### if tuning should be skipped, specify this here 264 | # and also point to a configuration file that contains 265 | # pointers to all relevant model files 266 | # 267 | 268 | 269 | ######################################################### 270 | ## RECASER: restore case, this part only trains the model 271 | 272 | [RECASING] 273 | 274 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 275 | 276 | ### training data 277 | # raw input needs to be still tokenized, 278 | # also also tokenized input may be specified 279 | # 280 | #tokenized = [LM:europarl:tokenized-corpus] 281 | 282 | # recase-config = 283 | 284 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 285 | 286 | ####################################################### 287 | ## TRUECASER: train model to truecase corpora and input 288 | 289 | [TRUECASER] 290 | 291 | ### script to train truecaser models 292 | # 293 | trainer = $moses-script-dir/recaser/train-truecaser.perl 294 | 295 | ### training data 296 | # raw input needs to be still tokenized, 297 | # also also tokenized input may be specified 298 | # 299 | # tokenized-stem = $working-dir/data/ep+nc 300 | 301 | ### trained model 302 | # 303 | #truecase-model = 304 | 305 | ############################################################ 306 | ## EVALUATION: translating a test set using the tuned system 307 | 308 | [EVALUATION] 309 | 310 | ### number of jobs (if parallel execution of testing) 311 | # 312 | jobs = 10 313 | 314 | filter-settings = " " 315 | 316 | 317 | ### prepare system output for scoring 318 | # this may include detokenization and wrapping output in sgm 319 | # (needed for nist-bleu, ter, meteor) 320 | # 321 | #recaser = $moses-script-dir/recaser/recase.perl 322 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 323 | # output-sgm = 324 | 325 | ### should output be scored case-sensitive (default: no)? 326 | # 327 | # case-sensitive = yes 328 | 329 | ### BLEU 330 | # 331 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 332 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 333 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 334 | # ibm-bleu = 335 | 336 | ### TER: translation error rate (BBN metric) based on edit distance 337 | # 338 | # ter = $edinburgh-script-dir/tercom_v6a.pl 339 | 340 | ### METEOR: gives credit to stem / worknet synonym matches 341 | # 342 | # meteor = 343 | 344 | ### Analysis: carry out various forms of analysis on the output 345 | # 346 | analysis = $moses-script-dir/ems/support/analysis.perl 347 | #analyze-coverage = yes 348 | report-segmentation = yes 349 | 350 | 351 | [EVALUATION:newstest2013] 352 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100" 353 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 354 | wrapping-frame = $input-sgm 355 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 356 | 357 | [REPORTING] 358 | 359 | ### what to do with result (default: store in file evaluation/report) 360 | # 361 | # email = pkoehn@inf.ed.ac.uk 362 | 363 | -------------------------------------------------------------------------------- /example/toy_example_2015_3.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | nplm-dir = /home/rsennrich/tools/nplm-github/ 17 | 18 | ###### no further changes should be required to run the toy example 19 | ###### (but feel free to experiment with different settings, or change the training/test data) 20 | 21 | moses-script-dir = $moses-src-dir/scripts 22 | moses-bin-dir = $moses-src-dir/bin 23 | toy-data = $wmt2014-scripts/example/data 24 | working-dir = $wmt2014-scripts/example/working-dir 25 | decoder = $moses-src-dir/bin/moses 26 | 27 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 28 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 29 | input-truecaser = $moses-script-dir/recaser/truecase.perl 30 | output-truecaser = $moses-script-dir/recaser/truecase.perl 31 | detruecaser = $moses-script-dir/recaser/detruecase.perl 32 | 33 | # parsing pipeline used for WMT 2014 34 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 35 | 36 | # also parse tuning/evaluation reference files 37 | mock-output-parser-references = $output-parser 38 | mock-output-parser-lm = $output-parser 39 | 40 | # head binarization 41 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head" 42 | 43 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015) 44 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest" 45 | 46 | # sed instructions unsplit the split compunds from output-splitter 47 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 48 | 49 | input-extension = en 50 | output-extension = de 51 | pair-extension = de-en 52 | 53 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 54 | 55 | jobs = 10 56 | 57 | ################################################################# 58 | # PARALLEL CORPUS PREPARATION: 59 | # create a tokenized, sentence-aligned corpus, ready for training 60 | 61 | [CORPUS] 62 | 63 | cores = 10 64 | 65 | ### tools to use to prepare the data 66 | # 67 | #tokenizer = 68 | #lowercaser = 69 | 70 | ### long sentences are filtered out, since they slow down GIZA++ 71 | # and are a less reliable source of data. set here the maximum 72 | # length of a sentence 73 | # 74 | max-sentence-length = 80 75 | 76 | ### GIZA++ does not allow sentence pairs of highly uneven length. 77 | # since uneven sentence length is an indicator of a misalignment, 78 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 79 | # 80 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 81 | 82 | [CORPUS:parallelA] 83 | raw-stem = $toy-data/parallelA.$pair-extension 84 | 85 | [CORPUS:parallelB] 86 | raw-stem = $toy-data/parallelB.$pair-extension 87 | 88 | [CORPUS:parallelC] 89 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 90 | # you can add parsed corpora to your system like this 91 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_target 92 | 93 | ################################################################# 94 | # LANGUAGE MODEL TRAINING 95 | 96 | [LM] 97 | 98 | cores = 10 99 | 100 | ### tool to be used for language model training 101 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 102 | # 103 | lm-training = $srilm-dir/ngram-count 104 | settings = "-interpolate -kndiscount -unk" 105 | order = 5 106 | 107 | ### script to use for binary table format 108 | # (default: no binarization) 109 | # 110 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 111 | 112 | # kenlm, also set type to 8 113 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 114 | #type = 8 115 | 116 | ### script to create quantized language model format 117 | # (default: no quantization) 118 | # 119 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 120 | 121 | ### tools to use to prepare the data 122 | # 123 | #tokenizer = 124 | #lowercaser = 125 | 126 | ### each language model to be used has its own section here 127 | 128 | ### if corpus preparation should be skipped, 129 | # point to the prepared language model 130 | # 131 | #lm = 132 | 133 | # internal settings for RDLM 134 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files 135 | rdlm-left-context = 3 136 | rdlm-right-context = 3 137 | rdlm-up-context = 2 138 | rdlm-working-dir = 1 139 | 140 | [LM:parallelA] 141 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 142 | 143 | [LM:parallelB] 144 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 145 | 146 | [LM:parallelC] 147 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 148 | # you can add parsed corpora to your system like this 149 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_target.$output-extension 150 | 151 | [LM:monolingualA] 152 | raw-corpus = $toy-data/monolingualA.$output-extension 153 | 154 | ### Relational Dependency LM trained on concatenation of other training corpora [head model] 155 | [LM:RDLM] 156 | 157 | ### define which corpora to concatenate 158 | # we use -split here because we do not want to strip away syntactic markup 159 | # 160 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus] 161 | 162 | ### tell INTERPOLATED-LM to ignore this model 163 | # 164 | exclude-from-interpolation = true 165 | 166 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 167 | # 168 | syntactic = true 169 | 170 | ### training command 171 | # 172 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 173 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \ 174 | --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \ 175 | --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \ 176 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 177 | --epochs 10 --mmap" 178 | 179 | # we train two RDLMs, but only need one entry in the config, so we leave this empty 180 | config-feature-line = " " 181 | config-weight-line = " " 182 | 183 | 184 | ### Relational Dependency LM trained on concatenation of other training corpora [label model] 185 | [LM:RDLM2] 186 | 187 | ### define which corpora to concatenate 188 | # we use -split here because we do not want to strip away syntactic markup 189 | # 190 | split-corpus = [LM:RDLM:split-corpus] 191 | 192 | ### tell INTERPOLATED-LM to ignore this model 193 | # 194 | exclude-from-interpolation = true 195 | 196 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 197 | # 198 | syntactic = true 199 | 200 | ### training command 201 | # 202 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 203 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \ 204 | --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \ 205 | --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \ 206 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 207 | --epochs 10 --mmap" 208 | 209 | ### manually specify feature and weight lines for moses.ini (required for custom-training) 210 | # 211 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full" 212 | config-weight-line = "RDLM0= 0.1 0.1" 213 | 214 | 215 | ################################################################# 216 | # INTERPOLATING LANGUAGE MODELS 217 | 218 | [INTERPOLATED-LM] 219 | 220 | # if multiple language models are used, these may be combined 221 | # by optimizing perplexity on a tuning set 222 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 223 | 224 | ### script to interpolate language models 225 | # if commented out, no interpolation is performed 226 | # 227 | script = $moses-script-dir/ems/support/interpolate-lm.perl 228 | 229 | ### tuning set 230 | # you may use the same set that is used for mert tuning (reference set) 231 | # 232 | raw-tuning = $toy-data/newstest2012.$output-extension 233 | 234 | ### script to use for binary table format for irstlm or kenlm 235 | # kenlm, also set type to 8 236 | lm-binarizer = $moses-src-dir/bin/build_binary 237 | type = 8 238 | 239 | ################################################################# 240 | # TRANSLATION MODEL TRAINING 241 | 242 | [TRAINING] 243 | 244 | ### training script to be used: either a legacy script or 245 | # current moses training script (default) 246 | # 247 | script = $moses-script-dir/training/train-model.perl 248 | 249 | ### general options 250 | # 251 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 252 | 253 | ### symmetrization method to obtain word alignments from giza output 254 | # (commonly used: grow-diag-final-and) 255 | # 256 | alignment-symmetrization-method = grow-diag-final-and 257 | 258 | run-giza-in-parts = 5 259 | 260 | ### if word alignment (giza symmetrization) should be skipped, 261 | # point to word alignment files 262 | # 263 | # word-alignment = 264 | 265 | ### hierarchical rule set 266 | # 267 | hierarchical-rule-set = true 268 | use-ghkm = true 269 | use-pcfg-feature = true 270 | use-unknown-word-soft-matches = true 271 | dont-tune-glue-grammar = true 272 | 273 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary" 274 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001" 275 | 276 | 277 | ### if phrase extraction should be skipped, 278 | # point to stem for extract files 279 | # 280 | # extracted-phrases = 281 | 282 | ### if phrase table training should be skipped, 283 | # point to phrase translation table 284 | # 285 | # phrase-translation-table = 286 | 287 | ### if training should be skipped, 288 | # point to a configuration file that contains 289 | # pointers to all relevant model files 290 | # config = 291 | 292 | ####################################################### TUNING: finding good weights for model components 293 | 294 | [TUNING] 295 | 296 | ### instead of tuning with this setting, old weights may be recycled 297 | # specify here an old configuration file with matching weights 298 | # 299 | #weight-config = 300 | 301 | ### tuning script to be used 302 | # 303 | tuning-script = $moses-script-dir/training/mert-moses.pl 304 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'" 305 | 306 | ### specify the corpus used for tuning 307 | # it should contain 100s if not 1000s of sentences 308 | # 309 | raw-input = $toy-data/newstest2012.$input-extension 310 | # tokenized-input = 311 | # factorized-input = 312 | # input = 313 | 314 | inputtype = 3 315 | 316 | raw-reference = $toy-data/newstest2012.$output-extension 317 | # tokenized-reference = 318 | # factorized-reference = 319 | # reference = 320 | 321 | ### size of n-best list used (typically 100) 322 | # 323 | nbest = 1000 324 | 325 | ### ranges for weights for random initialization 326 | # if not specified, the tuning script will use generic ranges 327 | # it is not clear, if this matters 328 | # 329 | # lambda = 330 | 331 | ### additional flags for the decoder 332 | # 333 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees" 334 | 335 | ### if tuning should be skipped, specify this here 336 | # and also point to a configuration file that contains 337 | # pointers to all relevant model files 338 | # 339 | 340 | 341 | ######################################################### 342 | ## RECASER: restore case, this part only trains the model 343 | 344 | [RECASING] 345 | 346 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 347 | 348 | ### training data 349 | # raw input needs to be still tokenized, 350 | # also also tokenized input may be specified 351 | # 352 | #tokenized = [LM:europarl:tokenized-corpus] 353 | 354 | # recase-config = 355 | 356 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 357 | 358 | ####################################################### 359 | ## TRUECASER: train model to truecase corpora and input 360 | 361 | [TRUECASER] 362 | 363 | ### script to train truecaser models 364 | # 365 | trainer = $moses-script-dir/recaser/train-truecaser.perl 366 | 367 | ### training data 368 | # raw input needs to be still tokenized, 369 | # also also tokenized input may be specified 370 | # 371 | # tokenized-stem = $working-dir/data/ep+nc 372 | 373 | ### trained model 374 | # 375 | #truecase-model = 376 | 377 | ############################################################ 378 | ## EVALUATION: translating a test set using the tuned system 379 | 380 | [EVALUATION] 381 | 382 | ### number of jobs (if parallel execution of testing) 383 | # 384 | jobs = 10 385 | 386 | filter-settings = " " 387 | 388 | 389 | ### prepare system output for scoring 390 | # this may include detokenization and wrapping output in sgm 391 | # (needed for nist-bleu, ter, meteor) 392 | # 393 | #recaser = $moses-script-dir/recaser/recase.perl 394 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 395 | # output-sgm = 396 | 397 | ### should output be scored case-sensitive (default: no)? 398 | # 399 | # case-sensitive = yes 400 | 401 | ### BLEU 402 | # 403 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 404 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 405 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 406 | # ibm-bleu = 407 | 408 | ### TER: translation error rate (BBN metric) based on edit distance 409 | # 410 | # ter = $edinburgh-script-dir/tercom_v6a.pl 411 | 412 | ### METEOR: gives credit to stem / worknet synonym matches 413 | # 414 | # meteor = 415 | 416 | ### Analysis: carry out various forms of analysis on the output 417 | # 418 | analysis = $moses-script-dir/ems/support/analysis.perl 419 | #analyze-coverage = yes 420 | report-segmentation = yes 421 | 422 | 423 | [EVALUATION:newstest2013] 424 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100" 425 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 426 | wrapping-frame = $input-sgm 427 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 428 | 429 | [REPORTING] 430 | 431 | ### what to do with result (default: store in file evaluation/report) 432 | # 433 | # email = pkoehn@inf.ed.ac.uk 434 | 435 | -------------------------------------------------------------------------------- /enrich_labelset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | # this script modifies the ParZu grammar output to a representation that is more suitable for SMT: 6 | # ambiguous labels are split, and optionally enriched with morphological information. The script also restructures coordinations. 7 | # The modifications (the subset used for the WMT 2014 shared translation task EN-DE) are described in: 8 | # Rico Sennrich, Philip Williams, Matthias Huck (2015): 9 | # A tree does not make a well-formed sentence: Improving syntactic string-to-tree statistical machine translation with more linguistic knowledge. 10 | # In: Computer Speech & Language 32(1), 27-45. 11 | 12 | from __future__ import print_function, unicode_literals 13 | import sys 14 | import codecs 15 | from collections import defaultdict 16 | 17 | #at which point in the morphological output is case information stored 18 | CASE_POSITION = {b'ADJA':2 19 | ,b'PPER':3 20 | ,b'ART':2 21 | ,b'APPRART':0 22 | ,b'APPR':0 23 | ,b'APPO':0 24 | ,b'PRF':2 25 | ,b'NN':1 26 | ,b'FM':1 27 | ,b'NE':1 28 | ,b'PIS':1 29 | ,b'PIAT':1 30 | ,b'PDS':1 31 | ,b'PIDAT':1 32 | ,b'PPOSS':1 33 | ,b'PPOSAT':1 34 | ,b'PRELS':1 35 | ,b'PRELAT':1 36 | ,b'PWS':1 37 | ,b'PWAT':1 38 | } 39 | 40 | 41 | 42 | GENDER_POSITION = {b'ADJA':1 43 | ,b'PPER':2 44 | ,b'ART':1 45 | ,b'NN':0 46 | ,b'FM':0 47 | ,b'NE':0 48 | ,b'PIS':0 49 | ,b'PIAT':0 50 | ,b'PDS':0 51 | ,b'PIDAT':0 52 | ,b'PPOSS':0 53 | ,b'PRELS':0 54 | ,b'PPOSAT':0 55 | ,b'PRELAT':0 56 | ,b'PWS':0 57 | ,b'PWAT':0 58 | } 59 | 60 | 61 | NUMBER_POSITION = {b'ADJA':3 62 | ,b'PPER':1 63 | ,b'ART':3 64 | ,b'PRF':1 65 | ,b'NN':2 66 | ,b'FM':2 67 | ,b'NE':2 68 | ,b'PIS':2 69 | ,b'PIAT':2 70 | ,b'PDS':2 71 | ,b'PIDAT':2 72 | ,b'PPOSS':2 73 | ,b'PPOSAT':2 74 | ,b'PRELS':2 75 | ,b'PRELAT':2 76 | ,b'PWS':2 77 | ,b'PWAT':2 78 | ,b'VVFIN':1 79 | ,b'VAFIN':1 80 | ,b'VMFIN':1 81 | } 82 | 83 | 84 | PERSON_POSITION = {b'PPER':0 85 | ,b'VVFIN':0 86 | ,b'VAFIN':0 87 | ,b'VMFIN':0 88 | } 89 | 90 | 91 | KEYWORDS = ['pos','word','lemma','tag','tag2','morph','head','func', 'proj_head', 'proj_func'] 92 | def create_named_dict(values): 93 | return dict(zip(KEYWORDS,values)) 94 | 95 | def sorted_values(named_dict): 96 | return [named_dict[keyword] for keyword in KEYWORDS] 97 | 98 | def write(sentence): 99 | for word in sentence: 100 | sys.stdout.write(b'\t'.join(sorted_values(word)) + b'\n') 101 | sys.stdout.write(b'\n') 102 | 103 | def main(fobj_in): 104 | sentence = [] 105 | for line in fobj_in: 106 | 107 | if line == b"\n": 108 | convert(sentence) 109 | write(sentence) 110 | sentence = [] 111 | continue 112 | 113 | word = create_named_dict(line.split()) 114 | sentence.append(word) 115 | 116 | 117 | def convert(sentence): 118 | 119 | spans = get_spans(sentence) 120 | for word in sentence: 121 | 122 | if word['func'] != word['proj_func']: 123 | sys.stderr.write('Whoops, better check why label and projective label are different\n') 124 | sys.stderr.write(b'\t'.join(sorted_values(word)) + b'\n') 125 | sys.exit(1) 126 | 127 | if word['func'] in CONVERSIONS: 128 | CONVERSIONS[word['func']](word, sentence, spans) 129 | 130 | def get_head(word, sentence): 131 | head_position = int(word['proj_head']) 132 | if head_position: 133 | return sentence[head_position-1] 134 | 135 | def comma_is_kon(word, sentence, spans): 136 | '''if comma joins two coordinated elements, mark this with a new function, 137 | then make it the head of the element to the right, and the dependent of the element to the left. 138 | this allows for recursive addition of new coordinated elements. 139 | 140 | ''' 141 | 142 | if not 'kon' in CONVERSIONS: 143 | return 144 | 145 | head = get_head(word, sentence) 146 | if head and head['func'] == b'kon' and int(word['proj_head']) > int(word['pos']) and head['tag'] != b'KON': 147 | # make sure projectivity isn't violated 148 | if not any(int(w['proj_head']) > int(word['pos']) or int(w['proj_head']) < int(head['proj_head']) for w in sentence[int(head['proj_head']):int(word['pos'])-1]): 149 | word['proj_head'] = head['proj_head'] 150 | head['proj_head'] = word['pos'] 151 | word['proj_func'] = b'kon' 152 | word['func'] = b'kon' 153 | kon_conversion(word, sentence, spans) 154 | return word['func'] 155 | 156 | def aux_conversion(word, sentence, spans): 157 | '''distinguish between past participle and infinitive auxiliary verbs to avoid overgeneralization.''' 158 | morph_info = b'' 159 | if word['tag2'].endswith(b'PP'): 160 | morph_info = b'_pp' 161 | elif word['tag2'].endswith(b'INF'): 162 | if any(w['tag'] == b'PTKZU' and w['head'] == word['pos'] for w in sentence): 163 | morph_info = b'_izu' 164 | else: 165 | morph_info = b'_inf' 166 | elif word['tag2'].endswith(b'IZU'): 167 | morph_info = b'_izu' 168 | 169 | word['func'] += morph_info 170 | word['proj_func'] += morph_info 171 | 172 | 173 | def root_conversion(word, sentence, spans): 174 | '''distinguish between five types of structures that receive label 'root': 175 | punct: full stops, question marks etc. 176 | comma: commas 177 | bracket: quotation marks, hyphens, and brackets 178 | vroot: full verb; root of a successful parse 179 | root: everything else; typically root of partial trees. 180 | 181 | ''' 182 | morph_info = word['func'] 183 | if word['tag2'] == b'$.': 184 | morph_info = b'punct' 185 | elif word['tag2'] == b'$(': 186 | morph_info = b'bracket' 187 | elif word['tag2'] in [b'VVFIN',b'VMFIN',b'VAFIN']: 188 | # try to only give label 'vroot' to main clause roots, not to verb-last structures that remain unattached in parse 189 | midfield_labels = set(['subj','obja','subjc','adv','pred','pp','objp']) 190 | aux_labels = set(['aux','aux_pp','aux_inf','aux_vvizu']) 191 | direct_dependents_left = [w for w in sentence[:int(word['pos'])] if w['proj_head'] == word['pos'] and w['tag2'] not in ['$,','$(']] 192 | direct_dependents_right = [w for w in sentence[int(word['pos']):] if w['proj_head'] == word['pos']] 193 | if (len(direct_dependents_left) < 2 and not any(w['proj_func'] in aux_labels for w in direct_dependents_left)) or any(w['proj_func'] in midfield_labels for w in direct_dependents_right): 194 | morph_info = b'vroot' 195 | elif word['tag2'] == b'$,': 196 | morph_info = comma_is_kon(word, sentence, spans) 197 | if not morph_info: 198 | morph_info = b'comma' 199 | 200 | # mark remaining roots that cover the full sentence (or anything between two punctuation marks) with 'sroot' 201 | if morph_info == b'root': 202 | dependents = sorted(get_dependents_for_word(word, spans)) 203 | if dependents[0] == 0 or sentence[dependents[0]-1]['tag2'] == b'$.' or (sentence[dependents[0]-1]['tag2'] == b'$(' and (dependents[0]-1 == 0 or sentence[dependents[0]-2]['tag2'] == b'$.')): 204 | if dependents[-1]+1 == len(sentence) or sentence[dependents[-1]+1]['tag2'] == b'$.' or (sentence[dependents[-1]+1]['tag2'] == b'$(' and (dependents[-1]+2 == len(sentence) or sentence[dependents[-1]+2]['tag2'] == b'$.')): 205 | morph_info = b'sroot' 206 | 207 | word['func'] = morph_info 208 | word['proj_func'] = morph_info 209 | 210 | 211 | def obji_conversion(word, sentence, spans): 212 | '''distinguish between infinitive with 'zu' and bare infinitive 213 | examples: 214 | ich lasse ihn schlafen/obji_bare 215 | ich bitte ihn, zu schlafen/obji_zu 216 | ''' 217 | morph_info = b'' 218 | if word['tag2'] == b'VVIZU': 219 | morph_info = b'_zu' 220 | elif any(w['tag'] == b'PTKZU' and w['head'] == word['pos'] for w in sentence): 221 | morph_info = b'_zu' 222 | else: 223 | morph_info = b'_bare' 224 | 225 | word['func'] += morph_info 226 | word['proj_func'] += morph_info 227 | 228 | dependents = sorted(get_dependents_for_word(word, spans)) 229 | if sentence[dependents[0]]['proj_func'] == b'comma': 230 | word['func'] += b'_comma' 231 | word['proj_func'] += b'_comma' 232 | 233 | def pn_conversion(word, sentence, spans): 234 | '''add grammatical case to prepositional noun''' 235 | head = get_head(word, sentence) 236 | case = get_morphology(head)['case'] 237 | 238 | if case != b'_': 239 | word['func'] += b'_'+ case 240 | word['proj_func'] += b'_'+ case 241 | 242 | 243 | def np_conversion(word, sentence, spans): 244 | '''enforce agreement within NP (case, number, gender)''' 245 | morph_dict = get_morphology(word) 246 | 247 | # gender doesn't matter for plural agreement 248 | if morph_dict['number'] == 'pl': 249 | morph_dict['gender'] = b'_' 250 | 251 | morph_info = morph_dict['gender'] + b'-' + morph_dict['case'] + b'-' + morph_dict['number'] 252 | 253 | if morph_info != b'_-_-_': 254 | word['func'] += b'_'+ morph_info 255 | word['proj_func'] += b'_'+ morph_info 256 | 257 | 258 | def subj_coord_conversion(word, sentence, spans): 259 | '''mark coordinated subjects (which do not need to agree with verb in number)''' 260 | if any(w['proj_func'] == 'kon' and w['proj_head'] == word['pos'] for w in sentence): 261 | word['func'] = b'csubj' 262 | word['proj_func'] = b'csubj' 263 | 264 | def subj_conversion(word, sentence, spans): 265 | '''enforce agreement between subject and verb (person/number)''' 266 | 267 | head = get_head(word, sentence) 268 | morph_dict = get_morphology(head) 269 | 270 | morph_info = morph_dict['person'] + b'-' + morph_dict['number'] 271 | 272 | if morph_info != b'_-_': 273 | word['func'] += b'_'+ morph_info 274 | word['proj_func'] += b'_'+ morph_info 275 | 276 | 277 | def kon_conversion(word, sentence, spans): 278 | ''' 279 | let elements in coordination copy the label of the first element, 280 | and mark commas and conjunctions with label that specifies what type of structure is coordinated. 281 | 282 | ''' 283 | head = get_head(word, sentence) 284 | while head and (head['func'].startswith(b'kon') or head['func'].startswith(b'app') or head['func'].startswith(b'cj')): 285 | head = get_head(head, sentence) 286 | 287 | if head: 288 | headfunc = head['func'] 289 | else: 290 | headfunc = b'root' 291 | 292 | # ignore comparative clause 293 | if headfunc.startswith(b'kom'): 294 | return 295 | 296 | elif headfunc.startswith(b'rel') or headfunc.startswith(b'objc') or headfunc.startswith(b'subjc') or headfunc.startswith(b'neb'): 297 | headfunc = b'vkon_sub' 298 | 299 | # ignore number/person information 300 | elif headfunc.startswith(b'subj'): 301 | headfunc = b'subj' 302 | 303 | if word['func'] == b'cj' and headfunc == b'csubj': 304 | headfunc = b'subj' 305 | 306 | if word['func'] == b'kon' and word['tag'] == b'KON' or word['tag'] == b'$,': 307 | word['func'] += b'_'+ headfunc 308 | word['proj_func'] += b'_'+ headfunc 309 | else: 310 | word['func'] = headfunc 311 | word['proj_func'] = headfunc 312 | 313 | 314 | def gmod_conversion(word, sentence, spans): 315 | '''distinguish between premodifying and postmodifying genitive modifiers 316 | premodifying are typically named entities without articles (Peters X) 317 | postmodifying are typically noun phrases with articles (X der Firma) 318 | 319 | ''' 320 | if int(word['pos']) > int(word['proj_head']): 321 | info = b'post' 322 | else: 323 | info = b'pre' 324 | 325 | word['func'] += b'_'+ info 326 | word['proj_func'] += b'_'+ info 327 | 328 | def pred_conversion(word, sentence, spans): 329 | '''distinguish between adverbial and nominal predicates''' 330 | 331 | info = b'' 332 | if word['tag2'] in [b'ADJD',b'ADV',b'PWAV']: 333 | info = b'_adv' 334 | elif word['tag2'] in [b'NE', b'NN', b'FM', b'PIS', b'PPER', b'PWS', b'ADJA']: 335 | info = b'_nn' 336 | 337 | word['func'] += info 338 | word['proj_func'] += info 339 | 340 | def get_morphology(word): 341 | morph_info = word['morph'].split(b'|') 342 | morph_dict = {} 343 | 344 | tag = word['tag2'] 345 | 346 | try: 347 | morph_dict['case'] = morph_info[CASE_POSITION[tag]].lower() 348 | except (IndexError, KeyError): 349 | morph_dict['case'] = b'_' 350 | 351 | try: 352 | morph_dict['gender'] = morph_info[GENDER_POSITION[tag]].lower() 353 | except (IndexError, KeyError): 354 | morph_dict['gender'] = b'_' 355 | 356 | try: 357 | morph_dict['number'] = morph_info[NUMBER_POSITION[tag]].lower() 358 | except (IndexError, KeyError): 359 | morph_dict['number'] = b'_' 360 | 361 | try: 362 | morph_dict['person'] = morph_info[PERSON_POSITION[tag]].lower() 363 | except (IndexError, KeyError): 364 | morph_dict['person'] = b'_' 365 | 366 | return morph_dict 367 | 368 | 369 | def get_spans(sentence): 370 | spans = {} 371 | dominates = defaultdict(set) 372 | for i,w in enumerate(sentence): 373 | dominates[i].add(i) 374 | head = int(w['proj_head'])-1 375 | while head != -1: 376 | if i in dominates[head]: 377 | break 378 | dominates[head].add(i) 379 | head = int(sentence[head]['proj_head'])-1 380 | 381 | return dominates 382 | 383 | def get_dependents_for_word(word, dependents): 384 | return dependents[int(word['pos'])-1] 385 | 386 | CONVERSIONS = {b'aux':aux_conversion 387 | ,b'root':root_conversion 388 | ,b'obji':obji_conversion 389 | ,b'pn':pn_conversion 390 | ,b'det':np_conversion 391 | ,b'attr':np_conversion 392 | ,b'subj':subj_conversion 393 | ,b'kon':kon_conversion 394 | ,b'cj':kon_conversion 395 | ,b'gmod':gmod_conversion 396 | ,b'pred':pred_conversion 397 | } 398 | 399 | if __name__ == '__main__': 400 | if sys.version_info >= (3,0,0): 401 | sys.stdin = sys.stdin.buffer 402 | sys.stdout = sys.stdout.buffer 403 | sys.stderr = sys.stderr.buffer 404 | 405 | # conversions used for WMT 14 406 | if '--wmt14' in sys.argv: 407 | CONVERSIONS = {b'root':root_conversion 408 | ,b'kon':kon_conversion 409 | ,b'cj':kon_conversion 410 | ,b'gmod':gmod_conversion} 411 | 412 | if '--wmt15' in sys.argv: 413 | CONVERSIONS = {b'root':root_conversion 414 | ,b'kon':kon_conversion 415 | ,b'cj':kon_conversion 416 | ,b'gmod':gmod_conversion 417 | ,b'subj':subj_coord_conversion 418 | ,b'obji':obji_conversion} 419 | 420 | if '--coord-subj' in sys.argv: 421 | CONVERSIONS[b'subj'] = subj_coord_conversion 422 | 423 | if '--obji' in sys.argv: 424 | CONVERSIONS[b'obji'] = obji_conversion 425 | 426 | for arg in sys.argv[1:]: 427 | if arg.startswith('--disable_'): 428 | disabled_class = arg.split('_',1)[1].encode('UTF-8') 429 | del CONVERSIONS[disabled_class] 430 | 431 | main(sys.stdin) 432 | -------------------------------------------------------------------------------- /example/toy_example_2015_4.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | nplm-dir = /home/rsennrich/tools/nplm-github/ 17 | maltparser = /home/rsennrich/tools/maltparser-1.8.1/maltparser-1.8.1.jar 18 | corenlp = /home/rsennrich/tools/stanford-corenlp-full-2014-10-31 19 | 20 | ###### no further changes should be required to run the toy example 21 | ###### (but feel free to experiment with different settings, or change the training/test data) 22 | 23 | moses-script-dir = $moses-src-dir/scripts 24 | moses-bin-dir = $moses-src-dir/bin 25 | toy-data = $wmt2014-scripts/example/data 26 | working-dir = $wmt2014-scripts/example/working-dir 27 | decoder = $moses-src-dir/bin/moses 28 | 29 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 30 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 31 | input-truecaser = $moses-script-dir/recaser/truecase.perl 32 | output-truecaser = $moses-script-dir/recaser/truecase.perl 33 | detruecaser = $moses-script-dir/recaser/detruecase.perl 34 | 35 | input-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl \ 36 | | python $moses-script-dir/training/wrappers/parse-en-stanford.py --stanford $corenlp \ 37 | | java -jar $maltparser -c pproj -m proj -pp baseline -pcr head \ 38 | | python $moses-script-dir/training/wrappers/conll2mosesxml.py" 39 | 40 | # parsing pipeline used for WMT 2014 41 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 42 | 43 | # also parse tuning/evaluation reference files 44 | mock-output-parser-references = $output-parser 45 | mock-output-parser-lm = $output-parser 46 | 47 | # SAMT relaxation for soft source-syntactic constraints 48 | input-parse-relaxer = "$moses-src-dir/bin/relax-parse --SAMT 2" 49 | 50 | # head binarization 51 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head" 52 | 53 | inputtype = 3 54 | 55 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015) 56 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest" 57 | 58 | # sed instructions unsplit the split compunds from output-splitter 59 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 60 | 61 | input-extension = en 62 | output-extension = de 63 | pair-extension = de-en 64 | 65 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 66 | 67 | jobs = 10 68 | 69 | ################################################################# 70 | # PARALLEL CORPUS PREPARATION: 71 | # create a tokenized, sentence-aligned corpus, ready for training 72 | 73 | [CORPUS] 74 | 75 | cores = 10 76 | 77 | ### tools to use to prepare the data 78 | # 79 | #tokenizer = 80 | #lowercaser = 81 | 82 | ### long sentences are filtered out, since they slow down GIZA++ 83 | # and are a less reliable source of data. set here the maximum 84 | # length of a sentence 85 | # 86 | max-sentence-length = 80 87 | 88 | ### GIZA++ does not allow sentence pairs of highly uneven length. 89 | # since uneven sentence length is an indicator of a misalignment, 90 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 91 | # 92 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 93 | 94 | [CORPUS:parallelA] 95 | raw-stem = $toy-data/parallelA.$pair-extension 96 | 97 | [CORPUS:parallelB] 98 | raw-stem = $toy-data/parallelB.$pair-extension 99 | 100 | [CORPUS:parallelC] 101 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 102 | # you can add parsed corpora to your system like this 103 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_both 104 | 105 | ################################################################# 106 | # LANGUAGE MODEL TRAINING 107 | 108 | [LM] 109 | 110 | cores = 10 111 | 112 | ### tool to be used for language model training 113 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 114 | # 115 | lm-training = $srilm-dir/ngram-count 116 | settings = "-interpolate -kndiscount -unk" 117 | order = 5 118 | 119 | ### script to use for binary table format 120 | # (default: no binarization) 121 | # 122 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 123 | 124 | # kenlm, also set type to 8 125 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 126 | #type = 8 127 | 128 | ### script to create quantized language model format 129 | # (default: no quantization) 130 | # 131 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 132 | 133 | ### tools to use to prepare the data 134 | # 135 | #tokenizer = 136 | #lowercaser = 137 | 138 | ### each language model to be used has its own section here 139 | 140 | ### if corpus preparation should be skipped, 141 | # point to the prepared language model 142 | # 143 | #lm = 144 | 145 | # internal settings for RDLM 146 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files 147 | rdlm-left-context = 3 148 | rdlm-right-context = 3 149 | rdlm-up-context = 2 150 | rdlm-working-dir = 1 151 | 152 | [LM:parallelA] 153 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 154 | 155 | [LM:parallelB] 156 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 157 | 158 | [LM:parallelC] 159 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 160 | # you can add parsed corpora to your system like this 161 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_both.$output-extension 162 | 163 | [LM:monolingualA] 164 | raw-corpus = $toy-data/monolingualA.$output-extension 165 | 166 | ### Relational Dependency LM trained on concatenation of other training corpora [head model] 167 | [LM:RDLM] 168 | 169 | ### define which corpora to concatenate 170 | # we use -split here because we do not want to strip away syntactic markup 171 | # 172 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus] 173 | 174 | ### tell INTERPOLATED-LM to ignore this model 175 | # 176 | exclude-from-interpolation = true 177 | 178 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 179 | # 180 | syntactic = true 181 | 182 | ### training command 183 | # 184 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 185 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \ 186 | --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \ 187 | --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \ 188 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 189 | --epochs 10 --mmap" 190 | 191 | # we train two RDLMs, but only need one entry in the config, so we leave this empty 192 | config-feature-line = " " 193 | config-weight-line = " " 194 | 195 | 196 | ### Relational Dependency LM trained on concatenation of other training corpora [label model] 197 | [LM:RDLM2] 198 | 199 | ### define which corpora to concatenate 200 | # we use -split here because we do not want to strip away syntactic markup 201 | # 202 | split-corpus = [LM:RDLM:split-corpus] 203 | 204 | ### tell INTERPOLATED-LM to ignore this model 205 | # 206 | exclude-from-interpolation = true 207 | 208 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 209 | # 210 | syntactic = true 211 | 212 | ### training command 213 | # 214 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 215 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \ 216 | --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \ 217 | --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \ 218 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 219 | --epochs 10 --mmap" 220 | 221 | ### manually specify feature and weight lines for moses.ini (required for custom-training) 222 | # 223 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full" 224 | config-weight-line = "RDLM0= 0.1 0.1" 225 | 226 | 227 | ################################################################# 228 | # INTERPOLATING LANGUAGE MODELS 229 | 230 | [INTERPOLATED-LM] 231 | 232 | # if multiple language models are used, these may be combined 233 | # by optimizing perplexity on a tuning set 234 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 235 | 236 | ### script to interpolate language models 237 | # if commented out, no interpolation is performed 238 | # 239 | script = $moses-script-dir/ems/support/interpolate-lm.perl 240 | 241 | ### tuning set 242 | # you may use the same set that is used for mert tuning (reference set) 243 | # 244 | raw-tuning = $toy-data/newstest2012.$output-extension 245 | 246 | ### script to use for binary table format for irstlm or kenlm 247 | # kenlm, also set type to 8 248 | lm-binarizer = $moses-src-dir/bin/build_binary 249 | type = 8 250 | 251 | ################################################################# 252 | # TRANSLATION MODEL TRAINING 253 | 254 | [TRAINING] 255 | 256 | ### training script to be used: either a legacy script or 257 | # current moses training script (default) 258 | # 259 | script = $moses-script-dir/training/train-model.perl 260 | 261 | ### general options 262 | # 263 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 264 | 265 | ### symmetrization method to obtain word alignments from giza output 266 | # (commonly used: grow-diag-final-and) 267 | # 268 | alignment-symmetrization-method = grow-diag-final-and 269 | 270 | run-giza-in-parts = 5 271 | 272 | ### if word alignment (giza symmetrization) should be skipped, 273 | # point to word alignment files 274 | # 275 | # word-alignment = 276 | 277 | ### hierarchical rule set 278 | # 279 | hierarchical-rule-set = true 280 | use-ghkm = true 281 | use-pcfg-feature = true 282 | use-unknown-word-soft-matches = true 283 | dont-tune-glue-grammar = true 284 | ghkm-source-labels = true 285 | 286 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary" 287 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001" 288 | 289 | 290 | ### if phrase extraction should be skipped, 291 | # point to stem for extract files 292 | # 293 | # extracted-phrases = 294 | 295 | ### if phrase table training should be skipped, 296 | # point to phrase translation table 297 | # 298 | # phrase-translation-table = 299 | 300 | ### if training should be skipped, 301 | # point to a configuration file that contains 302 | # pointers to all relevant model files 303 | # config = 304 | 305 | ####################################################### TUNING: finding good weights for model components 306 | 307 | [TUNING] 308 | 309 | ### instead of tuning with this setting, old weights may be recycled 310 | # specify here an old configuration file with matching weights 311 | # 312 | #weight-config = 313 | 314 | ### tuning script to be used 315 | # 316 | tuning-script = $moses-script-dir/training/mert-moses.pl 317 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'" 318 | 319 | ### specify the corpus used for tuning 320 | # it should contain 100s if not 1000s of sentences 321 | # 322 | raw-input = $toy-data/newstest2012.$input-extension 323 | # tokenized-input = 324 | # factorized-input = 325 | # input = 326 | 327 | inputtype = 3 328 | 329 | raw-reference = $toy-data/newstest2012.$output-extension 330 | # tokenized-reference = 331 | # factorized-reference = 332 | # reference = 333 | 334 | ### size of n-best list used (typically 100) 335 | # 336 | nbest = 1000 337 | 338 | ### ranges for weights for random initialization 339 | # if not specified, the tuning script will use generic ranges 340 | # it is not clear, if this matters 341 | # 342 | # lambda = 343 | 344 | ### additional flags for the decoder 345 | # 346 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees" 347 | 348 | ### if tuning should be skipped, specify this here 349 | # and also point to a configuration file that contains 350 | # pointers to all relevant model files 351 | # 352 | 353 | 354 | ######################################################### 355 | ## RECASER: restore case, this part only trains the model 356 | 357 | [RECASING] 358 | 359 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 360 | 361 | ### training data 362 | # raw input needs to be still tokenized, 363 | # also also tokenized input may be specified 364 | # 365 | #tokenized = [LM:europarl:tokenized-corpus] 366 | 367 | # recase-config = 368 | 369 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 370 | 371 | ####################################################### 372 | ## TRUECASER: train model to truecase corpora and input 373 | 374 | [TRUECASER] 375 | 376 | ### script to train truecaser models 377 | # 378 | trainer = $moses-script-dir/recaser/train-truecaser.perl 379 | 380 | ### training data 381 | # raw input needs to be still tokenized, 382 | # also also tokenized input may be specified 383 | # 384 | # tokenized-stem = $working-dir/data/ep+nc 385 | 386 | ### trained model 387 | # 388 | #truecase-model = 389 | 390 | ############################################################ 391 | ## EVALUATION: translating a test set using the tuned system 392 | 393 | [EVALUATION] 394 | 395 | ### number of jobs (if parallel execution of testing) 396 | # 397 | jobs = 10 398 | 399 | filter-settings = " " 400 | 401 | 402 | ### prepare system output for scoring 403 | # this may include detokenization and wrapping output in sgm 404 | # (needed for nist-bleu, ter, meteor) 405 | # 406 | #recaser = $moses-script-dir/recaser/recase.perl 407 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 408 | # output-sgm = 409 | 410 | ### should output be scored case-sensitive (default: no)? 411 | # 412 | # case-sensitive = yes 413 | 414 | ### BLEU 415 | # 416 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 417 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 418 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 419 | # ibm-bleu = 420 | 421 | ### TER: translation error rate (BBN metric) based on edit distance 422 | # 423 | # ter = $edinburgh-script-dir/tercom_v6a.pl 424 | 425 | ### METEOR: gives credit to stem / worknet synonym matches 426 | # 427 | # meteor = 428 | 429 | ### Analysis: carry out various forms of analysis on the output 430 | # 431 | analysis = $moses-script-dir/ems/support/analysis.perl 432 | #analyze-coverage = yes 433 | report-segmentation = yes 434 | 435 | 436 | [EVALUATION:newstest2013] 437 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100" 438 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 439 | wrapping-frame = $input-sgm 440 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 441 | 442 | [REPORTING] 443 | 444 | ### what to do with result (default: store in file evaluation/report) 445 | # 446 | # email = pkoehn@inf.ed.ac.uk 447 | 448 | -------------------------------------------------------------------------------- /example/toy_example_2015_5.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | nplm-dir = /home/rsennrich/tools/nplm-github/ 17 | maltparser = /home/rsennrich/tools/maltparser-1.8.1/maltparser-1.8.1.jar 18 | corenlp = /home/rsennrich/tools/stanford-corenlp-full-2014-10-31 19 | 20 | ###### no further changes should be required to run the toy example 21 | ###### (but feel free to experiment with different settings, or change the training/test data) 22 | 23 | moses-script-dir = $moses-src-dir/scripts 24 | moses-bin-dir = $moses-src-dir/bin 25 | toy-data = $wmt2014-scripts/example/data 26 | working-dir = $wmt2014-scripts/example/working-dir 27 | decoder = $moses-src-dir/bin/moses 28 | 29 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 30 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 31 | input-truecaser = $moses-script-dir/recaser/truecase.perl 32 | output-truecaser = $moses-script-dir/recaser/truecase.perl 33 | detruecaser = $moses-script-dir/recaser/detruecase.perl 34 | 35 | input-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl \ 36 | | python $moses-script-dir/training/wrappers/parse-en-stanford.py --stanford $corenlp \ 37 | | java -jar $maltparser -c pproj -m proj -pp baseline -pcr head \ 38 | | python $moses-script-dir/training/wrappers/conll2mosesxml.py" 39 | 40 | # parsing pipeline used for WMT 2014 41 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 42 | 43 | # also parse tuning/evaluation reference files 44 | mock-output-parser-references = $output-parser 45 | mock-output-parser-lm = $output-parser 46 | 47 | # SAMT relaxation for soft source-syntactic constraints 48 | input-parse-relaxer = "$moses-src-dir/bin/relax-parse --SAMT 2" 49 | 50 | # head binarization 51 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head" 52 | 53 | inputtype = 3 54 | 55 | # hybrid compound splitting (described in Sennrich, Williams and Huck, 2015) 56 | output-splitter = "$wmt2014-scripts/hybrid_compound_splitter.py -smor $zmorge-model -write-filler -no-truecase -q -syntax -dependency -fewest" 57 | 58 | # sed instructions unsplit the split compunds from output-splitter 59 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 60 | 61 | input-extension = en 62 | output-extension = de 63 | pair-extension = de-en 64 | 65 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 66 | 67 | jobs = 10 68 | 69 | ################################################################# 70 | # PARALLEL CORPUS PREPARATION: 71 | # create a tokenized, sentence-aligned corpus, ready for training 72 | 73 | [CORPUS] 74 | 75 | cores = 10 76 | 77 | ### tools to use to prepare the data 78 | # 79 | #tokenizer = 80 | #lowercaser = 81 | 82 | ### long sentences are filtered out, since they slow down GIZA++ 83 | # and are a less reliable source of data. set here the maximum 84 | # length of a sentence 85 | # 86 | max-sentence-length = 80 87 | 88 | ### GIZA++ does not allow sentence pairs of highly uneven length. 89 | # since uneven sentence length is an indicator of a misalignment, 90 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 91 | # 92 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 93 | 94 | [CORPUS:parallelA] 95 | raw-stem = $toy-data/parallelA.$pair-extension 96 | 97 | [CORPUS:parallelB] 98 | raw-stem = $toy-data/parallelB.$pair-extension 99 | 100 | [CORPUS:parallelC] 101 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 102 | # you can add parsed corpora to your system like this 103 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_both 104 | 105 | ################################################################# 106 | # LANGUAGE MODEL TRAINING 107 | 108 | [LM] 109 | 110 | cores = 10 111 | 112 | ### tool to be used for language model training 113 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 114 | # 115 | lm-training = $srilm-dir/ngram-count 116 | settings = "-interpolate -kndiscount -unk" 117 | order = 5 118 | 119 | ### script to use for binary table format 120 | # (default: no binarization) 121 | # 122 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 123 | 124 | # kenlm, also set type to 8 125 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 126 | #type = 8 127 | 128 | ### script to create quantized language model format 129 | # (default: no quantization) 130 | # 131 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 132 | 133 | ### tools to use to prepare the data 134 | # 135 | #tokenizer = 136 | #lowercaser = 137 | 138 | ### each language model to be used has its own section here 139 | 140 | ### if corpus preparation should be skipped, 141 | # point to the prepared language model 142 | # 143 | #lm = 144 | 145 | # internal settings for RDLM 146 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files 147 | rdlm-left-context = 3 148 | rdlm-right-context = 3 149 | rdlm-up-context = 2 150 | rdlm-working-dir = 1 151 | 152 | [LM:parallelA] 153 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 154 | 155 | [LM:parallelB] 156 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 157 | 158 | [LM:parallelC] 159 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 160 | # you can add parsed corpora to your system like this 161 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_both.$output-extension 162 | 163 | [LM:monolingualA] 164 | raw-corpus = $toy-data/monolingualA.$output-extension 165 | 166 | ### Relational Dependency LM trained on concatenation of other training corpora [head model] 167 | [LM:RDLM] 168 | 169 | ### define which corpora to concatenate 170 | # we use -split here because we do not want to strip away syntactic markup 171 | # 172 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus] 173 | 174 | ### tell INTERPOLATED-LM to ignore this model 175 | # 176 | exclude-from-interpolation = true 177 | 178 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 179 | # 180 | syntactic = true 181 | 182 | ### training command 183 | # 184 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 185 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \ 186 | --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \ 187 | --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \ 188 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 189 | --epochs 10 --mmap" 190 | 191 | # we train two RDLMs, but only need one entry in the config, so we leave this empty 192 | config-feature-line = " " 193 | config-weight-line = " " 194 | 195 | 196 | ### Relational Dependency LM trained on concatenation of other training corpora [label model] 197 | [LM:RDLM2] 198 | 199 | ### define which corpora to concatenate 200 | # we use -split here because we do not want to strip away syntactic markup 201 | # 202 | split-corpus = [LM:RDLM:split-corpus] 203 | 204 | ### tell INTERPOLATED-LM to ignore this model 205 | # 206 | exclude-from-interpolation = true 207 | 208 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 209 | # 210 | syntactic = true 211 | 212 | ### training command 213 | # 214 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 215 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \ 216 | --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \ 217 | --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \ 218 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 219 | --epochs 10 --mmap" 220 | 221 | ### manually specify feature and weight lines for moses.ini (required for custom-training) 222 | # 223 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full" 224 | config-weight-line = "RDLM0= 0.1 0.1" 225 | 226 | 227 | ### 5-gram Neural Network LM 228 | [LM:NPLM] 229 | 230 | ### define which corpora to concatenate 231 | # we use -split here because we do not want to strip away syntactic markup 232 | # 233 | concatenate-files = [LM:{parallelA,parallelB,monolingualA}:stripped-corpus] 234 | 235 | ### tell INTERPOLATED-LM to ignore this model 236 | # 237 | exclude-from-interpolation = true 238 | 239 | # internal settings for NPLM 240 | # IMPORTANT: update nplm-working-dir when training a new NPLM to avoid overwriting old files 241 | order = 5 242 | nplm-working-dir = 1 243 | 244 | ### training command 245 | # 246 | custom-training = "mkdir -p $working-dir/lm/nplm/nplm$nplm-working-dir && $moses-script-dir/training/train-neurallm.py \ 247 | --nplm-home $nplm-dir --working-dir $working-dir/lm/nplm/nplm$nplm-working-dir \ 248 | --output-dir $working-dir/lm/nplm/nplm$nplm-working-dir --output-model nplm \ 249 | --vocab-size 500000 --noise 100 --order $order \ 250 | --epochs 10" 251 | 252 | # we train two RDLMs, but only need one entry in the config, so we leave this empty 253 | config-feature-line = "NeuralLM path=$working-dir/lm/nplm/nplm$nplm-working-dir/nplm.model.nplm order=$order" 254 | config-weight-line = "NeuralLM0= 0.1" 255 | 256 | ################################################################# 257 | # INTERPOLATING LANGUAGE MODELS 258 | 259 | [INTERPOLATED-LM] 260 | 261 | # if multiple language models are used, these may be combined 262 | # by optimizing perplexity on a tuning set 263 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 264 | 265 | ### script to interpolate language models 266 | # if commented out, no interpolation is performed 267 | # 268 | script = $moses-script-dir/ems/support/interpolate-lm.perl 269 | 270 | ### tuning set 271 | # you may use the same set that is used for mert tuning (reference set) 272 | # 273 | raw-tuning = $toy-data/newstest2012.$output-extension 274 | 275 | ### script to use for binary table format for irstlm or kenlm 276 | # kenlm, also set type to 8 277 | lm-binarizer = $moses-src-dir/bin/build_binary 278 | type = 8 279 | 280 | ################################################################# 281 | # TRANSLATION MODEL TRAINING 282 | 283 | [TRAINING] 284 | 285 | ### training script to be used: either a legacy script or 286 | # current moses training script (default) 287 | # 288 | script = $moses-script-dir/training/train-model.perl 289 | 290 | ### general options 291 | # 292 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 293 | 294 | ### symmetrization method to obtain word alignments from giza output 295 | # (commonly used: grow-diag-final-and) 296 | # 297 | alignment-symmetrization-method = grow-diag-final-and 298 | 299 | run-giza-in-parts = 5 300 | 301 | ### if word alignment (giza symmetrization) should be skipped, 302 | # point to word alignment files 303 | # 304 | # word-alignment = 305 | 306 | ### hierarchical rule set 307 | # 308 | hierarchical-rule-set = true 309 | use-ghkm = true 310 | use-pcfg-feature = true 311 | use-unknown-word-soft-matches = true 312 | dont-tune-glue-grammar = true 313 | ghkm-source-labels = true 314 | 315 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary" 316 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001" 317 | 318 | 319 | ### if phrase extraction should be skipped, 320 | # point to stem for extract files 321 | # 322 | # extracted-phrases = 323 | 324 | ### if phrase table training should be skipped, 325 | # point to phrase translation table 326 | # 327 | # phrase-translation-table = 328 | 329 | ### if training should be skipped, 330 | # point to a configuration file that contains 331 | # pointers to all relevant model files 332 | # config = 333 | 334 | ####################################################### TUNING: finding good weights for model components 335 | 336 | [TUNING] 337 | 338 | ### instead of tuning with this setting, old weights may be recycled 339 | # specify here an old configuration file with matching weights 340 | # 341 | #weight-config = 342 | 343 | ### tuning script to be used 344 | # 345 | tuning-script = $moses-script-dir/training/mert-moses.pl 346 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'" 347 | 348 | ### specify the corpus used for tuning 349 | # it should contain 100s if not 1000s of sentences 350 | # 351 | raw-input = $toy-data/newstest2012.$input-extension 352 | # tokenized-input = 353 | # factorized-input = 354 | # input = 355 | 356 | inputtype = 3 357 | 358 | raw-reference = $toy-data/newstest2012.$output-extension 359 | # tokenized-reference = 360 | # factorized-reference = 361 | # reference = 362 | 363 | ### size of n-best list used (typically 100) 364 | # 365 | nbest = 1000 366 | 367 | ### ranges for weights for random initialization 368 | # if not specified, the tuning script will use generic ranges 369 | # it is not clear, if this matters 370 | # 371 | # lambda = 372 | 373 | ### additional flags for the decoder 374 | # 375 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees" 376 | 377 | ### if tuning should be skipped, specify this here 378 | # and also point to a configuration file that contains 379 | # pointers to all relevant model files 380 | # 381 | 382 | 383 | ######################################################### 384 | ## RECASER: restore case, this part only trains the model 385 | 386 | [RECASING] 387 | 388 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 389 | 390 | ### training data 391 | # raw input needs to be still tokenized, 392 | # also also tokenized input may be specified 393 | # 394 | #tokenized = [LM:europarl:tokenized-corpus] 395 | 396 | # recase-config = 397 | 398 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 399 | 400 | ####################################################### 401 | ## TRUECASER: train model to truecase corpora and input 402 | 403 | [TRUECASER] 404 | 405 | ### script to train truecaser models 406 | # 407 | trainer = $moses-script-dir/recaser/train-truecaser.perl 408 | 409 | ### training data 410 | # raw input needs to be still tokenized, 411 | # also also tokenized input may be specified 412 | # 413 | # tokenized-stem = $working-dir/data/ep+nc 414 | 415 | ### trained model 416 | # 417 | #truecase-model = 418 | 419 | ############################################################ 420 | ## EVALUATION: translating a test set using the tuned system 421 | 422 | [EVALUATION] 423 | 424 | ### number of jobs (if parallel execution of testing) 425 | # 426 | jobs = 10 427 | 428 | filter-settings = " " 429 | 430 | 431 | ### prepare system output for scoring 432 | # this may include detokenization and wrapping output in sgm 433 | # (needed for nist-bleu, ter, meteor) 434 | # 435 | #recaser = $moses-script-dir/recaser/recase.perl 436 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 437 | # output-sgm = 438 | 439 | ### should output be scored case-sensitive (default: no)? 440 | # 441 | # case-sensitive = yes 442 | 443 | ### BLEU 444 | # 445 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 446 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 447 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 448 | # ibm-bleu = 449 | 450 | ### TER: translation error rate (BBN metric) based on edit distance 451 | # 452 | # ter = $edinburgh-script-dir/tercom_v6a.pl 453 | 454 | ### METEOR: gives credit to stem / worknet synonym matches 455 | # 456 | # meteor = 457 | 458 | ### Analysis: carry out various forms of analysis on the output 459 | # 460 | analysis = $moses-script-dir/ems/support/analysis.perl 461 | #analyze-coverage = yes 462 | report-segmentation = yes 463 | 464 | 465 | [EVALUATION:newstest2013] 466 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 100" 467 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 468 | wrapping-frame = $input-sgm 469 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 470 | 471 | [REPORTING] 472 | 473 | ### what to do with result (default: store in file evaluation/report) 474 | # 475 | # email = pkoehn@inf.ed.ac.uk 476 | 477 | -------------------------------------------------------------------------------- /example/toy_example_2015_6.config: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### 3 | ################################################ 4 | 5 | [GENERAL] 6 | 7 | ###### you need to set these paths to match your environemnt 8 | ###### 9 | 10 | moses-src-dir = /home/rsennrich/tools/mosesdecoder 11 | wmt2014-scripts = /home/rsennrich/smtworkspace/wmt2014-scripts 12 | parzu-path = /home/rsennrich/ParZu # https://github.com/rsennrich/ParZu 13 | zmorge-model = /home/rsennrich/zmorge/zmorge-20141224-smor_newlemma.a #get this (or a newer version) from http://kitt.ifi.uzh.ch/kitt/zmorge/ 14 | srilm-dir = /home/rsennrich/tools/srilm/bin/i686-m64/ 15 | external-bin-dir = ~/bin 16 | nplm-dir = /home/rsennrich/tools/nplm-github/ 17 | maltparser = /home/rsennrich/tools/maltparser-1.8.1/maltparser-1.8.1.jar 18 | corenlp = /home/rsennrich/tools/stanford-corenlp-full-2014-10-31 19 | 20 | # IMPORTANT: update run-id to avoid decoder output (Ttree file) being overwritten 21 | run-id = 1 22 | 23 | ###### no further changes should be required to run the toy example 24 | ###### (but feel free to experiment with different settings, or change the training/test data) 25 | 26 | moses-script-dir = $moses-src-dir/scripts 27 | moses-bin-dir = $moses-src-dir/bin 28 | toy-data = $wmt2014-scripts/example/data 29 | working-dir = $wmt2014-scripts/example/working-dir 30 | decoder = $moses-src-dir/bin/moses 31 | 32 | input-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $input-extension -penn" 33 | output-tokenizer = "$moses-script-dir/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" 34 | input-truecaser = $moses-script-dir/recaser/truecase.perl 35 | output-truecaser = $moses-script-dir/recaser/truecase.perl 36 | detruecaser = $moses-script-dir/recaser/detruecase.perl 37 | 38 | input-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl \ 39 | | python $moses-script-dir/training/wrappers/parse-en-stanford.py --stanford $corenlp --java /home/rsennrich/tools/openjdk/OBF_DROP_DIR/openjdk8/j2re-image/bin/java \ 40 | | java -jar $maltparser -c pproj -m proj -pp baseline -pcr head \ 41 | | python $moses-script-dir/training/wrappers/conll2mosesxml.py" 42 | 43 | # parsing pipeline used for WMT 2014 44 | output-parser = "$moses-script-dir/tokenizer/deescape-special-chars.perl | $parzu-path/parzu -i tokenized_lines --projective | $wmt2014-scripts/enrich_labelset.py --wmt15 | $moses-script-dir/training/wrappers/conll2mosesxml.py" 45 | 46 | # also parse tuning/evaluation reference files 47 | mock-output-parser-references = $output-parser 48 | mock-output-parser-lm = $output-parser 49 | 50 | # SAMT relaxation for soft source-syntactic constraints 51 | input-parse-relaxer = "$moses-src-dir/bin/relax-parse --SAMT 2" 52 | 53 | # head binarization 54 | output-parse-relaxer = "$wmt2014-scripts/emnlp2015/binarize.py head" 55 | 56 | inputtype = 3 57 | 58 | # hyphen splitting on input 59 | input-splitter = "$wmt2014-scripts/emnlp2015/hyphen-splitter.py -syntax" 60 | 61 | # hybrid compound splitting and particle verb restructuring (described in Sennrich and Haddow, 2015) 62 | output-splitter = "$wmt2014-scripts/emnlp2015/split_and_restructure.sh $wmt2014-scripts $zmorge-model" 63 | 64 | # sed instructions unsplit the split compunds from output-splitter 65 | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension | sed -r 's/ \@(\S*?)\@ /\1/g' | sed -r 's/\@\@ //g'" 66 | 67 | input-extension = en 68 | output-extension = de 69 | pair-extension = de-en 70 | 71 | generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl 72 | 73 | jobs = 10 74 | 75 | ################################################################# 76 | # PARALLEL CORPUS PREPARATION: 77 | # create a tokenized, sentence-aligned corpus, ready for training 78 | 79 | [CORPUS] 80 | 81 | cores = 10 82 | 83 | ### tools to use to prepare the data 84 | # 85 | #tokenizer = 86 | #lowercaser = 87 | 88 | ### long sentences are filtered out, since they slow down GIZA++ 89 | # and are a less reliable source of data. set here the maximum 90 | # length of a sentence 91 | # 92 | max-sentence-length = 80 93 | 94 | ### GIZA++ does not allow sentence pairs of highly uneven length. 95 | # since uneven sentence length is an indicator of a misalignment, 96 | # we set a maximum ratio of 3 (this also gives us room for compoudn splitting) 97 | # 98 | cleaner = "$moses-script-dir/training/clean-corpus-n.perl -ratio 3" 99 | 100 | [CORPUS:parallelA] 101 | raw-stem = $toy-data/parallelA.$pair-extension 102 | 103 | [CORPUS:parallelB] 104 | raw-stem = $toy-data/parallelB.$pair-extension 105 | 106 | [CORPUS:parallelC] 107 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 108 | # you can add parsed corpora to your system like this 109 | clean-parsed-stem = $toy-data/parallelC.$pair-extension.parsed_both 110 | 111 | ################################################################# 112 | # LANGUAGE MODEL TRAINING 113 | 114 | [LM] 115 | 116 | cores = 10 117 | 118 | ### tool to be used for language model training 119 | # for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 120 | # 121 | lm-training = $srilm-dir/ngram-count 122 | settings = "-interpolate -kndiscount -unk" 123 | order = 5 124 | 125 | ### script to use for binary table format 126 | # (default: no binarization) 127 | # 128 | #lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm 129 | 130 | # kenlm, also set type to 8 131 | #lm-binarizer = $moses-src-dir/kenlm/build_binary 132 | #type = 8 133 | 134 | ### script to create quantized language model format 135 | # (default: no quantization) 136 | # 137 | #lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm 138 | 139 | ### tools to use to prepare the data 140 | # 141 | #tokenizer = 142 | #lowercaser = 143 | 144 | ### each language model to be used has its own section here 145 | 146 | ### if corpus preparation should be skipped, 147 | # point to the prepared language model 148 | # 149 | #lm = 150 | 151 | # internal settings for RDLM 152 | # IMPORTANT: update rdlm-working-dir when training a new RDLM to avoid overwriting old files 153 | rdlm-left-context = 3 154 | rdlm-right-context = 3 155 | rdlm-up-context = 2 156 | rdlm-working-dir = 1 157 | 158 | [LM:parallelA] 159 | raw-corpus = $toy-data/parallelA.$pair-extension.$output-extension 160 | 161 | [LM:parallelB] 162 | raw-corpus = $toy-data/parallelB.$pair-extension.$output-extension 163 | 164 | [LM:parallelC] 165 | # if you do your own parsing (or wanna re-use other data, like http://statmt.org/rsennrich/parsed_wmt/ ), 166 | # you can add parsed corpora to your system like this 167 | mock-parsed-corpus = $toy-data/parallelC.$pair-extension.parsed_both.$output-extension 168 | 169 | [LM:monolingualA] 170 | raw-corpus = $toy-data/monolingualA.$output-extension 171 | 172 | ### Relational Dependency LM trained on concatenation of other training corpora [head model] 173 | [LM:RDLM] 174 | 175 | ### define which corpora to concatenate 176 | # we use -split here because we do not want to strip away syntactic markup 177 | # 178 | concatenate-files-split = [LM:{parallelA,parallelB,monolingualA}:split-corpus] 179 | 180 | ### tell INTERPOLATED-LM to ignore this model 181 | # 182 | exclude-from-interpolation = true 183 | 184 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 185 | # 186 | syntactic = true 187 | 188 | ### training command 189 | # 190 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 191 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir \ 192 | --output-dir $working-dir/lm/rdlm/rdlm_head$rdlm-working-dir --output-model rdlm_head \ 193 | --mode head --output-vocab-size 500000 --noise 100 --left-context-size $rdlm-left-context \ 194 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 195 | --epochs 10 --mmap" 196 | 197 | # we train two RDLMs, but only need one entry in the config, so we leave this empty 198 | config-feature-line = " " 199 | config-weight-line = " " 200 | 201 | 202 | ### Relational Dependency LM trained on concatenation of other training corpora [label model] 203 | [LM:RDLM2] 204 | 205 | ### define which corpora to concatenate 206 | # we use -split here because we do not want to strip away syntactic markup 207 | # 208 | split-corpus = [LM:RDLM:split-corpus] 209 | 210 | ### tell INTERPOLATED-LM to ignore this model 211 | # 212 | exclude-from-interpolation = true 213 | 214 | ### syntactic = true indicate that custom-training should take file with syntactic markup as input (requires mock-output-parser-lm) 215 | # 216 | syntactic = true 217 | 218 | ### training command 219 | # 220 | custom-training = "mkdir -p $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir && $moses-script-dir/training/rdlm/train_rdlm.py \ 221 | --nplm-home $nplm-dir --working-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir \ 222 | --output-dir $working-dir/lm/rdlm/rdlm_label$rdlm-working-dir --output-model rdlm_label \ 223 | --mode label --output-vocab-size 75 --noise 50 --left-context-size $rdlm-left-context \ 224 | --right-context-size $rdlm-right-context --up-context-size $rdlm-up-context \ 225 | --epochs 10 --mmap" 226 | 227 | ### manually specify feature and weight lines for moses.ini (required for custom-training) 228 | # 229 | config-feature-line = "RDLM path_head_lm=$working-dir/lm/rdlm/rdlm_head$rdlm-working-dir/rdlm_head.model.nplm path_label_lm=$working-dir/lm/rdlm/rdlm_label$rdlm-working-dir/rdlm_label.model.nplm backoff=true premultiply=true context_left=$rdlm-left-context context_right=$rdlm-right-context context_up=$rdlm-up-context binarized=full" 230 | config-weight-line = "RDLM0= 0.1 0.1" 231 | 232 | 233 | ### 5-gram Neural Network LM 234 | [LM:NPLM] 235 | 236 | ### define which corpora to concatenate 237 | # we use -split here because we do not want to strip away syntactic markup 238 | # 239 | concatenate-files = [LM:{parallelA,parallelB,monolingualA}:stripped-corpus] 240 | 241 | ### tell INTERPOLATED-LM to ignore this model 242 | # 243 | exclude-from-interpolation = true 244 | 245 | # internal settings for NPLM 246 | # IMPORTANT: update nplm-working-dir when training a new NPLM to avoid overwriting old files 247 | order = 5 248 | nplm-working-dir = 1 249 | 250 | ### training command 251 | # 252 | custom-training = "mkdir -p $working-dir/lm/nplm/nplm$nplm-working-dir && $moses-script-dir/training/train-neurallm.py \ 253 | --nplm-home $nplm-dir --working-dir $working-dir/lm/nplm/nplm$nplm-working-dir \ 254 | --output-dir $working-dir/lm/nplm/nplm$nplm-working-dir --output-model nplm \ 255 | --vocab-size 500000 --noise 100 --order $order \ 256 | --epochs 10" 257 | 258 | # we train two RDLMs, but only need one entry in the config, so we leave this empty 259 | config-feature-line = "NeuralLM path=$working-dir/lm/nplm/nplm$nplm-working-dir/nplm.model.nplm order=$order" 260 | config-weight-line = "NeuralLM0= 0.1" 261 | 262 | ################################################################# 263 | # INTERPOLATING LANGUAGE MODELS 264 | 265 | [INTERPOLATED-LM] 266 | 267 | # if multiple language models are used, these may be combined 268 | # by optimizing perplexity on a tuning set 269 | # see, for instance [Koehn and Schwenk, IJCNLP 2008] 270 | 271 | ### script to interpolate language models 272 | # if commented out, no interpolation is performed 273 | # 274 | script = $moses-script-dir/ems/support/interpolate-lm.perl 275 | 276 | ### tuning set 277 | # you may use the same set that is used for mert tuning (reference set) 278 | # 279 | raw-tuning = $toy-data/newstest2012.$output-extension 280 | 281 | ### script to use for binary table format for irstlm or kenlm 282 | # kenlm, also set type to 8 283 | lm-binarizer = $moses-src-dir/bin/build_binary 284 | type = 8 285 | 286 | ################################################################# 287 | # TRANSLATION MODEL TRAINING 288 | 289 | [TRAINING] 290 | 291 | ### training script to be used: either a legacy script or 292 | # current moses training script (default) 293 | # 294 | script = $moses-script-dir/training/train-model.perl 295 | 296 | ### general options 297 | # 298 | training-options = "-mgiza -mgiza-cpus 8 -sort-buffer-size 10G -sort-compress gzip -cores 16 -alt-direct-rule-score-2 --ghkm-tree-fragment" 299 | 300 | ### symmetrization method to obtain word alignments from giza output 301 | # (commonly used: grow-diag-final-and) 302 | # 303 | alignment-symmetrization-method = grow-diag-final-and 304 | 305 | run-giza-in-parts = 5 306 | 307 | ### if word alignment (giza symmetrization) should be skipped, 308 | # point to word alignment files 309 | # 310 | # word-alignment = 311 | 312 | ### hierarchical rule set 313 | # 314 | hierarchical-rule-set = true 315 | use-ghkm = true 316 | use-pcfg-feature = true 317 | use-unknown-word-soft-matches = true 318 | dont-tune-glue-grammar = true 319 | ghkm-source-labels = true 320 | 321 | extract-settings = "--UnknownWordMinRelFreq 0.01 --MaxNodes 40 --MaxRuleDepth 7 --MaxRuleSize 7 --AllowUnary" 322 | score-settings = " --GoodTuring --LowCountFeature --MinCountHierarchical 2 --MinScore 2:0.0001" 323 | 324 | 325 | ### if phrase extraction should be skipped, 326 | # point to stem for extract files 327 | # 328 | # extracted-phrases = 329 | 330 | ### if phrase table training should be skipped, 331 | # point to phrase translation table 332 | # 333 | # phrase-translation-table = 334 | 335 | ### if training should be skipped, 336 | # point to a configuration file that contains 337 | # pointers to all relevant model files 338 | # config = 339 | 340 | ####################################################### TUNING: finding good weights for model components 341 | 342 | [TUNING] 343 | 344 | ### instead of tuning with this setting, old weights may be recycled 345 | # specify here an old configuration file with matching weights 346 | # 347 | # weight-config = 348 | 349 | ### tuning script to be used 350 | # 351 | tuning-script = $moses-script-dir/training/mert-moses.pl 352 | tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 25 --threads 16 -batch-mira-args='--sctype BLEU,HWCM'" 353 | 354 | ### specify the corpus used for tuning 355 | # it should contain 100s if not 1000s of sentences 356 | # 357 | raw-input = $toy-data/newstest2012.$input-extension 358 | # tokenized-input = 359 | # factorized-input = 360 | # input = 361 | 362 | inputtype = 3 363 | 364 | raw-reference = $toy-data/newstest2012.$output-extension 365 | # tokenized-reference = 366 | # factorized-reference = 367 | # reference = 368 | 369 | ### size of n-best list used (typically 100) 370 | # 371 | nbest = 1000 372 | 373 | ### ranges for weights for random initialization 374 | # if not specified, the tuning script will use generic ranges 375 | # it is not clear, if this matters 376 | # 377 | # lambda = 378 | 379 | ### additional flags for the decoder 380 | # 381 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=100' -threads 8 -max-chart-span 50 -rule-limit 50 -n-best-trees" 382 | 383 | ### if tuning should be skipped, specify this here 384 | # and also point to a configuration file that contains 385 | # pointers to all relevant model files 386 | # 387 | 388 | 389 | ######################################################### 390 | ## RECASER: restore case, this part only trains the model 391 | 392 | [RECASING] 393 | 394 | #decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm 395 | 396 | ### training data 397 | # raw input needs to be still tokenized, 398 | # also also tokenized input may be specified 399 | # 400 | #tokenized = [LM:europarl:tokenized-corpus] 401 | 402 | # recase-config = 403 | 404 | #lm-training = $moses-src-dir/srilm/bin/i686/ngram-count 405 | 406 | ####################################################### 407 | ## TRUECASER: train model to truecase corpora and input 408 | 409 | [TRUECASER] 410 | 411 | ### script to train truecaser models 412 | # 413 | trainer = $moses-script-dir/recaser/train-truecaser.perl 414 | 415 | ### training data 416 | # raw input needs to be still tokenized, 417 | # also also tokenized input may be specified 418 | # 419 | # tokenized-stem = $working-dir/data/ep+nc 420 | 421 | ### trained model 422 | # 423 | #truecase-model = 424 | 425 | ############################################################ 426 | ## EVALUATION: translating a test set using the tuned system 427 | 428 | [EVALUATION] 429 | 430 | ### number of jobs (if parallel execution of testing) 431 | # 432 | jobs = 10 433 | 434 | filter-settings = " " 435 | 436 | 437 | ### prepare system output for scoring 438 | # this may include detokenization and wrapping output in sgm 439 | # (needed for nist-bleu, ter, meteor) 440 | # 441 | #recaser = $moses-script-dir/recaser/recase.perl 442 | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" 443 | # output-sgm = 444 | 445 | ### should output be scored case-sensitive (default: no)? 446 | # 447 | # case-sensitive = yes 448 | 449 | ### BLEU 450 | # 451 | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl 452 | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" 453 | # multi-bleu = $edinburgh-script-dir/multi-bleu.perl 454 | # ibm-bleu = 455 | 456 | ### TER: translation error rate (BBN metric) based on edit distance 457 | # 458 | # ter = $edinburgh-script-dir/tercom_v6a.pl 459 | 460 | ### METEOR: gives credit to stem / worknet synonym matches 461 | # 462 | # meteor = 463 | 464 | ### Analysis: carry out various forms of analysis on the output 465 | # 466 | analysis = $moses-script-dir/ems/support/analysis.perl 467 | #analyze-coverage = yes 468 | report-segmentation = yes 469 | 470 | 471 | [EVALUATION:newstest2013] 472 | decoder-settings = "-feature-overwrite 'TranslationModel0 table-limit=10' -threads 1 -max-chart-span 10 -rule-limit 10 -Ttree $working-dir/evaluation/newstest2013.output.tree.$run-id" 473 | input-sgm = $toy-data/newstest2013-src.$input-extension.sgm 474 | wrapping-frame = $input-sgm 475 | reference-sgm = $toy-data/newstest2013-ref.$output-extension.sgm 476 | 477 | # ugly hack: to post-process particle verbs, we read tree output (produced with -Ttree) instead of string output; particle verb restructuring is made part of detruecaser. 478 | detruecaser = "$wmt2014-scripts/emnlp2015/detruecase_ptkvz.sh $wmt2014-scripts $working-dir/evaluation/newstest2013.output.tree.$run-id | $moses-script-dir/recaser/detruecase.perl" 479 | 480 | [REPORTING] 481 | 482 | ### what to do with result (default: store in file evaluation/report) 483 | # 484 | # email = pkoehn@inf.ed.ac.uk 485 | 486 | -------------------------------------------------------------------------------- /hybrid_compound_splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | # This script implements hybrid compound splitting as described in 6 | # Fritzinger & Fraser 2010: How to Avoid Burning Ducks: Combining Linguistic Analysis and Corpus Statistics for German Compound Processing 7 | # the variant without morphology tool corresponds to Koehn & Knight 2003: Empirical Methods for Compound Splitting 8 | 9 | # As SMOR morphology, I recommend the most recent version of Zmorge: zmorge-{version}-smor_newlemma.a at http://kitt.ifi.uzh.ch/kitt/zmorge/ 10 | # The script requires SFST in hybrid mode. 11 | 12 | # A syntactic representation of split compounds as described in: 13 | # Rico Sennrich, Philip Williams, Matthias Huck (2015): 14 | # A tree does not make a well-formed sentence: Improving syntactic string-to-tree statistical machine translation with more linguistic knowledge. 15 | # In: Computer Speech & Language 32(1), 27-45. 16 | # can be generated (given a corpus in the Moses XML format) with the following commands: 17 | # hybrid_compound_splitter.py -train -syntax -corpus INPUT_FILE -model MODEL_FILE 18 | # hybrid_compound_splitter.py -write-filler -no-truecase -q -syntax -smor zmorge-{version}-smor_newlemma.a -model MODEL_FILE < INPUT_FILE > OUTPUT_FILE 19 | 20 | from __future__ import division, unicode_literals 21 | import sys 22 | import os 23 | import re 24 | import pprint 25 | import json 26 | import codecs 27 | import argparse 28 | from collections import defaultdict 29 | from operator import mul 30 | 31 | from lxml import etree as ET 32 | 33 | try: 34 | import pexpect 35 | except ImportError: 36 | sys.stderr.write('Error: this script requires Pexpect >= 3.0\n') 37 | sys.exit(1) 38 | 39 | if pexpect.__version__ < 3: 40 | sys.stderr.write('Error: this script requires Pexpect >= 3.0. Version {0} found\n'.format(pexpect.__version__)) 41 | sys.exit(1) 42 | 43 | if sys.version_info >= (3, 0): 44 | from functools import reduce 45 | 46 | JUNCTURES = ['', 's', 'es', '-'] # only allow these junctures in unsupervised mode (ignored in hybrid mode) 47 | SMOR_SPLIT = ['NN', 'NE', 'ADJ'] # only split these word classes with SMOR 48 | MIN_SIZE = 4 49 | MIN_COUNT = 5 50 | MAX_COUNT = 5 51 | MAX_SPLIT_HYPOTHESES = 1000 # break if there are too many ways to split a word 52 | 53 | SMOR_ENCODING = 'UTF-8' 54 | 55 | 56 | class FstWrapper(): 57 | def __init__(self, smor_binary, smor_model): 58 | self.child = pexpect.spawnu(smor_binary + ' ' + smor_model) 59 | self.child.delaybeforesend = 0 60 | self.child.expect(["analyze> ", pexpect.EOF], timeout=600) 61 | before = self.child.before 62 | if self.child.terminated: 63 | raise RuntimeError(before) 64 | 65 | def analyse(self, word): 66 | word = word.strip() 67 | if word == "" or word == "q" or word == "\x7f": 68 | return [] 69 | self.child.sendline(word) 70 | try: 71 | self.child.expect(["analyze> ", pexpect.EOF]) 72 | except pexpect.TIMEOUT: 73 | sys.stderr.write('Warning: timeout while waiting for fst-mor\n') 74 | sys.stderr.write('String: {0}'.format(word)) 75 | return [] 76 | result = self.child.before.split("\r\n")[1:-1] 77 | if len(result) == 1 and re.match("^no result for ", result[0]): 78 | result = [] 79 | return result 80 | 81 | 82 | class SMORSplitter(object): 83 | 84 | def __init__(self, smor_model, no_truecase): 85 | 86 | self.smor = FstWrapper('fst-mor', smor_model) 87 | self.data = defaultdict(set) 88 | self.re_mainclass = re.compile(r'<\+(.*?)>') 89 | self.re_any = re.compile(r'<([^#~-]+?)>') 90 | self.re_nn = re.compile(r'<#>') 91 | self.re_morph = re.compile(r'<([#~-])>') 92 | self.re_fugenlaut = re.compile(r'<->') 93 | self.re_segment = re.compile(r'<([A-Z#~]*?)>') 94 | self.re_hyphenation = re.compile(r'\{(.+?)\}-(?:)?') 95 | self.re_last = re.compile(r'(.+?)<\+',re.UNICODE) 96 | self.no_truecase = no_truecase 97 | 98 | 99 | def convert(self, analyses): 100 | """convert SMOR output into list of morphemes""" 101 | 102 | for word, lines in analyses: 103 | cache = [] 104 | for line in lines: 105 | 106 | if line.startswith('no result'): 107 | continue 108 | 109 | if not line: 110 | continue 111 | 112 | try: 113 | pos = self.re_mainclass.search(line).group(1) 114 | except AttributeError: 115 | continue 116 | 117 | if pos == 'V' and '' in line: 118 | continue 119 | elif pos == 'PUNCT': 120 | continue 121 | 122 | #score number of morphemes; heuristic adopted from SFST 123 | segments = len(self.re_segment.findall(line)) 124 | if line.startswith(''): 125 | if self.no_truecase: 126 | continue 127 | else: 128 | segments -= 1 129 | elif '' in line: 130 | continue 131 | 132 | # convert markup of hyphenated words into markup of compounds (with '-' as juncture element which is lost if we split, but kept if we don't) 133 | # {ABC}-Abwehr<+NN> -> ABC<->-<#>Abwehr<+NN> 134 | line = self.re_hyphenation.sub(r'\1<->-<#>', line) 135 | 136 | main = self.re_last.search(line).group(1) 137 | parts = self.re_any.sub('',main) 138 | 139 | cache.append((word,segments,parts,pos)) 140 | 141 | self.get_best(cache) 142 | 143 | 144 | def get_best(self,cache): 145 | if cache: 146 | for best in cache: #currently, process all segmentations. possible modification: only use 'best' segmentation, i.e. the one with the fewest morphemes 147 | 148 | #only split nouns 149 | if best[3] in SMOR_SPLIT: 150 | 151 | wordform = best[0] 152 | lemma = best[2] 153 | if not '<#>' in lemma: 154 | continue 155 | if '<~>' in lemma: 156 | lemma = lemma.replace('<~>','') 157 | stem = ''.join(lemma.split('<#>')[:-1]) 158 | stem = self.re_morph.sub('',stem) 159 | 160 | # restore inflected ending from analysis 161 | try: 162 | ending = best[0].split(stem)[1] 163 | split = lemma.split('<#>')[:-1] + [ending] 164 | except: 165 | split = lemma.split('<#>') 166 | 167 | # keep inflection of ending 168 | split[-1] = self.re_morph.sub('',split[-1]) 169 | 170 | for i, item in enumerate(split): 171 | root, fuge = item, '' 172 | items = item.split('<->') 173 | 174 | if len(items) == 2: 175 | root, fuge = items 176 | elif len(items) > 2: 177 | root = ''.join(items[:-1]) 178 | fuge = items[-1] 179 | 180 | root = self.re_morph.sub('', root) 181 | split[i] = (root, fuge) 182 | 183 | self.data[best[0]].add(tuple(split)) 184 | 185 | 186 | def analyze(self, words_in): 187 | """get all new words from input line and send them to SMOR for analysis""" 188 | 189 | todo = [] 190 | 191 | for word in words_in: 192 | if not word in self.data: 193 | 194 | self.data[word] = set([((word,''),)]) 195 | todo.append(word) 196 | 197 | analyses = [(word, self.smor.analyse(word)) for word in todo] 198 | self.convert(analyses) 199 | 200 | 201 | 202 | def train_model(in_obj, out_path, syntax): 203 | 204 | freq = defaultdict(int) 205 | 206 | re_syntax_splitter = re.compile(r'((?:\s*(?:<[^<>]*>)+\s*)|(?:(?)\s+(?!<)))') 207 | 208 | for line in in_obj: 209 | if syntax and '<' in line: 210 | words = [word for word in re_syntax_splitter.split(line) if word and not word == ' ' and not word.startswith('<')] 211 | else: 212 | words = line.split() 213 | for word in words: 214 | freq[word] += 1 215 | 216 | write_model(freq, out_path) 217 | 218 | 219 | def write_model(model, file_path): 220 | 221 | if sys.version_info < (3, 0): 222 | file_obj = codecs.getwriter('UTF-8')(open(args.model, 'w')) 223 | else: 224 | file_obj = open(args.model, 'w', encoding='UTF-8') 225 | 226 | file_obj.write('# -*- coding: utf-8 -*-\n\n') 227 | file_obj.write('from __future__ import unicode_literals\n\n') 228 | file_obj.write('model = ') 229 | json.dump(model,file_obj, indent=2) 230 | file_obj.close() 231 | 232 | 233 | def generate_decompositions(splits, memory = False, write_juncture = False): 234 | 235 | if not memory: 236 | memory = [] 237 | 238 | for start in splits[-1].keys(): 239 | if start == 0: 240 | yield [splits[-1][start]] + memory 241 | else: 242 | if write_juncture: 243 | juncture, segment, new_start = splits[-1][start] 244 | new_memory = [(juncture, -1), (segment, new_start)] + memory 245 | else: 246 | new_memory = [splits[-1][start]] + memory 247 | for decomposition in generate_decompositions(splits[:start+1], new_memory, write_juncture = write_juncture): 248 | yield decomposition 249 | 250 | 251 | def get_unsupervised_splits(word, freq, truecase, fst_server=None, write_juncture=False, no_truecase=False): 252 | reachable = [{} for i in range(len(word)+1)] 253 | for end in range(MIN_SIZE, len(word)+1): 254 | for start in range(0, end-MIN_SIZE+1): 255 | 256 | if start and not reachable[start]: # no split ending in this position 257 | continue 258 | 259 | for juncture in JUNCTURES: 260 | 261 | if start == 0 and juncture: 262 | continue 263 | 264 | if word[start:start+len(juncture)] != juncture: 265 | continue 266 | 267 | subword_orig = word[start+len(juncture):end] 268 | subword = subword_orig.lower() 269 | if subword not in freq or freq[subword] < MIN_COUNT: 270 | continue 271 | 272 | if VERBOSE: 273 | sys.stderr.write('\tmatching word {0} .. {1} ({2}){3} {4}\n'.format(start, end, juncture, subword, freq[subword])) 274 | 275 | if subword in truecase: 276 | subword = truecase[subword] 277 | 278 | if no_truecase: 279 | subword_out = subword_orig 280 | else: 281 | subword_out = subword 282 | 283 | if not start in reachable[end] or freq[subword] > reachable[end][start][1]: 284 | if write_juncture and not start == 0: 285 | juncture_out = '@' + juncture + '@' 286 | reachable[end][start] = (juncture_out, subword_out, freq[subword]) 287 | else: 288 | reachable[end][start] = (subword_out, freq[subword]) 289 | 290 | #no split found 291 | if not reachable[-1]: 292 | return 293 | 294 | for decomposition in generate_decompositions(reachable, write_juncture = write_juncture): 295 | yield decomposition 296 | 297 | def join_compounds(compounds, freq, truecase, write_junctures, no_truecase, memory = False): 298 | 299 | if not memory: 300 | memory = [] 301 | 302 | for j in range(1, len(compounds)+1): 303 | 304 | if j == 1: 305 | subword_orig = compounds[0][0] 306 | subword = subword_orig.lower() 307 | else: 308 | prefix = ''.join([''.join(f) for f in compounds[:j-1]]) 309 | suffix = compounds[j-1][0] 310 | subword_orig = prefix + suffix 311 | subword = subword_orig.lower() 312 | 313 | if subword not in freq or freq[subword] < MIN_COUNT: 314 | continue 315 | 316 | if VERBOSE: 317 | sys.stderr.write('\tmatching word {0} {1}\n'.format(subword, freq[subword])) 318 | 319 | if no_truecase: 320 | subword_out = subword_orig 321 | else: 322 | if subword in truecase: 323 | subword = truecase[subword] 324 | subword_out = subword 325 | 326 | new_element = [(subword_out, freq[subword])] 327 | 328 | if j == len(compounds): 329 | yield memory + new_element 330 | else: 331 | if write_junctures: 332 | new_element.append(('@' + compounds[j-1][1] + '@', -1)) 333 | for compound in join_compounds(compounds[j:], freq, truecase, write_junctures, no_truecase, memory + new_element): 334 | yield compound 335 | 336 | 337 | def get_FST_splits(word, freq, truecase, fst_server, write_junctures, no_truecase): 338 | 339 | for split in fst_server.data[word]: 340 | for compound in join_compounds(split, freq, truecase, write_junctures, no_truecase): 341 | yield compound 342 | 343 | 344 | def create_compound_xml(element, wordlist, write_junctures, merge_junctures, dependency, initial=False): 345 | 346 | # separate last segment, then recursively label remainder as compound modifier 347 | if initial: 348 | juncture = '' 349 | dep = ET.Element('tree') 350 | dep.set('label', 'SEGMENT') 351 | dep.text = wordlist[-1] 352 | remainder = wordlist[:-1] 353 | if remainder: 354 | create_compound_xml(element, remainder, write_junctures, merge_junctures, dependency) 355 | element.append(dep) 356 | return 357 | 358 | if write_junctures or merge_junctures: 359 | juncture = wordlist[-1] 360 | word = wordlist[-2] 361 | remainder = wordlist[:-2] 362 | else: 363 | word = wordlist[-1] 364 | remainder = wordlist[:-1] 365 | 366 | head = ET.Element('tree') 367 | head.set('label', 'comp_mod') 368 | element.append(head) 369 | 370 | if merge_junctures: 371 | dep1 = ET.Element('tree') 372 | dep1.set('label', 'SEGMENT+JUNC') 373 | dep1.text = word + juncture[1:-1] + '@@' 374 | else: 375 | dep1 = ET.Element('tree') 376 | dep1.set('label', 'SEGMENT') 377 | dep1.text = word 378 | 379 | if remainder: 380 | create_compound_xml(head, remainder, write_junctures, merge_junctures, dependency) 381 | 382 | head.append(dep1) 383 | 384 | if write_junctures: 385 | dep2 = ET.Element('tree') 386 | dep2.set('label', 'JUNC') 387 | dep2.text = juncture 388 | if dependency: 389 | dep3 = ET.Element('tree') 390 | dep3.set('label', 'junc') 391 | dep3.append(dep2) 392 | head.append(dep3) 393 | else: 394 | head.append(dep2) 395 | 396 | 397 | def apply_model(file_obj, freq, fst_server, split_function, write_junctures, merge_junctures, syntax, no_truecase, dependency): 398 | 399 | re_syntax_splitter = re.compile(r'((?:\s*(?:<[^<>]*>)+\s*)|(?:(?)\s+(?!<)))') 400 | truecase = {} 401 | 402 | for word in list(freq): 403 | word_lc = word.lower() 404 | if word_lc in freq and freq[word_lc] > freq[word]: 405 | continue 406 | 407 | freq[word_lc] = freq[word] 408 | if word_lc != word and not no_truecase: 409 | truecase[word_lc] = word 410 | 411 | for line in file_obj: 412 | 413 | # only do syntactic processing if option syntax is used and we see '<' in line 414 | write_syntax = syntax 415 | if write_syntax and not '<' in line: 416 | write_syntax = False 417 | 418 | if write_syntax: 419 | words_in = re_syntax_splitter.split(line) 420 | words_in_clean = [word for word in words_in if word and not word.startswith('<') and not word == ' '] 421 | else: 422 | words_in = line.split() 423 | words_in_clean = words_in 424 | 425 | if fst_server: 426 | fst_server.analyze(words_in_clean) 427 | 428 | words = [] 429 | for word in words_in: 430 | 431 | if write_syntax: 432 | if not word: 433 | continue 434 | if word == ' ' or word.startswith('<'): 435 | words.append(word) 436 | continue 437 | 438 | word_lc = word.lower() 439 | if VERBOSE: 440 | sys.stderr.write('considering {0} ({1})...\n'.format(word, word_lc)) 441 | 442 | if word_lc in freq and freq[word_lc] >= MAX_COUNT: 443 | words.append(word) 444 | if VERBOSE: 445 | sys.stderr.write('\tfrequent word ({0}>{1}), skipping\n'.format(freq[word_lc], MAX_COUNT)) 446 | continue 447 | 448 | best_split = word 449 | best_score = 1 450 | 451 | for i, decomposition in enumerate(split_function(word, freq, truecase, fst_server, write_junctures or merge_junctures, no_truecase)): 452 | 453 | if i >= MAX_SPLIT_HYPOTHESES: 454 | break 455 | 456 | split_list, scores = zip(*decomposition) 457 | scores = [score for score in scores if score != -1] #ignoring 458 | total = reduce(mul, scores) 459 | score = total ** (1/len(scores)) 460 | if FEWEST: 461 | score = (-len(scores),score) 462 | split = ' '.join(split_list) 463 | 464 | if VERBOSE: 465 | sys.stderr.write('\t split: {0} ({1} ** 1/{2}) = {3}\n'.format(split, total, len(scores), score)) 466 | 467 | if score > best_score: 468 | best_split = split 469 | best_score = score 470 | 471 | if write_syntax and len(best_split.split()) > 1: 472 | head = ET.Element('x') 473 | create_compound_xml(head, best_split.split(), write_junctures, merge_junctures, dependency, initial=True) 474 | best_split = ET.tostring(head, encoding="UTF-8")[3:-4].decode("UTF-8") 475 | if dependency: 476 | words[-1] = words[-1].rsplit('<',1)[0] 477 | best_split = best_split.rsplit('<',1)[0] 478 | 479 | if merge_junctures: 480 | merged_best_split = [] 481 | for item in best_split.split(): 482 | if merged_best_split and len(item) > 1 and item[0] == item[-1] == "@": 483 | merged_best_split[-1] += item[1:-1] + "@@" 484 | else: 485 | merged_best_split.append(item) 486 | best_split = ' '.join(merged_best_split) 487 | 488 | words.append(best_split) 489 | 490 | if write_syntax: 491 | sys.stdout.write(''.join(words)) 492 | else: 493 | sys.stdout.write(' '.join(words) + '\n') 494 | 495 | 496 | def parse_arguments(): 497 | 498 | help_text = "compound splitter\n" 499 | help_text += " train: python hybrid_compound_splitter.py -train -corpus txt-file -model new-model\n" 500 | help_text += " apply: python hybrid_compound_splitter.py -model trained-model < in > out\n" 501 | 502 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=help_text) 503 | 504 | general = parser.add_argument_group('general options') 505 | 506 | general.add_argument('-model', metavar='MODEL', required=True, 507 | help='path to statistical decompounding model. Will be overwritten if -train is active.') 508 | general.add_argument('-corpus', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', 509 | help='input text (default: standard input).') 510 | general.add_argument('-train', action="store_true", 511 | help='train model on input text. MODEL will be overwritten.') 512 | general.add_argument('-syntax', action="store_true", 513 | help='input/output is syntactic tree') 514 | general.add_argument('-q', action="store_true", 515 | help='quiet mode.') 516 | 517 | application = parser.add_argument_group('application options') 518 | 519 | application.add_argument('-min-size', type=int, 520 | help='minimum word size [don\'t split into short words] (default {0})'.format(MIN_SIZE), default=MIN_SIZE) 521 | application.add_argument('-min-count', type=int, 522 | help='minimum word count [don\'t split into rare words] (default {0})'.format(MIN_COUNT), default=MIN_COUNT) 523 | application.add_argument('-max-count', type=int, 524 | help='maximum word count [don\'t split up frequent words] (default {0})'.format(MAX_COUNT), default=MAX_COUNT) 525 | application.add_argument('-fewest', action="store_true", 526 | help='prefer option with fewest splits (that meets all other constraints)') 527 | application.add_argument('-module', action="store_true", 528 | help='load model as Python module - quicker, but model file needs to end in *.py and be in same folder as script.') 529 | application.add_argument('-smor', metavar='PATH', 530 | help='perform hybrid compound splitting (with SMOR morphology). Default: purely corpus-based compound splitting.') 531 | application.add_argument('-no-truecase', action='store_true', 532 | help='leave segments in original case') 533 | application.add_argument('-dependency', action='store_true', 534 | help='dependency-like representation of compounds (ensure that every nonterminal in compound representation has exactly one preterminal)') 535 | 536 | filler = application.add_mutually_exclusive_group() 537 | 538 | filler.add_argument('-write-filler', action="store_true", dest='write_junctures', 539 | help='write filler elements (surrounded by @@)') 540 | filler.add_argument('-merge-filler', action="store_true", dest='merge_junctures', 541 | help='write filler elements (concatenated with preceding segment, ending in @@)') 542 | 543 | args = parser.parse_args() 544 | 545 | return args 546 | 547 | if __name__ == '__main__': 548 | 549 | args = parse_arguments() 550 | 551 | VERBOSE = not args.q 552 | MIN_SIZE = args.min_size 553 | MIN_COUNT = args.min_count 554 | MAX_COUNT = args.max_count 555 | FEWEST = args.fewest 556 | 557 | if sys.version_info < (3, 0): 558 | sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) 559 | args.corpus = codecs.getreader('UTF-8')(args.corpus) 560 | sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) 561 | 562 | if args.train: 563 | train_model(args.corpus, args.model, args.syntax) 564 | 565 | else: 566 | if args.module: 567 | if args.model.endswith('.py'): 568 | args.model = args.model[:-3] 569 | model = __import__(args.model) 570 | 571 | else: 572 | if sys.version_info < (3, 0): 573 | file_obj = codecs.getreader('UTF-8')(open(args.model, 'r')) 574 | else: 575 | file_obj = open(args.model, 'r', encoding='UTF-8') 576 | start = file_obj.read(100) 577 | offset = start.find('{') 578 | file_obj.seek(offset) 579 | model = {} 580 | model['model'] = json.load(file_obj) 581 | model = argparse.Namespace(**model) 582 | 583 | if args.smor: 584 | smor_server = SMORSplitter(args.smor, args.no_truecase) 585 | split_function = get_FST_splits 586 | else: 587 | smor_server = None 588 | split_function = get_unsupervised_splits 589 | 590 | 591 | apply_model(args.corpus, model.model, smor_server, split_function, args.write_junctures, args.merge_junctures, args.syntax, args.no_truecase, args.dependency) 592 | --------------------------------------------------------------------------------