├── .gitignore ├── LICENSE.md ├── README.md ├── bin ├── nmt-bpe-apply ├── nmt-bpe-learn ├── nmt-build-dict ├── nmt-coco-metrics ├── nmt-extract ├── nmt-rescore ├── nmt-test-lm ├── nmt-train ├── nmt-translate ├── nmt-translate-client ├── nmt-translate-factors └── nmt-translate-server ├── docs ├── _config.yml ├── index.md ├── logo.pdf ├── logo.png └── pages │ └── config.md ├── examples ├── README.md ├── ted-factors │ ├── README.md │ ├── attention_factors-ted-en-fr.conf │ └── data │ │ └── README.md └── wmt16-mmt-task2 │ ├── README.md │ ├── data │ ├── README.md │ ├── fix-corpus-bugs.patch │ ├── split_all.txt │ ├── split_test.txt │ ├── split_train.txt │ ├── split_val.txt │ ├── test.1.de │ ├── test.1.en │ ├── test.2.de │ ├── test.2.en │ ├── test.3.de │ ├── test.3.en │ ├── test.4.de │ ├── test.4.en │ ├── test.5.de │ ├── test.5.en │ ├── train.1.de │ ├── train.1.en │ ├── train.2.de │ ├── train.2.en │ ├── train.3.de │ ├── train.3.en │ ├── train.4.de │ ├── train.4.en │ ├── train.5.de │ ├── train.5.en │ ├── val.1.de │ ├── val.1.en │ ├── val.2.de │ ├── val.2.en │ ├── val.3.de │ ├── val.3.en │ ├── val.4.de │ ├── val.4.en │ ├── val.5.de │ └── val.5.en │ ├── scripts │ ├── 01-tokenize.sh │ └── 02-prepare.py │ ├── wmt16-mmt-task2-monomodal.conf │ └── wmt16-mmt-task2-multimodal.conf ├── nmtpy ├── __init__.py ├── cleanup.py ├── cocoeval │ ├── README.md │ ├── __init__.py │ ├── bleu │ │ ├── LICENSE.bleu │ │ ├── __init__.py │ │ ├── bleu.py │ │ └── bleu_scorer.py │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ ├── meteor │ │ ├── __init__.py │ │ └── meteor.py │ └── rouge │ │ ├── __init__.py │ │ └── rouge.py ├── config.py ├── defaults.py ├── external │ ├── data │ │ └── README.md │ ├── meteor-1.5.jar │ └── multi-bleu.perl ├── filters.py ├── iterators │ ├── __init__.py │ ├── bitext.py │ ├── factors.py │ ├── fusion.py │ ├── homogeneous.py │ ├── iterator.py │ ├── mnmt.py │ ├── text.py │ └── wmt.py ├── layers.py ├── logger.py ├── mainloop.py ├── metrics │ ├── __init__.py │ ├── bleu.py │ ├── external.py │ ├── factors2wordbleu.py │ ├── meteor.py │ ├── metric.py │ └── mtevalbleu.py ├── models │ ├── README.md │ ├── __init__.py │ ├── attention.py │ ├── attention_factors.py │ ├── attention_wmt.py │ ├── basefnmt.py │ ├── basefusion.py │ ├── basemodel.py │ ├── dcu_multimodal.py │ ├── fusion_concat_dep_dep.py │ ├── fusion_concat_dep_ind.py │ ├── fusion_concat_ind_dep.py │ ├── fusion_concat_ind_ind.py │ ├── fusion_sum_dep_dep.py │ ├── fusion_sum_dep_ind.py │ ├── fusion_sum_ind_dep.py │ ├── fusion_sum_ind_ind.py │ ├── mnmt_ctxmul.py │ ├── mnmt_decinit.py │ ├── mnmt_decinitctxtrgmul.py │ ├── mnmt_encdecinit.py │ ├── mnmt_encdecinitctxtrgmul.py │ ├── mnmt_trgmul.py │ ├── mnmt_yemb_mulimg.py │ └── rnnlm.py ├── nmtutils.py ├── optimizers.py ├── sysutils.py └── textutils.py ├── patches └── 00-theano-advancedinctensor.patch ├── scripts ├── README.md ├── get-meteor-data.sh ├── modify-npz ├── prep-charnmt.sh ├── snaprun └── update-npz └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *egg-info 3 | nmtpy/external/data/*gz 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## MIT License 2 | 3 | Copyright (c) 2017 - University of Le Mans - Language and Speech Technology (LST) Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -- 24 | 25 | **nmtpy** includes code from the following projects which have their own licenses: 26 | 27 | - [dl4mt-tutorial](https://github.com/nyu-dl/dl4mt-tutorial) [[BSD-3-Clause](https://github.com/nyu-dl/dl4mt-tutorial/blob/master/LICENSE)] 28 | - Ensembling and alignment collection from [nematus](https://github.com/rsennrich/nematus) [Same as dl4mt-tutorial] 29 | - Scripts from [subword-nmt](https://github.com/rsennrich/subword-nmt) [[MIT](https://github.com/rsennrich/subword-nmt/blob/master/LICENSE)] 30 | - `multi-bleu.perl` from [mosesdecoder](https://github.com/moses-smt/mosesdecoder) [[LGPL-2.1](https://github.com/moses-smt/mosesdecoder/blob/master/COPYING)] 31 | - METEOR v1.5 JAR from [meteor](https://github.com/cmu-mtlab/meteor) [[LGPL-2.1](https://github.com/cmu-mtlab/meteor/blob/master/COPYING)] 32 | - Sorted data iterator, coco eval script and LSTM from [arctic-captions](https://github.com/kelvinxu/arctic-captions) [Revised BSD-3-Clause] 33 | - `pycocoevalcap` from [coco-caption](https://github.com/tylin/coco-caption) [[BSD-2-Clause](https://github.com/tylin/coco-caption/blob/master/license.txt)] 34 | -------------------------------------------------------------------------------- /bin/nmt-bpe-apply: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use operations learned with nmt-bpe-learn to encode a new text. 6 | The text will not be smaller, but use only a fixed vocabulary, with rare words 7 | encoded as variable-length sequences of subword units. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | import re 15 | import sys 16 | import codecs 17 | import argparse 18 | 19 | class BPE(object): 20 | 21 | def __init__(self, codes, separator='@@', skiptags=False): 22 | 23 | with codecs.open(codes.name, encoding='utf-8') as codes: 24 | self.bpe_codes = [tuple(item.split()) for item in codes] 25 | 26 | # some hacking to deal with duplicates (only consider first instance) 27 | self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) 28 | 29 | self.separator = separator 30 | self.skiptags = skiptags 31 | 32 | def segment(self, sentence): 33 | """segment single sentence (whitespace-tokenized string) with BPE encoding""" 34 | 35 | output = [] 36 | for word in sentence.split(): 37 | if self.skiptags and re.match('<.*?:.*>', word): 38 | output.append(word) 39 | else: 40 | new_word = encode(word, self.bpe_codes) 41 | 42 | for item in new_word[:-1]: 43 | output.append(item + self.separator) 44 | output.append(new_word[-1]) 45 | 46 | return ' '.join(output) 47 | 48 | def get_pairs(word): 49 | """Return set of symbol pairs in a word. 50 | 51 | word is represented as tuple of symbols (symbols being variable-length strings) 52 | """ 53 | pairs = set() 54 | prev_char = word[0] 55 | for char in word[1:]: 56 | pairs.add((prev_char, char)) 57 | prev_char = char 58 | return pairs 59 | 60 | def encode(orig, bpe_codes, cache={}): 61 | """Encode word based on list of BPE merge operations, which are applied consecutively 62 | """ 63 | 64 | if orig in cache: 65 | return cache[orig] 66 | 67 | word = tuple(orig) + ('',) 68 | pairs = get_pairs(word) 69 | 70 | while True: 71 | bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf'))) 72 | if bigram not in bpe_codes: 73 | break 74 | first, second = bigram 75 | new_word = [] 76 | i = 0 77 | while i < len(word): 78 | try: 79 | j = word.index(first, i) 80 | new_word.extend(word[i:j]) 81 | i = j 82 | except: 83 | new_word.extend(word[i:]) 84 | break 85 | 86 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 87 | new_word.append(first+second) 88 | i += 2 89 | else: 90 | new_word.append(word[i]) 91 | i += 1 92 | new_word = tuple(new_word) 93 | word = new_word 94 | if len(word) == 1: 95 | break 96 | else: 97 | pairs = get_pairs(word) 98 | 99 | # don't print end-of-word symbols 100 | if word[-1] == '': 101 | word = word[:-1] 102 | elif word[-1].endswith(''): 103 | word = word[:-1] + (word[-1].replace('',''),) 104 | 105 | cache[orig] = word 106 | return word 107 | 108 | 109 | if __name__ == '__main__': 110 | parser = argparse.ArgumentParser( 111 | formatter_class=argparse.RawDescriptionHelpFormatter, 112 | description="learn BPE-based word segmentation") 113 | 114 | parser.add_argument( 115 | '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, 116 | metavar='PATH', 117 | help="Input file (default: standard input).") 118 | parser.add_argument( 119 | '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', 120 | required=True, 121 | help="File with BPE codes (created by nmt-bpe-learn).") 122 | parser.add_argument( 123 | '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, 124 | metavar='PATH', 125 | help="Output file (default: standard output)") 126 | parser.add_argument( 127 | '--skiptags', '-k', action='store_true', default=False, 128 | help="Skip morphological tags (default: False)") 129 | parser.add_argument( 130 | '--separator', '-s', type=str, default='@@', metavar='STR', 131 | help="Separator between non-final subword units (default: '%(default)s'))") 132 | 133 | args = parser.parse_args() 134 | 135 | bpe = BPE(args.codes, args.separator, args.skiptags) 136 | 137 | for line in args.input: 138 | args.output.write(bpe.segment(line).strip()) 139 | args.output.write('\n') 140 | -------------------------------------------------------------------------------- /bin/nmt-bpe-learn: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: Rico Sennrich 4 | 5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. 6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary 7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens. 8 | 9 | Reference: 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. 12 | """ 13 | 14 | import sys 15 | import re 16 | import copy 17 | import argparse 18 | from collections import defaultdict, Counter 19 | 20 | def get_vocabulary(fobj): 21 | """Read text and return dictionary that encodes vocabulary 22 | """ 23 | vocab = Counter() 24 | sys.stderr.write('Reading file {0}\n'.format(fobj.name)) 25 | for line in fobj: 26 | for word in line.split(): 27 | vocab[word] += 1 28 | sys.stderr.write('Done.\n') 29 | return vocab 30 | 31 | def update_pair_statistics(pair, changed, stats, indices): 32 | """Minimally update the indices and frequency of symbol pairs 33 | 34 | if we merge a pair of symbols, only pairs that overlap with occurrences 35 | of this pair are affected, and need to be updated. 36 | """ 37 | stats[pair] = 0 38 | indices[pair] = defaultdict(int) 39 | first, second = pair 40 | new_pair = first+second 41 | for j, word, old_word, freq in changed: 42 | 43 | # find all instances of pair, and update frequency/indices around it 44 | i = 0 45 | while True: 46 | try: 47 | i = old_word.index(first, i) 48 | except ValueError: 49 | break 50 | if i < len(old_word)-1 and old_word[i+1] == second: 51 | if i: 52 | prev = old_word[i-1:i+1] 53 | stats[prev] -= freq 54 | indices[prev][j] -= 1 55 | if i < len(old_word)-2: 56 | # don't double-count consecutive pairs 57 | if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second: 58 | nex = old_word[i+1:i+3] 59 | stats[nex] -= freq 60 | indices[nex][j] -= 1 61 | i += 2 62 | else: 63 | i += 1 64 | 65 | i = 0 66 | while True: 67 | try: 68 | i = word.index(new_pair, i) 69 | except ValueError: 70 | break 71 | if i: 72 | prev = word[i-1:i+1] 73 | stats[prev] += freq 74 | indices[prev][j] += 1 75 | # don't double-count consecutive pairs 76 | if i < len(word)-1 and word[i+1] != new_pair: 77 | nex = word[i:i+2] 78 | stats[nex] += freq 79 | indices[nex][j] += 1 80 | i += 1 81 | 82 | 83 | def get_pair_statistics(vocab): 84 | """Count frequency of all symbol pairs, and create index""" 85 | 86 | # data structure of pair frequencies 87 | stats = defaultdict(int) 88 | 89 | #index from pairs to words 90 | indices = defaultdict(lambda: defaultdict(int)) 91 | 92 | for i, (word, freq) in enumerate(vocab): 93 | prev_char = word[0] 94 | for char in word[1:]: 95 | stats[prev_char, char] += freq 96 | indices[prev_char, char][i] += 1 97 | prev_char = char 98 | 99 | return stats, indices 100 | 101 | 102 | def replace_pair(pair, vocab, indices): 103 | """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'""" 104 | first, second = pair 105 | pair_str = ''.join(pair) 106 | pair_str = pair_str.replace('\\','\\\\') 107 | changes = [] 108 | pattern = re.compile(r'(?',) ,y) for (x,y) in vocab.items()]) 161 | sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) 162 | 163 | stats, indices = get_pair_statistics(sorted_vocab) 164 | big_stats = copy.deepcopy(stats) 165 | # threshold is inspired by Zipfian assumption, but should only affect speed 166 | threshold = max(stats.values()) / 10 167 | for i in range(args.symbols): 168 | if stats: 169 | most_frequent = max(stats, key=lambda x: (stats[x], x)) 170 | 171 | # we probably missed the best pair because of pruning; go back to full statistics 172 | if not stats or (i and stats[most_frequent] < threshold): 173 | prune_stats(stats, big_stats, threshold) 174 | stats = copy.deepcopy(big_stats) 175 | most_frequent = max(stats, key=lambda x: (stats[x], x)) 176 | # threshold is inspired by Zipfian assumption, but should only affect speed 177 | threshold = stats[most_frequent] * i/(i+10000.0) 178 | prune_stats(stats, big_stats, threshold) 179 | 180 | if stats[most_frequent] < 2: 181 | sys.stderr.write('no pair has frequency > 1. Stopping\n') 182 | break 183 | 184 | if args.verbose: 185 | sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) 186 | args.output.write('{0} {1}\n'.format(*most_frequent)) 187 | changes = replace_pair(most_frequent, sorted_vocab, indices) 188 | update_pair_statistics(most_frequent, changes, stats, indices) 189 | stats[most_frequent] = 0 190 | if not i % 100: 191 | prune_stats(stats, big_stats, threshold) 192 | -------------------------------------------------------------------------------- /bin/nmt-build-dict: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import argparse 7 | import pickle as pkl 8 | from collections import OrderedDict 9 | 10 | import numpy as np 11 | 12 | def freqs_to_dict(token_freqs, min_freq): 13 | # Get list of tokens 14 | tokens = list(token_freqs.keys()) 15 | 16 | # Collect their frequencies in a numpy array 17 | freqs = np.array(list(token_freqs.values())) 18 | 19 | tokendict = OrderedDict() 20 | tokendict[''] = 0 21 | tokendict[''] = 1 22 | 23 | # Sort in descending order of frequency 24 | sorted_idx = np.argsort(freqs) 25 | if min_freq > 0: 26 | sorted_tokens = [tokens[ii] for ii in sorted_idx[::-1] if freqs[ii] >= min_freq] 27 | else: 28 | sorted_tokens = [tokens[ii] for ii in sorted_idx[::-1]] 29 | 30 | # Start inserting from index 2 31 | for ii, ww in enumerate(sorted_tokens): 32 | tokendict[ww] = ii + 2 33 | 34 | return tokendict 35 | 36 | def get_freqs(fname, cumul_dict=None): 37 | # We'll first count frequencies 38 | if cumul_dict is not None: 39 | # Let's accumulate frequencies 40 | token_freqs = cumul_dict 41 | else: 42 | token_freqs = OrderedDict() 43 | 44 | print("Reading file %s" % filename) 45 | with open(filename) as f: 46 | idx = 0 47 | for line in f: 48 | line = line.strip() 49 | if line: 50 | # Collect frequencies 51 | for w in line.split(' '): 52 | if w not in token_freqs: 53 | token_freqs[w] = 0 54 | token_freqs[w] += 1 55 | 56 | if (idx+1) % 10000 == 0: 57 | print('\r%d sentences processed' % (idx + 1), end=' ') 58 | sys.stdout.flush() 59 | 60 | idx += 1 61 | 62 | print('\r%d sentences processed' % (idx)) 63 | # Remove already available and if any 64 | if '' in token_freqs: 65 | del token_freqs[''] 66 | if '' in token_freqs: 67 | del token_freqs[''] 68 | 69 | return token_freqs 70 | 71 | def write_dict(fname, vocab): 72 | print("Dumping vocabulary (%d tokens) to %s..." % (len(vocab), fname)) 73 | with open(fname, 'wb') as f: 74 | pkl.dump(vocab, f) 75 | 76 | if __name__ == '__main__': 77 | parser = argparse.ArgumentParser(prog='build_dictionary') 78 | parser.add_argument('-o', '--output-dir', type=str, default='.', help='Output directory') 79 | parser.add_argument('-s', '--single' , type=str, default=None,help='Name of the single vocabulary file. (default: Disabled)') 80 | parser.add_argument('-m', '--min-freq' , type=int, default=0, help='Filter out tokens occuring < m times.') 81 | parser.add_argument('files', type=str , nargs='+', help='Text files to create dictionaries.') 82 | args = parser.parse_args() 83 | 84 | # In case it is needed 85 | all_freqs = OrderedDict() 86 | 87 | for filename in args.files: 88 | filename = os.path.abspath(os.path.expanduser(filename)) 89 | 90 | if args.single: 91 | # Get cumulative frequencies 92 | all_freqs = get_freqs(filename, all_freqs) 93 | 94 | else: 95 | # Get frequencies 96 | freqs = get_freqs(filename) 97 | # Build dictionary from frequencies 98 | tokendict = freqs_to_dict(freqs, args.min_freq) 99 | 100 | vocab_fname = os.path.basename(filename) 101 | if args.min_freq > 0: 102 | vocab_fname += "-min%d" % args.min_freq 103 | vocab_fname = os.path.join(args.output_dir, vocab_fname) 104 | vocab_fname += ".vocab.pkl" 105 | 106 | write_dict(vocab_fname, tokendict) 107 | 108 | if args.single: 109 | tokendict = freqs_to_dict(all_freqs, args.min_freq) 110 | write_dict(args.single, tokendict) 111 | -------------------------------------------------------------------------------- /bin/nmt-coco-metrics: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Computes the BLEU, ROUGE, METEOR, and CIDER 6 | using the COCO metrics scripts 7 | """ 8 | import os 9 | import argparse 10 | from collections import OrderedDict 11 | 12 | # Script taken and adapted from Kelvin Xu's arctic-captions project 13 | # https://github.com/kelvinxu/arctic-captions 14 | 15 | from nmtpy.cocoeval.bleu.bleu import Bleu 16 | from nmtpy.cocoeval.rouge.rouge import Rouge 17 | from nmtpy.cocoeval.cider.cider import Cider 18 | from nmtpy.cocoeval.meteor.meteor import Meteor 19 | 20 | def print_table(results, sort_by='METEOR'): 21 | cols = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 22 | 'METEOR', 'METEOR (norm)', 'CIDEr', 'ROUGE_L'] 23 | for col in cols: 24 | print('|{:^15}|'.format(col), end='') 25 | print() 26 | 27 | results = sorted(results.items(), key=lambda x: x[1][sort_by]) 28 | 29 | for sysname, result in results: 30 | if len(results) > 1: 31 | print(sysname) 32 | for col in cols: 33 | print('|{:^15,.3f}|'.format(result[col]), end='') 34 | print() 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser(description="Compute BLEU, METEOR, ROUGE and CIDEr for single or multiple references.") 38 | 39 | parser.add_argument("-w", "--write", action='store_true', help='Create a per-hypothesisscore file containing the results.') 40 | parser.add_argument("-l", "--language", default='en', help='Hypothesis language (default: en)') 41 | parser.add_argument("-s", "--systems", type=str, help="Per-system hypothesis file(s)", nargs='+') 42 | parser.add_argument("-r", "--refs", type=argparse.FileType('r'), help="Path to all the reference files", nargs='+') 43 | 44 | args = parser.parse_args() 45 | 46 | # List of scorers 47 | scorers = [ 48 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 49 | (Meteor(args.language), ["METEOR"]), 50 | (Meteor(args.language, norm=True), ["METEOR (norm)"]), 51 | (Cider(), ["CIDEr"]), 52 | (Rouge(), ["ROUGE_L"]), 53 | ] 54 | 55 | results = OrderedDict() 56 | 57 | # Read multiple reference files 58 | raw_refs = [list(map(str.strip, r)) for r in zip(*args.refs)] 59 | refs = {idx: rr for idx, rr in enumerate(raw_refs)} 60 | 61 | # Ranking of multiple systems is possible 62 | for hypfile in args.systems: 63 | with open(hypfile) as f: 64 | # List of hypothesis sentences for this system 65 | hypo = {idx: [line.strip()] for (idx, line) in enumerate(f)} 66 | 67 | result = OrderedDict() 68 | 69 | for scorer, method in scorers: 70 | score, _ = scorer.compute_score(refs, hypo) 71 | if score: 72 | if not isinstance(score, list): 73 | score = [score] 74 | for m, s in zip(method, score): 75 | result[m] = float('%.3f' % s) 76 | 77 | if args.write: 78 | with open("%s.score" % hypfile, 'w') as f: 79 | f.write("%s\n" % result) 80 | results[os.path.basename(hypfile)] = result 81 | 82 | print_table(results) 83 | -------------------------------------------------------------------------------- /bin/nmt-extract: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Extract several features from a trained model.""" 4 | 5 | import sys 6 | import argparse 7 | 8 | import numpy as np 9 | 10 | from nmtpy.sysutils import get_param_dict 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(prog='nmt-extract') 14 | parser.add_argument('-o', '--output', type=str, help="Output .npz file.") 15 | parser.add_argument('-n', '--init', action='store_true', help="Make output file compatible with nmt-train '--init' argument.") 16 | parser.add_argument('-m', '--model', type=str, help="Model's .npz file from which weights will be extracted.") 17 | parser.add_argument('-w', '--which', nargs='+', required=True, help='Space separated list of to-be-extracted weight keys.') 18 | 19 | args = parser.parse_args() 20 | 21 | try: 22 | params = get_param_dict(args.model) 23 | except KeyError as ke: 24 | print('%s does not contain model parameters. Did you train model do at least 1 validation?' % args.model) 25 | sys.exit(1) 26 | 27 | extracted_weights = {} 28 | for key in args.which: 29 | try: 30 | extracted_weights[key] = params[key] 31 | print("Extracted '%s' with shape=%s" % (key, params[key].shape)) 32 | except KeyError as ke: 33 | print("'%s' not found in model's .npz file, aborting." % key) 34 | sys.exit(1) 35 | 36 | if args.init: 37 | # You can use output file to init a new model with pre-trained weights 38 | # extracted here. 39 | np.savez(args.output, tparams=extracted_weights, opts={}) 40 | else: 41 | np.savez(args.output, **extracted_weights) 42 | -------------------------------------------------------------------------------- /bin/nmt-rescore: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Rescores translations using an nmtpy NMT model.""" 5 | 6 | import os 7 | import sys 8 | import time 9 | import argparse 10 | import tempfile 11 | import importlib 12 | 13 | import numpy as np 14 | 15 | from nmtpy.logger import Logger 16 | from nmtpy.sysutils import * 17 | 18 | def is_nbest(trg_file): 19 | """Checks whether trg_file is in N-best format.""" 20 | with open(trg_file) as tf: 21 | return ' ||| ' in tf.readline().strip() 22 | 23 | def process_files(src_file, trg_file): 24 | with open(trg_file) as tf: 25 | # Read source sentences as they are 26 | src_sents = open(src_file).read().strip().split('\n') 27 | 28 | new_sf = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.rescore.src') 29 | new_tf = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.rescore.trg') 30 | 31 | for line in tf: 32 | idx, hyp, score = line.split(' ||| ') 33 | new_sf.write('%s\n' % src_sents[int(idx)]) 34 | new_tf.write('%s\n' % hyp) 35 | 36 | new_sf.close() 37 | new_tf.close() 38 | 39 | return (new_sf.name, new_tf.name) 40 | 41 | def write_rescore_file(trg_file, out_file, nmt_scores, nbest): 42 | """Append scores to trg_file's last column and save it as out_file.""" 43 | with open(out_file, 'w') as of: 44 | with open(trg_file) as tf: 45 | for idx, (scores, line) in enumerate(zip(nmt_scores, tf)): 46 | # generate score string 47 | score = ' '.join(["%.6f" % s for s in scores]) 48 | if nbest: 49 | of.write("%s %s\n" % (line.strip(), score)) 50 | else: 51 | of.write("%d ||| %s ||| %s\n" % (idx, line.strip(), score)) 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser(prog='nmt-rescore') 55 | parser.add_argument('-b', '--batch-size' ,default=128, type=int, help="Batch size to use during forward-pass.") 56 | parser.add_argument('-d', '--device' ,default='auto', type=str, help="Automatically selects GPU or CPU if no GPU available. (cpu or gpuX can also be given.)") 57 | parser.add_argument('-s', '--src-file' ,required=True, type=str, help="File containing 1 source sentence per line.") 58 | parser.add_argument('-t', '--trg-file' ,required=True, type=str, help="Translations file in plain text or n-best format.") 59 | parser.add_argument('-o', '--out-file' ,required=True, type=str, help="Output file for rescored translations.") 60 | parser.add_argument('-m', '--models' ,required=True, type=str, help="Model .npz file(s) to be used for (ensemble) rescoring.", 61 | nargs='+') 62 | 63 | # Setup the logger 64 | Logger.setup(timestamp=False) 65 | log = Logger.get() 66 | 67 | args = parser.parse_args() 68 | 69 | ##################################### 70 | # Set device for Theano if not forced 71 | ##################################### 72 | # NOTE: Very slow on CPU compared to GPU! 73 | if 'THEANO_FLAGS' not in os.environ: 74 | dev = get_device(args.device) 75 | log.info('Using device: %s' % dev) 76 | os.environ['THEANO_FLAGS'] = "device=%s" % dev 77 | 78 | if args.device == 'cpu': 79 | # This is to avoid thread explosion. Allow 80 | # each process to use a single thread. 81 | os.environ["OMP_NUM_THREADS"] = "1" 82 | os.environ["MKL_NUM_THREADS"] = "1" 83 | 84 | # Print information 85 | log.info("Source file: %s" % args.src_file) 86 | log.info("Target file: %s" % args.trg_file) 87 | log.info("%d models given for rescoring" % len(args.models)) 88 | 89 | # Load model options from first model 90 | model_options = get_model_options(args.models[0]) 91 | 92 | # Import the module 93 | Model = importlib.import_module("nmtpy.models.%s" % model_options['model_type']).Model 94 | 95 | # Create the model, seed is not used. 96 | model = Model(seed=1, logger=None, **model_options) 97 | 98 | # Load the first model 99 | model.load(args.models[0]) 100 | 101 | # Disable dropout 102 | model.set_dropout(False) 103 | 104 | # Build graph 105 | log.info('Building computation graph...') 106 | model.build() 107 | 108 | # Set batch size 109 | model.batch_size = args.batch_size 110 | log.info('Batch size: %d' % model.batch_size) 111 | 112 | remove_temp_files = [] 113 | 114 | # Copy filenames 115 | src_file, trg_file = args.src_file, args.trg_file 116 | 117 | is_trg_nbest = is_nbest(args.trg_file) 118 | 119 | if is_trg_nbest: 120 | log.info('Target is n-best') 121 | # Process src and trg files accordingly 122 | src_file, trg_file = process_files(args.src_file, args.trg_file) 123 | remove_temp_files.extend([src_file, trg_file]) 124 | 125 | model.data['valid_src'] = src_file 126 | model.data['valid_trg'] = trg_file 127 | 128 | log.info('Loading data') 129 | model.load_valid_data() 130 | iterator = model.valid_iterator 131 | 132 | # Score array per each model 133 | scores = [[] for i in range(len(args.models))] 134 | 135 | start = time.time() 136 | for idx, modelfile in enumerate(args.models): 137 | log.info('Rescoring with %s' % os.path.basename(modelfile)) 138 | 139 | # Load model weights for anything except first one (hacky) 140 | if idx > 0: 141 | model.update_shared_variables(get_param_dict(modelfile)) 142 | 143 | for i, data in enumerate(iterator): 144 | norm = data['y_mask'].sum(0) 145 | scores[idx].extend(model.f_log_probs(*list(data.values())) / norm) 146 | 147 | if (i + 1) % 10 == 0: 148 | log.info('%d samples completed.' % (len(scores[idx]))) 149 | 150 | log.info('Rescoring done in %.3f seconds.' % (time.time() - start)) 151 | 152 | # Convert scores to numpy array and transpose 153 | scores = np.array(scores, dtype='float32').T 154 | 155 | # Write final file 156 | write_rescore_file(args.trg_file, args.out_file, scores, is_trg_nbest) 157 | 158 | # Remove n-best related temporary files 159 | for file_ in remove_temp_files: 160 | os.unlink(file_) 161 | 162 | # Report success 163 | sys.exit(0) 164 | -------------------------------------------------------------------------------- /bin/nmt-test-lm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Score a source file using a model.""" 3 | 4 | import os 5 | import sys 6 | import time 7 | import argparse 8 | import importlib 9 | from multiprocessing import Process, Queue, cpu_count 10 | 11 | from collections import OrderedDict 12 | 13 | import numpy as np 14 | 15 | from nmtpy.logger import Logger 16 | from nmtpy.config import Config 17 | from nmtpy.sysutils import * 18 | from nmtpy.iterators.bitext import BiTextIterator 19 | import nmtpy.cleanup as cleanup 20 | 21 | Logger.setup() 22 | log = Logger.get() 23 | 24 | """Worker process which does calculate logprobs from data send through the model.""" 25 | def test_model(queue, rqueue, pid, model): 26 | 27 | while True: 28 | req = queue.get() 29 | # We should avoid this 30 | if req is None: 31 | break 32 | 33 | # Get data from queue 34 | idx, data_dict = req[0], req[1] 35 | # Calculate validation loss 36 | curr_loss, sentlen = model.val_loss(data_dict) 37 | 38 | score=0 39 | rqueue.put((idx, score, sentlen, curr_loss)) 40 | 41 | 42 | """Tester starts worker processes, delegates source iterator 43 | to them, waits for the results.""" 44 | class Tester(object): 45 | def __init__(self, args): 46 | # Always lists provided by argparse (nargs:'+') 47 | self.src_files = args.src_files 48 | self.ref_files = args.ref_files 49 | 50 | self.n_jobs = args.n_jobs 51 | self.model_file = args.model 52 | 53 | # Not used 54 | self.seed = 1234 55 | 56 | self.utf8 = False 57 | 58 | # Create worker process pool 59 | self.processes = [None] * self.n_jobs 60 | 61 | def set_model_options(self): 62 | model_options = get_model_options(self.model_file) 63 | 64 | # Import the module 65 | self.__class = importlib.import_module("nmtpy.models.%s" % model_options['model_type']).Model 66 | 67 | # Create the model 68 | self.model = self.__class(seed=self.seed, logger=None, **model_options) 69 | self.model.load(self.model_file) 70 | self.model.set_dropout(False) 71 | 72 | # invert dictionary 73 | self.ref_idict = dict([[v,k] for k,v in self.model.src_dict.items()]) 74 | 75 | # Normal test mode 76 | if self.src_files is not None: 77 | self.model.data['valid_src'] = self.src_files[0] 78 | 79 | self.model.load_valid_data() 80 | self.iterator = self.model.valid_iterator 81 | self.n_sentences = self.iterator.n_samples 82 | log.info('I will test %d samples' % self.n_sentences) 83 | 84 | if self.src_files is None: 85 | self.src_files = listify(self.model.data['valid_src']) 86 | log.info("No test data given, assuming validation dataset.") 87 | 88 | # Print information 89 | log.info("Source file(s)") 90 | for f in self.src_files: 91 | log.info(" %s" % f) 92 | 93 | # It's possible that we don't have any reference files, e.g. for test sets. 94 | if self.ref_files: 95 | log.info("Reference file(s)") 96 | for f in self.ref_files: 97 | log.info(" %s" % f) 98 | 99 | def start(self): 100 | # create input and output queues for processes 101 | write_queue = Queue() 102 | read_queue = Queue() 103 | # Create processes 104 | self.model.build_sampler() 105 | self.model.build() 106 | for idx in range(self.n_jobs): 107 | self.processes[idx] = Process(target=test_model, args=(write_queue, read_queue, idx, self.model)) 108 | self.processes[idx].start() 109 | cleanup.register_proc(self.processes[idx].pid) 110 | 111 | cleanup.register_handler() 112 | 113 | # Send data to worker processes 114 | for idx in range(self.n_sentences): 115 | sample = next(self.iterator) 116 | write_queue.put((idx, sample)) 117 | 118 | log.info("Distributed %d sentences to worker processes." % self.n_sentences) 119 | 120 | # Receive the results 121 | self.sentences = [None] * self.n_sentences 122 | self.log_probs = [None] * self.n_sentences 123 | 124 | t = time.time() 125 | sum_sentlen = 0 126 | sum_logprob = 0. 127 | 128 | for i in range(self.n_sentences): 129 | # Get response from worker 130 | resp = read_queue.get() 131 | 132 | # This is the sample id of the processed sample 133 | idx = resp[0] 134 | sentlens, logprobs = resp[2], resp[3] 135 | 136 | sum_sentlen += sentlens 137 | sum_logprob += sum(logprobs) 138 | self.log_probs[idx] = logprobs 139 | 140 | # Print progress 141 | if (i+1) % 100 == 0: 142 | t = time.time() - t 143 | log.info("%d/%d sentences completed (%.2f seconds)" % ((i+1), self.n_sentences, t)) 144 | t = time.time() 145 | 146 | log.info("Test Perplexity: %.4f" % np.exp(sum_logprob/sum_sentlen)) 147 | # Stop workers 148 | for idx in range(self.n_jobs): 149 | write_queue.put(None) 150 | self.processes[idx].terminate() 151 | cleanup.unregister_proc(self.processes[idx].pid) 152 | 153 | #Write sentence and logprobs for rescoring purpose 154 | def write_logprobs(self, filename, dump_scores=False): 155 | def __encode(s): 156 | return s.encode('utf-8') if self.utf8 else s 157 | 158 | with open(filename, 'w') as f: 159 | log.info("Writing output file...") 160 | for idx, lp in enumerate(self.log_probs): 161 | logprobs_array=''.join(map(str,' '.join(str(i[0]) for i in lp))) 162 | f.write(__encode("%d ||| %s ||| %f\n"%(idx, logprobs_array,sum(lp)))) 163 | 164 | if __name__ == "__main__": 165 | parser = argparse.ArgumentParser(prog='lm-test') 166 | parser.add_argument('-j', '--n-jobs' , type=int, default=8, 167 | help="Number of processes (default: 8, 0: Auto)") 168 | 169 | parser.add_argument('-m', '--model' , type=str, help="Model file", required=True) 170 | parser.add_argument('-o', '--saveto' , type=str, help="Output test file (if not given, only metrics will be printed)", 171 | default=None) 172 | parser.add_argument('-s', '--score' , action='store_true', help="Print scores of each sentence") 173 | 174 | parser.add_argument('-S', '--src-files' , type=str, help="Source data file (default: validation set)", 175 | nargs='+', default=None) 176 | parser.add_argument('-R', '--ref-files' , type=str, help="One or multiple reference files (default: validation set)", 177 | nargs='+', 178 | default=None) 179 | 180 | args = parser.parse_args() 181 | 182 | if args.n_jobs == 0: 183 | # Auto infer CPU number 184 | args.n_jobs = (cpu_count() / 2) - 1 185 | 186 | # This is to avoid thread explosion. Allow 187 | # each process to use a single thread. 188 | os.environ["OMP_NUM_THREADS"] = "1" 189 | os.environ["MKL_NUM_THREADS"] = "1" 190 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 191 | 192 | # Force CPU 193 | os.environ["THEANO_FLAGS"] = "device=cpu" 194 | 195 | # Create tester object 196 | tester = Tester(args) 197 | tester.set_model_options() 198 | tester.start() 199 | out_file = args.saveto 200 | 201 | tester.write_logprobs(out_file, args.score) 202 | 203 | sys.exit(0) 204 | -------------------------------------------------------------------------------- /bin/nmt-translate-client: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Client for nmt-translate server 5 | # Send the content of an input file to the server and print the resulting translation 6 | 7 | import sys 8 | import re 9 | import http.client 10 | import argparse 11 | 12 | parser = argparse.ArgumentParser(description='nmt-translate client') 13 | parser.add_argument('inputfile', help='text to translate') 14 | parser.add_argument('-s', '--server', dest='HTTPserver', help='nmt-translate server adress (localhost:30060)', nargs='?', default="localhost:30060") 15 | args = parser.parse_args() 16 | 17 | if '@' in args.HTTPserver: 18 | urlbase,proxy = args.HTTPserver.split('@') 19 | else: 20 | urlbase = args.HTTPserver 21 | proxy = None 22 | if proxy: 23 | connectionaddress = proxy 24 | else: 25 | connectionaddress = args.HTTPserver 26 | 27 | # request to translation server 28 | def translate(text): 29 | # start HTTP connection (a simple TCP connection could not pass firewall) 30 | r=0 31 | try: 32 | conn = http.client.HTTPConnection(connectionaddress) 33 | conn.request('GET', urlbase, text.encode('utf8')) 34 | r = conn.getresponse() 35 | response=r.read() 36 | return response.decode('utf8') 37 | except Exception as e: 38 | message = "Failed to connect: "+str(e) 39 | if r: 40 | message += "Error %d" % r.status 41 | print(message) 42 | return None 43 | 44 | # open input file 45 | try: 46 | f = open(args.inputfile, 'r') 47 | inputText = f.read().strip() 48 | print("source: %s" % inputText) 49 | except IOError: 50 | print("Failed to open input file (%s)" % args.inputfile) 51 | sys.exit(1) 52 | 53 | # map input file format to translation model format 54 | # ex: Les onze prétendants à l'Elysée s'affrontent mardi ==> l e s | o n z e | p r é t e n d a n t s | à | l e l y s é e | s a f f r o n t e n t | m a r d i 55 | inputText = re.sub("[^\w\s]|[0-9]", "", inputText.lower()) # clean extra spaces and digits + lowercase 56 | inputText = re.sub('\s+', '|', inputText) # use pipe as word separator (for grapheme-to-phoneme conversion) 57 | inputText = " ".join(inputText) # tokenize: separate letters by spaces 58 | 59 | # send translation request 60 | rep=translate("%s"%inputText) 61 | 62 | if (not rep): 63 | print ("Failed to translate: "+ str(rep)) 64 | else: 65 | print ("target: %s" % rep) 66 | 67 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | Introduction 2 | ------------ 3 | 4 | **nmtpy** is a suite of Python tools, primarily based on the starter code provided in [dl4mt-tutorial](https://github.com/nyu-dl/dl4mt-tutorial) 5 | for training neural machine translation models using Theano. 6 | 7 | The basic motivation behind forking **dl4mt-tutorial** was to create a framework where it would be 8 | easy to implement a new model by *merely* creating a new Python file. 9 | 10 | Features 11 | ----- 12 | 13 | ### General 14 | - No shell script, everything is in Python 15 | - Overhaul object-oriented refactoring of the code: clear separation of API and scripts that interface with the API 16 | - INI style configuration files to define everything regarding a training experiment 17 | - Transparent cleanup mechanism to kill stale processes, remove temporary files 18 | - Simultaneous logging of training details to stdout and log file 19 | 20 | - Supports out-of-the-box BLEU, METEOR and COCO eval metrics 21 | - Includes [subword-nmt](https://github.com/rsennrich/subword-nmt) utilities for training and applying BPE model 22 | - Plugin-like text filters for hypothesis post-processing (Example: BPE, Compound, Desegment) 23 | - Early-stopping and checkpointing based on perplexity, BLEU or METEOR 24 | - Ability to add new metrics easily 25 | - Single `.npz` file to store everything about a training experiment 26 | - Automatic free GPU selection and reservation using `nvidia-smi` 27 | - Shuffling between epochs 28 | - Simple shuffle 29 | - [Homogeneous batches of same-length samples](https://github.com/kelvinxu/arctic-captions) to improve training speed 30 | - Improved parallel translation decoding on CPU 31 | - Forced decoding i.e. rescoring using NMT 32 | - Export decoding informations into `json` for further visualization of attention weights 33 | 34 | ### Training 35 | - Improved numerical stability and reproducibility 36 | - Glorot/Xavier, He, Orthogonal weight initializations 37 | - Efficient SGD, Adadelta, RMSProp and ADAM 38 | - Single forward/backward theano function without intermediate variables 39 | - Initialization of a model with weights from another nmtpy model 40 | - Ability to freeze pre-trained weights 41 | - Several recurrent blocks: 42 | - GRU, Conditional GRU (CGRU) and LSTM 43 | - Multimodal attentive CGRU variants 44 | - [Layer Normalization](https://github.com/ryankiros/layer-norm) support for GRU 45 | - [Tied target embeddings](https://arxiv.org/abs/1608.05859) 46 | - Simple/Non-recurrent Dropout, L2 weight decay 47 | - Training and validation loss normalization for comparable perplexities 48 | -------------------------------------------------------------------------------- /docs/logo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/docs/logo.pdf -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/docs/logo.png -------------------------------------------------------------------------------- /docs/pages/config.md: -------------------------------------------------------------------------------- 1 | Configuration Files 2 | -- 3 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## WMT Shared Task on Multimodal Translation 4 | 5 | - [WMT17 Multimodal Translation (Task 1)](https://github.com/lium-lst/wmt17-mmt) 6 | - [WMT16 Cross-lingual Image Description Generation (Task 2)](wmt16-mmt-task2) : Monomodal and Multimodal 7 | 8 | **Note:** All textual data provided in the `data/` folders of the above examples are the courtesy of the following work 9 | and can be downloaded from [here](http://www.statmt.org/wmt17/multimodal-task.html). 10 | 11 | If you use `fusion_*` multimodal architectures in your work, please cite the following 12 | article: 13 | 14 | ``` 15 | @article{caglayan2016multimodal, 16 | title={Multimodal Attention for Neural Machine Translation}, 17 | author={Caglayan, Ozan and Barrault, Lo{\"\i}c and Bougares, Fethi}, 18 | journal={arXiv preprint arXiv:1609.03976}, 19 | year={2016} 20 | } 21 | ``` 22 | 23 | ### Getting the Image Features 24 | 25 | For multimodal baselines, you will need the convolutional features extracted 26 | from a pre-trained ResNet-50. You can download these files from the links below: 27 | 28 | - [flickr30k_ResNets50_blck4_train.fp16.npy.xz](http://www-lium.univ-lemans.fr/sites/default/files/NMTPY/flickr30k_ResNets50_blck4_train.fp16.npy.xz) (6GB) 29 | - [flickr30k_ResNets50_blck4_val.fp16.npy.xz](http://www-lium.univ-lemans.fr/sites/default/files/NMTPY/flickr30k_ResNets50_blck4_val.fp16.npy.xz) (214M) 30 | - [flickr30k_ResNets50_blck4_test.fp16.npy.xz](http://www-lium.univ-lemans.fr/sites/default/files/NMTPY/flickr30k_ResNets50_blck4_test.fp16.npy.xz) (211M) 31 | 32 | After downloading the files, extract them using the following command: 33 | 34 | ``` 35 | xz -d 36 | ``` 37 | 38 | Each `.npy` file contains 14x14x1024 convolutional feature maps for each image 39 | which are extracted from **res4f_relu** layer of a ResNet-50 trained on ImageNet: 40 | (The `fp16` suffix means that the `dtype` is `float16`.) 41 | 42 | ``` 43 | >> valfeats = numpy.load('flickr30k_ResNets50_blck4_val.fp16.npy') 44 | >> valfeats.shape 45 | (1014, 196, 1024) 46 | # 1014: n_samples 47 | # 196: flattened 14x14 into 196 for convenience 48 | # 1024: n_feature_maps 49 | ``` 50 | 51 | For more information about the image features, please refer to: 52 | 53 | ``` 54 | @article{caglayan2016does, 55 | title={Does Multimodality Help Human and Machine for Translation and Image Captioning?}, 56 | author={Caglayan, Ozan and Aransa, Walid and Wang, Yaxing and Masana, Marc and Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Bougares, Fethi and Barrault, Lo{\"\i}c and van de Weijer, Joost}, 57 | journal={arXiv preprint arXiv:1605.09186}, 58 | year={2016} 59 | } 60 | ``` 61 | 62 | ## Factored Neural Machine Translation system 63 | 64 | The Factored NMT models defined by basefnmt.py are based on the NMT architecture and extended to be able to generate several output symbols at the same time (Figure http://www-lium.univ-lemans.fr/~garcia/fnmt_archi.pdf). 65 | 66 | Folder `ted-factors` contains examples of how to use this system. 67 | 68 | ### Citation: 69 | 70 | If you use fnmt system in your work, please cite the following: 71 | 72 | ``` 73 | @inproceedings{garcia-martinez2016fnmt, 74 | title={Factored Neural Machine Translation Architectures}, 75 | author={Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Barrault, Lo{\"\i}c and Bougares, Fethi}, 76 | booktitle={Proceedings of the International Workshop on Spoken Language Translation (IWSLT)}, 77 | year={2016}, 78 | url={'http://workshop2016.iwslt.org/downloads/IWSLT_2016_paper_2.pdf'} 79 | } 80 | ``` 81 | 82 | Contact: Mercedes.Garcia_Martinez@univ-lemans.fr. 83 | -------------------------------------------------------------------------------- /examples/ted-factors/README.md: -------------------------------------------------------------------------------- 1 | # Factored Neural Machine Translation system 2 | 3 | The Factored NMT models defined by ```basefnmt.py``` are based on the NMT architecture and extended to be able to generate several output symbols at the same time (Figure http://www-lium.univ-lemans.fr/~garcia/fnmt_archi.pdf). 4 | 5 | The decoder has been modified respect to the baseline model with the following items: 6 | 7 | - Specialized iterator named ```factors.py``` that handles multiple inputs and outputs text streams. 8 | - Additional softmax and embedding for the 2nd output. 9 | - Concatenation of the embeddings of the generated tokens at previous timestep to feedback the generation of the current token. 10 | - Sum of costs coming from each output. 11 | - Constriction of the length of the 2nd output sequence to be equal to the length of the 1st output sequence. 12 | Firstly, we included a new mask excluding the end of sequence (\tm{EOS}) symbols to avoid shorter sequences. 13 | Secondly, we limited the maximum length of the 2nd output sequence to the length of the 1st output sequence. 14 | - The beam search has been modified to be able to handle the multiple outputs. 15 | Once we obtain the hypothesis from lemmas (1st output) and factors (2nd output) at stage 1 of the Figure http://www-lium.univ-lemans.fr/~garcia/beamsearch.pdf, the cross product of those output spaces is performed. 16 | Afterwards, we keep the beam size best combinations for each hypothesis. 17 | Finally, the number of samples is reduced again to the beam size. 18 | - Translation generation executed by ```nmt-translate-factors``` which can handle multiple outputs. 19 | - Optionally, \tm{factors2wordbleu.py} metric is available to evaluate with BLEU the combination of the several outputs. 20 | A script detailed in the configuration file is necessary to apply this metric. 21 | 22 | ## TED data 23 | 24 | - Download [examples-ted-data.tar.bz2](http://www-lium.univ-lemans.fr/~garcia/examples-ted-data.tar.bz2) and extract it into the `data/` folder. 25 | 26 | - Build the vocabulary dictionaries for each train file: 27 | 28 | `nmt-build-dict train_file` 29 | 30 | - Option factors enable the factored system. 31 | Factors parameter gets as argument `evalf` which will evaluate the model just with the first output or a script to combine the 2 outputs as desired. 32 | 33 | This script will need as arguments `lang, first_output_hyp_file, second_output_hyp_file, reference_file` in this order and will print the corresponding BLEU score. 34 | 35 | ## FNMT Training 36 | 37 | Run `nmt-train -c attention_factors-ted-en-fr.conf` to train a FNMT on this corpus. 38 | 39 | ## FNMT Translation 40 | 41 | When the training is over, you can translate the test set using the following command: 42 | 43 | ``` 44 | nmt-translate-factors -m ~/nmtpy/models/ \ 45 | -S ~/nmtpy/examples/ted-factors/data/dev.en \ 46 | -R ~/nmtpy/examples/ted-factors/data/dev.fr \ 47 | ~/nmtpy/examples/ted-factors/data/dev.lemma.fr \ 48 | ~/nmtpy/examples/ted-factors/data/dev.factors.fr \ 49 | -o trans_dev.lemma.fr trans_dev.factors.fr \ 50 | -fa evalf 51 | ``` 52 | The option -R needs the references of the word-level, first output and second output, repectively. 53 | 54 | In -fa option you can include your script to combine both outputs if desired instead of evalf option. 55 | 56 | 57 | ## Citation: 58 | If you use `fnmt` system in your work, please cite the following: 59 | 60 | ``` 61 | @inproceedings{garcia-martinez2016fnmt, 62 | title={Factored Neural Machine Translation Architectures}, 63 | author={Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Barrault, Lo{\"\i}c and Bougares, Fethi}, 64 | booktitle={arXiv preprint arXiv:1605.09186}, 65 | year={2016} 66 | } 67 | ``` 68 | 69 | More info: 70 | http://workshop2016.iwslt.org/downloads/IWSLT_2016_paper_2.pdf 71 | 72 | Contact: Mercedes.Garcia_Martinez@univ-lemans.fr. 73 | 74 | 75 | -------------------------------------------------------------------------------- /examples/ted-factors/attention_factors-ted-en-fr.conf: -------------------------------------------------------------------------------- 1 | [training] 2 | # Main .py file which will be used for the model 3 | model_type: attention_factors 4 | # how much validation period will we wait 5 | # to do early stopping 6 | patience: 10 7 | # Maximum number of epochs before stopping training 8 | max_epochs: 20 9 | # Validation start in terms of epochs 10 | # validation frequency in terms of minibatch updates 11 | valid_start: 2 12 | valid_freq: 5000 13 | # Save the hypothesis file for each validation 14 | valid_save_hyp: True 15 | # 0: no, otherwise weight decay factor 16 | decay_c: 0 17 | # -1: no, otherwise maximum gradient norm 18 | clip_c: 1.0 19 | seed: 1234 20 | 21 | [model] 22 | # Using the same embedding for output and previous 23 | tied_emb: 2way 24 | layer_norm: False 25 | # Sort batches by target length or not 26 | shuffle_mode: None 27 | # 0: no, otherwise dropout probability 28 | dropout: 0.0 29 | 30 | # Embedding vector dimension 31 | embedding_dim: 620 32 | 33 | # RNN's hidden layer dimension 34 | rnn_dim: 1000 35 | enc_type: gru 36 | dec_type: gru_cond 37 | 38 | # Number of jobs while translating 39 | njobs: 15 40 | 41 | # adadelta, adam, sgd or rmsprop 42 | optimizer: adadelta 43 | 44 | # Learning rate (only for SGD) 45 | lrate: 1 46 | 47 | # batch size 48 | batch_size: 80 49 | 50 | #Normalization of the cost 51 | norm_cost: False 52 | 53 | # Use BLEU as additional validation metric 54 | valid_metric: bleu 55 | 56 | # Script to combine output factors or 'evalf' to evaluate just with the first output 57 | factors: evalf 58 | 59 | weight_init: xavier 60 | 61 | # 0: use all vocabulary, otherwise upper limit as integer 62 | n_words_src: 30000 63 | n_words_trg1: 30000 64 | n_words_trg2: 0 65 | 66 | # Where to save model params, weights and training log file 67 | save_path: ~/nmtpy/models 68 | 69 | [model.dicts] 70 | src: ~/nmtpy/examples/ted-factors/data/train.en.vocab.pkl 71 | trg1: ~/nmtpy/examples/ted-factors/data/train.lemma.fr.vocab.pkl 72 | trg2: ~/nmtpy/examples/ted-factors/data/train.factors.fr.vocab.pkl 73 | 74 | [model.data] 75 | train_src: ~/nmtpy/examples/ted-factors/data/train.en 76 | train_trg1: ~/nmtpy/examples/ted-factors/data/train.lemma.fr 77 | train_trg2: ~/nmtpy/examples/ted-factors/data/train.factors.fr 78 | valid_src: ~/nmtpy/examples/ted-factors/data/dev.en 79 | valid_trg: ~/nmtpy/examples/ted-factors/data/dev.fr 80 | valid_trg1: ~/nmtpy/examples/ted-factors/data/dev.lemma.fr 81 | valid_trg2: ~/nmtpy/examples/ted-factors/data/dev.factors.fr 82 | 83 | -------------------------------------------------------------------------------- /examples/ted-factors/data/README.md: -------------------------------------------------------------------------------- 1 | Extract the downloaded `examples-ted-data.tar.bz2` inside here. 2 | -------------------------------------------------------------------------------- /examples/wmt16-mmt-task2/README.md: -------------------------------------------------------------------------------- 1 | # WMT16 Shared task on Multimodal Translation 2 | ## Task 2 - Cross-lingual Image Description Generation 3 | 4 | ### Multi30k Dataset 5 | 6 | A copy of the original text files for Task 2 are available under `data/`. These files are downloaded 7 | from [WMT16 Multimodal Task](http://www.statmt.org/wmt16/multimodal-task.html) webpage. 8 | 9 | (**Note:** If you would like to fix some mistakes in the corpora, you can apply [this patch](data/fix-corpus-bugs.patch) before proceeding. 10 | 11 | ### Normalization and Tokenization 12 | 13 | Make sure that the following scripts from the `mosesdecoder` project are in your `$PATH`: 14 | - tokenizer.perl 15 | - normalize-punctuation.perl 16 | 17 | Run `scripts/01-tokenize.sh ~/nmtpy/data/wmt16-task2` to: 18 | - Normalize punctuations 19 | - Tokenize 20 | 21 | train, val and test files from `data/` and save them under `~/nmtpy/data/wmt16-task2`. 22 | **Note that** the output folder is in accordance with the configuration file 23 | `wmt16-task2-monomodal.conf` so if you use another output folder, change the configuration 24 | file as well. 25 | 26 | ### Preparing Data 27 | 28 | `scripts/02-prepare.py` is a Python script that consumes all the tokenized data produces: 29 | - processed text files 30 | - `pkl` files to be used by `WMTIterator` 31 | - `nmtpy` dictionary files for source and target vocabularies 32 | 33 | You can run the following command to prepare above files: 34 | ``` 35 | ODIR=~/nmtpy/data/wmt16-task2 36 | scripts/02-prepare.py -i data/split_all.txt \ 37 | -t $ODIR/train.*.en -T $ODIR/train.*.de \ 38 | -v $ODIR/val.*.en -V $ODIR/val.*.de \ 39 | -e $ODIR/test.*.en -E $ODIR/test.*.de \ 40 | -l -s -d 5 -o $ODIR # lowercase, strippunct, minwordoccurrence5 41 | ``` 42 | 43 | The produced `.pkl` data files contain a `list` of samples for each of the train/val/test sets 44 | where a sample is represented with: 45 | - `ssplit`: An integer between 0-4 representing from which file the source sentence came from 46 | - `tsplit`: An integer between 0-4 representing from which file the source sentence came from 47 | - `imgid`: An integer between 0-(N-1) representing the order of the image for a set containing N images 48 | - `imgname`: The name of the JPG image file 49 | - `swords`: List of source words 50 | - `twords`: List of target words 51 | 52 | Let's see with a concrete example: 53 | ```bash 54 | cd $ODIR 55 | ipython 56 | ``` 57 | 58 | ```python 59 | ... 60 | In [1]: import pickle 61 | 62 | In [2]: v = pickle.load(open('flickr_30k_align.valid.pkl')) 63 | 64 | In [3]: len(v) 65 | Out[3]: 25350 66 | 67 | In [4]: v[0] 68 | Out[4]: 69 | [0, 70 | 0, 71 | 0, 72 | '1018148011.jpg', 73 | [u'a', 74 | u'group', 75 | u'of', 76 | u'people', 77 | u'stand', 78 | u'in', 79 | u'the', 80 | u'back', 81 | u'of', 82 | u'a', 83 | u'truck', 84 | u'filled', 85 | u'with', 86 | u'cotton'], 87 | [u'baumwolllager', u'mit', u'lkw']] 88 | ``` 89 | 90 | A clarification should be made about the number of samples in a set: since we have 5 source and 5 target sentences for each image, the script generates `5x5=25` comparable pairs for a single image. Since the validation set contains 1014 images, this makes a total of `25*1014=25350` samples. 91 | 92 | During training, you can select whether you would like to use: 93 | - All 25 comparable pairs for an image (`data_mode:all`) 94 | - 5 comparable pairs for an image (**default:** `data_mode:pairs`) 95 | - `(.1.en, .1.de), (.2.en, .2.de), ..., (.5.en, .5.de)` 96 | - Just one pair from the first pair of files: `.1.en -> .1.de` (`data_mode:single`) 97 | 98 | During early-stopping, we use by default `single` for validation to only consider the description pairs from `.1.en, .1.de` resulting in 1014 images-captions. 99 | 100 | ### Train a Monomodal NMT 101 | 102 | Run `nmt-train -c wmt16-task2-monomodal.conf` to train a monomodal NMT on this 103 | corpus. When the training is over, you can translate the test set using the following command: 104 | 105 | ``` 106 | nmt-translate -m ~/nmtpy/models/wmt16-mmt-task2-monomodal/ 107 | -S ~/nmtpy/data/wmt16-task2/flickr_30k_align.test.pkl -v pairs \ 108 | -o test_monomodal.tok.de 109 | ``` 110 | 111 | The flag `-v pairs` will generate 5 hypotheses for each image using each source description and 112 | pick the one having the maximum likelihood based on NMT score. 113 | 114 | ### Train a Multimodal NMT 115 | 116 | #### Image Features 117 | 118 | You need to [download](../README.md) ResNet-50 convolutional feature files, uncompress them and save 119 | under `~/nmtpy/data/wmt16-task2`. 120 | 121 | Run `nmt-train -c wmt16-task2-multimodal.conf` to train a `fusion_concat_dep_ind` architecture. 122 | When the training is over, you can translate the test set with the following command: 123 | 124 | ``` 125 | nmt-translate -m ~/nmtpy/models/wmt16-mmt-task2-multimodal/ \ 126 | -S ~/nmtpy/data/wmt16-task2/flickr_30k_align.test.pkl \ 127 | ~/nmtpy/data/wmt16-task2/flickr30k_ResNets50_blck4_test.fp16.npy -v pairs \ 128 | -o test_multimodal.tok.de 129 | -------------------------------------------------------------------------------- /examples/wmt16-mmt-task2/data/README.md: -------------------------------------------------------------------------------- 1 | Multi30k dataset 2 | --- 3 | 4 | This is a reorganized folder containing extracted and renamed files from 5 | the original WMT16 Multimodal Translation Task 2 train/dev/test splits, namely 6 | the Multi30k dataset. 7 | 8 | Original files can be downloaded from [here](http://www.statmt.org/wmt16/multimodal-task.html) 9 | 10 | The files are organized as follows: 11 | - `train.[1-5].{en,de}`: 5 splits of training set each having 29K sentences 12 | - `val.[1-5].{en,de}`: 5 splits of dev set each having 1014 sentences 13 | - `test.[1-5].{en,de}`: 5 splits of test set each having 1000 sentences 14 | - `split_*.txt`: Text files containing sentence to image name mapping for each sets. 15 | 16 | (The patch file `fix-corpus-bugs.patch` can be applied on top of these files to fix some bugs 17 | in the corpus.) 18 | -------------------------------------------------------------------------------- /examples/wmt16-mmt-task2/scripts/01-tokenize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUT=$1 3 | 4 | if [ -z $OUT ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | mkdir $OUT &> /dev/null 10 | 11 | for lang in en de; do 12 | for f in $(ls --color=none data/*$lang); do 13 | fname=`basename $f` 14 | fname=${fname/\.$lang/} 15 | echo "Normalizing punctuation and tokenizing $f" 16 | cat $f | normalize-punctuation.perl -l $lang | tokenizer.perl -threads 8 -l $lang > $OUT/"$fname.norm.tok.$lang" 17 | done 18 | done 19 | -------------------------------------------------------------------------------- /examples/wmt16-mmt-task2/wmt16-mmt-task2-monomodal.conf: -------------------------------------------------------------------------------- 1 | [training] 2 | # This is the attention NMT with WMT iterator 3 | model_type: attention_wmt 4 | patience: 10 5 | max_epochs: 100 6 | valid_freq: 0 7 | valid_metric: meteor 8 | decay_c: 1e-5 9 | clip_c: 5 10 | seed: 1234 11 | 12 | [model] 13 | tied_emb: 2way 14 | layer_norm: True 15 | shuffle_mode: trglen 16 | embedding_dim: 100 17 | rnn_dim: 100 18 | 19 | optimizer: adam 20 | lrate: 0.0004 21 | weight_init: xavier 22 | batch_size: 32 23 | 24 | n_words_src: 0 25 | 26 | # Only the most 10K, other -> UNK 27 | n_words_trg: 10000 28 | 29 | save_path: ~/nmtpy/models 30 | 31 | [model.dicts] 32 | src: ~/nmtpy/data/wmt16-task2/train_src.pkl 33 | trg: ~/nmtpy/data/wmt16-task2/train_trg.pkl 34 | 35 | [model.data] 36 | train_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.train.pkl 37 | valid_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.valid.pkl 38 | valid_trg: ~/nmtpy/data/wmt16-task2/valid.*.tok.lc.nopunct.de 39 | -------------------------------------------------------------------------------- /examples/wmt16-mmt-task2/wmt16-mmt-task2-multimodal.conf: -------------------------------------------------------------------------------- 1 | [training] 2 | model_type: fusion_concat_dep_ind 3 | patience: 10 4 | max_epochs: 100 5 | valid_freq: 0 6 | valid_metric: meteor 7 | decay_c: 1e-5 8 | clip_c: 5 9 | seed: 1234 10 | 11 | [model] 12 | tied_emb: False 13 | layer_norm: False 14 | shuffle_mode: trglen 15 | embedding_dim: 620 16 | rnn_dim: 1000 17 | conv_dim: 1024 18 | 19 | optimizer: adam 20 | lrate: 0.0004 21 | weight_init: xavier 22 | batch_size: 32 23 | 24 | n_words_src: 0 25 | 26 | # Only the most 10K, other -> UNK 27 | n_words_trg: 10000 28 | 29 | save_path: ~/nmtpy/models 30 | 31 | [model.dicts] 32 | src: ~/nmtpy/data/wmt16-task2/train_src.pkl 33 | trg: ~/nmtpy/data/wmt16-task2/train_trg.pkl 34 | 35 | [model.data] 36 | train_img: ~/nmtpy/data/wmt16-task2/flickr30k_ResNets50_blck4_train.fp16.npy 37 | valid_img: ~/nmtpy/data/wmt16-task2/flickr30k_ResNets50_blck4_val.fp16.npy 38 | train_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.train.pkl 39 | valid_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.valid.pkl 40 | valid_trg: ~/nmtpy/data/wmt16-task2/valid.*.tok.lc.nopunct.de 41 | -------------------------------------------------------------------------------- /nmtpy/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.0' 2 | -------------------------------------------------------------------------------- /nmtpy/cleanup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import signal 5 | import atexit 6 | import traceback 7 | 8 | temp_files = set() 9 | subprocesses = set() 10 | 11 | def register_tmp_file(f): 12 | """Add new temp file to global set.""" 13 | temp_files.add(f) 14 | 15 | def register_proc(pid): 16 | """Add new process to global set.""" 17 | subprocesses.add(pid) 18 | 19 | def unregister_proc(pid): 20 | """Remove given PID from global set.""" 21 | subprocesses.remove(pid) 22 | 23 | def cleanup(): 24 | """Cleanup registered temp files and kill PIDs.""" 25 | for f in temp_files: 26 | try: 27 | os.unlink(f) 28 | except: 29 | pass 30 | 31 | for p in subprocesses: 32 | try: 33 | os.kill(p, signal.SIGTERM) 34 | except: 35 | pass 36 | 37 | def signal_handler(signum, frame): 38 | """Let Python call this when SIGINT or SIGTERM caught.""" 39 | cleanup() 40 | sys.exit(0) 41 | 42 | def register_exception_handler(logger, quit_on_exception=False): 43 | """Setup exception handler.""" 44 | 45 | def exception_handler(exctype, value, tb): 46 | """Let Python call this when an exception is uncaught.""" 47 | logger.info(''.join(traceback.format_exception(exctype, value, tb))) 48 | 49 | def exception_handler_quits(exctype, value, tb): 50 | """Let Python call this when an exception is uncaught.""" 51 | logger.info(''.join(traceback.format_exception(exctype, value, tb))) 52 | sys.exit(1) 53 | 54 | if quit_on_exception: 55 | sys.excepthook = exception_handler_quits 56 | else: 57 | sys.excepthook = exception_handler 58 | 59 | def register_handler(logger, _atexit=True, _signals=True, exception_quits=False): 60 | """Register atexit and signal handlers.""" 61 | if _atexit: 62 | # Register exit handler 63 | atexit.register(cleanup) 64 | 65 | if _signals: 66 | # Register SIGINT and SIGTERM 67 | signal.signal(signal.SIGINT, signal_handler) 68 | signal.signal(signal.SIGTERM, signal_handler) 69 | 70 | register_exception_handler(logger, exception_quits) 71 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/README.md: -------------------------------------------------------------------------------- 1 | pycocoevalcap 2 | --- 3 | 4 | This is a copy from 5 | https://github.com/tylin/coco-caption/tree/master/pycocoevalcap 6 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/bleu/LICENSE.bleu: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File Name : bleu.py 3 | # 4 | # Description : Wrapper for BLEU scorer. 5 | # 6 | # Creation Date : 06-01-2015 7 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 8 | # Authors : Hao Fang and Tsung-Yi Lin 9 | 10 | from .bleu_scorer import BleuScorer 11 | 12 | class Bleu: 13 | def __init__(self, n=4): 14 | # default compute Blue score up to 4 15 | self._n = n 16 | self._hypo_for_image = {} 17 | self.ref_for_image = {} 18 | 19 | def compute_score(self, gts, res): 20 | 21 | bleu_scorer = BleuScorer(n=self._n) 22 | for id in sorted(gts.keys()): 23 | hypo = res[id] 24 | ref = gts[id] 25 | 26 | # Sanity check. 27 | assert(type(hypo) is list) 28 | assert(len(hypo) == 1) 29 | assert(type(ref) is list) 30 | assert(len(ref) >= 1) 31 | 32 | bleu_scorer += (hypo[0], ref) 33 | 34 | #score, scores = bleu_scorer.compute_score(option='shortest') 35 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 36 | score, scores = bleu_scorer.compute_score(option='closest', verbose=0) 37 | 38 | # return (bleu, bleu_info) 39 | return score, scores 40 | 41 | def method(self): 42 | return "Bleu" 43 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/cider/cider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Filename: cider.py 3 | # 4 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 5 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 6 | # 7 | # Creation Date: Sun Feb 8 14:16:54 2015 8 | # 9 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 10 | 11 | from .cider_scorer import CiderScorer 12 | 13 | class Cider(object): 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 33 | 34 | for id in sorted(gts.keys()): 35 | hypo = res[id] 36 | ref = gts[id] 37 | 38 | # Sanity check. 39 | assert(type(hypo) is list) 40 | assert(len(hypo) == 1) 41 | assert(type(ref) is list) 42 | assert(len(ref) > 0) 43 | 44 | cider_scorer += (hypo[0], ref) 45 | 46 | (score, scores) = cider_scorer.compute_score() 47 | 48 | return score, scores 49 | 50 | def method(self): 51 | return "CIDEr" -------------------------------------------------------------------------------- /nmtpy/cocoeval/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | from collections import defaultdict 7 | import numpy as np 8 | import math 9 | 10 | def precook(s, n=4, out=False): 11 | """ 12 | Takes a string as input and returns an object that can be given to 13 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 14 | can take string arguments as well. 15 | :param s: string : sentence to be converted into ngrams 16 | :param n: int : number of ngrams for which representation is calculated 17 | :return: term frequency vector for occuring ngrams 18 | """ 19 | words = s.split() 20 | counts = defaultdict(int) 21 | for k in range(1,n+1): 22 | for i in range(len(words)-k+1): 23 | ngram = tuple(words[i:i+k]) 24 | counts[ngram] += 1 25 | return counts 26 | 27 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 28 | '''Takes a list of reference sentences for a single segment 29 | and returns an object that encapsulates everything that BLEU 30 | needs to know about them. 31 | :param refs: list of string : reference sentences for some image 32 | :param n: int : number of ngrams for which (ngram) representation is calculated 33 | :return: result (list of dict) 34 | ''' 35 | return [precook(ref, n) for ref in refs] 36 | 37 | def cook_test(test, n=4): 38 | '''Takes a test sentence and returns an object that 39 | encapsulates everything that BLEU needs to know about it. 40 | :param test: list of string : hypothesis sentence for some image 41 | :param n: int : number of ngrams for which (ngram) representation is calculated 42 | :return: result (dict) 43 | ''' 44 | return precook(test, n, True) 45 | 46 | class CiderScorer(object): 47 | """CIDEr scorer. 48 | """ 49 | 50 | def copy(self): 51 | ''' copy the refs.''' 52 | new = CiderScorer(n=self.n) 53 | new.ctest = copy.copy(self.ctest) 54 | new.crefs = copy.copy(self.crefs) 55 | return new 56 | 57 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 58 | ''' singular instance ''' 59 | self.n = n 60 | self.sigma = sigma 61 | self.crefs = [] 62 | self.ctest = [] 63 | self.document_frequency = defaultdict(float) 64 | self.cook_append(test, refs) 65 | self.ref_len = None 66 | 67 | def cook_append(self, test, refs): 68 | '''called by constructor and __iadd__ to avoid creating new instances.''' 69 | 70 | if refs is not None: 71 | self.crefs.append(cook_refs(refs)) 72 | if test is not None: 73 | self.ctest.append(cook_test(test)) ## N.B.: -1 74 | else: 75 | self.ctest.append(None) # lens of crefs and ctest have to match 76 | 77 | def size(self): 78 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 79 | return len(self.crefs) 80 | 81 | def __iadd__(self, other): 82 | '''add an instance (e.g., from another sentence).''' 83 | 84 | if type(other) is tuple: 85 | ## avoid creating new CiderScorer instances 86 | self.cook_append(other[0], other[1]) 87 | else: 88 | self.ctest.extend(other.ctest) 89 | self.crefs.extend(other.crefs) 90 | 91 | return self 92 | def compute_doc_freq(self): 93 | ''' 94 | Compute term frequency for reference data. 95 | This will be used to compute idf (inverse document frequency later) 96 | The term frequency is stored in the object 97 | :return: None 98 | ''' 99 | for refs in self.crefs: 100 | # refs, k ref captions of one image 101 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): 102 | self.document_frequency[ngram] += 1 103 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 104 | 105 | def compute_cider(self): 106 | def counts2vec(cnts): 107 | """ 108 | Function maps counts of ngram to vector of tfidf weights. 109 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 110 | The n-th entry of array denotes length of n-grams. 111 | :param cnts: 112 | :return: vec (array of dict), norm (array of float), length (int) 113 | """ 114 | vec = [defaultdict(float) for _ in range(self.n)] 115 | length = 0 116 | norm = [0.0 for _ in range(self.n)] 117 | for (ngram,term_freq) in cnts.items(): 118 | # give word count 1 if it doesn't appear in reference corpus 119 | df = np.log(max(1.0, self.document_frequency[ngram])) 120 | # ngram index 121 | n = len(ngram)-1 122 | # tf (term_freq) * idf (precomputed idf) for n-grams 123 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 124 | # compute norm for the vector. the norm will be used for computing similarity 125 | norm[n] += pow(vec[n][ngram], 2) 126 | 127 | if n == 1: 128 | length += term_freq 129 | norm = [np.sqrt(n) for n in norm] 130 | return vec, norm, length 131 | 132 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 133 | ''' 134 | Compute the cosine similarity of two vectors. 135 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 136 | :param vec_ref: array of dictionary for vector corresponding to reference 137 | :param norm_hyp: array of float for vector corresponding to hypothesis 138 | :param norm_ref: array of float for vector corresponding to reference 139 | :param length_hyp: int containing length of hypothesis 140 | :param length_ref: int containing length of reference 141 | :return: array of score for each n-grams cosine similarity 142 | ''' 143 | delta = float(length_hyp - length_ref) 144 | # measure consine similarity 145 | val = np.array([0.0 for _ in range(self.n)]) 146 | for n in range(self.n): 147 | # ngram 148 | for (ngram,count) in vec_hyp[n].items(): 149 | # vrama91 : added clipping 150 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 151 | 152 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 153 | val[n] /= (norm_hyp[n]*norm_ref[n]) 154 | 155 | assert(not math.isnan(val[n])) 156 | # vrama91: added a length based gaussian penalty 157 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 158 | return val 159 | 160 | # compute log reference length 161 | self.ref_len = np.log(float(len(self.crefs))) 162 | 163 | scores = [] 164 | for test, refs in zip(self.ctest, self.crefs): 165 | # compute vector for test captions 166 | vec, norm, length = counts2vec(test) 167 | # compute vector for ref captions 168 | score = np.array([0.0 for _ in range(self.n)]) 169 | for ref in refs: 170 | vec_ref, norm_ref, length_ref = counts2vec(ref) 171 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 172 | # change by vrama91 - mean of ngram scores, instead of sum 173 | score_avg = np.mean(score) 174 | # divide by number of references 175 | score_avg /= len(refs) 176 | # multiply score by 10 177 | score_avg *= 10.0 178 | # append score of an image to the score list 179 | scores.append(score_avg) 180 | return scores 181 | 182 | def compute_score(self, option=None, verbose=0): 183 | # compute idf 184 | self.compute_doc_freq() 185 | # assert to check document frequency 186 | assert(len(self.ctest) >= max(self.document_frequency.values())) 187 | # compute cider score 188 | score = self.compute_cider() 189 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /nmtpy/cocoeval/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Python wrapper for METEOR implementation, by Xinlei Chen 3 | # Acknowledge Michael Denkowski for the generous discussion and help 4 | 5 | import os 6 | import threading 7 | import subprocess 8 | import pkg_resources 9 | 10 | METEOR_JAR = pkg_resources.resource_filename('nmtpy', 'external/meteor-1.5.jar') 11 | 12 | class Meteor(object): 13 | def __init__(self, language, norm=False): 14 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, '-', '-', '-stdio', '-l', language] 15 | self.env = os.environ 16 | self.env['LC_ALL'] = 'en_US.UTF_8' 17 | 18 | if norm: 19 | self.meteor_cmd.append('-norm') 20 | 21 | self.meteor_p = subprocess.Popen(self.meteor_cmd, stdin=subprocess.PIPE, \ 22 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, 23 | env=self.env, universal_newlines=True, bufsize=1) 24 | # Used to guarantee thread safety 25 | self.lock = threading.Lock() 26 | 27 | def method(self): 28 | return "METEOR" 29 | 30 | def compute_score(self, gts, res): 31 | imgIds = sorted(list(gts.keys())) 32 | scores = [] 33 | 34 | eval_line = 'EVAL' 35 | self.lock.acquire() 36 | for i in imgIds: 37 | assert(len(res[i]) == 1) 38 | 39 | hypothesis_str = res[i][0].replace('|||', '').replace(' ', ' ') 40 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(gts[i]), hypothesis_str)) 41 | 42 | # We obtained --> SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 43 | self.meteor_p.stdin.write(score_line + '\n') 44 | stat = self.meteor_p.stdout.readline().strip() 45 | eval_line += ' ||| {}'.format(stat) 46 | 47 | # Send to METEOR 48 | self.meteor_p.stdin.write(eval_line + '\n') 49 | 50 | # Collect segment scores 51 | for i in range(len(imgIds)): 52 | score = float(self.meteor_p.stdout.readline().strip()) 53 | scores.append(score) 54 | 55 | # Final score 56 | final_score = 100*float(self.meteor_p.stdout.readline().strip()) 57 | self.lock.release() 58 | 59 | return final_score, scores 60 | 61 | def __del__(self): 62 | self.lock.acquire() 63 | self.meteor_p.stdin.close() 64 | self.meteor_p.wait() 65 | self.lock.release() 66 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /nmtpy/cocoeval/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File Name : rouge.py 3 | # 4 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 5 | # 6 | # Creation Date : 2015-01-07 06:03 7 | # Author : Ramakrishna Vedantam 8 | 9 | import numpy as np 10 | 11 | def my_lcs(string, sub): 12 | """ 13 | Calculates longest common subsequence for a pair of tokenized strings 14 | :param string : list of str : tokens from a string split using whitespace 15 | :param sub : list of str : shorter string, also split using whitespace 16 | :returns: length (list of int): length of the longest common subsequence between the two strings 17 | 18 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 19 | """ 20 | if(len(string)< len(sub)): 21 | sub, string = string, sub 22 | 23 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 24 | 25 | for j in range(1,len(sub)+1): 26 | for i in range(1,len(string)+1): 27 | if(string[i-1] == sub[j-1]): 28 | lengths[i][j] = lengths[i-1][j-1] + 1 29 | else: 30 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 31 | 32 | return lengths[len(string)][len(sub)] 33 | 34 | class Rouge(): 35 | ''' 36 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 37 | 38 | ''' 39 | def __init__(self): 40 | # vrama91: updated the value below based on discussion with Hovey 41 | self.beta = 1.2 42 | 43 | def calc_score(self, candidate, refs): 44 | """ 45 | Compute ROUGE-L score given one candidate and references for an image 46 | :param candidate: str : candidate sentence to be evaluated 47 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 48 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 49 | """ 50 | assert(len(candidate)==1) 51 | assert(len(refs)>0) 52 | prec = [] 53 | rec = [] 54 | 55 | # split into tokens 56 | token_c = candidate[0].split(" ") 57 | 58 | for reference in refs: 59 | # split into tokens 60 | token_r = reference.split(" ") 61 | # compute the longest common subsequence 62 | lcs = my_lcs(token_r, token_c) 63 | prec.append(lcs/float(len(token_c))) 64 | rec.append(lcs/float(len(token_r))) 65 | 66 | prec_max = max(prec) 67 | rec_max = max(rec) 68 | 69 | if(prec_max!=0 and rec_max !=0): 70 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 71 | else: 72 | score = 0.0 73 | return score 74 | 75 | def compute_score(self, gts, res): 76 | """ 77 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 78 | Invoked by evaluate_captions.py 79 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 80 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 81 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 82 | """ 83 | score = [] 84 | for id in sorted(gts.keys()): 85 | hypo = res[id] 86 | ref = gts[id] 87 | 88 | score.append(self.calc_score(hypo, ref)) 89 | 90 | # Sanity check. 91 | assert(type(hypo) is list) 92 | assert(len(hypo) == 1) 93 | assert(type(ref) is list) 94 | assert(len(ref) > 0) 95 | 96 | average_score = np.mean(np.array(score)) 97 | return average_score, np.array(score) 98 | 99 | def method(self): 100 | return "Rouge" 101 | -------------------------------------------------------------------------------- /nmtpy/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import glob 4 | 5 | from configparser import SafeConfigParser 6 | from argparse import Namespace 7 | from ast import literal_eval 8 | 9 | def _parse_value(value): 10 | # Check for boolean or None 11 | if value.capitalize().startswith(('False', 'True', 'None')): 12 | return eval(value.capitalize(), {}, {}) 13 | 14 | # Check for path, files 15 | elif value.startswith(('~', '/', '../', './')): 16 | real_path = os.path.realpath(os.path.expanduser(value)) 17 | if '*' in real_path: 18 | # Resolve wildcards if any 19 | files = glob.glob(real_path) 20 | if len(files) == 0: 21 | raise Exception('%s did not match any file.' % value) 22 | # Return list if multiple, single file if not 23 | return sorted(files) if len(files) > 1 else files[0] 24 | else: 25 | return real_path 26 | 27 | else: 28 | # Detect strings, floats and ints 29 | try: 30 | # If this fails, this is a string 31 | literal = literal_eval(value) 32 | except Exception as ve: 33 | return value 34 | else: 35 | # Did not fail => literal is a float or int now 36 | return literal 37 | 38 | def _get_section_dict(l): 39 | """l is a list of key-value tuples returned by ConfigParser.items(). 40 | Convert it to a dictionary after inferring value types.""" 41 | return {key : _parse_value(value) for key,value in l} 42 | 43 | def _update_dict(d, defs): 44 | """Update d with key-values from defs IF key misses from d.""" 45 | for k,v in list(defs.items()): 46 | if k not in d: 47 | d[k] = v 48 | return d 49 | 50 | class Config(SafeConfigParser, object): 51 | """Custom parser inheriting from SafeConfigParser.""" 52 | 53 | def __init__(self, filename, trdefs=None, mddefs=None, override=None): 54 | # Call parent's __init__() 55 | super(self.__class__, self).__init__() 56 | 57 | # Use values from defaults.py when missing 58 | self._trdefs = trdefs if trdefs else {} 59 | self._mddefs = mddefs if mddefs else {} 60 | 61 | # dict that will override 62 | # this can contain both model and training args unfortunately. 63 | self._override = _get_section_dict(list(override.items())) \ 64 | if override else {} 65 | 66 | # Parse the file, raise if error 67 | if len(self.read(filename)) == 0: 68 | raise Exception('Could not parse configuration file.') 69 | 70 | def parse(self): 71 | """Parse everything and return 2 Namespace objects.""" 72 | # Convert training and model sections to dictionary 73 | trdict = _get_section_dict(self.items('training')) \ 74 | if 'training' in self.sections() else {} 75 | mddict = _get_section_dict(self.items('model')) \ 76 | if 'model' in self.sections() else {} 77 | 78 | # Update parsed sections with missing defaults 79 | trdict = _update_dict(trdict, self._trdefs) 80 | mddict = _update_dict(mddict, self._mddefs) 81 | 82 | for key, value in list(self._override.items()): 83 | assert not (key in trdict and key in mddict) 84 | if key in trdict: 85 | trdict[key] = value 86 | else: 87 | # everything else goes to model args 88 | mddict[key] = value 89 | 90 | # Finally merge model.* subsections into model 91 | for section in self.sections(): 92 | if section.startswith('model.'): 93 | subsection = section.split('.')[-1] 94 | mddict[subsection] = _get_section_dict(self.items(section)) 95 | 96 | return (Namespace(**trdict), Namespace(**mddict)) 97 | -------------------------------------------------------------------------------- /nmtpy/defaults.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Default data types 4 | INT = 'int64' 5 | FLOAT = 'float32' 6 | 7 | MODEL_DEFAULTS = { 8 | 'weight_init': 'xavier', # Can be a float for the scale of normal initialization, "xavier" or "he". 9 | 'batch_size': 32, # Training batch size 10 | 'optimizer': 'adam', # adadelta, sgd, rmsprop, adam 11 | 'lrate': None, # Initial learning rate. Defaults for each optimizer is different so value 12 | # will be initialized when building optimizer if None. 13 | } 14 | 15 | TRAIN_DEFAULTS = { 16 | 'init': None, # Pretrained model .npz file 17 | 'device_id': 'auto', # 18 | 'seed': 1234, # RNG seed 19 | 'clip_c': 5., # Clip gradients above clip_c 20 | 'decay_c': 0., # L2 penalty factor 21 | 'patience': 10, # Early stopping patience 22 | 'patience_delta': 0., # Absolute difference that will be taken into account as improvement for valid metric 23 | 'max_epochs': 100, # Max number of epochs to train 24 | 'max_iteration': int(1e6), # Max number of updates to train 25 | 'valid_metric': 'bleu', # one or more metrics separated by comma, 1st one used for early-stopping 26 | 'valid_start': 1, # Epoch which validation will start 27 | 'valid_njobs': 16, # # of parallel CPU tasks to do beam-search 28 | 'valid_beam': 12, # Allow changing beam size during validation 29 | 'valid_freq': 0, # 0: End of epochs 30 | 'valid_save_hyp': False, # Save each output of validation to separate files 31 | 'snapshot_freq': 0, # Checkpoint frequency for resuming in terms of number of iterations 32 | 'disp_freq': 10, # Display training statistics after each disp_freq minibatches 33 | 'save_best_n': 4, # Always keep a set of 4 best validation models on disk 34 | 'save_timestamp': False, # Creates a subfolder for each experiment with timestamp prefix 35 | } 36 | -------------------------------------------------------------------------------- /nmtpy/external/data/README.md: -------------------------------------------------------------------------------- 1 | Download METEOR paraphrase files to here using `scripts/get-meteor-data.sh` 2 | before running `python setup.py install` 3 | -------------------------------------------------------------------------------- /nmtpy/external/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/nmtpy/external/meteor-1.5.jar -------------------------------------------------------------------------------- /nmtpy/external/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | # 6 | # This is a verbatim copy of the original multi-bleu.perl at: 7 | # 8 | # commit ec71c2397bb8316110efb91067dfb4c66b843cf3 9 | # Author: Ulrich Germann 10 | # Date: Tue Nov 10 01:16:17 2015 +0000 11 | # Allow multiple reference files to be specified on the command line; handle gzipped reference files. 12 | 13 | # $Id$ 14 | use warnings; 15 | use strict; 16 | 17 | my $lowercase = 0; 18 | if ($ARGV[0] eq "-lc") { 19 | $lowercase = 1; 20 | shift; 21 | } 22 | 23 | my $stem = $ARGV[0]; 24 | if (!defined $stem) { 25 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 26 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 27 | exit(1); 28 | } 29 | 30 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 31 | 32 | my @REF; 33 | my $ref=0; 34 | while(-e "$stem$ref") { 35 | &add_to_ref("$stem$ref",\@REF); 36 | $ref++; 37 | } 38 | &add_to_ref($stem,\@REF) if -e $stem; 39 | die("ERROR: could not find reference file $stem") unless scalar @REF; 40 | 41 | # add additional references explicitly specified on the command line 42 | shift; 43 | foreach my $stem (@ARGV) { 44 | &add_to_ref($stem,\@REF) if -e $stem; 45 | } 46 | 47 | 48 | 49 | sub add_to_ref { 50 | my ($file,$REF) = @_; 51 | my $s=0; 52 | if ($file =~ /.gz$/) { 53 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 54 | } else { 55 | open(REF,$file) or die "Can't read $file"; 56 | } 57 | while() { 58 | chop; 59 | push @{$$REF[$s++]}, $_; 60 | } 61 | close(REF); 62 | } 63 | 64 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 65 | my $s=0; 66 | while() { 67 | chop; 68 | $_ = lc if $lowercase; 69 | my @WORD = split; 70 | my %REF_NGRAM = (); 71 | my $length_translation_this_sentence = scalar(@WORD); 72 | my ($closest_diff,$closest_length) = (9999,9999); 73 | foreach my $reference (@{$REF[$s]}) { 74 | # print "$s $_ <=> $reference\n"; 75 | $reference = lc($reference) if $lowercase; 76 | my @WORD = split(' ',$reference); 77 | my $length = scalar(@WORD); 78 | my $diff = abs($length_translation_this_sentence-$length); 79 | if ($diff < $closest_diff) { 80 | $closest_diff = $diff; 81 | $closest_length = $length; 82 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 83 | } elsif ($diff == $closest_diff) { 84 | $closest_length = $length if $length < $closest_length; 85 | # from two references with the same closeness to me 86 | # take the *shorter* into account, not the "first" one. 87 | } 88 | for(my $n=1;$n<=4;$n++) { 89 | my %REF_NGRAM_N = (); 90 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 91 | my $ngram = "$n"; 92 | for(my $w=0;$w<$n;$w++) { 93 | $ngram .= " ".$WORD[$start+$w]; 94 | } 95 | $REF_NGRAM_N{$ngram}++; 96 | } 97 | foreach my $ngram (keys %REF_NGRAM_N) { 98 | if (!defined($REF_NGRAM{$ngram}) || 99 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 100 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 101 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 102 | } 103 | } 104 | } 105 | } 106 | $length_translation += $length_translation_this_sentence; 107 | $length_reference += $closest_length; 108 | for(my $n=1;$n<=4;$n++) { 109 | my %T_NGRAM = (); 110 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 111 | my $ngram = "$n"; 112 | for(my $w=0;$w<$n;$w++) { 113 | $ngram .= " ".$WORD[$start+$w]; 114 | } 115 | $T_NGRAM{$ngram}++; 116 | } 117 | foreach my $ngram (keys %T_NGRAM) { 118 | $ngram =~ /^(\d+) /; 119 | my $n = $1; 120 | # my $corr = 0; 121 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 122 | $TOTAL[$n] += $T_NGRAM{$ngram}; 123 | if (defined($REF_NGRAM{$ngram})) { 124 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 125 | $CORRECT[$n] += $T_NGRAM{$ngram}; 126 | # $corr = $T_NGRAM{$ngram}; 127 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 128 | } 129 | else { 130 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 131 | # $corr = $REF_NGRAM{$ngram}; 132 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 133 | } 134 | } 135 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 136 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 137 | } 138 | } 139 | $s++; 140 | } 141 | my $brevity_penalty = 1; 142 | my $bleu = 0; 143 | 144 | my @bleu=(); 145 | 146 | for(my $n=1;$n<=4;$n++) { 147 | if (defined ($TOTAL[$n])){ 148 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 149 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 150 | }else{ 151 | $bleu[$n]=0; 152 | } 153 | } 154 | 155 | if ($length_reference==0){ 156 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 157 | exit(1); 158 | } 159 | 160 | if ($length_translation<$length_reference) { 161 | $brevity_penalty = exp(1-$length_reference/$length_translation); 162 | } 163 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 164 | my_log( $bleu[2] ) + 165 | my_log( $bleu[3] ) + 166 | my_log( $bleu[4] ) ) / 4) ; 167 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 168 | 100*$bleu, 169 | 100*$bleu[1], 170 | 100*$bleu[2], 171 | 100*$bleu[3], 172 | 100*$bleu[4], 173 | $brevity_penalty, 174 | $length_translation / $length_reference, 175 | $length_translation, 176 | $length_reference; 177 | 178 | sub my_log { 179 | return -9999999999 unless $_[0]; 180 | return log($_[0]); 181 | } 182 | -------------------------------------------------------------------------------- /nmtpy/filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | class Filter(object): 5 | """Common Filter class for post-processing sentences.""" 6 | def __call__(self, inp): 7 | if isinstance(inp, str): 8 | # Apply to single sentence 9 | return self.process(inp) 10 | else: 11 | # Assume a sequence and apply to each 12 | return [self.process(e) for e in inp] 13 | 14 | def process(self, s): 15 | # Derived classes should implement this method 16 | return s 17 | 18 | class CompoundFilter(Filter): 19 | """Filters out fillers from compound splitted sentences.""" 20 | def process(self, s): 21 | return s.replace(" @@ ", "").replace(" @@", "").replace(" @", "").replace("@ ", "") 22 | 23 | class BPEFilter(Filter): 24 | """Filters out fillers from BPE applied sentences.""" 25 | def process(self, s): 26 | # The first replace misses lines ending with @@ 27 | # like 'foo@@ bar Hotel@@' 28 | return s.replace("@@ ", "").replace("@@", "") 29 | 30 | class DesegmentFilter(Filter): 31 | """Converts Turkish segmentations of to normal form.""" 32 | def process(self, s): 33 | return re.sub(' *<.*?:(.*?)>', '\\1', s) 34 | 35 | class Char2Words(Filter): 36 | """Converts a space delimited character sequence to 37 | normal word form. The output will be non-tokenized.""" 38 | def process(self, s): 39 | return s.replace(' ', '').replace('', ' ').strip() 40 | 41 | def get_filter(name): 42 | filters = { 43 | "bpe" : BPEFilter(), 44 | "char2words" : Char2Words(), 45 | "compound" : CompoundFilter(), 46 | "desegment" : DesegmentFilter(), 47 | } 48 | return filters.get(name, None) 49 | -------------------------------------------------------------------------------- /nmtpy/iterators/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /nmtpy/iterators/bitext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from ..sysutils import fopen 5 | from .iterator import Iterator 6 | from .homogeneous import HomogeneousData 7 | 8 | """Parallel text iterator for translation data.""" 9 | class BiTextIterator(Iterator): 10 | def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs): 11 | super(BiTextIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger) 12 | 13 | assert 'srcfile' in kwargs, "Missing argument srcfile" 14 | assert 'trgfile' in kwargs, "Missing argument trgfile" 15 | assert 'srcdict' in kwargs, "Missing argument srcdict" 16 | assert 'trgdict' in kwargs, "Missing argument trgdict" 17 | assert batch_size > 1, "Batch size should be > 1" 18 | 19 | self._print('Shuffle mode: %s' % shuffle_mode) 20 | 21 | self.srcfile = kwargs['srcfile'] 22 | self.trgfile = kwargs['trgfile'] 23 | self.srcdict = kwargs['srcdict'] 24 | self.trgdict = kwargs['trgdict'] 25 | 26 | self.n_words_src = kwargs.get('n_words_src', 0) 27 | self.n_words_trg = kwargs.get('n_words_trg', 0) 28 | 29 | self.src_name = kwargs.get('src_name', 'x') 30 | self.trg_name = kwargs.get('trg_name', 'y') 31 | 32 | self._keys = [self.src_name] 33 | if self.mask: 34 | self._keys.append("%s_mask" % self.src_name) 35 | 36 | self._keys.append(self.trg_name) 37 | if self.mask: 38 | self._keys.append("%s_mask" % self.trg_name) 39 | 40 | def read(self): 41 | seqs = [] 42 | sf = fopen(self.srcfile, 'r') 43 | tf = fopen(self.trgfile, 'r') 44 | 45 | src_unks = 0 46 | trg_unks = 0 47 | 48 | for idx, (sline, tline) in enumerate(zip(sf, tf)): 49 | sline = sline.strip() 50 | tline = tline.strip() 51 | 52 | # Exception if empty line found 53 | if sline == "" or tline == "": 54 | continue 55 | 56 | sseq = [self.srcdict.get(w, 1) for w in sline.split(' ')] 57 | tseq = [self.trgdict.get(w, 1) for w in tline.split(' ')] 58 | 59 | # if given limit vocabulary 60 | if self.n_words_src > 0: 61 | sseq = [w if w < self.n_words_src else 1 for w in sseq] 62 | 63 | # if given limit vocabulary 64 | if self.n_words_trg > 0: 65 | tseq = [w if w < self.n_words_trg else 1 for w in tseq] 66 | 67 | src_unks += sseq.count(1) 68 | trg_unks += tseq.count(1) 69 | 70 | # Append sequences to the list 71 | seqs.append((sseq, tseq)) 72 | 73 | sf.close() 74 | tf.close() 75 | 76 | # Save sequences 77 | self._seqs = seqs 78 | self.n_unks_src = src_unks 79 | self.n_unks_trg = trg_unks 80 | 81 | # Number of training samples 82 | self.n_samples = len(self._seqs) 83 | 84 | # Set batch processor function 85 | self._process_batch = (lambda idxs: self.mask_seqs(idxs)) 86 | 87 | if self.shuffle_mode == 'trglen': 88 | # Homogeneous batches ordered by target sequence length 89 | # Get an iterator over sample idxs 90 | self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=1) 91 | else: 92 | self.rewind() 93 | 94 | def rewind(self): 95 | if self.shuffle_mode != 'trglen': 96 | # Fill in the _idxs list for sample order 97 | if self.shuffle_mode == 'simple': 98 | # Simple shuffle 99 | self._idxs = np.random.permutation(self.n_samples).tolist() 100 | elif self.shuffle_mode is None: 101 | # Ordered 102 | self._idxs = np.arange(self.n_samples).tolist() 103 | 104 | self._iter = [] 105 | for i in range(0, self.n_samples, self.batch_size): 106 | self._iter.append(self._idxs[i:i + self.batch_size]) 107 | self._iter = iter(self._iter) 108 | 109 | def mask_seqs(self, idxs): 110 | """Prepares a list of padded tensors with their masks for the given sample idxs.""" 111 | src, src_mask = Iterator.mask_data([self._seqs[i][0] for i in idxs]) 112 | trg, trg_mask = Iterator.mask_data([self._seqs[i][1] for i in idxs]) 113 | return (src, src_mask, trg, trg_mask) 114 | -------------------------------------------------------------------------------- /nmtpy/iterators/fusion.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle 3 | import numpy as np 4 | 5 | from ..sysutils import listify 6 | from ..nmtutils import sent_to_idx 7 | from .iterator import Iterator 8 | from .homogeneous import HomogeneousData 9 | from ..defaults import INT, FLOAT 10 | 11 | # This is an iterator specifically to be used by the .pkl 12 | # corpora files created for WMT17 Shared Task on Multimodal Machine Translation 13 | # Each element of the list that is pickled is in the following format: 14 | # [src_split_idx, trg_split_idx, imgid, imgname, src_words, trg_words] 15 | 16 | # Shorthand for positional access 17 | SSPLIT, TSPLIT, IMGID, IMGNAME, STOKENS, TTOKENS = range(6) 18 | 19 | class FusionIterator(Iterator): 20 | def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs): 21 | super(FusionIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger) 22 | 23 | assert 'pklfile' in kwargs, "Missing argument pklfile" 24 | 25 | # pkl file containing the data 26 | self.pklfile = kwargs['pklfile'] 27 | 28 | # Don't use mask when batch_size == 1 which means we're doing 29 | # translation with nmt-translate 30 | if self.batch_size == 1: 31 | self.mask = False 32 | 33 | # Will be set after reading the data 34 | self.src_avail = False 35 | self.trg_avail = False 36 | 37 | # Source word dictionary 38 | # This may not be available in image captioning 39 | self.srcdict = kwargs.get('srcdict', None) 40 | # This may not be available during validation 41 | self.trgdict = kwargs.get('trgdict', None) 42 | 43 | # Short-list sizes 44 | self.n_words_src = kwargs.get('n_words_src', 0) 45 | self.n_words_trg = kwargs.get('n_words_trg', 0) 46 | 47 | # How do we refer to symbolic data variables? 48 | self.src_name = kwargs.get('src_name', 'x') 49 | self.trg_name = kwargs.get('trg_name', 'y') 50 | 51 | # Image features file 52 | # (n_samples, flattened_spatial, n_maps) 53 | self.imgfile = kwargs.get('imgfile', None) 54 | 55 | if self.srcdict: 56 | self._keys = [self.src_name] 57 | if self.mask: 58 | self._keys.append("%s_mask" % self.src_name) 59 | 60 | # We have images in the middle 61 | if self.imgfile: 62 | self._keys.append("%s_img" % self.src_name) 63 | 64 | if self.trgdict: 65 | self._keys.append(self.trg_name) 66 | if self.mask: 67 | self._keys.append("%s_mask" % self.trg_name) 68 | 69 | def read(self): 70 | # Load image features file if any 71 | if self.imgfile is not None: 72 | self._print('Loading image file...') 73 | self.img_feats = np.load(self.imgfile) 74 | 75 | # Move n_samples to middle dimension 76 | # -> 196 x n_samples x 1024 for res4f_relu 77 | self.img_feats = self.img_feats.transpose(1, 0, 2) 78 | 79 | # (w*h, n, c) 80 | self.img_shape = tuple((self.img_feats.shape[0], -1, self.img_feats.shape[-1])) 81 | self._print('Done.') 82 | 83 | # Load the corpora 84 | with open(self.pklfile, 'rb') as f: 85 | self._print('Loading pkl file...') 86 | self._seqs = pickle.load(f) 87 | self._print('Done.') 88 | 89 | # Introspect the pickle by looking the first sample 90 | ss = self._seqs[0] 91 | 92 | # we may not have them in pickle or we may not 93 | # want to use target sentences by giving its vocab None 94 | if ss[TTOKENS] is not None and self.trgdict: 95 | self.trg_avail = True 96 | 97 | # Same for source side 98 | if ss[STOKENS] is not None and self.srcdict: 99 | self.src_avail = True 100 | 101 | # We now have a list of samples 102 | self.n_samples = len(self._seqs) 103 | 104 | # Depending on mode, we can have multiple sentences per image so 105 | # let's store the number of actual images as well. 106 | # n_unique_samples <= n_samples 107 | self.n_unique_images = len(set([s[IMGNAME] for s in self._seqs])) 108 | 109 | # Some statistics 110 | total_src_words = [] 111 | total_trg_words = [] 112 | 113 | # Let's map the sentences once to idx's 114 | for sample in self._seqs: 115 | if self.src_avail: 116 | sample[STOKENS] = sent_to_idx(self.srcdict, sample[STOKENS], self.n_words_src) 117 | total_src_words.extend(sample[STOKENS]) 118 | if self.trg_avail: 119 | sample[TTOKENS] = sent_to_idx(self.trgdict, sample[TTOKENS], self.n_words_trg) 120 | total_trg_words.extend(sample[TTOKENS]) 121 | 122 | if self.src_avail: 123 | self.unk_src = total_src_words.count(1) 124 | self.total_src_words = len(total_src_words) 125 | if self.trg_avail: 126 | self.unk_trg = total_trg_words.count(1) 127 | self.total_trg_words = len(total_trg_words) 128 | 129 | # Set batch processor function 130 | # idxs can be a list of single element as well 131 | self._process_batch = (lambda idxs: self.mask_seqs(idxs)) 132 | 133 | # Homogeneous batches ordered by target sequence length 134 | # Get an iterator over sample idxs 135 | if self.batch_size > 1: 136 | # Training 137 | self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=TTOKENS) 138 | else: 139 | # Test-set 140 | self._iter = iter([[i] for i in np.arange(self.n_samples)]) 141 | 142 | def mask_seqs(self, idxs): 143 | """Pad if necessary and return padded batches or single samples.""" 144 | data = [] 145 | 146 | # Let's fetch batch samples first 147 | batch = [self._seqs[i] for i in idxs] 148 | 149 | if self.src_avail: 150 | data += Iterator.mask_data([b[STOKENS] for b in batch], get_mask=self.mask) 151 | 152 | # Source image features 153 | if self.imgfile is not None: 154 | x_img = self.img_feats[:, [b[IMGID] for b in batch], :] 155 | 156 | # Reshape accordingly 157 | x_img.shape = self.img_shape 158 | data += [x_img] 159 | 160 | if self.trg_avail: 161 | data += Iterator.mask_data([b[TTOKENS] for b in batch], get_mask=self.mask) 162 | 163 | return data 164 | 165 | def rewind(self): 166 | # Done automatically within homogeneous iterator 167 | pass 168 | -------------------------------------------------------------------------------- /nmtpy/iterators/homogeneous.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import copy 4 | 5 | # Iterator that randomly fetches samples with same target 6 | # length to be efficient in terms of RNN underlyings. 7 | # Code from https://github.com/kelvinxu/arctic-captions 8 | class HomogeneousData(object): 9 | def __init__(self, data, batch_size, trg_pos): 10 | self.batch_size = batch_size 11 | self.data = data 12 | self.trg_pos = trg_pos 13 | 14 | self.prepare() 15 | self.reset() 16 | 17 | def prepare(self): 18 | # find all target sequence lengths 19 | self.lengths = [len(cc[self.trg_pos]) for cc in self.data] 20 | 21 | # Compute unique lengths 22 | self.len_unique = np.unique(self.lengths) 23 | 24 | # indices of unique lengths 25 | self.len_indices = dict() 26 | self.len_counts = dict() 27 | 28 | # For each length, find the sample idxs and their counts 29 | for ll in self.len_unique: 30 | self.len_indices[ll] = np.where(self.lengths == ll)[0] 31 | self.len_counts[ll] = len(self.len_indices[ll]) 32 | 33 | def reset(self): 34 | self.len_curr_counts = copy.copy(self.len_counts) 35 | 36 | # Randomize length order 37 | self.len_unique = np.random.permutation(self.len_unique) 38 | self.len_indices_pos = dict() 39 | for ll in self.len_unique: 40 | # Randomize sample order for a specific length 41 | self.len_indices[ll] = np.random.permutation(self.len_indices[ll]) 42 | # Set initial position for this length to 0 43 | self.len_indices_pos[ll] = 0 44 | 45 | self.len_idx = -1 46 | 47 | def __next__(self): 48 | fin_unique_len = 0 49 | while True: 50 | # What is the length idx for this batch? 51 | self.len_idx = (self.len_idx + 1) % len(self.len_unique) 52 | # Current candidate length 53 | self.cur_len = self.len_unique[self.len_idx] 54 | # Do we have samples left for this length? 55 | if self.len_curr_counts[self.cur_len] > 0: 56 | break 57 | 58 | # All samples for a length exhausted, increment counter 59 | fin_unique_len += 1 60 | 61 | # Is this the end for this epoch? 62 | if fin_unique_len >= len(self.len_unique): 63 | break 64 | 65 | # All data consumed 66 | if fin_unique_len >= len(self.len_unique): 67 | self.reset() 68 | raise StopIteration() 69 | 70 | # batch_size or what is left for this length 71 | curr_batch_size = np.minimum(self.batch_size, self.len_curr_counts[self.cur_len]) 72 | # Get current position for the batch 73 | curr_pos = self.len_indices_pos[self.cur_len] 74 | 75 | # get the indices for the current batch 76 | curr_indices = self.len_indices[self.cur_len][curr_pos:curr_pos+curr_batch_size] 77 | 78 | # Increment next position 79 | self.len_indices_pos[self.cur_len] += curr_batch_size 80 | # Decrement used sample count 81 | self.len_curr_counts[self.cur_len] -= curr_batch_size 82 | 83 | # Return batch indices from here 84 | return curr_indices 85 | 86 | def __iter__(self): 87 | return self 88 | -------------------------------------------------------------------------------- /nmtpy/iterators/iterator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | 4 | from abc import ABCMeta, abstractmethod 5 | from collections import OrderedDict 6 | 7 | import numpy as np 8 | from ..defaults import INT, FLOAT 9 | 10 | class Iterator(object, metaclass=ABCMeta): 11 | """Base Iterator class.""" 12 | 13 | @staticmethod 14 | def mask_data(seqs, get_mask=True): 15 | """Pads sequences with EOS (0) for minibatch processing.""" 16 | lengths = [len(s) for s in seqs] 17 | maxlen = np.max(lengths) + 1 18 | 19 | # Shape is (t_steps, samples) 20 | x = np.zeros((maxlen, len(seqs))).astype(INT) 21 | x_mask = np.zeros_like(x).astype(FLOAT) 22 | 23 | for idx, s_x in enumerate(seqs): 24 | x[:lengths[idx], idx] = s_x 25 | x_mask[:lengths[idx] + 1, idx] = 1. 26 | 27 | if get_mask: 28 | return [x, x_mask] 29 | else: 30 | return [x] 31 | 32 | def _print(self, msg): 33 | if self._logger: 34 | self._logger.info(msg) 35 | 36 | def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None): 37 | self.n_samples = 0 38 | self.seed = seed 39 | self.mask = mask 40 | self._logger = logger 41 | self.batch_size = batch_size 42 | self._keys = [] 43 | self._idxs = [] 44 | self._seqs = [] 45 | self._iter = None 46 | self._minibatches = [] 47 | 48 | self.shuffle_mode = shuffle_mode 49 | if self.shuffle_mode: 50 | # Set random seed 51 | random.seed(self.seed) 52 | 53 | # This can be set by child classes for processing 54 | # a list of idxs into the actual minibatch 55 | self._process_batch = lambda x: x 56 | 57 | def __len__(self): 58 | """Returns number of samples.""" 59 | return self.n_samples 60 | 61 | def __iter__(self): 62 | return self 63 | 64 | def __next__(self): 65 | """Returns the next set of data from the iterator.""" 66 | try: 67 | data = self._process_batch(next(self._iter)) 68 | except StopIteration as si: 69 | self.rewind() 70 | raise 71 | else: 72 | # Lookup the keys and return an ordered dict of the current minibatch 73 | return OrderedDict([(k, data[i]) for i,k in enumerate(self._keys)]) 74 | 75 | # May or may not be used. 76 | def prepare_batches(self): 77 | """Prepare self.__iter.""" 78 | pass 79 | 80 | @abstractmethod 81 | def read(self): 82 | """Read the data and put in into self.__seqs.""" 83 | pass 84 | 85 | @abstractmethod 86 | def rewind(self): 87 | pass 88 | -------------------------------------------------------------------------------- /nmtpy/iterators/mnmt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle 3 | import numpy as np 4 | 5 | from ..sysutils import listify 6 | from ..nmtutils import sent_to_idx 7 | from .iterator import Iterator 8 | from .homogeneous import HomogeneousData 9 | from ..defaults import INT, FLOAT 10 | 11 | # This is an iterator specifically to be used by the .pkl 12 | # corpora files created for WMT17 Shared Task on Multimodal Machine Translation 13 | # Each element of the list that is pickled is in the following format: 14 | # [src_split_idx, trg_split_idx, imgid, imgname, src_words, trg_words] 15 | 16 | # Shorthand for positional access 17 | SSPLIT, TSPLIT, IMGID, IMGNAME, STOKENS, TTOKENS = range(6) 18 | 19 | class MNMTIterator(Iterator): 20 | def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs): 21 | super(MNMTIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger) 22 | 23 | assert 'pklfile' in kwargs, "Missing argument pklfile" 24 | 25 | # pkl file containing the data 26 | self.pklfile = kwargs['pklfile'] 27 | 28 | self._print('Shuffle mode: %s' % shuffle_mode) 29 | 30 | # Don't use mask when batch_size == 1 which means we're doing 31 | # translation with nmt-translate 32 | if self.batch_size == 1: 33 | self.mask = False 34 | 35 | # Will be set after reading the data 36 | self.src_avail = False 37 | self.trg_avail = False 38 | 39 | # Source word dictionary 40 | # This may not be available in image captioning 41 | self.srcdict = kwargs.get('srcdict', None) 42 | # This may not be available during validation 43 | self.trgdict = kwargs.get('trgdict', None) 44 | 45 | # Short-list sizes 46 | self.n_words_src = kwargs.get('n_words_src', 0) 47 | self.n_words_trg = kwargs.get('n_words_trg', 0) 48 | 49 | # How do we refer to symbolic data variables? 50 | self.src_name = kwargs.get('src_name', 'x') 51 | self.trg_name = kwargs.get('trg_name', 'y') 52 | 53 | # Image features file 54 | # (n_samples, flattened_spatial, n_maps) 55 | self.imgfile = kwargs.get('imgfile', None) 56 | 57 | def read(self): 58 | # Load image features file if any 59 | if self.imgfile is not None: 60 | self._print('Loading image file...') 61 | self.img_feats = np.load(self.imgfile) 62 | 63 | # Load the corpora 64 | with open(self.pklfile, 'rb') as f: 65 | self._print('Loading pkl file...') 66 | self._seqs = pickle.load(f) 67 | 68 | # Introspect the pickle by looking the first sample 69 | ss = self._seqs[0] 70 | 71 | # we may not have them in pickle or we may not 72 | # want to use target sentences by giving its vocab None 73 | if ss[TTOKENS] is not None and self.trgdict: 74 | self.trg_avail = True 75 | 76 | # Same for source side 77 | if ss[STOKENS] is not None and self.srcdict: 78 | self.src_avail = True 79 | 80 | if self.src_avail: 81 | self._keys = [self.src_name] 82 | if self.mask: 83 | self._keys.append("%s_mask" % self.src_name) 84 | 85 | # We have images in the middle 86 | if self.imgfile: 87 | self._keys.append("%s_img" % self.src_name) 88 | 89 | if self.trg_avail: 90 | self._keys.append(self.trg_name) 91 | if self.mask: 92 | self._keys.append("%s_mask" % self.trg_name) 93 | 94 | # We now have a list of samples 95 | self.n_samples = len(self._seqs) 96 | 97 | # Depending on mode, we can have multiple sentences per image so 98 | # let's store the number of actual images as well. 99 | # n_unique_samples <= n_samples 100 | self.n_unique_images = len(set([s[IMGNAME] for s in self._seqs])) 101 | 102 | # Some statistics 103 | total_src_words = [] 104 | total_trg_words = [] 105 | 106 | # Let's map the sentences once to idx's 107 | for sample in self._seqs: 108 | if self.src_avail: 109 | sample[STOKENS] = sent_to_idx(self.srcdict, sample[STOKENS], self.n_words_src) 110 | total_src_words.extend(sample[STOKENS]) 111 | if self.trg_avail: 112 | sample[TTOKENS] = sent_to_idx(self.trgdict, sample[TTOKENS], self.n_words_trg) 113 | total_trg_words.extend(sample[TTOKENS]) 114 | 115 | if self.src_avail: 116 | self.n_unks_src = total_src_words.count(1) 117 | self.total_src_words = len(total_src_words) 118 | if self.trg_avail: 119 | self.n_unks_trg = total_trg_words.count(1) 120 | self.total_trg_words = len(total_trg_words) 121 | 122 | # Set batch processor function 123 | # idxs can be a list of single element as well 124 | self._process_batch = (lambda idxs: self.mask_seqs(idxs)) 125 | 126 | # Homogeneous batches ordered by target sequence length 127 | # Get an iterator over sample idxs 128 | if self.batch_size > 1 and self.shuffle_mode == 'trglen': 129 | # Training 130 | self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=TTOKENS) 131 | else: 132 | # Handles both bsize = 1 and > 1. Test-set mode 133 | self._idxs = np.arange(self.n_samples) 134 | self._iter = [] 135 | for i in range(0, self.n_samples, self.batch_size): 136 | self._iter.append(self._idxs[i:i + self.batch_size]) 137 | self._iter = iter(self._iter) 138 | 139 | def mask_seqs(self, idxs): 140 | """Pad if necessary and return padded batches or single samples.""" 141 | data = [] 142 | 143 | # Let's fetch batch samples first 144 | batch = [self._seqs[i] for i in idxs] 145 | 146 | if self.src_avail: 147 | data += Iterator.mask_data([b[STOKENS] for b in batch], get_mask=self.mask) 148 | 149 | # Source image features 150 | if self.imgfile is not None: 151 | x_img = self.img_feats[[b[IMGID] for b in batch], ...] 152 | data += [x_img] 153 | 154 | if self.trg_avail: 155 | data += Iterator.mask_data([b[TTOKENS] for b in batch], get_mask=self.mask) 156 | 157 | return data 158 | 159 | def rewind(self): 160 | if self.shuffle_mode != 'trglen': 161 | # Handles both bsize = 1 and > 1. Test-set mode 162 | self._idxs = np.arange(self.n_samples) 163 | self._iter = [] 164 | for i in range(0, self.n_samples, self.batch_size): 165 | self._iter.append(self._idxs[i:i + self.batch_size]) 166 | self._iter = iter(self._iter) 167 | -------------------------------------------------------------------------------- /nmtpy/iterators/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | from ..sysutils import fopen 5 | from .iterator import Iterator 6 | 7 | """Text iterator for monolingual data.""" 8 | class TextIterator(Iterator): 9 | def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs): 10 | super(TextIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger) 11 | 12 | assert 'file' in kwargs, "Missing argument file" 13 | assert 'dict' in kwargs, "Missing argument dict" 14 | 15 | self.__file = kwargs['file'] 16 | self.__dict = kwargs['dict'] 17 | self.__n_words = kwargs.get('n_words', 0) 18 | self.name = kwargs.get('name', 'x') 19 | 20 | self._keys = [self.name] 21 | if self.mask: 22 | self._keys.append('%s_mask' % self.name) 23 | 24 | def read(self): 25 | seqs = [] 26 | with fopen(self.__file, 'r') as f: 27 | for idx, line in enumerate(f): 28 | line = line.strip() 29 | 30 | # Skip empty lines 31 | if line == "": 32 | print('Warning: empty line in %s' % self.__file) 33 | else: 34 | line = line.split(" ") 35 | 36 | seq = [self.__dict.get(w, 1) for w in line] 37 | 38 | # if given limit vocabulary 39 | if self.__n_words > 0: 40 | seq = [w if w < self.__n_words else 1 for w in seq] 41 | # Append the sequence 42 | seqs += [seq] 43 | 44 | self._seqs = seqs 45 | self.n_samples = len(self._seqs) 46 | self._idxs = np.arange(self.n_samples) 47 | 48 | if not self._minibatches: 49 | self.prepare_batches() 50 | self.rewind() 51 | 52 | def prepare_batches(self): 53 | self._minibatches = [] 54 | 55 | for i in range(0, self.n_samples, self.batch_size): 56 | batch_idxs = self._idxs[i:i + self.batch_size] 57 | x, x_mask = Iterator.mask_data([self._seqs[i] for i in batch_idxs]) 58 | self._minibatches.append((x, x_mask)) 59 | 60 | def rewind(self): 61 | """Recreate the iterator.""" 62 | if self.shuffle_mode == 'simple': 63 | self._idxs = np.random.permutation(self.n_samples) 64 | self.prepare_batches() 65 | 66 | self._iter = iter(self._minibatches) 67 | -------------------------------------------------------------------------------- /nmtpy/iterators/wmt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle 3 | import numpy as np 4 | 5 | from ..nmtutils import sent_to_idx 6 | from .iterator import Iterator 7 | from .homogeneous import HomogeneousData 8 | 9 | # This is an iterator specifically to be used by the .pkl 10 | # corpora files created for WMT16 Shared Task on Multimodal Machine Translation 11 | # Each element of the list that is pickled is in the following format: 12 | # [src_split_idx, trg_split_idx, imgid, imgname, src_words, trg_words] 13 | 14 | class WMTIterator(Iterator): 15 | def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs): 16 | super(WMTIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger) 17 | 18 | assert 'pklfile' in kwargs, "Missing argument pklfile" 19 | assert 'srcdict' in kwargs, "Missing argument srcdict" 20 | 21 | self._print('Shuffle mode: %s' % shuffle_mode) 22 | 23 | # Short-list sizes 24 | self.n_words_src = kwargs.get('n_words_src', 0) 25 | self.n_words_trg = kwargs.get('n_words_trg', 0) 26 | 27 | # How do we refer to symbolic data variables? 28 | self.src_name = kwargs.get('src_name', 'x') 29 | self.trg_name = kwargs.get('trg_name', 'y') 30 | 31 | # How do we use the multimodal data? (Numbers in parens are for Task 2) 32 | # 'all' : All combinations (~725K parallel) 33 | # 'single' : Take only the first pair e.g., train0.en->train0.de (~29K parallel) 34 | # 'pairs' : Take only one-to-one pairs e.g., train_i.en->train_i.de (~145K parallel) 35 | self.mode = kwargs.get('mode', 'pairs') 36 | 37 | # pkl file which contains a list of samples 38 | self.pklfile = kwargs['pklfile'] 39 | # Resnet-50 image features file 40 | self.imgfile = kwargs.get('imgfile', None) 41 | self.img_avail = self.imgfile is not None 42 | 43 | self.trg_avail = False 44 | 45 | # Source word dictionary and short-list limit 46 | # This may not be available if the task is image -> description (Not implemented) 47 | self.srcdict = kwargs['srcdict'] 48 | # This may not be available during validation 49 | self.trgdict = kwargs.get('trgdict', None) 50 | 51 | # Don't use mask when batch_size == 1 which means we're doing 52 | # translation with nmt-translate 53 | if self.batch_size == 1: 54 | self.mask = False 55 | 56 | self._keys = [self.src_name] 57 | if self.mask: 58 | self._keys.append("%s_mask" % self.src_name) 59 | 60 | # We have images in the middle 61 | if self.imgfile: 62 | self._keys.append("%s_img" % self.src_name) 63 | 64 | # Target may not be available during validation 65 | if self.trgdict: 66 | self._keys.append(self.trg_name) 67 | if self.mask: 68 | self._keys.append("%s_mask" % self.trg_name) 69 | 70 | def read(self): 71 | # Load image features file if any 72 | if self.img_avail: 73 | self._print('Loading image file...') 74 | self.img_feats = np.load(self.imgfile) 75 | self._print('Done.') 76 | 77 | # Load the corpora 78 | with open(self.pklfile, 'rb') as f: 79 | self._print('Loading pkl file...') 80 | self._seqs = pickle.load(f) 81 | self._print('Done.') 82 | 83 | # Check for what is available 84 | ss = self._seqs[0] 85 | # If no split idxs are found, its Task 1, set mode to 'all' 86 | if ss[0] is None and ss[1] is None: 87 | self.mode = 'all' 88 | 89 | if ss[5] is not None and self.trgdict: 90 | self.trg_avail = True 91 | 92 | if self.mode == 'single': 93 | # Just take the first src-trg pair. Useful for validation 94 | if ss[1] is not None: 95 | self._seqs = [s for s in self._seqs if (s[0] == s[1] == 0)] 96 | else: 97 | self._seqs = [s for s in self._seqs if (s[0] == 0)] 98 | 99 | elif ss[1] is not None and self.mode == 'pairs': 100 | # Take the pairs with split idx's equal 101 | self._seqs = [s for s in self._seqs if s[0] == s[1]] 102 | 103 | # We now have a list of samples 104 | self.n_samples = len(self._seqs) 105 | 106 | # Depending on mode, we can have multiple sentences per image so 107 | # let's store the number of actual images as well. 108 | # n_unique_samples <= n_samples 109 | self.n_unique_images = len(set([s[3] for s in self._seqs])) 110 | 111 | # Some statistics 112 | total_src_words = [] 113 | total_trg_words = [] 114 | 115 | # Let's map the sentences once to idx's 116 | for sample in self._seqs: 117 | sample[4] = sent_to_idx(self.srcdict, sample[4], self.n_words_src) 118 | total_src_words.extend(sample[4]) 119 | if self.trg_avail: 120 | sample[5] = sent_to_idx(self.trgdict, sample[5], self.n_words_trg) 121 | total_trg_words.extend(sample[5]) 122 | 123 | self.unk_src = total_src_words.count(1) 124 | self.unk_trg = total_trg_words.count(1) 125 | self.total_src_words = len(total_src_words) 126 | self.total_trg_words = len(total_trg_words) 127 | 128 | ######################### 129 | # Prepare iteration stuff 130 | ######################### 131 | # Set batch processor function 132 | if self.batch_size == 1: 133 | self._process_batch = (lambda idxs: self.process_single(idxs[0])) 134 | else: 135 | self._process_batch = (lambda idxs: self.mask_seqs(idxs)) 136 | 137 | if self.shuffle_mode == 'trglen': 138 | # Homogeneous batches ordered by target sequence length 139 | # Get an iterator over sample idxs 140 | self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=5) 141 | else: 142 | # For once keep it ordered 143 | self._idxs = np.arange(self.n_samples).tolist() 144 | self._iter = [] 145 | for i in range(0, self.n_samples, self.batch_size): 146 | self._iter.append(self._idxs[i:i + self.batch_size]) 147 | self._iter = iter(self._iter) 148 | 149 | def process_single(self, idx): 150 | data, _ = Iterator.mask_data([self._seqs[idx][4]]) 151 | data = [data] 152 | if self.img_avail: 153 | # Do this 196 x 1024 154 | data += [self.img_feats[self._seqs[idx][2]][:, None, :]] 155 | if self.trg_avail: 156 | trg, _ = Iterator.mask_data([self._seqs[idx][5]]) 157 | data.append(trg) 158 | return data 159 | 160 | def mask_seqs(self, idxs): 161 | """Prepares a list of padded tensors with their masks for the given sample idxs.""" 162 | data = list(Iterator.mask_data([self._seqs[i][4] for i in idxs])) 163 | # Source image features 164 | if self.img_avail: 165 | img_idxs = [self._seqs[i][2] for i in idxs] 166 | 167 | # Do this 196 x bsize x 1024 168 | x_img = self.img_feats[img_idxs].transpose(1, 0, 2) 169 | data += [x_img] 170 | 171 | if self.trg_avail: 172 | data += list(Iterator.mask_data([self._seqs[i][5] for i in idxs])) 173 | 174 | return data 175 | 176 | def rewind(self): 177 | if self.shuffle_mode != 'trglen': 178 | # Fill in the _idxs list for sample order 179 | if self.shuffle_mode == 'simple': 180 | # Simple shuffle 181 | self._idxs = np.random.permutation(self.n_samples).tolist() 182 | elif self.shuffle_mode is None: 183 | # Ordered 184 | self._idxs = np.arange(self.n_samples).tolist() 185 | 186 | self._iter = [] 187 | for i in range(0, self.n_samples, self.batch_size): 188 | self._iter.append(self._idxs[i:i + self.batch_size]) 189 | self._iter = iter(self._iter) 190 | -------------------------------------------------------------------------------- /nmtpy/logger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | def singleton(cls): 5 | instances = {} 6 | def get_instance(): 7 | if cls not in instances: 8 | instances[cls] = cls() 9 | return instances[cls] 10 | return get_instance() 11 | 12 | @singleton 13 | class Logger(object): 14 | """Logs to stdout and to file simultaneously.""" 15 | def __init__(self): 16 | pass 17 | 18 | def setup(self, log_file=None, timestamp=True): 19 | _format = '%(message)s' 20 | if timestamp: 21 | _format = '%(asctime)s ' + _format 22 | 23 | self.formatter = logging.Formatter(_format) 24 | self._logger = logging.getLogger('nmtpy') 25 | self._logger.setLevel(logging.DEBUG) 26 | self._ch = logging.StreamHandler() 27 | self._ch.setFormatter(self.formatter) 28 | self._logger.addHandler(self._ch) 29 | 30 | if log_file: 31 | self._fh = logging.FileHandler(log_file, mode='w') 32 | self._fh.setFormatter(self.formatter) 33 | self._logger.addHandler(self._fh) 34 | 35 | def get(self): 36 | return self._logger 37 | -------------------------------------------------------------------------------- /nmtpy/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import operator 4 | import numpy as np 5 | 6 | from .bleu import MultiBleuScorer 7 | from .meteor import METEORScorer 8 | from .factors2wordbleu import Factors2word 9 | from .mtevalbleu import MTEvalV13aBLEUScorer 10 | from .external import ExternalScorer 11 | 12 | comparators = { 13 | 'bleu' : (max, operator.gt, 0), 14 | 'bleu_v13a' : (max, operator.gt, 0), 15 | 'meteor' : (max, operator.gt, 0), 16 | 'cider' : (max, operator.gt, 0), 17 | 'rouge' : (max, operator.gt, 0), 18 | 'loss' : (min, operator.lt, -1), 19 | 'ter' : (min, operator.lt, -1), 20 | } 21 | 22 | def get_scorer(scorer): 23 | scorers = { 24 | 'meteor' : METEORScorer, 25 | 'bleu' : MultiBleuScorer, 26 | 'bleu_v13a' : MTEvalV13aBLEUScorer, 27 | 'factors2word': Factors2word, 28 | } 29 | 30 | if scorer in scorers: 31 | # A defined metric 32 | return scorers[scorer]() 33 | elif scorer.startswith(('/', '~')): 34 | # External script 35 | return ExternalScorer(os.path.expanduser(scorer)) 36 | 37 | def is_last_best(name, history, min_delta): 38 | """Checks whether the last element is the best score so far 39 | by taking into account an absolute improvement threshold min_delta.""" 40 | if len(history) == 1: 41 | # If first validation, return True to save it 42 | return True 43 | 44 | new_value = history[-1] 45 | 46 | # bigger is better 47 | if name.startswith(('bleu', 'meteor', 'cider', 'rouge')): 48 | cur_best = max(history[:-1]) 49 | return new_value > cur_best and abs(new_value - cur_best) >= (min_delta - 1e-5) 50 | # lower is better 51 | elif name in ['loss', 'px', 'ter']: 52 | cur_best = min(history[:-1]) 53 | return new_value < cur_best and abs(new_value - cur_best) >= (min_delta - 1e-5) 54 | 55 | def find_best(name, history): 56 | """Returns the best idx and value for the given metric.""" 57 | history = np.array(history) 58 | if name.startswith(('bleu', 'meteor', 'cider', 'rouge')): 59 | best_idx = np.argmax(history) 60 | elif name in ['loss', 'px', 'ter']: 61 | best_idx = np.argmin(history) 62 | 63 | # Validation periods start from 1 64 | best_val = history[best_idx] 65 | return (best_idx + 1), best_val 66 | -------------------------------------------------------------------------------- /nmtpy/metrics/bleu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import subprocess 3 | import pkg_resources 4 | 5 | from .metric import Metric 6 | 7 | BLEU_SCRIPT = pkg_resources.resource_filename('nmtpy', 'external/multi-bleu.perl') 8 | 9 | class BLEUScore(Metric): 10 | def __init__(self, score=None): 11 | super(BLEUScore, self).__init__(score) 12 | self.name = "BLEU" 13 | if score: 14 | self.score = float(score.split()[2][:-1]) 15 | self.score_str = score.replace('BLEU = ', '') 16 | 17 | """MultiBleuScorer class.""" 18 | class MultiBleuScorer(object): 19 | def __init__(self, lowercase=False): 20 | # For multi-bleu.perl we give the reference(s) files as argv, 21 | # while the candidate translations are read from stdin. 22 | self.lowercase = lowercase 23 | self.__cmdline = [BLEU_SCRIPT] 24 | if self.lowercase: 25 | self.__cmdline.append("-lc") 26 | 27 | def compute(self, refs, hypfile): 28 | cmdline = self.__cmdline[:] 29 | 30 | # Make reference files a list 31 | refs = [refs] if isinstance(refs, str) else refs 32 | cmdline.extend(refs) 33 | 34 | hypstring = None 35 | with open(hypfile, "r") as fhyp: 36 | hypstring = fhyp.read().rstrip() 37 | 38 | score = subprocess.run(cmdline, stdout=subprocess.PIPE, 39 | input=hypstring, universal_newlines=True).stdout.splitlines() 40 | if len(score) == 0: 41 | return BLEUScore() 42 | else: 43 | return BLEUScore(score[0].rstrip("\n")) 44 | -------------------------------------------------------------------------------- /nmtpy/metrics/external.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import subprocess 4 | 5 | from .metric import Metric 6 | 7 | class ExternalScore(Metric): 8 | def __init__(self, score=None): 9 | super(ExternalScore, self).__init__(score) 10 | # This should be overriden once the score is received 11 | # So the script should behave exactly as it is documented below. 12 | self.name = 'External' 13 | 14 | if score: 15 | # Parse score line of format: 16 | # METRIC = SCORE, ...... 17 | name, rest = score.split('=', 1) 18 | self.score_str = rest.strip() 19 | self.name = name.strip() 20 | score, rest = rest.split(',', 1) 21 | self.score = float(score.strip()) 22 | 23 | class ExternalScorer(object): 24 | """An external scorer that calls arbitrary script for metric computation. 25 | - The script should be runnable as it-is 26 | - It should consume the hypotheses from stdin and receive 27 | a variable number of references as cmdline arguments. 28 | - The script should output a "single line" to stdout with the 29 | following format: 30 | METRICNAME = SCORE, 31 | 32 | Example: 33 | $ custombleu.perl ref1 ref2 ref3 < hyps (Higher better) 34 | BLEU = 23.43, (ref_len=xxx,hyp_len=xxx,penalty=xxx) 35 | $ wer.py ref1 < hyps (Lower better) 36 | WER = 32.42, (....)""" 37 | 38 | def __init__(self, script): 39 | self.__cmdline = [script] 40 | 41 | def compute(self, refs, hypfile): 42 | cmdline = self.__cmdline[:] 43 | 44 | # Make reference files a list and add to command 45 | refs = [refs] if isinstance(refs, str) else refs 46 | cmdline.extend(refs) 47 | 48 | # Read hypotheses 49 | with open(hypfile, "r") as fhyp: 50 | hypstring = fhyp.read().rstrip() 51 | 52 | # Run script 53 | score = subprocess.run(cmdline, stdout=subprocess.PIPE, 54 | input=hypstring, 55 | universal_newlines=True).stdout.splitlines() 56 | if len(score) == 0: 57 | return ExternalScore() 58 | else: 59 | return ExternalScore(score[0].strip()) 60 | -------------------------------------------------------------------------------- /nmtpy/metrics/factors2wordbleu.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | from ..sysutils import find_executable 4 | from .bleu import BLEUScore 5 | 6 | """Factors2word class.""" 7 | class Factors2word(object): 8 | def __init__(self): 9 | pass 10 | 11 | def compute(self, script, hyp_file, hyp_mult_file, ref): 12 | script = find_executable(script) 13 | lang = ref.split('.')[-1] 14 | cmdline = [script, lang, hyp_file, hyp_mult_file, ref] 15 | 16 | hypstring = None 17 | with open(hyp_file, "r") as fhyp: 18 | hypstring = fhyp.read().rstrip() 19 | 20 | out = subprocess.run(cmdline, stdout=subprocess.PIPE, 21 | input=hypstring, universal_newlines=True).stdout.splitlines() 22 | # TODO: this -1 gives many problems, in future we will return just BLEU and avoid those problems 23 | score = out[-1].splitlines() 24 | if len(score) == 0: 25 | return BLEUScore() 26 | else: 27 | return BLEUScore(score[0].rstrip("\n")) 28 | -------------------------------------------------------------------------------- /nmtpy/metrics/meteor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import subprocess 4 | import pkg_resources 5 | 6 | from ..sysutils import get_temp_file 7 | from .metric import Metric 8 | 9 | METEOR_JAR = pkg_resources.resource_filename('nmtpy', 'external/meteor-1.5.jar') 10 | 11 | class METEORScore(Metric): 12 | def __init__(self, score=None): 13 | super(METEORScore, self).__init__(score) 14 | self.name = "METEOR" 15 | self.score = (100*score) if score else 0. 16 | self.score_str = "%.3f" % self.score 17 | 18 | class METEORScorer(object): 19 | def __init__(self): 20 | self.__cmdline = ["java", "-Xmx2G", "-jar", METEOR_JAR] 21 | 22 | def compute(self, refs, hyps, language="auto", norm=False): 23 | cmdline = self.__cmdline[:] 24 | 25 | if isinstance(hyps, list): 26 | # Create a temporary file 27 | with get_temp_file(suffix=".hyps") as tmpf: 28 | for hyp in hyps: 29 | tmpf.write("%s\n" % hyp) 30 | 31 | cmdline.append(tmpf.name) 32 | 33 | elif isinstance(hyps, str): 34 | cmdline.append(hyps) 35 | 36 | # Make reference files a list 37 | refs = [refs] if isinstance(refs, str) else refs 38 | n_refs = len(refs) 39 | if n_refs > 1: 40 | # Multiple references 41 | # FIXME: METEOR can consume everything from stdin 42 | tmpff = get_temp_file(suffix=".refs") 43 | fname = tmpff.name 44 | tmpff.close() 45 | os.system('paste -d"\\n" %s > %s' % (" ".join(refs), fname)) 46 | cmdline.append(fname) 47 | else: 48 | cmdline.append(refs[0]) 49 | 50 | if language == "auto": 51 | # Take the extension of the 1st reference file, e.g. ".de" 52 | language = os.path.splitext(refs[0])[-1][1:] 53 | 54 | cmdline.extend(["-l", language]) 55 | if norm: 56 | cmdline.append("-norm") 57 | 58 | if n_refs > 1: 59 | # Multiple references 60 | cmdline.extend(["-r", str(n_refs)]) 61 | 62 | score = subprocess.run(cmdline, stdout=subprocess.PIPE, 63 | universal_newlines=True).stdout.splitlines() 64 | if len(score) == 0: 65 | return METEORScore() 66 | else: 67 | # Final score: 0.320320320320 68 | return METEORScore(float(score[-1].split(":")[-1].strip())) 69 | -------------------------------------------------------------------------------- /nmtpy/metrics/metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from functools import total_ordering 3 | 4 | @total_ordering 5 | class Metric(object): 6 | def __init__(self, score=None): 7 | self.score_str = "0.0" 8 | self.score = 0. 9 | self.name = "" 10 | 11 | def __eq__(self, other): 12 | return self.score == other.score 13 | 14 | def __lt__(self, other): 15 | return self.score < other.score 16 | 17 | def __repr__(self): 18 | return "%s = %s" % (self.name, self.score_str) 19 | -------------------------------------------------------------------------------- /nmtpy/metrics/mtevalbleu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import math 4 | 5 | from collections import defaultdict 6 | 7 | from .metric import Metric 8 | 9 | # This is an exact reimplementation of mteval-v13a.pl 10 | # It currently only works for single reference 11 | 12 | LOG_2 = math.log(2) 13 | 14 | def score_segment(tst_words, ref_words, ref_ngram_freqs, max_order): 15 | # Create initial lists 16 | match_cnt = [0 for i in range(max_order)] 17 | tst_cnt = [0 for i in range(max_order)] 18 | ref_cnt = [0 for i in range(max_order)] 19 | tst_info = [0 for i in range(max_order)] 20 | ref_info = [0 for i in range(max_order)] 21 | 22 | ref_ngrams_max = {} 23 | 24 | # Get the ngram counts for the test segment 25 | tst_ngrams = words_to_ngrams(tst_words, max_order) 26 | len_tst = len(tst_words) 27 | for i in range(max_order): 28 | tst_cnt[i] = (len_tst - i) if i < len_tst else 0 29 | 30 | ########### 31 | # Reference 32 | ########### 33 | ref_ngrams = words_to_ngrams(ref_words, max_order) 34 | len_ref = len(ref_words) 35 | for ngram_words, freq in ref_ngrams.items(): 36 | # Counts of ngrams for this sentence 37 | ref_info[len(ngram_words) - 1] += ref_ngram_freqs[ngram_words] 38 | 39 | # Update the maximum count of this ngram 40 | # Shorter=>ref_ngrams_max[ngram_words] = max(ref_ngrams_max.get(ngram_words, -1), ref_ngrams[ngram_words]) 41 | if ngram_words in ref_ngrams_max: 42 | ref_ngrams_max[ngram_words] = max(ref_ngrams_max[ngram_words], freq) 43 | else: 44 | ref_ngrams_max[ngram_words] = freq 45 | 46 | # Update reference ngram counts 47 | for i in range(max_order): 48 | ref_cnt[i] = (len_ref - i) if i < len_ref else 0 49 | 50 | for ngram_words, freq in tst_ngrams.items(): 51 | if ngram_words in ref_ngrams_max: 52 | m = min(freq, ref_ngrams_max[ngram_words]) 53 | l = len(ngram_words) - 1 54 | tst_info[l] += ref_ngram_freqs[ngram_words] * m 55 | match_cnt[l] += m 56 | 57 | return len_ref, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info 58 | 59 | def score_system(ref_segs, tst_segs, max_order): 60 | ref_ngram_freqs = compute_ngram_info(ref_segs, max_order) 61 | 62 | # 0-based indexing in contrast to perl version 63 | cum_match = [0 for i in range(max_order)] 64 | cum_tst_cnt = [0 for i in range(max_order)] 65 | cum_ref_cnt = [0 for i in range(max_order)] 66 | cum_tst_info = [0 for i in range(max_order)] 67 | cum_ref_info = [0 for i in range(max_order)] 68 | cum_ref_len = 0 69 | 70 | # Score each segment and keep statistics 71 | for tst, ref in zip(tst_segs, ref_segs): 72 | ref_len, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info = score_segment(tst, ref, ref_ngram_freqs, max_order) 73 | 74 | # Sum ref length 75 | cum_ref_len += ref_len 76 | 77 | for i in range(max_order): 78 | cum_match[i] += match_cnt[i] 79 | cum_tst_cnt[i] += tst_cnt[i] 80 | cum_ref_cnt[i] += ref_cnt[i] 81 | cum_tst_info[i] += tst_info[i] 82 | cum_ref_info[i] += ref_info[i] 83 | 84 | # Compute length score 85 | exp_len_score = math.exp(min(0, 1 - cum_ref_len / cum_tst_cnt[0])) \ 86 | if cum_tst_cnt[0] > 0 else 0 87 | 88 | # For further length ratio computation 89 | tst_vs_ref_ratio = (cum_tst_cnt[0], cum_ref_len, math.log(exp_len_score)) 90 | 91 | return bleu_score(cum_ref_len, cum_match, cum_tst_cnt, exp_len_score, max_order), tst_vs_ref_ratio 92 | 93 | def read_file(filename, tokenizer, is_cased): 94 | """Read simple plain text file.""" 95 | sents = [] 96 | with open(filename) as f: 97 | for line in f: 98 | sents.append(tokenizer(line, is_cased)) 99 | return sents 100 | 101 | def words_to_ngrams(words, max_order): 102 | """Convert a sequence of words to an ngram count dict.""" 103 | d = defaultdict(int) 104 | 105 | # Iterate over word indices as start pointers 106 | for i in range(len(words)): 107 | # Sliding windows 108 | for j in range(min(max_order, len(words) - i)): 109 | # Increment counter, keep keys as tuples 110 | d[tuple(words[i: i+j+1])] += 1 111 | 112 | return d 113 | 114 | def compute_ngram_info(ref_segs, max_order): 115 | tot_words = 0 116 | 117 | # Segment-wise frequencies 118 | ngram_count = defaultdict(int) 119 | 120 | ngram_info = {} 121 | 122 | for words in ref_segs: 123 | tot_words += len(words) 124 | # Get frequencies and add them to ngramcpunt 125 | for key, value in words_to_ngrams(words, max_order).items(): 126 | ngram_count[key] += value 127 | 128 | for ngram_words, freq in ngram_count.items(): 129 | if len(ngram_words) == 1: 130 | # ngram is unigram => corpus frequency 131 | denom = tot_words 132 | else: 133 | # n-gram is n-gram => n-gram frequency 134 | denom = ngram_count[ngram_words[:-1]] 135 | 136 | ngram_info[ngram_words] = -math.log(freq / denom) / LOG_2 137 | return ngram_info 138 | 139 | def bleu_score(ref_len, matching_ngrams, tst_ngrams, exp_len_score, max_order): 140 | score = 0 141 | iscore = 0 142 | smooth = 1 143 | 144 | ind_scores = [] 145 | cum_scores = [] 146 | 147 | for i in range(max_order): 148 | if tst_ngrams[i] == 0: 149 | iscore = 0 150 | elif matching_ngrams[i] == 0: 151 | smooth *= 2 152 | iscore = math.log(1 / (smooth * tst_ngrams[i])) 153 | else: 154 | iscore = math.log(matching_ngrams[i] / tst_ngrams[i]) 155 | 156 | ind_scores.append(math.exp(iscore)) 157 | score += iscore 158 | cum_scores.append(math.exp(score / (i+1)) * exp_len_score) 159 | 160 | return ind_scores, cum_scores 161 | 162 | def tokenizer(s, is_cased): 163 | s = s.strip() 164 | 165 | # language-independent part: 166 | if '' in s: 167 | s = re.sub('', '', s) 168 | 169 | # language-dependent part (assuming Western languages): 170 | s = " %s " % s 171 | if not is_cased: 172 | s = s.lower() 173 | 174 | # tokenize punctuation 175 | s = re.sub('([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', s) 176 | 177 | # tokenize period and comma unless preceded by a digit 178 | s = re.sub('([^0-9])([\.,])', '\\1 \\2 ', s) 179 | 180 | # tokenize period and comma unless followed by a digit 181 | s = re.sub('([\.,])([^0-9])', ' \\1 \\2', s) 182 | 183 | # tokenize dash when preceded by a digit 184 | if '-' in s: 185 | s = re.sub('([0-9])(-)', '\\1 \\2 ', s) 186 | 187 | # Strip multiple spaces 188 | # Strip trailing and leading spaces 189 | return re.sub('\s+', ' ', s).strip().split() 190 | 191 | ####################### 192 | class BLEUScore(Metric): 193 | def __init__(self, score=None): 194 | super(BLEUScore, self).__init__(score) 195 | self.name = "BLEU_v13a" 196 | if score: 197 | self.score = float(score.split()[0][:-1]) 198 | self.score_str = score 199 | 200 | class MTEvalV13aBLEUScorer(object): 201 | def __init__(self): 202 | pass 203 | 204 | def compute(self, refs, hyp_file): 205 | # Make reference files a list 206 | refs = [refs] if isinstance(refs, str) else refs 207 | 208 | # Take the first one 209 | ref_file = refs[0] 210 | 211 | # Read (detokenized) files and tokenize them 212 | ref_segs = read_file(ref_file, tokenizer, True) 213 | tst_segs = read_file(hyp_file, tokenizer, True) 214 | 215 | assert(len(ref_segs) == len(tst_segs)) 216 | 217 | (ind_scores, cum_scores), ratios = score_system(ref_segs, tst_segs, 4) 218 | 219 | bleu_str = "%.2f, %s (ratio=%.3f, hyp_len=%d, ref_len=%d)" % ( 220 | cum_scores[3]*100, 221 | "/".join([("%.1f" % (s*100)) for s in ind_scores[:4]]), 222 | (ratios[0]/ratios[1]), ratios[0], ratios[1]) 223 | return BLEUScore(bleu_str) 224 | -------------------------------------------------------------------------------- /nmtpy/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/nmtpy/models/__init__.py -------------------------------------------------------------------------------- /nmtpy/models/attention_wmt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from ..iterators.fusion import FusionIterator 3 | from .attention import Model as Attention 4 | 5 | # Same model as attention but using FusionIterator 6 | # Purpose was to train a monomodal system using the same .pkl 7 | # files prepared for multimodal Task 2 system. 8 | # 9 | # FIXME: Not tested since WMT16 Task 2 experiments, probably broken 10 | class Model(Attention): 11 | def __init__(self, seed, logger, **kwargs): 12 | # Call parent's init first 13 | super(Model, self).__init__(seed, logger, **kwargs) 14 | 15 | self.data_mode = kwargs.pop('data_mode', 'pairs') 16 | 17 | def load_valid_data(self, from_translate=False, data_mode='single'): 18 | if from_translate: 19 | self.valid_ref_files = self.data['valid_trg'] 20 | if isinstance(self.valid_ref_files, str): 21 | self.valid_ref_files = list([self.valid_ref_files]) 22 | 23 | self.valid_iterator = FusionIterator( 24 | mask=False, 25 | batch_size=1, 26 | pklfile=self.data['valid_src'], 27 | srcdict=self.src_dict, n_words_src=self.n_words_src, 28 | mode=data_mode) 29 | else: 30 | # Take the first validation item for NLL computation 31 | self.valid_iterator = FusionIterator( 32 | batch_size=self.batch_size, 33 | pklfile=self.data['valid_src'], 34 | trgdict=self.trg_dict, srcdict=self.src_dict, 35 | n_words_trg=self.n_words_trg, n_words_src=self.n_words_src, 36 | mode='single') # Override the given parameter 37 | 38 | self.valid_iterator.read() 39 | 40 | def load_data(self): 41 | self.train_iterator = FusionIterator( 42 | batch_size=self.batch_size, 43 | shuffle_mode=self.shuffle_mode, 44 | logger=self._logger, 45 | pklfile=self.data['train_src'], 46 | trgdict=self.trg_dict, srcdict=self.src_dict, 47 | n_words_trg=self.n_words_trg, n_words_src=self.n_words_src, 48 | mode=self.data_mode) 49 | # Prepare batches 50 | self.train_iterator.read() 51 | self.load_valid_data() 52 | -------------------------------------------------------------------------------- /nmtpy/models/basemodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from collections import OrderedDict 3 | 4 | from abc import ABCMeta, abstractmethod 5 | 6 | import theano 7 | import theano.tensor as tensor 8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 9 | 10 | import numpy as np 11 | from ..nmtutils import unzip 12 | from ..sysutils import readable_size, get_temp_file, get_param_dict, get_valid_evaluation 13 | from ..defaults import INT, FLOAT 14 | from ..optimizers import get_optimizer 15 | 16 | class BaseModel(object, metaclass=ABCMeta): 17 | def __init__(self, **kwargs): 18 | # This will save all arguments as instance attributes 19 | self.__dict__.update(kwargs) 20 | 21 | # Will be set when set_dropout is first called 22 | self._use_dropout = None 23 | 24 | # Theano TRNG 25 | self._trng = None 26 | 27 | # Input tensor lists 28 | self.inputs = None 29 | 30 | # Theano variables 31 | self.f_log_probs = None 32 | self.f_init = None 33 | self.f_next = None 34 | 35 | # Model parameters, i.e. weights and biases 36 | self.initial_params = None 37 | self.tparams = None 38 | 39 | # Iterators 40 | self.train_iterator = None 41 | self.valid_iterator = None 42 | 43 | # A theano shared variable for lrate annealing 44 | self.learning_rate = None 45 | 46 | # Optimizer instance (will not be serialized) 47 | self.__opt = None 48 | 49 | @staticmethod 50 | def beam_search(inputs, f_inits, f_nexts, beam_size=12, maxlen=100, suppress_unks=False, **kwargs): 51 | # Override this from your classes 52 | pass 53 | 54 | def set_options(self, optdict): 55 | """Filter out None's and '__[a-zA-Z]' then store into self._options.""" 56 | self._options = OrderedDict() 57 | for k,v in optdict.items(): 58 | # Don't keep model attributes with _ prefix 59 | if v is not None and not k.startswith('_'): 60 | self._options[k] = v 61 | 62 | def set_trng(self, seed): 63 | """Set the seed for Theano RNG.""" 64 | if seed == 0: 65 | # No seed given, randomly pick the seed 66 | seed = np.random.randint(2**29) + 1 67 | self._trng = RandomStreams(seed) 68 | 69 | def set_dropout(self, val): 70 | """Set dropout indicator for activation scaling if dropout is available through configuration.""" 71 | if self._use_dropout is None: 72 | self._use_dropout = theano.shared(np.float64(0.).astype(FLOAT)) 73 | else: 74 | self._use_dropout.set_value(float(val)) 75 | 76 | def update_lrate(self, lrate): 77 | """Update learning rate.""" 78 | self.__opt.set_lrate(lrate) 79 | 80 | def get_nb_params(self): 81 | """Return the number of parameters of the model.""" 82 | return readable_size(sum([p.size for p in self.initial_params.values()])) 83 | 84 | def save(self, fname): 85 | """Save model parameters as .npz.""" 86 | kwargs = OrderedDict() 87 | kwargs['opts'] = self._options 88 | if self.tparams is not None: 89 | kwargs.update(unzip(self.tparams)) 90 | 91 | # Save each param as a separate argument into npz 92 | np.savez(fname, **kwargs) 93 | 94 | def load(self, params): 95 | """Restore .npz checkpoint file into model.""" 96 | self.tparams = OrderedDict() 97 | 98 | params = get_param_dict(params) 99 | 100 | for k,v in params.items(): 101 | self.tparams[k] = theano.shared(v.astype(FLOAT), name=k) 102 | 103 | def init_shared_variables(self): 104 | """Initialize the shared variables of the model.""" 105 | # Create tensor dict 106 | self.tparams = OrderedDict() 107 | 108 | # Fill it with initial random weights 109 | for kk, pp in self.initial_params.items(): 110 | self.tparams[kk] = theano.shared(pp, name=kk) 111 | 112 | def update_shared_variables(self, _from): 113 | """Reset some variables from _from dict.""" 114 | for kk in _from.keys(): 115 | self.tparams[kk].set_value(_from[kk]) 116 | 117 | def val_loss(self, mean=True): 118 | """Compute validation loss.""" 119 | probs = [] 120 | 121 | # dict of x, x_mask, y, y_mask 122 | for data in self.valid_iterator: 123 | # Don't fail if data doesn't contain y_mask. The loss won't 124 | # be normalized but the training will continue 125 | norm = data['y_mask'].sum(0) if 'y_mask' in data else 1 126 | log_probs = self.f_log_probs(*list(data.values())) / norm 127 | probs.extend(log_probs) 128 | 129 | if mean: 130 | return np.array(probs).mean() 131 | else: 132 | return np.array(probs) 133 | 134 | def get_l2_weight_decay(self, decay_c, skip_bias=True): 135 | """Return l2 weight decay regularization term.""" 136 | decay_c = theano.shared(np.float64(decay_c).astype(FLOAT), name='decay_c') 137 | weight_decay = 0. 138 | for kk, vv in self.tparams.items(): 139 | # Skip biases for L2 regularization 140 | if not skip_bias or (skip_bias and vv.get_value().ndim > 1): 141 | weight_decay += (vv ** 2).sum() 142 | weight_decay *= decay_c 143 | return weight_decay 144 | 145 | def get_clipped_grads(self, grads, clip_c): 146 | """Clip gradients a la Pascanu et al.""" 147 | g2 = 0. 148 | new_grads = [] 149 | for g in grads: 150 | g2 += (g**2).sum() 151 | for g in grads: 152 | new_grads.append(tensor.switch(g2 > (clip_c**2), 153 | g / tensor.sqrt(g2) * clip_c, 154 | g)) 155 | return new_grads 156 | 157 | def build_optimizer(self, cost, regcost, clip_c, dont_update=None, opt_history=None): 158 | """Build optimizer by optionally disabling learning for some weights.""" 159 | tparams = OrderedDict(self.tparams) 160 | 161 | # Filter out weights that we do not want to update during backprop 162 | if dont_update is not None: 163 | for key in list(tparams.keys()): 164 | if key in dont_update: 165 | del tparams[key] 166 | 167 | # Our final cost 168 | final_cost = cost.mean() 169 | 170 | # If we have a regularization cost, add it 171 | if regcost is not None: 172 | final_cost += regcost 173 | 174 | # Normalize cost w.r.t sentence lengths to correctly compute perplexity 175 | # Only active when y_mask is available 176 | if 'y_mask' in self.inputs: 177 | norm_cost = (cost / self.inputs['y_mask'].sum(0)).mean() 178 | if regcost is not None: 179 | norm_cost += regcost 180 | else: 181 | norm_cost = final_cost 182 | 183 | # Get gradients of cost with respect to variables 184 | # This uses final_cost which is not normalized w.r.t sentence lengths 185 | grads = tensor.grad(final_cost, wrt=list(tparams.values())) 186 | 187 | # Clip gradients if requested 188 | if clip_c > 0: 189 | grads = self.get_clipped_grads(grads, clip_c) 190 | 191 | # Create optimizer, self.lrate is passed from nmt-train 192 | self.__opt = get_optimizer(self.optimizer)(lr0=self.lrate) 193 | self.__opt.set_trng(self._trng) 194 | #TODO: parameterize this! self.__opt.set_gradient_noise(0.1) 195 | 196 | # Get updates 197 | updates = self.__opt.get_updates(tparams, grads, opt_history) 198 | 199 | # Compile forward/backward function 200 | self.train_batch = theano.function(list(self.inputs.values()), norm_cost, updates=updates) 201 | 202 | def run_beam_search(self, beam_size=12, n_jobs=8, metric='bleu', f_valid_out=None): 203 | """Save model under /tmp for passing it to nmt-translate.""" 204 | # Save model temporarily 205 | with get_temp_file(suffix=".npz", delete=True) as tmpf: 206 | self.save(tmpf.name) 207 | result = get_valid_evaluation(tmpf.name, 208 | beam_size=beam_size, 209 | n_jobs=n_jobs, 210 | metric=metric, 211 | f_valid_out=f_valid_out) 212 | 213 | # Return every available metric back 214 | return result 215 | 216 | def info(self): 217 | """Reimplement to show model specific information before training.""" 218 | pass 219 | 220 | ########################################################## 221 | # For all the abstract methods below, you can take a look 222 | # at attention.py to understand how they are implemented. 223 | # Remember that you NEED to implement these methods in your 224 | # own model. 225 | ########################################################## 226 | 227 | @abstractmethod 228 | def load_data(self): 229 | """Load and prepare your training and validation data.""" 230 | pass 231 | 232 | @abstractmethod 233 | def init_params(self): 234 | """Initialize the weights and biases of your network.""" 235 | pass 236 | 237 | @abstractmethod 238 | def build(self): 239 | """Build the computational graph of your network.""" 240 | pass 241 | 242 | @abstractmethod 243 | def build_sampler(self, **kwargs): 244 | """Build f_init() and f_next() for beam-search.""" 245 | pass 246 | -------------------------------------------------------------------------------- /nmtpy/models/fusion_sum_dep_ind.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as tensor 6 | 7 | # Ours 8 | from ..layers import * 9 | from ..defaults import INT, FLOAT 10 | 11 | # Base fusion model 12 | from .basefusion import Model as Fusion 13 | 14 | class Model(Fusion): 15 | def __init__(self, seed, logger, **kwargs): 16 | # Call parent's init first 17 | super(Model, self).__init__(seed, logger, **kwargs) 18 | 19 | # Set architecture specific methods 20 | self.init_gru_decoder = init_gru_decoder_multi 21 | self.gru_decoder = gru_decoder_multi 22 | 23 | ########## Define layers here ########### 24 | def init_gru_decoder_multi(params, nin, dim, dimctx, scale=0.01, prefix='gru_decoder_multi'): 25 | # Init with usual gru_cond function 26 | params = param_init_gru_cond(params, nin, dim, dimctx, scale, prefix) 27 | 28 | # Add separate attention weights for the 2nd modality 29 | params[pp(prefix, 'Wc_att2')] = norm_weight(dimctx, dimctx, scale=scale) 30 | params[pp(prefix, 'b_att2')] = np.zeros((dimctx,)).astype(FLOAT) 31 | 32 | # attention: This gives the alpha's 33 | params[pp(prefix, 'U_att2')] = norm_weight(dimctx, 1, scale=scale) 34 | params[pp(prefix, 'c_att2')] = np.zeros((1,)).astype(FLOAT) 35 | 36 | return params 37 | 38 | def gru_decoder_multi(tparams, state_below, 39 | ctx1, ctx2, prefix='gru_decoder_multi', 40 | input_mask=None, one_step=False, 41 | init_state=None, ctx1_mask=None): 42 | if one_step: 43 | assert init_state, 'previous state must be provided' 44 | 45 | # Context 46 | # n_timesteps x n_samples x ctxdim 47 | assert ctx1 and ctx2, 'Contexts must be provided' 48 | assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim' 49 | 50 | # Number of padded source timesteps 51 | nsteps = state_below.shape[0] 52 | 53 | # Batch or single sample? 54 | n_samples = state_below.shape[1] if state_below.ndim == 3 else 1 55 | 56 | # if we have no mask, we assume all the inputs are valid 57 | # tensor.alloc(value, *shape) 58 | # input_mask: (n_steps, 1) filled with 1 59 | if input_mask is None: 60 | input_mask = tensor.alloc(1., nsteps, 1) 61 | 62 | # Infer RNN dimensionality 63 | dim = tparams[pp(prefix, 'Wcx')].shape[1] 64 | 65 | # initial/previous state 66 | # if not given, assume it's all zeros 67 | if init_state is None: 68 | init_state = tensor.alloc(0., n_samples, dim) 69 | 70 | # These two dot products are same with gru_layer, refer to the equations. 71 | # [W_r * X + b_r, W_z * X + b_z] 72 | state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')] 73 | 74 | # input to compute the hidden state proposal 75 | # This is the [W*x]_j in the eq. 8 of the paper 76 | state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')] 77 | 78 | # Wc_att: dimctx -> dimctx 79 | # Linearly transform the contexts to another space with same dimensionality 80 | pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')] 81 | pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att2')]) + tparams[pp(prefix, 'b_att2')] 82 | 83 | # Step function for the recurrence/scan 84 | # Sequences 85 | # --------- 86 | # m_ : mask 87 | # x_ : state_below_ 88 | # xx_ : state_belowx 89 | # outputs_info 90 | # ------------ 91 | # h_ : init_state, 92 | # ctx_ : need to be defined as it's returned by _step 93 | # alpha1_: need to be defined as it's returned by _step 94 | # alpha2_: need to be defined as it's returned by _step 95 | # non sequences 96 | # ------------- 97 | # pctx1_ : pctx1_ 98 | # pctx2_ : pctx2_ 99 | # cc1_ : ctx1 100 | # cc2_ : ctx2 101 | # and all the shared weights and biases.. 102 | def _step(m_, x_, xx_, 103 | h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations 104 | pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, U_att, c_att, U_att2, c_att2, 105 | Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): 106 | 107 | # Do a step of classical GRU 108 | h1 = gru_step(m_, x_, xx_, h_, U, Ux) 109 | 110 | ########### 111 | # Attention 112 | ########### 113 | # h1 X W_comb_att 114 | # W_comb_att: dim -> dimctx 115 | # pstate_ should be 2D as we're working with unrolled timesteps 116 | pstate_ = tensor.dot(h1, W_comb_att) 117 | 118 | # Accumulate in pctx*__ and apply tanh() 119 | # This becomes the projected context(s) + the current hidden state 120 | # of the decoder, e.g. this is the information accumulating 121 | # into the returned original contexts with the knowledge of target 122 | # sentence decoding. 123 | pctx1__ = tanh(pctx1_ + pstate_[None, :, :]) 124 | pctx2__ = tanh(pctx2_ + pstate_[None, :, :]) 125 | 126 | # Affine transformation for alpha* = (pctx*__ X U_att) + c_att 127 | # We're now down to scalar alpha's for each accumulated 128 | # context (0th dim) in the pctx*__ 129 | # alpha1 should be n_timesteps, 1, 1 130 | alpha1 = tensor.dot(pctx1__, U_att) + c_att 131 | alpha2 = tensor.dot(pctx2__, U_att2) + c_att2 132 | 133 | # Drop the last dimension, e.g. (n_timesteps, 1) 134 | alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]]) 135 | alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]]) 136 | 137 | # Exponentiate alpha1 138 | alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True)) 139 | alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True)) 140 | 141 | # If there is a context mask, multiply with it to cancel unnecessary steps 142 | # We won't have a ctx_mask for image vectors 143 | if ctx1_mask: 144 | alpha1 = alpha1 * ctx1_mask 145 | 146 | # Normalize so that the sum makes 1 147 | alpha1 = alpha1 / alpha1.sum(0, keepdims=True) 148 | alpha2 = alpha2 / alpha2.sum(0, keepdims=True) 149 | 150 | # Compute the current context ctx*_ as the alpha-weighted sum of 151 | # the initial contexts ctx*'s 152 | ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0) 153 | ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0) 154 | # n_samples x ctxdim (2000) 155 | 156 | # Sum of contexts 157 | ctx_ = tanh(ctx1_ + ctx2_) 158 | 159 | ############################################ 160 | # ctx*_ and alpha computations are completed 161 | ############################################ 162 | 163 | #################################### 164 | # The below code is another GRU cell 165 | #################################### 166 | # Affine transformation: h1 X U_nl + b_nl 167 | # U_nl, b_nl: Stacked dim*2 168 | preact = tensor.dot(h1, U_nl) + b_nl 169 | 170 | # Transform the weighted context sum with Wc 171 | # and add it to preact 172 | # Wc: dimctx -> Stacked dim*2 173 | preact += tensor.dot(ctx_, Wc) 174 | 175 | # Apply sigmoid nonlinearity 176 | preact = sigmoid(preact) 177 | 178 | # Slice activations: New gates r2 and u2 179 | r2 = tensor_slice(preact, 0, dim) 180 | u2 = tensor_slice(preact, 1, dim) 181 | 182 | preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2 183 | preactx += tensor.dot(ctx_, Wcx) 184 | 185 | # Candidate hidden 186 | h2_tilda = tanh(preactx) 187 | 188 | # Leaky integration between the new h2 and the 189 | # old h1 computed in line 285 190 | h2 = u2 * h2_tilda + (1. - u2) * h1 191 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 192 | 193 | return h2, ctx_, alpha1.T, alpha2.T 194 | 195 | # Sequences are the input mask and the transformed target embeddings 196 | seqs = [input_mask, state_below_, state_belowx] 197 | 198 | # Create a list of shared parameters for easy parameter passing 199 | shared_vars = [tparams[pp(prefix, 'U')], 200 | tparams[pp(prefix, 'Wc')], 201 | tparams[pp(prefix, 'W_comb_att')], 202 | tparams[pp(prefix, 'U_att')], 203 | tparams[pp(prefix, 'c_att')], 204 | tparams[pp(prefix, 'U_att2')], 205 | tparams[pp(prefix, 'c_att2')], 206 | tparams[pp(prefix, 'Ux')], 207 | tparams[pp(prefix, 'Wcx')], 208 | tparams[pp(prefix, 'U_nl')], 209 | tparams[pp(prefix, 'Ux_nl')], 210 | tparams[pp(prefix, 'b_nl')], 211 | tparams[pp(prefix, 'bx_nl')]] 212 | 213 | if one_step: 214 | rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars)) 215 | else: 216 | outputs_info=[init_state, 217 | tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim (ctx_) 218 | tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps (alpha1) 219 | tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps (alpha2) 220 | 221 | rval, updates = theano.scan(_step, 222 | sequences=seqs, 223 | outputs_info=outputs_info, 224 | non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars, 225 | name=pp(prefix, '_layers'), 226 | n_steps=nsteps, 227 | strict=True) 228 | return rval 229 | 230 | 231 | -------------------------------------------------------------------------------- /nmtpy/models/fusion_sum_ind_dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as tensor 6 | 7 | # Ours 8 | from ..layers import * 9 | from ..defaults import INT, FLOAT 10 | 11 | # Base fusion model 12 | from .basefusion import Model as Fusion 13 | 14 | class Model(Fusion): 15 | def __init__(self, seed, logger, **kwargs): 16 | # Call parent's init first 17 | super(Model, self).__init__(seed, logger, **kwargs) 18 | 19 | # Set architecture specific methods 20 | self.init_gru_decoder = init_gru_decoder_multi 21 | self.gru_decoder = gru_decoder_multi 22 | 23 | ########## Define layers here ########### 24 | def init_gru_decoder_multi(params, nin, dim, dimctx, scale=0.01, prefix='gru_decoder_multi'): 25 | # Init with usual gru_cond function 26 | params = param_init_gru_cond(params, nin, dim, dimctx, scale, prefix) 27 | 28 | # Add separate attention weights for the 2nd modality 29 | params[pp(prefix, 'Wc_att2')] = norm_weight(dimctx, dimctx, scale=scale) 30 | params[pp(prefix, 'b_att2')] = np.zeros((dimctx,)).astype(FLOAT) 31 | 32 | # attention: This gives the alpha's 33 | params[pp(prefix, 'W_comb_att2')] = norm_weight(dim, dimctx, scale=scale) 34 | 35 | return params 36 | 37 | def gru_decoder_multi(tparams, state_below, 38 | ctx1, ctx2, prefix='gru_decoder_multi', 39 | input_mask=None, one_step=False, 40 | init_state=None, ctx1_mask=None): 41 | if one_step: 42 | assert init_state, 'previous state must be provided' 43 | 44 | # Context 45 | # n_timesteps x n_samples x ctxdim 46 | assert ctx1 and ctx2, 'Contexts must be provided' 47 | assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim' 48 | 49 | # Number of padded source timesteps 50 | nsteps = state_below.shape[0] 51 | 52 | # Batch or single sample? 53 | n_samples = state_below.shape[1] if state_below.ndim == 3 else 1 54 | 55 | # if we have no mask, we assume all the inputs are valid 56 | # tensor.alloc(value, *shape) 57 | # input_mask: (n_steps, 1) filled with 1 58 | if input_mask is None: 59 | input_mask = tensor.alloc(1., nsteps, 1) 60 | 61 | # Infer RNN dimensionality 62 | dim = tparams[pp(prefix, 'Wcx')].shape[1] 63 | 64 | # initial/previous state 65 | # if not given, assume it's all zeros 66 | if init_state is None: 67 | init_state = tensor.alloc(0., n_samples, dim) 68 | 69 | # These two dot products are same with gru_layer, refer to the equations. 70 | # [W_r * X + b_r, W_z * X + b_z] 71 | state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')] 72 | 73 | # input to compute the hidden state proposal 74 | # This is the [W*x]_j in the eq. 8 of the paper 75 | state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')] 76 | 77 | # Wc_att: dimctx -> dimctx 78 | # Linearly transform the contexts to another space with same dimensionality 79 | pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')] 80 | pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att2')]) + tparams[pp(prefix, 'b_att2')] 81 | 82 | # Step function for the recurrence/scan 83 | # Sequences 84 | # --------- 85 | # m_ : mask 86 | # x_ : state_below_ 87 | # xx_ : state_belowx 88 | # outputs_info 89 | # ------------ 90 | # h_ : init_state, 91 | # ctx_ : need to be defined as it's returned by _step 92 | # alpha1_: need to be defined as it's returned by _step 93 | # alpha2_: need to be defined as it's returned by _step 94 | # non sequences 95 | # ------------- 96 | # pctx1_ : pctx1_ 97 | # pctx2_ : pctx2_ 98 | # cc1_ : ctx1 99 | # cc2_ : ctx2 100 | # and all the shared weights and biases.. 101 | def _step(m_, x_, xx_, 102 | h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations 103 | pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, W_comb_att2, U_att, c_att, 104 | Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): 105 | 106 | # Do a step of classical GRU 107 | h1 = gru_step(m_, x_, xx_, h_, U, Ux) 108 | 109 | ########### 110 | # Attention 111 | ########### 112 | # h1 X W_comb_att 113 | # W_comb_att: dim -> dimctx 114 | # pstate_ should be 2D as we're working with unrolled timesteps 115 | pstate1_ = tensor.dot(h1, W_comb_att) 116 | pstate2_ = tensor.dot(h1, W_comb_att2) 117 | 118 | # Accumulate in pctx*__ and apply tanh() 119 | # This becomes the projected context(s) + the current hidden state 120 | # of the decoder, e.g. this is the information accumulating 121 | # into the returned original contexts with the knowledge of target 122 | # sentence decoding. 123 | pctx1__ = tanh(pctx1_ + pstate1_[None, :, :]) 124 | pctx2__ = tanh(pctx2_ + pstate2_[None, :, :]) 125 | 126 | # Affine transformation for alpha* = (pctx*__ X U_att) + c_att 127 | # We're now down to scalar alpha's for each accumulated 128 | # context (0th dim) in the pctx*__ 129 | # alpha1 should be n_timesteps, 1, 1 130 | alpha1 = tensor.dot(pctx1__, U_att) + c_att 131 | alpha2 = tensor.dot(pctx2__, U_att) + c_att 132 | 133 | # Drop the last dimension, e.g. (n_timesteps, 1) 134 | alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]]) 135 | alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]]) 136 | 137 | # Exponentiate alpha1 138 | alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True)) 139 | alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True)) 140 | 141 | # If there is a context mask, multiply with it to cancel unnecessary steps 142 | # We won't have a ctx_mask for image vectors 143 | if ctx1_mask: 144 | alpha1 = alpha1 * ctx1_mask 145 | 146 | # Normalize so that the sum makes 1 147 | alpha1 = alpha1 / alpha1.sum(0, keepdims=True) 148 | alpha2 = alpha2 / alpha2.sum(0, keepdims=True) 149 | 150 | # Compute the current context ctx*_ as the alpha-weighted sum of 151 | # the initial contexts ctx*'s 152 | ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0) 153 | ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0) 154 | # n_samples x ctxdim (2000) 155 | 156 | # Sum of contexts 157 | ctx_ = tanh(ctx1_ + ctx2_) 158 | 159 | ############################################ 160 | # ctx*_ and alpha computations are completed 161 | ############################################ 162 | 163 | #################################### 164 | # The below code is another GRU cell 165 | #################################### 166 | # Affine transformation: h1 X U_nl + b_nl 167 | # U_nl, b_nl: Stacked dim*2 168 | preact = tensor.dot(h1, U_nl) + b_nl 169 | 170 | # Transform the weighted context sum with Wc 171 | # and add it to preact 172 | # Wc: dimctx -> Stacked dim*2 173 | preact += tensor.dot(ctx_, Wc) 174 | 175 | # Apply sigmoid nonlinearity 176 | preact = sigmoid(preact) 177 | 178 | # Slice activations: New gates r2 and u2 179 | r2 = tensor_slice(preact, 0, dim) 180 | u2 = tensor_slice(preact, 1, dim) 181 | 182 | preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2 183 | preactx += tensor.dot(ctx_, Wcx) 184 | 185 | # Candidate hidden 186 | h2_tilda = tanh(preactx) 187 | 188 | # Leaky integration between the new h2 and the 189 | # old h1 computed in line 285 190 | h2 = u2 * h2_tilda + (1. - u2) * h1 191 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 192 | 193 | return h2, ctx_, alpha1.T, alpha2.T 194 | 195 | # Sequences are the input mask and the transformed target embeddings 196 | seqs = [input_mask, state_below_, state_belowx] 197 | 198 | # Create a list of shared parameters for easy parameter passing 199 | shared_vars = [tparams[pp(prefix, 'U')], 200 | tparams[pp(prefix, 'Wc')], 201 | tparams[pp(prefix, 'W_comb_att')], 202 | tparams[pp(prefix, 'W_comb_att2')], 203 | tparams[pp(prefix, 'U_att')], 204 | tparams[pp(prefix, 'c_att')], 205 | tparams[pp(prefix, 'Ux')], 206 | tparams[pp(prefix, 'Wcx')], 207 | tparams[pp(prefix, 'U_nl')], 208 | tparams[pp(prefix, 'Ux_nl')], 209 | tparams[pp(prefix, 'b_nl')], 210 | tparams[pp(prefix, 'bx_nl')]] 211 | 212 | if one_step: 213 | rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars)) 214 | else: 215 | outputs_info=[init_state, 216 | tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim (ctx_) 217 | tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps (alpha1) 218 | tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps (alpha2) 219 | 220 | rval, updates = theano.scan(_step, 221 | sequences=seqs, 222 | outputs_info=outputs_info, 223 | non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars, 224 | name=pp(prefix, '_layers'), 225 | n_steps=nsteps, 226 | strict=True) 227 | return rval 228 | 229 | 230 | -------------------------------------------------------------------------------- /nmtpy/models/fusion_sum_ind_ind.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | import theano 5 | import theano.tensor as tensor 6 | 7 | # Ours 8 | from ..layers import * 9 | from ..defaults import INT, FLOAT 10 | 11 | # Base fusion model 12 | from .basefusion import Model as Fusion 13 | 14 | class Model(Fusion): 15 | def __init__(self, seed, logger, **kwargs): 16 | # Call parent's init first 17 | super(Model, self).__init__(seed, logger, **kwargs) 18 | 19 | # Set architecture specific methods 20 | self.init_gru_decoder = init_gru_decoder_multi 21 | self.gru_decoder = gru_decoder_multi 22 | 23 | ########## Define layers here ########### 24 | def init_gru_decoder_multi(params, nin, dim, dimctx, scale=0.01, prefix='gru_decoder_multi'): 25 | # Init with usual gru_cond function 26 | return param_init_gru_cond(params, nin, dim, dimctx, scale, prefix, False) 27 | 28 | def gru_decoder_multi(tparams, state_below, 29 | ctx1, ctx2, prefix='gru_decoder_multi', 30 | input_mask=None, one_step=False, 31 | init_state=None, ctx1_mask=None): 32 | if one_step: 33 | assert init_state, 'previous state must be provided' 34 | 35 | # Context 36 | # n_timesteps x n_samples x ctxdim 37 | assert ctx1 and ctx2, 'Contexts must be provided' 38 | assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim' 39 | 40 | # Number of padded source timesteps 41 | nsteps = state_below.shape[0] 42 | 43 | # Batch or single sample? 44 | n_samples = state_below.shape[1] if state_below.ndim == 3 else 1 45 | 46 | # if we have no mask, we assume all the inputs are valid 47 | # tensor.alloc(value, *shape) 48 | # input_mask: (n_steps, 1) filled with 1 49 | if input_mask is None: 50 | input_mask = tensor.alloc(1., nsteps, 1) 51 | 52 | # Infer RNN dimensionality 53 | dim = tparams[pp(prefix, 'Wcx')].shape[1] 54 | 55 | # initial/previous state 56 | # if not given, assume it's all zeros 57 | if init_state is None: 58 | init_state = tensor.alloc(0., n_samples, dim) 59 | 60 | # These two dot products are same with gru_layer, refer to the equations. 61 | # [W_r * X + b_r, W_z * X + b_z] 62 | state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')] 63 | 64 | # input to compute the hidden state proposal 65 | # This is the [W*x]_j in the eq. 8 of the paper 66 | state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')] 67 | 68 | # Wc_att: dimctx -> dimctx 69 | # Linearly transform the contexts to another space with same dimensionality 70 | pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')] 71 | pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')] 72 | 73 | # Step function for the recurrence/scan 74 | # Sequences 75 | # --------- 76 | # m_ : mask 77 | # x_ : state_below_ 78 | # xx_ : state_belowx 79 | # outputs_info 80 | # ------------ 81 | # h_ : init_state, 82 | # ctx_ : need to be defined as it's returned by _step 83 | # alpha1_: need to be defined as it's returned by _step 84 | # alpha2_: need to be defined as it's returned by _step 85 | # non sequences 86 | # ------------- 87 | # pctx1_ : pctx1_ 88 | # pctx2_ : pctx2_ 89 | # cc1_ : ctx1 90 | # cc2_ : ctx2 91 | # and all the shared weights and biases.. 92 | def _step(m_, x_, xx_, 93 | h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations 94 | pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, U_att, c_att, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl): 95 | 96 | # Do a step of classical GRU 97 | h1 = gru_step(m_, x_, xx_, h_, U, Ux) 98 | 99 | ########### 100 | # Attention 101 | ########### 102 | # h1 X W_comb_att 103 | # W_comb_att: dim -> dimctx 104 | # pstate_ should be 2D as we're working with unrolled timesteps 105 | pstate_ = tensor.dot(h1, W_comb_att) 106 | 107 | # Accumulate in pctx*__ and apply tanh() 108 | # This becomes the projected context(s) + the current hidden state 109 | # of the decoder, e.g. this is the information accumulating 110 | # into the returned original contexts with the knowledge of target 111 | # sentence decoding. 112 | pctx1__ = tanh(pctx1_ + pstate_[None, :, :]) 113 | pctx2__ = tanh(pctx2_ + pstate_[None, :, :]) 114 | 115 | # Affine transformation for alpha* = (pctx*__ X U_att) + c_att 116 | # We're now down to scalar alpha's for each accumulated 117 | # context (0th dim) in the pctx*__ 118 | # alpha1 should be n_timesteps, 1, 1 119 | alpha1 = tensor.dot(pctx1__, U_att) + c_att 120 | alpha2 = tensor.dot(pctx2__, U_att) + c_att 121 | 122 | # Drop the last dimension, e.g. (n_timesteps, 1) 123 | alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]]) 124 | alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]]) 125 | 126 | # Exponentiate alpha1 127 | alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True)) 128 | alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True)) 129 | 130 | # If there is a context mask, multiply with it to cancel unnecessary steps 131 | # We won't have a ctx_mask for image vectors 132 | if ctx1_mask: 133 | alpha1 = alpha1 * ctx1_mask 134 | 135 | # Normalize so that the sum makes 1 136 | alpha1 = alpha1 / alpha1.sum(0, keepdims=True) 137 | alpha2 = alpha2 / alpha2.sum(0, keepdims=True) 138 | 139 | # Compute the current context ctx*_ as the alpha-weighted sum of 140 | # the initial contexts ctx*'s 141 | ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0) 142 | ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0) 143 | # n_samples x ctxdim (2000) 144 | 145 | # Sum of contexts 146 | ctx_ = tanh(ctx1_ + ctx2_) 147 | 148 | ############################################ 149 | # ctx*_ and alpha computations are completed 150 | ############################################ 151 | 152 | #################################### 153 | # The below code is another GRU cell 154 | #################################### 155 | # Affine transformation: h1 X U_nl + b_nl 156 | # U_nl, b_nl: Stacked dim*2 157 | preact = tensor.dot(h1, U_nl) + b_nl 158 | 159 | # Transform the weighted context sum with Wc 160 | # and add it to preact 161 | # Wc: dimctx -> Stacked dim*2 162 | preact += tensor.dot(ctx_, Wc) 163 | 164 | # Apply sigmoid nonlinearity 165 | preact = sigmoid(preact) 166 | 167 | # Slice activations: New gates r2 and u2 168 | r2 = tensor_slice(preact, 0, dim) 169 | u2 = tensor_slice(preact, 1, dim) 170 | 171 | preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2 172 | preactx += tensor.dot(ctx_, Wcx) 173 | 174 | # Candidate hidden 175 | h2_tilda = tanh(preactx) 176 | 177 | # Leaky integration between the new h2 and the 178 | # old h1 computed in line 285 179 | h2 = u2 * h2_tilda + (1. - u2) * h1 180 | h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1 181 | 182 | return h2, ctx_, alpha1.T, alpha2.T 183 | 184 | # Sequences are the input mask and the transformed target embeddings 185 | seqs = [input_mask, state_below_, state_belowx] 186 | 187 | # Create a list of shared parameters for easy parameter passing 188 | shared_vars = [tparams[pp(prefix, 'U')], 189 | tparams[pp(prefix, 'Wc')], 190 | tparams[pp(prefix, 'W_comb_att')], 191 | tparams[pp(prefix, 'U_att')], 192 | tparams[pp(prefix, 'c_att')], 193 | tparams[pp(prefix, 'Ux')], 194 | tparams[pp(prefix, 'Wcx')], 195 | tparams[pp(prefix, 'U_nl')], 196 | tparams[pp(prefix, 'Ux_nl')], 197 | tparams[pp(prefix, 'b_nl')], 198 | tparams[pp(prefix, 'bx_nl')]] 199 | 200 | if one_step: 201 | rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars)) 202 | else: 203 | outputs_info=[init_state, 204 | tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim (ctx_) 205 | tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps (alpha1) 206 | tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps (alpha2) 207 | 208 | rval, updates = theano.scan(_step, 209 | sequences=seqs, 210 | outputs_info=outputs_info, 211 | non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars, 212 | name=pp(prefix, '_layers'), 213 | n_steps=nsteps, 214 | strict=True) 215 | return rval 216 | 217 | 218 | -------------------------------------------------------------------------------- /nmtpy/models/mnmt_yemb_mulimg.py: -------------------------------------------------------------------------------- 1 | mnmt_trgmul.py -------------------------------------------------------------------------------- /nmtpy/models/rnnlm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from collections import OrderedDict 3 | import numpy as np 4 | 5 | import theano 6 | import theano.tensor as tensor 7 | from ..layers import tanh, get_new_layer 8 | from ..defaults import INT, FLOAT 9 | from ..nmtutils import load_dictionary, norm_weight 10 | from ..iterators.text import TextIterator 11 | 12 | from .basemodel import BaseModel 13 | 14 | class Model(BaseModel): 15 | def __init__(self, seed, logger, **kwargs): 16 | # Call parent's init first 17 | super(Model, self).__init__(**kwargs) 18 | 19 | # Set the logger 20 | self._logger = logger 21 | 22 | # Load dictionaries 23 | dicts = kwargs.pop('dicts') 24 | 25 | # Let's default to GRU 26 | self.rnn_type = kwargs.pop('rnn_type', 'gru') 27 | 28 | self.src_dict, src_idict = load_dictionary(dicts['src']) 29 | self.n_words = min(self.n_words, len(self.src_dict)) \ 30 | if self.n_words > 0 else len(self.src_dict) 31 | 32 | self.set_options(self.__dict__) 33 | self.src_idict = src_idict 34 | self.set_trng(seed) 35 | self.set_dropout(False) 36 | 37 | def load_valid_data(self): 38 | self.valid_iterator = TextIterator( 39 | batch_size=1, 40 | mask=True, 41 | shuffle_mode=None, 42 | file=self.data['valid_src'], 43 | dict=self.src_dict, 44 | n_words=self.n_words, 45 | name='y') # This is important for the loss to be correctly normalized! 46 | self.valid_iterator.read() 47 | 48 | def load_data(self): 49 | self.train_iterator = TextIterator( 50 | batch_size=self.batch_size, 51 | mask=True, 52 | shuffle_mode=None, # or simple or trglen, not tested in rnnlm. 53 | file=self.data['train_src'], 54 | dict=self.src_dict, 55 | n_words=self.n_words) 56 | 57 | self.train_iterator.read() 58 | self.load_valid_data() 59 | 60 | def init_params(self): 61 | params = OrderedDict() 62 | 63 | # encoder: ff tanh 64 | ######### 65 | # Forward encoder initializer 66 | #params = get_new_layer(self.enc_type)[0](params, prefix='encoder', nin=self.in_emb_dim, nout=self.rnn_dim) 67 | # embedding weights for encoder 68 | params['W_in_emb'] = norm_weight(self.n_words, self.in_emb_dim) 69 | 70 | # init_state, init_cell 71 | #params = get_new_layer('ff')[0](params, prefix='ff_state', nin=self.in_emb_dim, nout=self.rnn_dim) 72 | 73 | # recurrent layer: in_emb_dim to rnn_dim 74 | params = get_new_layer(self.rnn_type)[0](params, prefix='recurrent', nin=self.in_emb_dim, dim=self.rnn_dim) 75 | 76 | # generate target embedding 77 | params = get_new_layer('ff')[0](params, prefix='ff_logit_rnn' , nin=self.rnn_dim, nout=self.out_emb_dim, ortho=False) 78 | # output to input: out_emb_dim -> out_emb_dim 79 | params = get_new_layer('ff')[0](params, prefix='ff_logit_prev' , nin=self.out_emb_dim, nout=self.out_emb_dim, ortho=False) 80 | # prepare softmax: out_emb_dim -> n_words 81 | params = get_new_layer('ff')[0](params, prefix='ff_logit' , nin=self.out_emb_dim, nout=self.n_words) 82 | 83 | self.initial_params = params 84 | 85 | def build(self): 86 | # description string: #words x #samples 87 | x = tensor.matrix('x', dtype=INT) 88 | x_mask = tensor.matrix('x_mask', dtype=FLOAT) 89 | 90 | # Store tensors 91 | self.inputs = OrderedDict() 92 | self.inputs['x'] = x # Source words 93 | self.inputs['x_mask'] = x_mask # Source mask 94 | 95 | n_timesteps = x.shape[0] 96 | n_samples = x.shape[1] 97 | 98 | # input word embedding 99 | emb = self.tparams['W_in_emb'][x.flatten()] 100 | emb = emb.reshape([n_timesteps, n_samples, self.in_emb_dim]) 101 | #proj = get_new_layer(self.enc_type)[1](self.tparams, emb, prefix='encoder', mask=x_mask) 102 | # prepare outputs 103 | emb_shifted = tensor.zeros_like(emb) 104 | emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) 105 | emb = emb_shifted 106 | 107 | # pass through gru layer, recurrence here 108 | proj = get_new_layer(self.rnn_type)[1](self.tparams, emb, 109 | prefix='recurrent', mask=x_mask) 110 | 111 | proj_h = proj[0] 112 | 113 | # compute word probabilities 114 | # internal state of RNN 115 | logit_rnn = get_new_layer('ff')[1](self.tparams, proj_h, prefix='ff_logit_rnn', activ='linear') 116 | # previous output word embedding 117 | logit_prev = get_new_layer('ff')[1](self.tparams, emb, prefix='ff_logit_prev', activ='linear') 118 | logit = tanh(logit_rnn + logit_prev) 119 | 120 | 121 | logit = get_new_layer('ff')[1](self.tparams, logit, prefix='ff_logit', activ='linear') 122 | logit_shp = logit.shape 123 | 124 | # Apply logsoftmax (stable version) 125 | log_probs = -tensor.nnet.logsoftmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 126 | 127 | # cost 128 | x_flat = x.flatten() 129 | x_flat_idx = tensor.arange(x_flat.shape[0]) * self.n_words + x_flat 130 | 131 | cost = log_probs.flatten()[x_flat_idx] 132 | cost = cost.reshape([x.shape[0], x.shape[1]]) 133 | cost = (cost * x_mask) 134 | 135 | #f_log_probs_detailled return the log probs array correponding to each word log probs 136 | self.f_log_probs_detailled = theano.function(list(self.inputs.values()), cost) 137 | cost = (cost * x_mask).sum(0) 138 | 139 | #f_log_probs return the sum of the sentence log probs 140 | self.f_log_probs = theano.function(list(self.inputs.values()), cost) 141 | 142 | return cost.mean() 143 | 144 | def val_loss(self, sentence=None): 145 | probs = [] 146 | if sentence is None: 147 | #Training validation 148 | for data in self.valid_iterator: 149 | norm = data['y_mask'].sum(0) 150 | log_probs = sum(self.f_log_probs(*list(data.values()))) / norm 151 | probs.extend(log_probs) 152 | return np.array(probs).mean() 153 | else: 154 | #lm testing one sentence at the time 155 | norm = sentence['y_mask'].sum(0) 156 | log_probs = self.f_log_probs_detailled(*list(sentence.values())) 157 | probs.extend(log_probs) 158 | return np.array(probs), norm 159 | 160 | def build_sampler(self, **kwargs): 161 | # x: 1 x 1 162 | y = tensor.vector('y_sampler', dtype=INT) 163 | init_state = tensor.matrix('init_state', dtype=FLOAT) 164 | 165 | # if it's the first word, emb should be all zero 166 | emb = tensor.switch(y[:, None] < 0, 167 | tensor.alloc(0., 1, self.tparams['W_in_emb'].shape[1]), 168 | self.tparams['W_in_emb'][y]) 169 | 170 | # apply one step of gru layer 171 | proj = get_new_layer(self.rnn_type)[1](self.tparams, emb, 172 | prefix='recurrent', 173 | mask=None) 174 | next_state = proj[0] 175 | 176 | # compute the output probability dist and sample 177 | logit_rnn = get_new_layer('ff')[1](self.tparams, next_state, prefix='ff_logit_rnn', activ='linear') 178 | logit_prev = get_new_layer('ff')[1](self.tparams, emb, prefix='ff_logit_prev', activ='linear') 179 | logit = tensor.tanh(logit_rnn+logit_prev) 180 | logit = get_new_layer('ff')[1](self.tparams, logit, prefix='ff_logit', activ='linear') 181 | logit_shp = logit.shape 182 | 183 | # Apply logsoftmax (stable version) 184 | next_log_probs = -tensor.nnet.logsoftmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) 185 | 186 | # Sample from the softmax distribution 187 | next_probs = tensor.exp(next_log_probs) 188 | next_word = self._trng.multinomial(pvals=next_probs).argmax(1) 189 | 190 | # next word probability 191 | inps = [y, init_state] 192 | outs = [next_log_probs, next_word, next_state] 193 | self.f_next = theano.function(inps, outs, name='f_next') 194 | 195 | def gen_sample(tparams, f_next, options, trng=None, maxlen=30, argmax=False): 196 | sample = [] 197 | sample_score = 0 198 | 199 | # initial token is indicated by a -1 and initial state is zero 200 | next_w = -1 * np.ones((1,)).astype(INT) 201 | next_state = np.zeros((1, options['dim'])).astype(FLOAT) 202 | 203 | for ii in range(maxlen): 204 | inps = [next_w, next_state] 205 | ret = f_next(*inps) 206 | next_p, next_w, next_state = ret[0], ret[1], ret[2] 207 | 208 | if argmax: 209 | nw = next_p[0].argmax() 210 | else: 211 | nw = next_w[0] 212 | sample.append(nw) 213 | sample_score += next_p[0, nw] 214 | if nw == 0: 215 | break 216 | 217 | # FIXME: This is broken 218 | return sample, sample_score, sample_scores, curr_loss, maxlen 219 | -------------------------------------------------------------------------------- /nmtpy/nmtutils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import pickle 4 | 5 | from collections import OrderedDict 6 | from .defaults import INT, FLOAT 7 | 8 | def invert_dictionary(d): 9 | return OrderedDict([(v,k) for k,v in d.items()]) 10 | 11 | def load_dictionary(fname): 12 | with open(fname, 'rb') as f: 13 | vocab = pickle.load(f) 14 | 15 | return vocab, invert_dictionary(vocab) 16 | 17 | # Function to convert idxs to sentence 18 | def idx_to_sent(ivocab, idxs, join=True): 19 | sent = [] 20 | for widx in idxs: 21 | if widx == 0: 22 | break 23 | sent.append(ivocab.get(widx, "")) 24 | if join: 25 | return " ".join(sent) 26 | else: 27 | return sent 28 | 29 | # Function to convert sentence to idxs 30 | def sent_to_idx(vocab, tokens, limit=0): 31 | idxs = [] 32 | for word in tokens: 33 | # Get token, 1 if not available 34 | idx = vocab.get(word, 1) 35 | if limit > 0: 36 | idx = idx if idx < limit else 1 37 | idxs.append(idx) 38 | return idxs 39 | 40 | # push parameters to Theano shared variables 41 | def zipp(params, tparams): 42 | for kk, vv in params.items(): 43 | tparams[kk].set_value(vv) 44 | 45 | # pull parameters from Theano shared variables 46 | def unzip(zipped): 47 | new_params = OrderedDict() 48 | for kk, vv in zipped.items(): 49 | new_params[kk] = vv.get_value() 50 | return new_params 51 | 52 | # make prefix-appended name 53 | def pp(prefix, name): 54 | return '%s_%s' % (prefix, name) 55 | 56 | # orthogonal initialization for weights 57 | # Saxe, Andrew M., James L. McClelland, and Surya Ganguli. 58 | # "Exact solutions to the nonlinear dynamics of learning in deep 59 | # linear neural networks." arXiv preprint arXiv:1312.6120 (2013). 60 | def ortho_weight(ndim): 61 | W = np.random.randn(ndim, ndim) 62 | u, s, v = np.linalg.svd(W) 63 | return u.astype(FLOAT) 64 | 65 | # weight initializer, normal by default 66 | def norm_weight(nin, nout, scale=0.01, ortho=True): 67 | if scale == "xavier": 68 | # Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks." 69 | # International conference on artificial intelligence and statistics. 2010. 70 | # http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf 71 | scale = 1. / np.sqrt(nin) 72 | elif scale == "he": 73 | # Claimed necessary for ReLU 74 | # Kaiming He et al. (2015) 75 | # Delving deep into rectifiers: Surpassing human-level performance on 76 | # imagenet classification. arXiv preprint arXiv:1502.01852. 77 | scale = 1. / np.sqrt(nin/2.) 78 | 79 | if nout == nin and ortho: 80 | W = ortho_weight(nin) 81 | else: 82 | W = scale * np.random.randn(nin, nout) 83 | return W.astype(FLOAT) 84 | -------------------------------------------------------------------------------- /nmtpy/optimizers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from collections import OrderedDict 3 | from abc import ABCMeta, abstractmethod 4 | 5 | import numpy as np 6 | 7 | import theano 8 | import theano.tensor as tensor 9 | 10 | from .defaults import FLOAT 11 | from .nmtutils import unzip 12 | 13 | 14 | def get_optimizer(name): 15 | optimizers = { 16 | 'sgd' : SGD, 17 | 'adam' : Adam, 18 | 'rmsprop' : RMSProp, 19 | 'adadelta' : Adadelta, 20 | } 21 | return optimizers[name] 22 | 23 | class Optimizer(object, metaclass=ABCMeta): 24 | def __init__(self, lr0): 25 | # Learning rate shared variable 26 | self.lr = theano.shared(np.float64(lr0).astype(FLOAT), name='lrate') 27 | 28 | # Theano shared variables for accumulator tensors 29 | self.history = OrderedDict() 30 | 31 | # Store grad variables given with get_updates() 32 | self.grads = None 33 | 34 | # Gradient noise per update 35 | self.grad_noise_factor = 0. 36 | 37 | def set_trng(self, trng): 38 | """Save Theano RNG.""" 39 | self._trng = trng 40 | 41 | def set_gradient_noise(self, factor): 42 | """Set gradient noise factor.""" 43 | self.grad_noise_factor = factor 44 | 45 | def init_value(self, shape, name, history=None): 46 | """Initialize a variable with zero or last value.""" 47 | value = history[name] if history else np.zeros(shape, dtype=FLOAT) 48 | 49 | # Create the shared variable and store it 50 | self.history[name] = theano.shared(value, name) 51 | return self.history[name] 52 | 53 | def get_history(self): 54 | """Returns a dictionary of numpy tensors for history variables.""" 55 | return unzip(self.history) 56 | 57 | def set_lrate(self, lrate): 58 | """Update the internal lrate.""" 59 | self.lr.set_value(lrate) 60 | 61 | @abstractmethod 62 | def get_updates(self, tparams, grads, history=None): 63 | """Return updates list for params.""" 64 | pass 65 | 66 | 67 | ############################# 68 | # Stochastic Gradient Descent 69 | ############################# 70 | class SGD(Optimizer): 71 | def __init__(self, lr0=0.01): 72 | super(SGD, self).__init__(lr0) 73 | 74 | def get_updates(self, tparams, grads, history=None): 75 | self.grads = grads 76 | updates = [] 77 | for tparam, grad in zip(tparams.values(), grads): 78 | updates.append((tparam, tparam - self.lr * grad)) 79 | 80 | return updates 81 | 82 | ######### 83 | # RMSProp 84 | ######### 85 | class RMSProp(Optimizer): 86 | def __init__(self, lr0=0.001, rho=0.95, eps=1e-6): 87 | super(RMSProp, self).__init__(lr0) 88 | self.rho = rho 89 | self.eps = eps 90 | 91 | def get_updates(self, tparams, grads, history=None): 92 | self.grads = grads 93 | updates = [] 94 | for tparam, grad in zip(tparams.values(), grads): 95 | # Accumulate gradient squares 96 | v = self.init_value(tparam.get_value().shape, '%s_v' % tparam.name, history) 97 | 98 | # rho * past + (1 - rho) * current 99 | v_new = (self.rho * v) + (1. - self.rho) * grad**2 100 | 101 | updates.append((v, v_new)) 102 | updates.append((tparam, tparam - (self.lr * grad / tensor.sqrt(v_new + self.eps)))) 103 | 104 | return updates 105 | 106 | ########## 107 | # Adadelta 108 | ########## 109 | class Adadelta(Optimizer): 110 | def __init__(self, lr0=1., rho=0.95, eps=1e-6): 111 | super(Adadelta, self).__init__(lr0) 112 | self.rho = rho 113 | self.eps = eps 114 | 115 | def get_updates(self, tparams, grads, history=None): 116 | self.grads = grads 117 | updates = [] 118 | for tparam, grad in zip(tparams.values(), grads): 119 | v = self.init_value(tparam.get_value().shape, '%s_v' % tparam.name, history) 120 | u = self.init_value(tparam.get_value().shape, '%s_u' % tparam.name, history) 121 | 122 | # Accumulate gradient squares 123 | # rho * past + (1 - rho) * current 124 | v_new = (self.rho * v) + (1. - self.rho) * grad**2 125 | updates.append((v, v_new)) 126 | 127 | # Update rule 128 | up = (grad * tensor.sqrt(u + self.eps) / tensor.sqrt(v_new + self.eps)) 129 | updates.append((tparam, tparam - self.lr * up)) 130 | 131 | # Accumulate update magnitudes 132 | updates.append((u, self.rho * u + (1. - self.rho) * up**2)) 133 | 134 | return updates 135 | 136 | ###### 137 | # Adam 138 | ###### 139 | class Adam(Optimizer): 140 | def __init__(self, *args, lr0=0.0001, b1=0.9, b2=0.999, eps=1e-8): 141 | super().__init__(lr0) 142 | self.b1 = b1 143 | self.b2 = b2 144 | self.eps = eps 145 | 146 | def get_updates(self, tparams, grads, history=None): 147 | self.grads = grads 148 | updates = [] 149 | 150 | # Iteration counter, 'None' for shape creates a scalar 151 | i = self.init_value(None, 'i', history) 152 | 153 | i_t = i + 1. 154 | 155 | # Running learning-rate that will eventually -> lr0 156 | lr_t = self.lr * (tensor.sqrt(1. - self.b2**i_t) / (1. - self.b1**i_t)) 157 | 158 | # Increment iteration counter 159 | updates.append((i, i_t)) 160 | 161 | for tparam, grad in zip(tparams.values(), grads): 162 | m = self.init_value(tparam.get_value().shape, '%s_m' % tparam.name, history) 163 | v = self.init_value(tparam.get_value().shape, '%s_v' % tparam.name, history) 164 | 165 | if self.grad_noise_factor > 0: 166 | # Sample normal noise from N(0, sqrt(factor/((1+t)**0.55))). 167 | var = self.grad_noise_factor / (i_t**0.55) 168 | noise = self._trng.normal(grad.shape, std=tensor.sqrt(stdev), dtype=FLOAT) 169 | grad += noise 170 | 171 | m_t = (self.b1 * m) + ((1. - self.b1) * grad) 172 | v_t = (self.b2 * v) + ((1. - self.b2) * grad**2) 173 | p_t = tparam - (lr_t * (m_t / (tensor.sqrt(v_t) + self.eps))) 174 | 175 | # Add updates 176 | updates.append((m, m_t)) 177 | updates.append((v, v_t)) 178 | updates.append((tparam, p_t)) 179 | 180 | return updates 181 | -------------------------------------------------------------------------------- /nmtpy/textutils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Text processing related functions""" 3 | 4 | def reduce_to_best(hyps, scores, n_unique_samples, avoid_unk=True): 5 | """Pick the best of each hypotheses group based on their scores.""" 6 | 7 | if avoid_unk: 8 | # Penalize hyps having inside 9 | pairs = [(p[0], p[1] + (100 if "" in p[0][0] else 0)) for p in zip(hyps, scores)] 10 | 11 | # Group each sample's hypotheses 12 | groups = [pairs[i::n_unique_samples] for i in range(n_unique_samples)] 13 | 14 | # Now each element of "groups" contain let's say 5 hypotheses and their scores 15 | # Sort them and get the first (smallest score) 16 | return [sorted(g, key=lambda x: x[1])[0][0] for g in groups] 17 | -------------------------------------------------------------------------------- /patches/00-theano-advancedinctensor.patch: -------------------------------------------------------------------------------- 1 | --- theano/sandbox/cuda/opt.py 2017-03-21 18:45:53.532335945 +0100 2 | +++ theano/sandbox/cuda/opt.py 2017-03-21 18:45:53.532335945 +0100 3 | @@ -1111,7 +1111,7 @@ 4 | 5 | gpu_op = GpuAdvancedIncSubtensor1(**props_dict) 6 | else: 7 | - gpu_op = GpuAdvancedIncSubtensor1_dev20(**props_dict) 8 | + gpu_op = GpuAdvancedIncSubtensor1(**props_dict) 9 | return [gpu_op(as_cuda_ndarray_variable(x), 10 | as_cuda_ndarray_variable(y), *coords)] 11 | 12 | @@ -1149,7 +1149,7 @@ 13 | if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2): 14 | gpu_op = GpuAdvancedIncSubtensor1(**node.op._props_dict()) 15 | else: 16 | - gpu_op = GpuAdvancedIncSubtensor1_dev20(**node.op._props_dict()) 17 | + gpu_op = GpuAdvancedIncSubtensor1(**node.op._props_dict()) 18 | return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] 19 | return False 20 | 21 | --- theano/gpuarray/opt.py 2017-03-21 18:42:35.589317691 +0100 22 | +++ theano/gpuarray/opt.py 2017-03-21 18:42:35.589317691 +0100 23 | @@ -68,8 +68,7 @@ 24 | from .subtensor import (GpuIncSubtensor, GpuSubtensor, 25 | GpuAdvancedSubtensor, 26 | GpuAdvancedSubtensor1, 27 | - GpuAdvancedIncSubtensor1, 28 | - GpuAdvancedIncSubtensor1_dev20) 29 | + GpuAdvancedIncSubtensor1) 30 | from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims 31 | from .reduction import GpuMaxAndArgmax 32 | from .linalg import (GpuCusolverSolve, cusolver_available) 33 | @@ -1056,7 +1055,7 @@ 34 | if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0: 35 | x = x.dimshuffle(0, 'x') 36 | y = y.dimshuffle('x', 'x') 37 | - ret = GpuAdvancedIncSubtensor1_dev20( 38 | + ret = GpuAdvancedIncSubtensor1( 39 | set_instead_of_inc=set_instead_of_inc)(x, y, ilist) 40 | ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret) 41 | return ret 42 | @@ -1064,15 +1063,14 @@ 43 | return GpuAdvancedIncSubtensor1( 44 | set_instead_of_inc=set_instead_of_inc) 45 | else: 46 | - return GpuAdvancedIncSubtensor1_dev20( 47 | + return GpuAdvancedIncSubtensor1( 48 | set_instead_of_inc=set_instead_of_inc) 49 | 50 | 51 | @register_inplace() 52 | -@local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20]) 53 | +@local_optimizer([GpuAdvancedIncSubtensor1]) 54 | def local_advincsub1_gpua_inplace(node): 55 | - if isinstance(node.op, (GpuAdvancedIncSubtensor1, 56 | - GpuAdvancedIncSubtensor1_dev20)): 57 | + if isinstance(node.op, (GpuAdvancedIncSubtensor1,)): 58 | if not node.op.inplace: 59 | return [node.op.clone_inplace()(*node.inputs)] 60 | 61 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | Scripts 2 | ------- 3 | 4 | - `get-meteor-data.sh`: Used to download METEOR paraphrases prior to `nmtpy` installation. 5 | -------------------------------------------------------------------------------- /scripts/get-meteor-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PREFIX="https://raw.githubusercontent.com/cmu-mtlab/meteor/master/data/paraphrase" 4 | SAVEDIR="nmtpy/external/data" 5 | 6 | for lang in cz de en es fr ru; do 7 | if [ ! -f "${SAVEDIR}/paraphrase-${lang}.gz" ]; then 8 | echo "Downloading $lang paraphrase data..." 9 | curl "${PREFIX}-${lang}.gz" -o "${SAVEDIR}/paraphrase-${lang}.gz" 10 | fi 11 | done 12 | -------------------------------------------------------------------------------- /scripts/modify-npz: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import numpy as np 7 | 8 | from ast import literal_eval 9 | from collections import OrderedDict 10 | 11 | from nmtpy.sysutils import * 12 | 13 | os.environ['THEANO_FLAGS'] = 'device=cpu' 14 | 15 | def parse_value(value): 16 | try: 17 | return literal_eval(value) 18 | except ValueError as ve: 19 | return value 20 | 21 | if __name__ == '__main__': 22 | # Change a property inside the 'opts' dictionary 23 | # of npz files and write it back. 24 | # Useful to port old npz files to new nmtpy versions. 25 | 26 | modifs, files = [], [] 27 | 28 | for param in sys.argv[1:]: 29 | if ":" in param: 30 | modifs.append(param.split(':')) 31 | else: 32 | files.append(param) 33 | 34 | for npzf in files: 35 | # Load the file 36 | npz = np.load(npzf) 37 | 38 | # Get the dict 39 | opts = get_model_options(npz) 40 | params = get_param_dict(npz) 41 | 42 | newfilename = npzf 43 | 44 | for key, value in modifs: 45 | opts[key] = parse_value(value) 46 | print('%s -> %s' % (key, opts[key])) 47 | 48 | if key == 'model_type': 49 | # If model_type changed, change the filename as well 50 | oldmodel, rest = npzf.split('-', 1) 51 | newfilename = '%s-%s' % (value, rest) 52 | 53 | print('Writing %s' % newfilename) 54 | 55 | params['opts'] = opts 56 | 57 | np.savez(newfilename, **params) 58 | -------------------------------------------------------------------------------- /scripts/prep-charnmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Example script to show how to prepare 4 | # a char2char dataset from raw text corpora. 5 | # You need to use filter:char2words in .conf 6 | # to correctly post-process hypotheses after beam-search 7 | 8 | # Pipeline: 9 | # lowercase.perl -> word2char (sed) -> trim whitespace (awk) 10 | 11 | datadir=../ 12 | SL=en 13 | TL=de 14 | 15 | for dataset in train val test2016 test2017; do 16 | for lang in $SL $TL; do 17 | inputfile=${datadir}/${dataset}.${lang} 18 | if [ -f $inputfile ]; then 19 | echo $dataset, $lang 20 | lowercase.perl -l $SL < $inputfile | sed -e "s/./& /g;s/\ \ \ / /g" \ 21 | | awk '{$1=$1};1' > ${dataset}.lc.char.${lang} 22 | fi 23 | done 24 | done 25 | 26 | nmt-build-dict train* 27 | -------------------------------------------------------------------------------- /scripts/snaprun: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: 4 | # 1. Put this script under your $USER/bin or $USER/.local/bin or whatever 5 | # 2. Set NMTPY to your nmtpy copy where you do your development 6 | # 3. Start training by prefixing your command with snaprun: 7 | # $ snaprun nmt-train -c foobar.conf 8 | 9 | # Source tree to use as nmtpy unless NMTPY= is set from cmdline 10 | NMTPY=${NMTPY:=~/git/nmtpy-merge} 11 | 12 | # Enter to $NMTPY 13 | pushd $NMTPY 14 | 15 | # Take the last commit SHA1 16 | SHA=`git rev-parse --short HEAD` 17 | 18 | SUFFIX=`uuidgen -t` 19 | 20 | # Export to the following folder 21 | SNAPSHOT="/tmp/nmtpy-${USER}-${SHA}-${SUFFIX}" 22 | 23 | # Take a snapshot of the source tree by cleaning unnecessary stuff 24 | rsync --exclude=*egg-info --exclude=*pycache* --exclude "*.git*" -a . $SNAPSHOT 25 | 26 | # Show code folder 27 | echo "Took snapshot under $SNAPSHOT" 28 | 29 | # Set PATH to use the new /bin to find nmt-* 30 | # and override default python search path list by giving this 31 | # new path as the first item. 32 | export PATH=${SNAPSHOT}/bin:${PATH} 33 | export PYTHONPATH=$SNAPSHOT 34 | 35 | popd 36 | # Execute the given command 37 | $@ 38 | 39 | # Remove the folder 40 | rm -rf $SNAPSHOT 41 | -------------------------------------------------------------------------------- /scripts/update-npz: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import numpy as np 7 | 8 | from ast import literal_eval 9 | from collections import OrderedDict 10 | 11 | from nmtpy.sysutils import * 12 | 13 | os.environ['THEANO_FLAGS'] = 'device=cpu' 14 | 15 | def parse_value(value): 16 | try: 17 | return literal_eval(value) 18 | except ValueError as ve: 19 | return value 20 | 21 | if __name__ == '__main__': 22 | # Update nmtpy model checkpoints to recent format 23 | # to fix problems of inference. 24 | 25 | for fname in sys.argv[1:]: 26 | do_write = False 27 | 28 | # Open the file 29 | npzf = np.load(fname) 30 | 31 | # Get option dictionary 32 | opts = npzf['opts'].tolist() 33 | tparams = OrderedDict() 34 | 35 | if 'tparams' in npzf.files: 36 | # Old format of saving parameters 37 | do_write = True 38 | tparams = npzf['tparams'].tolist() 39 | else: 40 | for key in npzf.files: 41 | if key != 'opts': 42 | tparams[key] = npzf[key] 43 | 44 | tparams['opts'] = opts 45 | 46 | # Close the file 47 | npzf.close() 48 | 49 | new_fname = fname 50 | ############### 51 | 52 | # attention_singledict is now included in main model 53 | if opts['model_type'] == 'attention_singledict': 54 | do_write = True 55 | opts['model_type'] = 'attention' 56 | opts['tied_emb'] = '3way' 57 | del opts['tied_trg_emb'] 58 | 59 | new_fname = fname.replace('attention_singledict', 'attention') 60 | else: 61 | if 'tied_trg_emb' in opts: 62 | opts['tied_emb'] = '2way' 63 | del opts['tied_trg_emb'] 64 | do_write = True 65 | 66 | if do_write: 67 | print('Writing %s' % new_fname) 68 | np.savez(new_fname, **tparams) 69 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | 5 | from setuptools import setup 6 | import nmtpy 7 | 8 | # Install pycocoevalcap metric scorers as well 9 | pycocometrics = ['bleu', 'meteor', 'cider', 'rouge'] 10 | pycocopackages = ['nmtpy.cocoeval.%s' % m for m in pycocometrics] 11 | 12 | if 'install' in sys.argv or 'develop' in sys.argv: 13 | if not os.path.exists('nmtpy/external/data/paraphrase-en.gz'): 14 | print('You need to run scripts/get-meteor-data.sh first.') 15 | sys.exit(1) 16 | 17 | setup( 18 | name='nmtpy', 19 | version=nmtpy.__version__, 20 | description='Neural Machine Translation Framework in Python', 21 | url='https://github.com/lium-lst/nmtpy', 22 | author='Ozan Çağlayan', 23 | author_email='ozancag@gmail.com', 24 | license='MIT', 25 | classifiers=[ 26 | 'Development Status :: 5 - Production/Stable', 27 | 'Intended Audience :: Science/Research', 28 | 'Topic :: Scientific/Engineering', 29 | 'License :: OSI Approved :: MIT License', 30 | 'Programming Language :: Python :: 3 :: Only', 31 | 'Programming Language :: Python :: 3.5', 32 | 'Operating System :: POSIX', 33 | ], 34 | keywords='nmt neural-mt translation deep-learning', 35 | packages=['nmtpy', 'nmtpy.models', 'nmtpy.iterators', 'nmtpy.metrics', 'nmtpy.cocoeval'] + pycocopackages, 36 | package_data={'' : ['external/meteor-1.5.jar', 'external/data/*gz', 'external/multi-bleu.perl']}, # data files 37 | install_requires=[ 38 | 'numpy', 39 | 'theano', 40 | ], 41 | scripts=[ 42 | 'bin/nmt-train', 43 | 'bin/nmt-extract', 44 | 'bin/nmt-rescore', 45 | 'bin/nmt-translate', 46 | 'bin/nmt-translate-factors', # Factored NMT variant. 47 | 'bin/nmt-build-dict', 48 | 'bin/nmt-coco-metrics', 49 | 'bin/nmt-bpe-apply', 50 | 'bin/nmt-bpe-learn', 51 | ], 52 | zip_safe=False) 53 | --------------------------------------------------------------------------------