├── .gitignore
├── LICENSE.md
├── README.md
├── bin
    ├── nmt-bpe-apply
    ├── nmt-bpe-learn
    ├── nmt-build-dict
    ├── nmt-coco-metrics
    ├── nmt-extract
    ├── nmt-rescore
    ├── nmt-test-lm
    ├── nmt-train
    ├── nmt-translate
    ├── nmt-translate-client
    ├── nmt-translate-factors
    └── nmt-translate-server
├── docs
    ├── _config.yml
    ├── index.md
    ├── logo.pdf
    ├── logo.png
    └── pages
    │   └── config.md
├── examples
    ├── README.md
    ├── ted-factors
    │   ├── README.md
    │   ├── attention_factors-ted-en-fr.conf
    │   └── data
    │   │   └── README.md
    └── wmt16-mmt-task2
    │   ├── README.md
    │   ├── data
    │       ├── README.md
    │       ├── fix-corpus-bugs.patch
    │       ├── split_all.txt
    │       ├── split_test.txt
    │       ├── split_train.txt
    │       ├── split_val.txt
    │       ├── test.1.de
    │       ├── test.1.en
    │       ├── test.2.de
    │       ├── test.2.en
    │       ├── test.3.de
    │       ├── test.3.en
    │       ├── test.4.de
    │       ├── test.4.en
    │       ├── test.5.de
    │       ├── test.5.en
    │       ├── train.1.de
    │       ├── train.1.en
    │       ├── train.2.de
    │       ├── train.2.en
    │       ├── train.3.de
    │       ├── train.3.en
    │       ├── train.4.de
    │       ├── train.4.en
    │       ├── train.5.de
    │       ├── train.5.en
    │       ├── val.1.de
    │       ├── val.1.en
    │       ├── val.2.de
    │       ├── val.2.en
    │       ├── val.3.de
    │       ├── val.3.en
    │       ├── val.4.de
    │       ├── val.4.en
    │       ├── val.5.de
    │       └── val.5.en
    │   ├── scripts
    │       ├── 01-tokenize.sh
    │       └── 02-prepare.py
    │   ├── wmt16-mmt-task2-monomodal.conf
    │   └── wmt16-mmt-task2-multimodal.conf
├── nmtpy
    ├── __init__.py
    ├── cleanup.py
    ├── cocoeval
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bleu
    │   │   ├── LICENSE.bleu
    │   │   ├── __init__.py
    │   │   ├── bleu.py
    │   │   └── bleu_scorer.py
    │   ├── cider
    │   │   ├── __init__.py
    │   │   ├── cider.py
    │   │   └── cider_scorer.py
    │   ├── meteor
    │   │   ├── __init__.py
    │   │   └── meteor.py
    │   └── rouge
    │   │   ├── __init__.py
    │   │   └── rouge.py
    ├── config.py
    ├── defaults.py
    ├── external
    │   ├── data
    │   │   └── README.md
    │   ├── meteor-1.5.jar
    │   └── multi-bleu.perl
    ├── filters.py
    ├── iterators
    │   ├── __init__.py
    │   ├── bitext.py
    │   ├── factors.py
    │   ├── fusion.py
    │   ├── homogeneous.py
    │   ├── iterator.py
    │   ├── mnmt.py
    │   ├── text.py
    │   └── wmt.py
    ├── layers.py
    ├── logger.py
    ├── mainloop.py
    ├── metrics
    │   ├── __init__.py
    │   ├── bleu.py
    │   ├── external.py
    │   ├── factors2wordbleu.py
    │   ├── meteor.py
    │   ├── metric.py
    │   └── mtevalbleu.py
    ├── models
    │   ├── README.md
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── attention_factors.py
    │   ├── attention_wmt.py
    │   ├── basefnmt.py
    │   ├── basefusion.py
    │   ├── basemodel.py
    │   ├── dcu_multimodal.py
    │   ├── fusion_concat_dep_dep.py
    │   ├── fusion_concat_dep_ind.py
    │   ├── fusion_concat_ind_dep.py
    │   ├── fusion_concat_ind_ind.py
    │   ├── fusion_sum_dep_dep.py
    │   ├── fusion_sum_dep_ind.py
    │   ├── fusion_sum_ind_dep.py
    │   ├── fusion_sum_ind_ind.py
    │   ├── mnmt_ctxmul.py
    │   ├── mnmt_decinit.py
    │   ├── mnmt_decinitctxtrgmul.py
    │   ├── mnmt_encdecinit.py
    │   ├── mnmt_encdecinitctxtrgmul.py
    │   ├── mnmt_trgmul.py
    │   ├── mnmt_yemb_mulimg.py
    │   └── rnnlm.py
    ├── nmtutils.py
    ├── optimizers.py
    ├── sysutils.py
    └── textutils.py
├── patches
    └── 00-theano-advancedinctensor.patch
├── scripts
    ├── README.md
    ├── get-meteor-data.sh
    ├── modify-npz
    ├── prep-charnmt.sh
    ├── snaprun
    └── update-npz
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *egg-info
3 | nmtpy/external/data/*gz
4 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ## MIT License
 2 | 
 3 | Copyright (c) 2017 - University of Le Mans - Language and Speech Technology (LST) Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | --
24 | 
25 | **nmtpy** includes code from the following projects which have their own licenses:
26 | 
27 |  - [dl4mt-tutorial](https://github.com/nyu-dl/dl4mt-tutorial) [[BSD-3-Clause](https://github.com/nyu-dl/dl4mt-tutorial/blob/master/LICENSE)]
28 |  - Ensembling and alignment collection from [nematus](https://github.com/rsennrich/nematus) [Same as dl4mt-tutorial]
29 |  - Scripts from [subword-nmt](https://github.com/rsennrich/subword-nmt) [[MIT](https://github.com/rsennrich/subword-nmt/blob/master/LICENSE)]
30 |  - `multi-bleu.perl` from [mosesdecoder](https://github.com/moses-smt/mosesdecoder) [[LGPL-2.1](https://github.com/moses-smt/mosesdecoder/blob/master/COPYING)]
31 |  - METEOR v1.5 JAR from [meteor](https://github.com/cmu-mtlab/meteor) [[LGPL-2.1](https://github.com/cmu-mtlab/meteor/blob/master/COPYING)]
32 |  - Sorted data iterator, coco eval script and LSTM from [arctic-captions](https://github.com/kelvinxu/arctic-captions) [Revised BSD-3-Clause]
33 |  - `pycocoevalcap` from [coco-caption](https://github.com/tylin/coco-caption) [[BSD-2-Clause](https://github.com/tylin/coco-caption/blob/master/license.txt)]
34 | 


--------------------------------------------------------------------------------
/bin/nmt-bpe-apply:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use operations learned with nmt-bpe-learn to encode a new text.
  6 | The text will not be smaller, but use only a fixed vocabulary, with rare words
  7 | encoded as variable-length sequences of subword units.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | import re
 15 | import sys
 16 | import codecs
 17 | import argparse
 18 | 
 19 | class BPE(object):
 20 | 
 21 |     def __init__(self, codes, separator='@@', skiptags=False):
 22 |         
 23 |         with codecs.open(codes.name, encoding='utf-8') as codes:
 24 |             self.bpe_codes = [tuple(item.split()) for item in codes]
 25 |          
 26 |         # some hacking to deal with duplicates (only consider first instance)
 27 |         self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])
 28 | 
 29 |         self.separator = separator
 30 |         self.skiptags = skiptags
 31 | 
 32 |     def segment(self, sentence):
 33 |         """segment single sentence (whitespace-tokenized string) with BPE encoding"""
 34 | 
 35 |         output = []
 36 |         for word in sentence.split():
 37 |             if self.skiptags and re.match('<.*?:.*>', word):
 38 |                 output.append(word)
 39 |             else:
 40 |                 new_word = encode(word, self.bpe_codes)
 41 | 
 42 |                 for item in new_word[:-1]:
 43 |                     output.append(item + self.separator)
 44 |                 output.append(new_word[-1])
 45 | 
 46 |         return ' '.join(output)
 47 | 
 48 | def get_pairs(word):
 49 |     """Return set of symbol pairs in a word.
 50 | 
 51 |     word is represented as tuple of symbols (symbols being variable-length strings)
 52 |     """
 53 |     pairs = set()
 54 |     prev_char = word[0]
 55 |     for char in word[1:]:
 56 |         pairs.add((prev_char, char))
 57 |         prev_char = char
 58 |     return pairs
 59 | 
 60 | def encode(orig, bpe_codes, cache={}):
 61 |     """Encode word based on list of BPE merge operations, which are applied consecutively
 62 |     """
 63 | 
 64 |     if orig in cache:
 65 |         return cache[orig]
 66 | 
 67 |     word = tuple(orig) + ('</w>',)
 68 |     pairs = get_pairs(word)
 69 | 
 70 |     while True:
 71 |         bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
 72 |         if bigram not in bpe_codes:
 73 |             break
 74 |         first, second = bigram
 75 |         new_word = []
 76 |         i = 0
 77 |         while i < len(word):
 78 |             try:
 79 |                 j = word.index(first, i)
 80 |                 new_word.extend(word[i:j])
 81 |                 i = j
 82 |             except:
 83 |                 new_word.extend(word[i:])
 84 |                 break
 85 | 
 86 |             if word[i] == first and i < len(word)-1 and word[i+1] == second:
 87 |                 new_word.append(first+second)
 88 |                 i += 2
 89 |             else:
 90 |                 new_word.append(word[i])
 91 |                 i += 1
 92 |         new_word = tuple(new_word)
 93 |         word = new_word
 94 |         if len(word) == 1:
 95 |             break
 96 |         else:
 97 |             pairs = get_pairs(word)
 98 | 
 99 |     # don't print end-of-word symbols
100 |     if word[-1] == '</w>':
101 |         word = word[:-1]
102 |     elif word[-1].endswith('</w>'):
103 |         word = word[:-1] + (word[-1].replace('</w>',''),)
104 | 
105 |     cache[orig] = word
106 |     return word
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     parser = argparse.ArgumentParser(
111 |         formatter_class=argparse.RawDescriptionHelpFormatter,
112 |         description="learn BPE-based word segmentation")
113 | 
114 |     parser.add_argument(
115 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
116 |         metavar='PATH',
117 |         help="Input file (default: standard input).")
118 |     parser.add_argument(
119 |         '--codes', '-c', type=argparse.FileType('r'), metavar='PATH',
120 |         required=True,
121 |         help="File with BPE codes (created by nmt-bpe-learn).")
122 |     parser.add_argument(
123 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
124 |         metavar='PATH',
125 |         help="Output file (default: standard output)")
126 |     parser.add_argument(
127 |         '--skiptags', '-k', action='store_true', default=False,
128 |         help="Skip <foo:bar> morphological tags (default: False)")
129 |     parser.add_argument(
130 |         '--separator', '-s', type=str, default='@@', metavar='STR',
131 |         help="Separator between non-final subword units (default: '%(default)s'))")
132 | 
133 |     args = parser.parse_args()
134 | 
135 |     bpe = BPE(args.codes, args.separator, args.skiptags)
136 | 
137 |     for line in args.input:
138 |         args.output.write(bpe.segment(line).strip())
139 |         args.output.write('\n')
140 | 


--------------------------------------------------------------------------------
/bin/nmt-bpe-learn:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: Rico Sennrich
  4 | 
  5 | """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
  6 | Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
  7 | of a text to a configurable number of symbols, with only a small increase in the number of tokens.
  8 | 
  9 | Reference:
 10 | Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
 11 | Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
 12 | """
 13 | 
 14 | import sys
 15 | import re
 16 | import copy
 17 | import argparse
 18 | from collections import defaultdict, Counter
 19 | 
 20 | def get_vocabulary(fobj):
 21 |     """Read text and return dictionary that encodes vocabulary
 22 |     """
 23 |     vocab = Counter()
 24 |     sys.stderr.write('Reading file {0}\n'.format(fobj.name))
 25 |     for line in fobj:
 26 |         for word in line.split():
 27 |             vocab[word] += 1
 28 |     sys.stderr.write('Done.\n')
 29 |     return vocab
 30 | 
 31 | def update_pair_statistics(pair, changed, stats, indices):
 32 |     """Minimally update the indices and frequency of symbol pairs
 33 | 
 34 |     if we merge a pair of symbols, only pairs that overlap with occurrences
 35 |     of this pair are affected, and need to be updated.
 36 |     """
 37 |     stats[pair] = 0
 38 |     indices[pair] = defaultdict(int)
 39 |     first, second = pair
 40 |     new_pair = first+second
 41 |     for j, word, old_word, freq in changed:
 42 | 
 43 |         # find all instances of pair, and update frequency/indices around it
 44 |         i = 0
 45 |         while True:
 46 |             try:
 47 |                 i = old_word.index(first, i)
 48 |             except ValueError:
 49 |                 break
 50 |             if i < len(old_word)-1 and old_word[i+1] == second:
 51 |                 if i:
 52 |                     prev = old_word[i-1:i+1]
 53 |                     stats[prev] -= freq
 54 |                     indices[prev][j] -= 1
 55 |                 if i < len(old_word)-2:
 56 |                     # don't double-count consecutive pairs
 57 |                     if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
 58 |                         nex = old_word[i+1:i+3]
 59 |                         stats[nex] -= freq
 60 |                         indices[nex][j] -= 1
 61 |                 i += 2
 62 |             else:
 63 |                 i += 1
 64 | 
 65 |         i = 0
 66 |         while True:
 67 |             try:
 68 |                 i = word.index(new_pair, i)
 69 |             except ValueError:
 70 |                 break
 71 |             if i:
 72 |                 prev = word[i-1:i+1]
 73 |                 stats[prev] += freq
 74 |                 indices[prev][j] += 1
 75 |             # don't double-count consecutive pairs
 76 |             if i < len(word)-1 and word[i+1] != new_pair:
 77 |                 nex = word[i:i+2]
 78 |                 stats[nex] += freq
 79 |                 indices[nex][j] += 1
 80 |             i += 1
 81 | 
 82 | 
 83 | def get_pair_statistics(vocab):
 84 |     """Count frequency of all symbol pairs, and create index"""
 85 | 
 86 |     # data structure of pair frequencies
 87 |     stats = defaultdict(int)
 88 | 
 89 |     #index from pairs to words
 90 |     indices = defaultdict(lambda: defaultdict(int))
 91 | 
 92 |     for i, (word, freq) in enumerate(vocab):
 93 |         prev_char = word[0]
 94 |         for char in word[1:]:
 95 |             stats[prev_char, char] += freq
 96 |             indices[prev_char, char][i] += 1
 97 |             prev_char = char
 98 | 
 99 |     return stats, indices
100 | 
101 | 
102 | def replace_pair(pair, vocab, indices):
103 |     """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
104 |     first, second = pair
105 |     pair_str = ''.join(pair)
106 |     pair_str = pair_str.replace('\\','\\\\')
107 |     changes = []
108 |     pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
109 |     for j, freq in indices[pair].items():
110 |         if freq < 1:
111 |             continue
112 |         word, freq = vocab[j]
113 |         new_word = ' '.join(word)
114 |         new_word = pattern.sub(pair_str, new_word)
115 |         new_word = tuple(new_word.split())
116 | 
117 |         vocab[j] = (new_word, freq)
118 |         changes.append((j, new_word, word, freq))
119 | 
120 |     return changes
121 | 
122 | def prune_stats(stats, big_stats, threshold):
123 |     """Prune statistics dict for efficiency of max()
124 | 
125 |     The frequency of a symbol pair never increases, so pruning is generally safe
126 |     (until we the most frequent pair is less frequent than a pair we previously pruned)
127 |     big_stats keeps full statistics for when we need to access pruned items
128 |     """
129 |     for item, freq in list(stats.items()):
130 |         if freq < threshold:
131 |             del stats[item]
132 |             if freq < 0:
133 |                 big_stats[item] += freq
134 |             else:
135 |                 big_stats[item] = freq
136 | 
137 | if __name__ == '__main__':
138 |     parser = argparse.ArgumentParser(
139 |         formatter_class=argparse.RawDescriptionHelpFormatter,
140 |         description="learn BPE-based word segmentation")
141 | 
142 |     parser.add_argument(
143 |         '--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
144 |         metavar='PATH',
145 |         help="Input text (default: standard input).")
146 |     parser.add_argument(
147 |         '--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
148 |         metavar='PATH',
149 |         help="Output file for BPE codes (default: standard output)")
150 |     parser.add_argument(
151 |         '--symbols', '-s', type=int, default=10000,
152 |         help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))")
153 |     parser.add_argument(
154 |         '--verbose', '-v', action="store_true",
155 |         help="verbose mode.")
156 | 
157 |     args = parser.parse_args()
158 | 
159 |     vocab = get_vocabulary(args.input)
160 |     vocab = dict([(tuple(x)+('</w>',) ,y) for (x,y) in vocab.items()])
161 |     sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
162 | 
163 |     stats, indices = get_pair_statistics(sorted_vocab)
164 |     big_stats = copy.deepcopy(stats)
165 |     # threshold is inspired by Zipfian assumption, but should only affect speed
166 |     threshold = max(stats.values()) / 10
167 |     for i in range(args.symbols):
168 |         if stats:
169 |             most_frequent = max(stats, key=lambda x: (stats[x], x))
170 | 
171 |         # we probably missed the best pair because of pruning; go back to full statistics
172 |         if not stats or (i and stats[most_frequent] < threshold):
173 |             prune_stats(stats, big_stats, threshold)
174 |             stats = copy.deepcopy(big_stats)
175 |             most_frequent = max(stats, key=lambda x: (stats[x], x))
176 |             # threshold is inspired by Zipfian assumption, but should only affect speed
177 |             threshold = stats[most_frequent] * i/(i+10000.0)
178 |             prune_stats(stats, big_stats, threshold)
179 | 
180 |         if stats[most_frequent] < 2:
181 |             sys.stderr.write('no pair has frequency > 1. Stopping\n')
182 |             break
183 | 
184 |         if args.verbose:
185 |             sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent]))
186 |         args.output.write('{0} {1}\n'.format(*most_frequent))
187 |         changes = replace_pair(most_frequent, sorted_vocab, indices)
188 |         update_pair_statistics(most_frequent, changes, stats, indices)
189 |         stats[most_frequent] = 0
190 |         if not i % 100:
191 |             prune_stats(stats, big_stats, threshold)
192 | 


--------------------------------------------------------------------------------
/bin/nmt-build-dict:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | import argparse
  7 | import pickle as pkl
  8 | from collections import OrderedDict
  9 | 
 10 | import numpy as np
 11 | 
 12 | def freqs_to_dict(token_freqs, min_freq):
 13 |     # Get list of tokens
 14 |     tokens = list(token_freqs.keys())
 15 | 
 16 |     # Collect their frequencies in a numpy array
 17 |     freqs = np.array(list(token_freqs.values()))
 18 | 
 19 |     tokendict = OrderedDict()
 20 |     tokendict['<eos>'] = 0
 21 |     tokendict['<unk>'] = 1
 22 | 
 23 |     # Sort in descending order of frequency
 24 |     sorted_idx = np.argsort(freqs)
 25 |     if min_freq > 0:
 26 |         sorted_tokens = [tokens[ii] for ii in sorted_idx[::-1] if freqs[ii] >= min_freq]
 27 |     else:
 28 |         sorted_tokens = [tokens[ii] for ii in sorted_idx[::-1]]
 29 | 
 30 |     # Start inserting from index 2
 31 |     for ii, ww in enumerate(sorted_tokens):
 32 |         tokendict[ww] = ii + 2
 33 | 
 34 |     return tokendict
 35 | 
 36 | def get_freqs(fname, cumul_dict=None):
 37 |     # We'll first count frequencies
 38 |     if cumul_dict is not None:
 39 |         # Let's accumulate frequencies
 40 |         token_freqs = cumul_dict
 41 |     else:
 42 |         token_freqs = OrderedDict()
 43 | 
 44 |     print("Reading file %s" % filename)
 45 |     with open(filename) as f:
 46 |         idx = 0
 47 |         for line in f:
 48 |             line = line.strip()
 49 |             if line:
 50 |                 # Collect frequencies
 51 |                 for w in line.split(' '):
 52 |                     if w not in token_freqs:
 53 |                         token_freqs[w] = 0
 54 |                     token_freqs[w] += 1
 55 | 
 56 |                 if (idx+1) % 10000 == 0:
 57 |                     print('\r%d sentences processed' % (idx + 1), end=' ')
 58 |                     sys.stdout.flush()
 59 | 
 60 |                 idx += 1
 61 | 
 62 |     print('\r%d sentences processed' % (idx))
 63 |     # Remove already available <eos> and <unk> if any
 64 |     if '<eos>' in token_freqs:
 65 |         del token_freqs['<eos>']
 66 |     if '<unk>' in token_freqs:
 67 |         del token_freqs['<unk>']
 68 | 
 69 |     return token_freqs
 70 | 
 71 | def write_dict(fname, vocab):
 72 |     print("Dumping vocabulary (%d tokens) to %s..." % (len(vocab), fname))
 73 |     with open(fname, 'wb') as f:
 74 |         pkl.dump(vocab, f)
 75 | 
 76 | if __name__ == '__main__':
 77 |     parser = argparse.ArgumentParser(prog='build_dictionary')
 78 |     parser.add_argument('-o', '--output-dir', type=str, default='.', help='Output directory')
 79 |     parser.add_argument('-s', '--single'    , type=str, default=None,help='Name of the single vocabulary file. (default: Disabled)')
 80 |     parser.add_argument('-m', '--min-freq'  , type=int, default=0,   help='Filter out tokens occuring < m times.')
 81 |     parser.add_argument('files', type=str   , nargs='+',             help='Text files to create dictionaries.')
 82 |     args = parser.parse_args()
 83 | 
 84 |     # In case it is needed
 85 |     all_freqs = OrderedDict()
 86 | 
 87 |     for filename in args.files:
 88 |         filename = os.path.abspath(os.path.expanduser(filename))
 89 | 
 90 |         if args.single:
 91 |             # Get cumulative frequencies
 92 |             all_freqs = get_freqs(filename, all_freqs)
 93 | 
 94 |         else:
 95 |             # Get frequencies
 96 |             freqs = get_freqs(filename)
 97 |             # Build dictionary from frequencies
 98 |             tokendict = freqs_to_dict(freqs, args.min_freq)
 99 | 
100 |             vocab_fname = os.path.basename(filename)
101 |             if args.min_freq > 0:
102 |                 vocab_fname += "-min%d" % args.min_freq
103 |             vocab_fname = os.path.join(args.output_dir, vocab_fname)
104 |             vocab_fname += ".vocab.pkl"
105 | 
106 |             write_dict(vocab_fname, tokendict)
107 | 
108 |     if args.single:
109 |         tokendict = freqs_to_dict(all_freqs, args.min_freq)
110 |         write_dict(args.single, tokendict)
111 | 


--------------------------------------------------------------------------------
/bin/nmt-coco-metrics:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Computes the BLEU, ROUGE, METEOR, and CIDER
 6 | using the COCO metrics scripts
 7 | """
 8 | import os
 9 | import argparse
10 | from collections import OrderedDict
11 | 
12 | # Script taken and adapted from Kelvin Xu's arctic-captions project
13 | # https://github.com/kelvinxu/arctic-captions
14 | 
15 | from nmtpy.cocoeval.bleu.bleu       import Bleu
16 | from nmtpy.cocoeval.rouge.rouge     import Rouge
17 | from nmtpy.cocoeval.cider.cider     import Cider
18 | from nmtpy.cocoeval.meteor.meteor   import Meteor
19 | 
20 | def print_table(results, sort_by='METEOR'):
21 |     cols = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4',
22 |             'METEOR', 'METEOR (norm)', 'CIDEr', 'ROUGE_L']
23 |     for col in cols:
24 |         print('|{:^15}|'.format(col), end='')
25 |     print()
26 | 
27 |     results = sorted(results.items(), key=lambda x: x[1][sort_by])
28 | 
29 |     for sysname, result in results:
30 |         if len(results) > 1:
31 |             print(sysname)
32 |         for col in cols:
33 |             print('|{:^15,.3f}|'.format(result[col]), end='')
34 |         print()
35 | 
36 | if __name__ == '__main__':
37 |     parser = argparse.ArgumentParser(description="Compute BLEU, METEOR, ROUGE and CIDEr for single or multiple references.")
38 | 
39 |     parser.add_argument("-w", "--write",    action='store_true',            help='Create a per-hypothesisscore file containing the results.')
40 |     parser.add_argument("-l", "--language", default='en',                   help='Hypothesis language (default: en)')
41 |     parser.add_argument("-s", "--systems",  type=str,                       help="Per-system hypothesis file(s)", nargs='+')
42 |     parser.add_argument("-r", "--refs",     type=argparse.FileType('r'),    help="Path to all the reference files", nargs='+')
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     # List of scorers
47 |     scorers = [
48 |         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
49 |         (Meteor(args.language), ["METEOR"]),
50 |         (Meteor(args.language, norm=True), ["METEOR (norm)"]),
51 |         (Cider(), ["CIDEr"]),
52 |         (Rouge(), ["ROUGE_L"]),
53 |     ]
54 | 
55 |     results = OrderedDict()
56 | 
57 |     # Read multiple reference files
58 |     raw_refs = [list(map(str.strip, r)) for r in zip(*args.refs)]
59 |     refs = {idx: rr for idx, rr in enumerate(raw_refs)}
60 |     
61 |     # Ranking of multiple systems is possible
62 |     for hypfile in args.systems:
63 |         with open(hypfile) as f:
64 |             # List of hypothesis sentences for this system
65 |             hypo = {idx: [line.strip()] for (idx, line) in enumerate(f)}
66 | 
67 |             result = OrderedDict()
68 | 
69 |             for scorer, method in scorers:
70 |                 score, _ = scorer.compute_score(refs, hypo)
71 |                 if score:
72 |                     if not isinstance(score, list):
73 |                         score = [score]
74 |                     for m, s in zip(method, score):
75 |                         result[m] = float('%.3f' % s)
76 | 
77 |             if args.write:
78 |                 with open("%s.score" % hypfile, 'w') as f:
79 |                     f.write("%s\n" % result)
80 |             results[os.path.basename(hypfile)] = result
81 | 
82 |     print_table(results)
83 | 


--------------------------------------------------------------------------------
/bin/nmt-extract:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """Extract several features from a trained model."""
 4 | 
 5 | import sys
 6 | import argparse
 7 | 
 8 | import numpy as np
 9 | 
10 | from nmtpy.sysutils import get_param_dict
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(prog='nmt-extract')
14 |     parser.add_argument('-o', '--output', type=str,                 help="Output .npz file.")
15 |     parser.add_argument('-n', '--init', action='store_true',        help="Make output file compatible with nmt-train '--init' argument.")
16 |     parser.add_argument('-m', '--model', type=str,                  help="Model's .npz file from which weights will be extracted.")
17 |     parser.add_argument('-w', '--which', nargs='+', required=True,  help='Space separated list of to-be-extracted weight keys.')
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     try:
22 |         params = get_param_dict(args.model)
23 |     except KeyError as ke:
24 |         print('%s does not contain model parameters. Did you train model do at least 1 validation?' % args.model)
25 |         sys.exit(1)
26 | 
27 |     extracted_weights = {}
28 |     for key in args.which:
29 |         try:
30 |             extracted_weights[key] = params[key]
31 |             print("Extracted '%s' with shape=%s" % (key, params[key].shape))
32 |         except KeyError as ke:
33 |             print("'%s' not found in model's .npz file, aborting." % key)
34 |             sys.exit(1)
35 | 
36 |     if args.init:
37 |         # You can use output file to init a new model with pre-trained weights
38 |         # extracted here.
39 |         np.savez(args.output, tparams=extracted_weights, opts={})
40 |     else:
41 |         np.savez(args.output, **extracted_weights)
42 | 


--------------------------------------------------------------------------------
/bin/nmt-rescore:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Rescores translations using an nmtpy NMT model."""
  5 | 
  6 | import os
  7 | import sys
  8 | import time
  9 | import argparse
 10 | import tempfile
 11 | import importlib
 12 | 
 13 | import numpy as np
 14 | 
 15 | from nmtpy.logger   import Logger
 16 | from nmtpy.sysutils import *
 17 | 
 18 | def is_nbest(trg_file):
 19 |     """Checks whether trg_file is in N-best format."""
 20 |     with open(trg_file) as tf:
 21 |         return ' ||| ' in tf.readline().strip()
 22 | 
 23 | def process_files(src_file, trg_file):
 24 |     with open(trg_file) as tf:
 25 |         # Read source sentences as they are
 26 |         src_sents = open(src_file).read().strip().split('\n')
 27 | 
 28 |         new_sf = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.rescore.src')
 29 |         new_tf = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.rescore.trg')
 30 | 
 31 |         for line in tf:
 32 |             idx, hyp, score = line.split(' ||| ')
 33 |             new_sf.write('%s\n' % src_sents[int(idx)])
 34 |             new_tf.write('%s\n' % hyp)
 35 | 
 36 |     new_sf.close()
 37 |     new_tf.close()
 38 | 
 39 |     return (new_sf.name, new_tf.name)
 40 | 
 41 | def write_rescore_file(trg_file, out_file, nmt_scores, nbest):
 42 |     """Append scores to trg_file's last column and save it as out_file."""
 43 |     with open(out_file, 'w') as of:
 44 |         with open(trg_file) as tf:
 45 |             for idx, (scores, line) in enumerate(zip(nmt_scores, tf)):
 46 |                 # generate score string
 47 |                 score = ' '.join(["%.6f" % s for s in scores])
 48 |                 if nbest:
 49 |                     of.write("%s %s\n" % (line.strip(), score))
 50 |                 else:
 51 |                     of.write("%d ||| %s ||| %s\n" % (idx, line.strip(), score))
 52 | 
 53 | if __name__ == "__main__":
 54 |     parser = argparse.ArgumentParser(prog='nmt-rescore')
 55 |     parser.add_argument('-b', '--batch-size' ,default=128,    type=int, help="Batch size to use during forward-pass.")
 56 |     parser.add_argument('-d', '--device'     ,default='auto', type=str, help="Automatically selects GPU or CPU if no GPU available. (cpu or gpuX can also be given.)")
 57 |     parser.add_argument('-s', '--src-file'   ,required=True,  type=str, help="File containing 1 source sentence per line.")
 58 |     parser.add_argument('-t', '--trg-file'   ,required=True,  type=str, help="Translations file in plain text or n-best format.")
 59 |     parser.add_argument('-o', '--out-file'   ,required=True,  type=str, help="Output file for rescored translations.")
 60 |     parser.add_argument('-m', '--models'     ,required=True,  type=str, help="Model .npz file(s) to be used for (ensemble) rescoring.",
 61 |                                                               nargs='+')
 62 | 
 63 |     # Setup the logger
 64 |     Logger.setup(timestamp=False)
 65 |     log = Logger.get()
 66 | 
 67 |     args = parser.parse_args()
 68 | 
 69 |     #####################################
 70 |     # Set device for Theano if not forced
 71 |     #####################################
 72 |     # NOTE: Very slow on CPU compared to GPU!
 73 |     if 'THEANO_FLAGS' not in os.environ:
 74 |         dev = get_device(args.device)
 75 |         log.info('Using device: %s' % dev)
 76 |         os.environ['THEANO_FLAGS'] = "device=%s" % dev
 77 | 
 78 |         if args.device == 'cpu':
 79 |             # This is to avoid thread explosion. Allow
 80 |             # each process to use a single thread.
 81 |             os.environ["OMP_NUM_THREADS"] = "1"
 82 |             os.environ["MKL_NUM_THREADS"] = "1"
 83 | 
 84 |     # Print information
 85 |     log.info("Source file: %s" % args.src_file)
 86 |     log.info("Target file: %s" % args.trg_file)
 87 |     log.info("%d models given for rescoring" % len(args.models))
 88 | 
 89 |     # Load model options from first model
 90 |     model_options = get_model_options(args.models[0])
 91 | 
 92 |     # Import the module
 93 |     Model = importlib.import_module("nmtpy.models.%s" % model_options['model_type']).Model
 94 | 
 95 |     # Create the model, seed is not used.
 96 |     model = Model(seed=1, logger=None, **model_options)
 97 | 
 98 |     # Load the first model
 99 |     model.load(args.models[0])
100 | 
101 |     # Disable dropout
102 |     model.set_dropout(False)
103 | 
104 |     # Build graph
105 |     log.info('Building computation graph...')
106 |     model.build()
107 | 
108 |     # Set batch size
109 |     model.batch_size = args.batch_size
110 |     log.info('Batch size: %d' % model.batch_size)
111 | 
112 |     remove_temp_files = []
113 | 
114 |     # Copy filenames
115 |     src_file, trg_file = args.src_file, args.trg_file
116 | 
117 |     is_trg_nbest = is_nbest(args.trg_file)
118 | 
119 |     if is_trg_nbest:
120 |         log.info('Target is n-best')
121 |         # Process src and trg files accordingly
122 |         src_file, trg_file = process_files(args.src_file, args.trg_file)
123 |         remove_temp_files.extend([src_file, trg_file])
124 | 
125 |     model.data['valid_src'] = src_file
126 |     model.data['valid_trg'] = trg_file
127 | 
128 |     log.info('Loading data')
129 |     model.load_valid_data()
130 |     iterator = model.valid_iterator
131 | 
132 |     # Score array per each model
133 |     scores = [[] for i in range(len(args.models))]
134 | 
135 |     start = time.time()
136 |     for idx, modelfile in enumerate(args.models):
137 |         log.info('Rescoring with %s' % os.path.basename(modelfile))
138 | 
139 |         # Load model weights for anything except first one (hacky)
140 |         if idx > 0:
141 |             model.update_shared_variables(get_param_dict(modelfile))
142 | 
143 |         for i, data in enumerate(iterator):
144 |             norm = data['y_mask'].sum(0)
145 |             scores[idx].extend(model.f_log_probs(*list(data.values())) / norm)
146 | 
147 |             if (i + 1) % 10 == 0:
148 |                 log.info('%d samples completed.' % (len(scores[idx])))
149 | 
150 |     log.info('Rescoring done in %.3f seconds.' % (time.time() - start))
151 | 
152 |     # Convert scores to numpy array and transpose
153 |     scores = np.array(scores, dtype='float32').T
154 | 
155 |     # Write final file
156 |     write_rescore_file(args.trg_file, args.out_file, scores, is_trg_nbest)
157 | 
158 |     # Remove n-best related temporary files
159 |     for file_ in remove_temp_files:
160 |         os.unlink(file_)
161 | 
162 |     # Report success
163 |     sys.exit(0)
164 | 


--------------------------------------------------------------------------------
/bin/nmt-test-lm:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Score a source file using a model."""
  3 | 
  4 | import os
  5 | import sys
  6 | import time
  7 | import argparse
  8 | import importlib
  9 | from multiprocessing import Process, Queue, cpu_count
 10 | 
 11 | from collections import OrderedDict
 12 | 
 13 | import numpy as np
 14 | 
 15 | from nmtpy.logger import Logger
 16 | from nmtpy.config import Config
 17 | from nmtpy.sysutils import *
 18 | from nmtpy.iterators.bitext import BiTextIterator
 19 | import nmtpy.cleanup as cleanup
 20 | 
 21 | Logger.setup()
 22 | log = Logger.get()
 23 | 
 24 | """Worker process which does calculate logprobs from data send through the model."""
 25 | def test_model(queue, rqueue, pid, model):
 26 | 
 27 |     while True:
 28 |         req = queue.get()
 29 |         # We should avoid this
 30 |         if req is None:
 31 |             break
 32 | 
 33 |         # Get data from queue
 34 |         idx, data_dict = req[0], req[1]
 35 |         # Calculate validation loss
 36 |         curr_loss, sentlen = model.val_loss(data_dict)
 37 | 
 38 |         score=0
 39 |         rqueue.put((idx, score, sentlen, curr_loss))
 40 | 
 41 | 
 42 | """Tester starts worker processes, delegates source iterator
 43 | to them, waits for the results."""
 44 | class Tester(object):
 45 |     def __init__(self, args):
 46 |         # Always lists provided by argparse (nargs:'+')
 47 |         self.src_files = args.src_files
 48 |         self.ref_files = args.ref_files
 49 | 
 50 |         self.n_jobs = args.n_jobs
 51 |         self.model_file = args.model
 52 | 
 53 |         # Not used
 54 |         self.seed = 1234
 55 | 
 56 |         self.utf8 = False
 57 | 
 58 |         # Create worker process pool
 59 |         self.processes = [None] * self.n_jobs
 60 | 
 61 |     def set_model_options(self):
 62 |         model_options = get_model_options(self.model_file)
 63 | 
 64 |         # Import the module
 65 |         self.__class = importlib.import_module("nmtpy.models.%s" % model_options['model_type']).Model
 66 | 
 67 |         # Create the model
 68 |         self.model = self.__class(seed=self.seed, logger=None, **model_options)
 69 |         self.model.load(self.model_file)
 70 |         self.model.set_dropout(False)
 71 | 
 72 |         # invert dictionary
 73 |         self.ref_idict = dict([[v,k] for k,v in self.model.src_dict.items()])
 74 | 
 75 |             # Normal test mode
 76 |         if self.src_files is not None:
 77 |             self.model.data['valid_src'] = self.src_files[0]
 78 | 
 79 |         self.model.load_valid_data()
 80 |         self.iterator = self.model.valid_iterator
 81 |         self.n_sentences = self.iterator.n_samples
 82 |         log.info('I will test %d samples' % self.n_sentences)
 83 | 
 84 |         if self.src_files is None:
 85 |             self.src_files = listify(self.model.data['valid_src'])
 86 |             log.info("No test data given, assuming validation dataset.")
 87 | 
 88 |         # Print information
 89 |         log.info("Source file(s)")
 90 |         for f in self.src_files:
 91 |             log.info("  %s" % f)
 92 | 
 93 |         # It's possible that we don't have any reference files, e.g. for test sets.
 94 |         if self.ref_files:
 95 |             log.info("Reference file(s)")
 96 |             for f in self.ref_files:
 97 |                 log.info("  %s" % f)
 98 | 
 99 |     def start(self):
100 |         # create input and output queues for processes
101 |         write_queue = Queue()
102 |         read_queue = Queue()
103 |         # Create processes
104 |         self.model.build_sampler()
105 |         self.model.build()
106 |         for idx in range(self.n_jobs):
107 |             self.processes[idx] = Process(target=test_model, args=(write_queue, read_queue, idx, self.model))
108 |             self.processes[idx].start()
109 |             cleanup.register_proc(self.processes[idx].pid)
110 | 
111 |         cleanup.register_handler()
112 | 
113 |         # Send data to worker processes
114 |         for idx in range(self.n_sentences):
115 |             sample = next(self.iterator)
116 |             write_queue.put((idx, sample))
117 | 
118 |         log.info("Distributed %d sentences to worker processes." % self.n_sentences)
119 | 
120 |         # Receive the results
121 |         self.sentences = [None] * self.n_sentences
122 |         self.log_probs = [None] * self.n_sentences
123 | 
124 |         t = time.time()
125 |         sum_sentlen = 0
126 |         sum_logprob = 0.
127 | 
128 |         for i in range(self.n_sentences):
129 |             # Get response from worker
130 |             resp = read_queue.get()
131 | 
132 |             # This is the sample id of the processed sample
133 |             idx = resp[0]
134 |             sentlens, logprobs = resp[2], resp[3]
135 | 
136 |             sum_sentlen += sentlens
137 |             sum_logprob += sum(logprobs)
138 |             self.log_probs[idx] = logprobs
139 | 
140 |             # Print progress
141 |             if (i+1) % 100 == 0:
142 |                 t = time.time() - t
143 |                 log.info("%d/%d sentences completed (%.2f seconds)" % ((i+1), self.n_sentences, t))
144 |                 t = time.time()
145 | 
146 |         log.info("Test Perplexity: %.4f" % np.exp(sum_logprob/sum_sentlen))
147 |         # Stop workers
148 |         for idx in range(self.n_jobs):
149 |             write_queue.put(None)
150 |             self.processes[idx].terminate()
151 |             cleanup.unregister_proc(self.processes[idx].pid)
152 | 
153 |     #Write sentence and logprobs for rescoring purpose
154 |     def write_logprobs(self, filename, dump_scores=False):
155 |         def __encode(s):
156 |             return s.encode('utf-8') if self.utf8 else s
157 | 
158 |         with open(filename, 'w') as f:
159 |             log.info("Writing output file...")
160 |             for idx, lp in enumerate(self.log_probs):
161 |                 logprobs_array=''.join(map(str,' '.join(str(i[0]) for i in lp)))
162 |                 f.write(__encode("%d ||| %s ||| %f\n"%(idx, logprobs_array,sum(lp))))
163 | 
164 | if __name__ == "__main__":
165 |     parser = argparse.ArgumentParser(prog='lm-test')
166 |     parser.add_argument('-j', '--n-jobs'        , type=int, default=8,
167 |                                                   help="Number of processes (default: 8, 0: Auto)")
168 | 
169 |     parser.add_argument('-m', '--model'         , type=str, help="Model file", required=True)
170 |     parser.add_argument('-o', '--saveto'        , type=str, help="Output test file (if not given, only metrics will be printed)",
171 |                                                   default=None)
172 |     parser.add_argument('-s', '--score'         , action='store_true', help="Print scores of each sentence")
173 |     
174 |     parser.add_argument('-S', '--src-files'     , type=str, help="Source data file (default: validation set)",
175 |                                                   nargs='+', default=None)
176 |     parser.add_argument('-R', '--ref-files'     , type=str, help="One or multiple reference files (default: validation set)",
177 |                                                   nargs='+',
178 |                                                   default=None)
179 | 
180 |     args = parser.parse_args()
181 | 
182 |     if args.n_jobs == 0:
183 |         # Auto infer CPU number
184 |         args.n_jobs = (cpu_count() / 2) - 1
185 | 
186 |     # This is to avoid thread explosion. Allow
187 |     # each process to use a single thread.
188 |     os.environ["OMP_NUM_THREADS"] = "1"
189 |     os.environ["MKL_NUM_THREADS"] = "1"
190 |     os.environ["OPENBLAS_NUM_THREADS"] = "1"
191 | 
192 |     # Force CPU
193 |     os.environ["THEANO_FLAGS"] = "device=cpu"
194 | 
195 |     # Create tester object
196 |     tester = Tester(args)
197 |     tester.set_model_options()
198 |     tester.start()
199 |     out_file = args.saveto
200 | 
201 |     tester.write_logprobs(out_file, args.score)
202 | 
203 |     sys.exit(0)
204 | 


--------------------------------------------------------------------------------
/bin/nmt-translate-client:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Client for nmt-translate server
 5 | # Send the content of an input file to the server and print the resulting translation
 6 | 
 7 | import sys
 8 | import re
 9 | import http.client
10 | import argparse
11 | 
12 | parser = argparse.ArgumentParser(description='nmt-translate client')
13 | parser.add_argument('inputfile', help='text to translate')
14 | parser.add_argument('-s', '--server', dest='HTTPserver', help='nmt-translate server adress (localhost:30060)', nargs='?', default="localhost:30060")
15 | args = parser.parse_args()
16 | 
17 | if '@' in args.HTTPserver:
18 |     urlbase,proxy = args.HTTPserver.split('@')
19 | else:
20 |     urlbase = args.HTTPserver
21 |     proxy = None
22 | if proxy:
23 |     connectionaddress = proxy
24 | else:
25 |     connectionaddress = args.HTTPserver
26 | 
27 | # request to translation server
28 | def translate(text):
29 |     # start HTTP connection (a simple TCP connection could not pass firewall)
30 |     r=0
31 |     try:
32 |         conn = http.client.HTTPConnection(connectionaddress)
33 |         conn.request('GET', urlbase, text.encode('utf8'))
34 |         r = conn.getresponse()
35 |         response=r.read()
36 |         return response.decode('utf8')
37 |     except Exception as e:
38 |         message = "Failed to connect: "+str(e)
39 |         if r:
40 |                 message += "Error %d" % r.status
41 |         print(message)
42 |         return None
43 | 
44 | # open input file
45 | try:
46 |     f = open(args.inputfile, 'r')
47 |     inputText = f.read().strip()
48 |     print("source: %s" % inputText)
49 | except IOError:
50 |     print("Failed to open input file (%s)" % args.inputfile)
51 |     sys.exit(1)
52 | 
53 | # map input file format to translation model format
54 | # ex: Les onze prétendants à l'Elysée s'affrontent mardi ==> l e s | o n z e | p r é t e n d a n t s | à | l e l y s é e | s a f f r o n t e n t | m a r d i
55 | inputText = re.sub("[^\w\s]|[0-9]", "", inputText.lower()) # clean extra spaces and digits + lowercase
56 | inputText = re.sub('\s+', '|', inputText) # use pipe as word separator (for grapheme-to-phoneme conversion)
57 | inputText = " ".join(inputText) # tokenize: separate letters by spaces
58 | 
59 | # send translation request
60 | rep=translate("%s"%inputText)
61 | 
62 | if (not rep):
63 |     print ("Failed to translate: "+ str(rep))
64 | else:
65 |     print ("target: %s" % rep)
66 | 
67 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ------------
 3 | 
 4 | **nmtpy** is a suite of Python tools, primarily based on the starter code provided in [dl4mt-tutorial](https://github.com/nyu-dl/dl4mt-tutorial)
 5 | for training neural machine translation models using Theano.
 6 | 
 7 | The basic motivation behind forking **dl4mt-tutorial** was to create a framework where it would be
 8 | easy to implement a new model by *merely* creating a new Python file.
 9 | 
10 | Features
11 | -----
12 | 
13 | ### General
14 |   - No shell script, everything is in Python 
15 |   - Overhaul object-oriented refactoring of the code: clear separation of API and scripts that interface with the API
16 |   - INI style configuration files to define everything regarding a training experiment
17 |   - Transparent cleanup mechanism to kill stale processes, remove temporary files
18 |   - Simultaneous logging of training details to stdout and log file
19 |   
20 |   - Supports out-of-the-box BLEU, METEOR and COCO eval metrics
21 |   - Includes [subword-nmt](https://github.com/rsennrich/subword-nmt) utilities for training and applying BPE model
22 |   - Plugin-like text filters for hypothesis post-processing (Example: BPE, Compound, Desegment)
23 |   - Early-stopping and checkpointing based on perplexity, BLEU or METEOR
24 |   - Ability to add new metrics easily
25 |   - Single `.npz` file to store everything about a training experiment
26 |   - Automatic free GPU selection and reservation using `nvidia-smi`
27 |   - Shuffling between epochs
28 |     - Simple shuffle
29 |     - [Homogeneous batches of same-length samples](https://github.com/kelvinxu/arctic-captions) to improve training speed
30 |   - Improved parallel translation decoding on CPU
31 |   - Forced decoding i.e. rescoring using NMT
32 |   - Export decoding informations into `json` for further visualization of attention weights
33 |   
34 | ### Training
35 |   - Improved numerical stability and reproducibility
36 |   - Glorot/Xavier, He, Orthogonal weight initializations
37 |   - Efficient SGD, Adadelta, RMSProp and ADAM
38 |     - Single forward/backward theano function without intermediate variables
39 |   - Initialization of a model with weights from another nmtpy model
40 |     - Ability to freeze pre-trained weights
41 |   - Several recurrent blocks:
42 |     - GRU, Conditional GRU (CGRU) and LSTM
43 |     - Multimodal attentive CGRU variants
44 |   - [Layer Normalization](https://github.com/ryankiros/layer-norm) support for GRU
45 |   - [Tied target embeddings](https://arxiv.org/abs/1608.05859)
46 |   - Simple/Non-recurrent Dropout, L2 weight decay
47 |   - Training and validation loss normalization for comparable perplexities
48 | 


--------------------------------------------------------------------------------
/docs/logo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/docs/logo.pdf


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/docs/logo.png


--------------------------------------------------------------------------------
/docs/pages/config.md:
--------------------------------------------------------------------------------
1 | Configuration Files
2 | --
3 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | ## WMT Shared Task on Multimodal Translation
 4 | 
 5 |  - [WMT17 Multimodal Translation (Task 1)](https://github.com/lium-lst/wmt17-mmt)
 6 |  - [WMT16 Cross-lingual Image Description Generation (Task 2)](wmt16-mmt-task2) : Monomodal and Multimodal
 7 | 
 8 | **Note:** All textual data provided in the `data/` folders of the above examples are the courtesy of the following work
 9 | and can be downloaded from [here](http://www.statmt.org/wmt17/multimodal-task.html).
10 | 
11 | If you use `fusion_*` multimodal architectures in your work, please cite the following
12 | article:
13 | 
14 | ```
15 | @article{caglayan2016multimodal,
16 |   title={Multimodal Attention for Neural Machine Translation},
17 |   author={Caglayan, Ozan and Barrault, Lo{\"\i}c and Bougares, Fethi},
18 |   journal={arXiv preprint arXiv:1609.03976},
19 |   year={2016}
20 | }
21 | ```
22 | 
23 | ### Getting the Image Features
24 | 
25 | For multimodal baselines, you will need the convolutional features extracted
26 | from a pre-trained ResNet-50. You can download these files from the links below:
27 | 
28 |  - [flickr30k_ResNets50_blck4_train.fp16.npy.xz](http://www-lium.univ-lemans.fr/sites/default/files/NMTPY/flickr30k_ResNets50_blck4_train.fp16.npy.xz) (6GB)
29 |  - [flickr30k_ResNets50_blck4_val.fp16.npy.xz](http://www-lium.univ-lemans.fr/sites/default/files/NMTPY/flickr30k_ResNets50_blck4_val.fp16.npy.xz) (214M)
30 |  - [flickr30k_ResNets50_blck4_test.fp16.npy.xz](http://www-lium.univ-lemans.fr/sites/default/files/NMTPY/flickr30k_ResNets50_blck4_test.fp16.npy.xz) (211M)
31 | 
32 | After downloading the files, extract them using the following command:
33 | 
34 | ```
35 | xz -d <downloaded xz file>
36 | ```
37 | 
38 | Each `.npy` file contains 14x14x1024 convolutional feature maps for each image
39 | which are extracted from **res4f_relu** layer of a ResNet-50 trained on ImageNet:
40 | (The `fp16` suffix means that the `dtype` is `float16`.)
41 | 
42 | ```
43 | >> valfeats = numpy.load('flickr30k_ResNets50_blck4_val.fp16.npy')
44 | >> valfeats.shape
45 | (1014, 196, 1024)
46 | # 1014: n_samples
47 | #  196: flattened 14x14 into 196 for convenience
48 | # 1024: n_feature_maps
49 | ```
50 | 
51 | For more information about the image features, please refer to:
52 | 
53 | ```
54 | @article{caglayan2016does,
55 |   title={Does Multimodality Help Human and Machine for Translation and Image Captioning?},
56 |   author={Caglayan, Ozan and Aransa, Walid and Wang, Yaxing and Masana, Marc and Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Bougares, Fethi and Barrault, Lo{\"\i}c and van de Weijer, Joost},
57 |   journal={arXiv preprint arXiv:1605.09186},
58 |   year={2016}
59 | }
60 | ```
61 | 
62 | ## Factored Neural Machine Translation system
63 | 
64 | The Factored NMT models defined by basefnmt.py are based on the NMT architecture and extended to be able to generate several output symbols at the same time (Figure http://www-lium.univ-lemans.fr/~garcia/fnmt_archi.pdf).
65 | 
66 | Folder `ted-factors` contains examples of how to use this system.
67 | 
68 | ### Citation:
69 | 
70 | If you use fnmt system in your work, please cite the following:
71 | 
72 | ```
73 | @inproceedings{garcia-martinez2016fnmt,
74 |   title={Factored Neural Machine Translation Architectures},
75 |   author={Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Barrault, Lo{\"\i}c and Bougares, Fethi},
76 |   booktitle={Proceedings of the International Workshop on Spoken Language Translation (IWSLT)},
77 |   year={2016},
78 |   url={'http://workshop2016.iwslt.org/downloads/IWSLT_2016_paper_2.pdf'}
79 | }
80 | ```
81 | 
82 | Contact: Mercedes.Garcia_Martinez@univ-lemans.fr.
83 | 


--------------------------------------------------------------------------------
/examples/ted-factors/README.md:
--------------------------------------------------------------------------------
 1 | # Factored Neural Machine Translation system
 2 | 
 3 | The Factored NMT models defined by ```basefnmt.py``` are based on the NMT architecture and extended to be able to generate several output symbols at the same time (Figure http://www-lium.univ-lemans.fr/~garcia/fnmt_archi.pdf).
 4 | 
 5 | The decoder has been modified respect to the baseline model with the following items:
 6 | 
 7 | - Specialized iterator named ```factors.py``` that handles multiple inputs and outputs text streams.
 8 | - Additional softmax and embedding for the 2nd output.
 9 | - Concatenation of the embeddings of the generated tokens at previous timestep to feedback the generation of the current token.
10 | - Sum of costs coming from each output.
11 | - Constriction of the length of the 2nd output sequence to be equal to the length of the 1st output sequence. 
12 | Firstly, we included a new mask excluding the end of sequence (\tm{EOS}) symbols to avoid shorter sequences. 
13 | Secondly, we limited the maximum length of the 2nd output sequence to the length of the 1st output sequence.
14 | - The beam search has been modified to be able to handle the multiple outputs.
15 | Once we obtain the hypothesis from lemmas (1st output) and factors (2nd output) at stage 1 of the Figure http://www-lium.univ-lemans.fr/~garcia/beamsearch.pdf, the cross product of those output spaces is performed.
16 | Afterwards, we keep the beam size best combinations for each hypothesis. 
17 | Finally, the number of samples is reduced again to the beam size.
18 | - Translation generation executed by ```nmt-translate-factors``` which can handle multiple outputs. 
19 | - Optionally, \tm{factors2wordbleu.py} metric is available to evaluate with BLEU the combination of the several outputs. 
20 | A script detailed in the configuration file is necessary to apply this metric.
21 | 
22 | ## TED data 
23 | 
24 | - Download [examples-ted-data.tar.bz2](http://www-lium.univ-lemans.fr/~garcia/examples-ted-data.tar.bz2) and extract it into the `data/` folder.
25 | 
26 | - Build the vocabulary dictionaries for each train file:
27 | 
28 | `nmt-build-dict train_file`
29 | 
30 | - Option factors enable the factored system.
31 | Factors parameter gets as argument `evalf` which will evaluate the model just with the first output or a script to combine the 2 outputs as desired.
32 | 
33 | This script will need as arguments `lang, first_output_hyp_file, second_output_hyp_file, reference_file` in this order and will print the corresponding BLEU score.
34 | 
35 | ## FNMT Training
36 | 
37 | Run `nmt-train -c attention_factors-ted-en-fr.conf` to train a FNMT on this corpus. 
38 | 
39 | ## FNMT Translation
40 | 
41 | When the training is over, you can translate the test set using the following command:
42 | 
43 | ```
44 | nmt-translate-factors -m ~/nmtpy/models/<your model file> \
45 |                       -S ~/nmtpy/examples/ted-factors/data/dev.en \
46 |                       -R ~/nmtpy/examples/ted-factors/data/dev.fr \
47 |                          ~/nmtpy/examples/ted-factors/data/dev.lemma.fr \
48 |                          ~/nmtpy/examples/ted-factors/data/dev.factors.fr \
49 |                       -o trans_dev.lemma.fr trans_dev.factors.fr \
50 |                       -fa evalf
51 | ```
52 | The option -R needs the references of the word-level, first output and second output, repectively.
53 | 
54 | In -fa option you can include your script to combine both outputs if desired instead of evalf option.
55 | 
56 | 
57 | ## Citation:
58 | If you use `fnmt` system in your work, please cite the following:
59 | 
60 | ```
61 | @inproceedings{garcia-martinez2016fnmt,
62 |   title={Factored Neural Machine Translation Architectures},
63 |   author={Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Barrault, Lo{\"\i}c and Bougares, Fethi},
64 |   booktitle={arXiv preprint arXiv:1605.09186},
65 |   year={2016}
66 | }
67 | ```
68 | 
69 | More info:
70 | http://workshop2016.iwslt.org/downloads/IWSLT_2016_paper_2.pdf
71 | 
72 | Contact: Mercedes.Garcia_Martinez@univ-lemans.fr.
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/examples/ted-factors/attention_factors-ted-en-fr.conf:
--------------------------------------------------------------------------------
 1 | [training]
 2 | # Main .py file which will be used for the model
 3 | model_type: attention_factors
 4 | # how much validation period will we wait
 5 | # to do early stopping
 6 | patience: 10
 7 | # Maximum number of epochs before stopping training
 8 | max_epochs: 20
 9 | # Validation start in terms of epochs
10 | # validation frequency in terms of minibatch updates
11 | valid_start: 2
12 | valid_freq: 5000
13 | # Save the hypothesis file for each validation
14 | valid_save_hyp: True
15 | # 0: no, otherwise weight decay factor
16 | decay_c: 0
17 | # -1: no, otherwise maximum gradient norm
18 | clip_c: 1.0
19 | seed: 1234
20 | 
21 | [model]
22 | # Using the same embedding for output and previous
23 | tied_emb: 2way
24 | layer_norm: False
25 | # Sort batches by target length or not
26 | shuffle_mode: None
27 | # 0: no, otherwise dropout probability
28 | dropout: 0.0
29 | 
30 | # Embedding vector dimension
31 | embedding_dim: 620
32 | 
33 | # RNN's hidden layer dimension
34 | rnn_dim: 1000
35 | enc_type: gru
36 | dec_type: gru_cond
37 | 
38 | # Number of jobs while translating
39 | njobs: 15
40 | 
41 | # adadelta, adam, sgd or rmsprop
42 | optimizer: adadelta
43 | 
44 | # Learning rate (only for SGD)
45 | lrate: 1
46 | 
47 | # batch size
48 | batch_size: 80
49 | 
50 | #Normalization of the cost
51 | norm_cost: False
52 | 
53 | # Use BLEU as additional validation metric
54 | valid_metric: bleu
55 | 
56 | # Script to combine output factors or 'evalf' to evaluate just with the first output
57 | factors: evalf
58 | 
59 | weight_init: xavier
60 | 
61 | # 0: use all vocabulary, otherwise upper limit as integer
62 | n_words_src: 30000
63 | n_words_trg1: 30000
64 | n_words_trg2: 0
65 | 
66 | # Where to save model params, weights and training log file
67 | save_path:  ~/nmtpy/models
68 | 
69 | [model.dicts]
70 | src: ~/nmtpy/examples/ted-factors/data/train.en.vocab.pkl
71 | trg1: ~/nmtpy/examples/ted-factors/data/train.lemma.fr.vocab.pkl
72 | trg2: ~/nmtpy/examples/ted-factors/data/train.factors.fr.vocab.pkl
73 | 
74 | [model.data]
75 | train_src: ~/nmtpy/examples/ted-factors/data/train.en
76 | train_trg1: ~/nmtpy/examples/ted-factors/data/train.lemma.fr
77 | train_trg2: ~/nmtpy/examples/ted-factors/data/train.factors.fr
78 | valid_src: ~/nmtpy/examples/ted-factors/data/dev.en
79 | valid_trg: ~/nmtpy/examples/ted-factors/data/dev.fr
80 | valid_trg1: ~/nmtpy/examples/ted-factors/data/dev.lemma.fr
81 | valid_trg2: ~/nmtpy/examples/ted-factors/data/dev.factors.fr
82 | 
83 | 


--------------------------------------------------------------------------------
/examples/ted-factors/data/README.md:
--------------------------------------------------------------------------------
1 | Extract the downloaded `examples-ted-data.tar.bz2` inside here.
2 | 


--------------------------------------------------------------------------------
/examples/wmt16-mmt-task2/README.md:
--------------------------------------------------------------------------------
  1 | # WMT16 Shared task on Multimodal Translation
  2 | ## Task 2 - Cross-lingual Image Description Generation
  3 | 
  4 | ### Multi30k Dataset
  5 | 
  6 | A copy of the original text files for Task 2 are available under `data/`. These files are downloaded
  7 | from [WMT16 Multimodal Task](http://www.statmt.org/wmt16/multimodal-task.html) webpage.
  8 | 
  9 | (**Note:** If you would like to fix some mistakes in the corpora, you can apply [this patch](data/fix-corpus-bugs.patch) before proceeding.
 10 |  
 11 | ### Normalization and Tokenization
 12 | 
 13 | Make sure that the following scripts from the `mosesdecoder` project are in your `$PATH`:
 14 |   - tokenizer.perl
 15 |   - normalize-punctuation.perl
 16 | 
 17 | Run `scripts/01-tokenize.sh ~/nmtpy/data/wmt16-task2` to:
 18 |   - Normalize punctuations
 19 |   - Tokenize
 20 | 
 21 | train, val and test files from `data/` and save them under `~/nmtpy/data/wmt16-task2`.
 22 | **Note that** the output folder is in accordance with the configuration file
 23 | `wmt16-task2-monomodal.conf` so if you use another output folder, change the configuration
 24 | file as well.
 25 | 
 26 | ### Preparing Data
 27 | 
 28 | `scripts/02-prepare.py` is a Python script that consumes all the tokenized data produces:
 29 |   - processed text files
 30 |   - `pkl` files to be used by `WMTIterator`
 31 |   - `nmtpy` dictionary files for source and target vocabularies
 32 |   
 33 | You can run the following command to prepare above files:
 34 | ```
 35 | ODIR=~/nmtpy/data/wmt16-task2
 36 | scripts/02-prepare.py -i data/split_all.txt \
 37 |                 -t $ODIR/train.*.en  -T $ODIR/train.*.de \
 38 |                 -v $ODIR/val.*.en    -V $ODIR/val.*.de \
 39 |                 -e $ODIR/test.*.en   -E $ODIR/test.*.de \
 40 |                 -l -s -d 5 -o $ODIR # lowercase, strippunct, minwordoccurrence5
 41 | ```
 42 |   
 43 | The produced `.pkl` data files contain a `list` of samples for each of the train/val/test sets
 44 | where a sample is represented with:
 45 |  - `ssplit`: An integer between 0-4 representing from which file the source sentence came from
 46 |  - `tsplit`: An integer between 0-4 representing from which file the source sentence came from
 47 |  - `imgid`: An integer between 0-(N-1) representing the order of the image for a set containing N images
 48 |  - `imgname`: The name of the JPG image file
 49 |  - `swords`: List of source words
 50 |  - `twords`: List of target words
 51 |  
 52 | Let's see with a concrete example:
 53 | ```bash
 54 | cd $ODIR
 55 | ipython
 56 | ```
 57 | 
 58 | ```python
 59 | ...
 60 | In [1]: import pickle
 61 | 
 62 | In [2]: v = pickle.load(open('flickr_30k_align.valid.pkl'))
 63 | 
 64 | In [3]: len(v)
 65 | Out[3]: 25350
 66 | 
 67 | In [4]: v[0]
 68 | Out[4]: 
 69 | [0,
 70 |  0,
 71 |  0,
 72 |  '1018148011.jpg',
 73 |  [u'a',
 74 |   u'group',
 75 |   u'of',
 76 |   u'people',
 77 |   u'stand',
 78 |   u'in',
 79 |   u'the',
 80 |   u'back',
 81 |   u'of',
 82 |   u'a',
 83 |   u'truck',
 84 |   u'filled',
 85 |   u'with',
 86 |   u'cotton'],
 87 |  [u'baumwolllager', u'mit', u'lkw']]
 88 | ```
 89 | 
 90 | A clarification should be made about the number of samples in a set: since we have 5 source and 5 target sentences for each image, the script generates `5x5=25` comparable pairs for a single image. Since the validation set contains 1014 images, this makes a total of `25*1014=25350` samples.
 91 | 
 92 | During training, you can select whether you would like to use:
 93 |  - All 25 comparable pairs for an image (`data_mode:all`)
 94 |  - 5 comparable pairs for an image (**default:** `data_mode:pairs`)
 95 |    - `(.1.en, .1.de), (.2.en, .2.de), ..., (.5.en, .5.de)`
 96 |  - Just one pair from the first pair of files: `.1.en -> .1.de` (`data_mode:single`)
 97 |  
 98 | During early-stopping, we use by default `single` for validation to only consider the description pairs from `.1.en, .1.de` resulting in 1014 images-captions.
 99 | 
100 | ### Train a Monomodal NMT
101 | 
102 | Run `nmt-train -c wmt16-task2-monomodal.conf` to train a monomodal NMT on this
103 | corpus. When the training is over, you can translate the test set using the following command:
104 | 
105 | ```
106 | nmt-translate -m ~/nmtpy/models/wmt16-mmt-task2-monomodal/<your model file>
107 |               -S ~/nmtpy/data/wmt16-task2/flickr_30k_align.test.pkl -v pairs \
108 |               -o test_monomodal.tok.de
109 | ```
110 | 
111 | The flag `-v pairs` will generate 5 hypotheses for each image using each source description and
112 | pick the one having the maximum likelihood based on NMT score.
113 | 
114 | ### Train a Multimodal NMT
115 | 
116 | #### Image Features
117 | 
118 | You need to [download](../README.md) ResNet-50 convolutional feature files, uncompress them and save
119 | under `~/nmtpy/data/wmt16-task2`.
120 | 
121 | Run `nmt-train -c wmt16-task2-multimodal.conf` to train a `fusion_concat_dep_ind` architecture.
122 | When the training is over, you can translate the test set with the following command:
123 | 
124 | ```
125 | nmt-translate -m ~/nmtpy/models/wmt16-mmt-task2-multimodal/<your model file> \
126 |               -S ~/nmtpy/data/wmt16-task2/flickr_30k_align.test.pkl \
127 |                  ~/nmtpy/data/wmt16-task2/flickr30k_ResNets50_blck4_test.fp16.npy -v pairs \
128 |               -o test_multimodal.tok.de
129 | 


--------------------------------------------------------------------------------
/examples/wmt16-mmt-task2/data/README.md:
--------------------------------------------------------------------------------
 1 | Multi30k dataset
 2 | ---
 3 | 
 4 | This is a reorganized folder containing extracted and renamed files from
 5 | the original WMT16 Multimodal Translation Task 2 train/dev/test splits, namely
 6 | the Multi30k dataset.
 7 | 
 8 | Original files can be downloaded from [here](http://www.statmt.org/wmt16/multimodal-task.html)
 9 | 
10 | The files are organized as follows:
11 |   - `train.[1-5].{en,de}`: 5 splits of training set each having 29K sentences
12 |   - `val.[1-5].{en,de}`: 5 splits of dev set each having 1014 sentences
13 |   - `test.[1-5].{en,de}`: 5 splits of test set each having 1000 sentences
14 |   - `split_*.txt`: Text files containing sentence to image name mapping for each sets.
15 | 
16 | (The patch file `fix-corpus-bugs.patch` can be applied on top of these files to fix some bugs
17 | in the corpus.)
18 | 


--------------------------------------------------------------------------------
/examples/wmt16-mmt-task2/scripts/01-tokenize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | OUT=$1
 3 | 
 4 | if [ -z $OUT ]; then
 5 |   echo "Usage: $0 <output-dir>"
 6 |   exit 1
 7 | fi
 8 | 
 9 | mkdir $OUT &> /dev/null
10 | 
11 | for lang in en de; do
12 |   for f in $(ls --color=none data/*$lang); do
13 |     fname=`basename $f`
14 |     fname=${fname/\.$lang/}
15 |     echo "Normalizing punctuation and tokenizing $f"
16 |     cat $f | normalize-punctuation.perl -l $lang | tokenizer.perl -threads 8 -l $lang > $OUT/"$fname.norm.tok.$lang"
17 |   done
18 | done
19 | 


--------------------------------------------------------------------------------
/examples/wmt16-mmt-task2/wmt16-mmt-task2-monomodal.conf:
--------------------------------------------------------------------------------
 1 | [training]
 2 | # This is the attention NMT with WMT iterator
 3 | model_type: attention_wmt
 4 | patience: 10
 5 | max_epochs: 100
 6 | valid_freq: 0
 7 | valid_metric: meteor
 8 | decay_c: 1e-5
 9 | clip_c: 5
10 | seed: 1234
11 | 
12 | [model]
13 | tied_emb: 2way
14 | layer_norm: True
15 | shuffle_mode: trglen
16 | embedding_dim: 100
17 | rnn_dim: 100
18 | 
19 | optimizer: adam
20 | lrate: 0.0004
21 | weight_init: xavier
22 | batch_size: 32
23 | 
24 | n_words_src: 0
25 | 
26 | # Only the most 10K, other -> UNK
27 | n_words_trg: 10000
28 | 
29 | save_path: ~/nmtpy/models
30 | 
31 | [model.dicts]
32 | src: ~/nmtpy/data/wmt16-task2/train_src.pkl
33 | trg: ~/nmtpy/data/wmt16-task2/train_trg.pkl
34 | 
35 | [model.data]
36 | train_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.train.pkl
37 | valid_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.valid.pkl
38 | valid_trg: ~/nmtpy/data/wmt16-task2/valid.*.tok.lc.nopunct.de
39 | 


--------------------------------------------------------------------------------
/examples/wmt16-mmt-task2/wmt16-mmt-task2-multimodal.conf:
--------------------------------------------------------------------------------
 1 | [training]
 2 | model_type: fusion_concat_dep_ind
 3 | patience: 10
 4 | max_epochs: 100
 5 | valid_freq: 0
 6 | valid_metric: meteor
 7 | decay_c: 1e-5
 8 | clip_c: 5
 9 | seed: 1234
10 | 
11 | [model]
12 | tied_emb: False
13 | layer_norm: False
14 | shuffle_mode: trglen
15 | embedding_dim: 620
16 | rnn_dim: 1000
17 | conv_dim: 1024
18 | 
19 | optimizer: adam
20 | lrate: 0.0004
21 | weight_init: xavier
22 | batch_size: 32
23 | 
24 | n_words_src: 0
25 | 
26 | # Only the most 10K, other -> UNK
27 | n_words_trg: 10000
28 | 
29 | save_path: ~/nmtpy/models
30 | 
31 | [model.dicts]
32 | src: ~/nmtpy/data/wmt16-task2/train_src.pkl
33 | trg: ~/nmtpy/data/wmt16-task2/train_trg.pkl
34 | 
35 | [model.data]
36 | train_img: ~/nmtpy/data/wmt16-task2/flickr30k_ResNets50_blck4_train.fp16.npy
37 | valid_img: ~/nmtpy/data/wmt16-task2/flickr30k_ResNets50_blck4_val.fp16.npy
38 | train_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.train.pkl
39 | valid_src: ~/nmtpy/data/wmt16-task2/flickr_30k_align.valid.pkl
40 | valid_trg: ~/nmtpy/data/wmt16-task2/valid.*.tok.lc.nopunct.de
41 | 


--------------------------------------------------------------------------------
/nmtpy/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.0.0'
2 | 


--------------------------------------------------------------------------------
/nmtpy/cleanup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import sys
 4 | import signal
 5 | import atexit
 6 | import traceback
 7 | 
 8 | temp_files = set()
 9 | subprocesses = set()
10 | 
11 | def register_tmp_file(f):
12 |     """Add new temp file to global set."""
13 |     temp_files.add(f)
14 | 
15 | def register_proc(pid):
16 |     """Add new process to global set."""
17 |     subprocesses.add(pid)
18 | 
19 | def unregister_proc(pid):
20 |     """Remove given PID from global set."""
21 |     subprocesses.remove(pid)
22 | 
23 | def cleanup():
24 |     """Cleanup registered temp files and kill PIDs."""
25 |     for f in temp_files:
26 |         try:
27 |             os.unlink(f)
28 |         except:
29 |             pass
30 | 
31 |     for p in subprocesses:
32 |         try:
33 |             os.kill(p, signal.SIGTERM)
34 |         except:
35 |             pass
36 | 
37 | def signal_handler(signum, frame):
38 |     """Let Python call this when SIGINT or SIGTERM caught."""
39 |     cleanup()
40 |     sys.exit(0)
41 | 
42 | def register_exception_handler(logger, quit_on_exception=False):
43 |     """Setup exception handler."""
44 | 
45 |     def exception_handler(exctype, value, tb):
46 |         """Let Python call this when an exception is uncaught."""
47 |         logger.info(''.join(traceback.format_exception(exctype, value, tb)))
48 | 
49 |     def exception_handler_quits(exctype, value, tb):
50 |         """Let Python call this when an exception is uncaught."""
51 |         logger.info(''.join(traceback.format_exception(exctype, value, tb)))
52 |         sys.exit(1)
53 | 
54 |     if quit_on_exception:
55 |         sys.excepthook = exception_handler_quits
56 |     else:
57 |         sys.excepthook = exception_handler
58 | 
59 | def register_handler(logger, _atexit=True, _signals=True, exception_quits=False):
60 |     """Register atexit and signal handlers."""
61 |     if _atexit:
62 |         # Register exit handler
63 |         atexit.register(cleanup)
64 | 
65 |     if _signals:
66 |         # Register SIGINT and SIGTERM
67 |         signal.signal(signal.SIGINT, signal_handler)
68 |         signal.signal(signal.SIGTERM, signal_handler)
69 | 
70 |     register_exception_handler(logger, exception_quits)
71 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/README.md:
--------------------------------------------------------------------------------
1 | pycocoevalcap
2 | ---
3 | 
4 | This is a copy from
5 |  https://github.com/tylin/coco-caption/tree/master/pycocoevalcap
6 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/bleu/LICENSE.bleu:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File Name : bleu.py
 3 | #
 4 | # Description : Wrapper for BLEU scorer.
 5 | #
 6 | # Creation Date : 06-01-2015
 7 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 8 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from .bleu_scorer import BleuScorer
11 | 
12 | class Bleu:
13 |     def __init__(self, n=4):
14 |         # default compute Blue score up to 4
15 |         self._n = n
16 |         self._hypo_for_image = {}
17 |         self.ref_for_image = {}
18 | 
19 |     def compute_score(self, gts, res):
20 | 
21 |         bleu_scorer = BleuScorer(n=self._n)
22 |         for id in sorted(gts.keys()):
23 |             hypo = res[id]
24 |             ref = gts[id]
25 | 
26 |             # Sanity check.
27 |             assert(type(hypo) is list)
28 |             assert(len(hypo) == 1)
29 |             assert(type(ref) is list)
30 |             assert(len(ref) >= 1)
31 | 
32 |             bleu_scorer += (hypo[0], ref)
33 | 
34 |         #score, scores = bleu_scorer.compute_score(option='shortest')
35 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
36 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
37 | 
38 |         # return (bleu, bleu_info)
39 |         return score, scores
40 | 
41 |     def method(self):
42 |         return "Bleu"
43 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Filename: cider.py
 3 | #
 4 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 5 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 6 | #
 7 | # Creation Date: Sun Feb  8 14:16:54 2015
 8 | #
 9 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from .cider_scorer import CiderScorer
12 | 
13 | class Cider(object):
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
33 | 
34 |         for id in sorted(gts.keys()):
35 |             hypo = res[id]
36 |             ref = gts[id]
37 | 
38 |             # Sanity check.
39 |             assert(type(hypo) is list)
40 |             assert(len(hypo) == 1)
41 |             assert(type(ref) is list)
42 |             assert(len(ref) > 0)
43 | 
44 |             cider_scorer += (hypo[0], ref)
45 | 
46 |         (score, scores) = cider_scorer.compute_score()
47 | 
48 |         return score, scores
49 | 
50 |     def method(self):
51 |         return "CIDEr"


--------------------------------------------------------------------------------
/nmtpy/cocoeval/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | 
  5 | import copy
  6 | from collections import defaultdict
  7 | import numpy as np
  8 | import math
  9 | 
 10 | def precook(s, n=4, out=False):
 11 |     """
 12 |     Takes a string as input and returns an object that can be given to
 13 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 14 |     can take string arguments as well.
 15 |     :param s: string : sentence to be converted into ngrams
 16 |     :param n: int    : number of ngrams for which representation is calculated
 17 |     :return: term frequency vector for occuring ngrams
 18 |     """
 19 |     words = s.split()
 20 |     counts = defaultdict(int)
 21 |     for k in range(1,n+1):
 22 |         for i in range(len(words)-k+1):
 23 |             ngram = tuple(words[i:i+k])
 24 |             counts[ngram] += 1
 25 |     return counts
 26 | 
 27 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 28 |     '''Takes a list of reference sentences for a single segment
 29 |     and returns an object that encapsulates everything that BLEU
 30 |     needs to know about them.
 31 |     :param refs: list of string : reference sentences for some image
 32 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 33 |     :return: result (list of dict)
 34 |     '''
 35 |     return [precook(ref, n) for ref in refs]
 36 | 
 37 | def cook_test(test, n=4):
 38 |     '''Takes a test sentence and returns an object that
 39 |     encapsulates everything that BLEU needs to know about it.
 40 |     :param test: list of string : hypothesis sentence for some image
 41 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 42 |     :return: result (dict)
 43 |     '''
 44 |     return precook(test, n, True)
 45 | 
 46 | class CiderScorer(object):
 47 |     """CIDEr scorer.
 48 |     """
 49 | 
 50 |     def copy(self):
 51 |         ''' copy the refs.'''
 52 |         new = CiderScorer(n=self.n)
 53 |         new.ctest = copy.copy(self.ctest)
 54 |         new.crefs = copy.copy(self.crefs)
 55 |         return new
 56 | 
 57 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
 58 |         ''' singular instance '''
 59 |         self.n = n
 60 |         self.sigma = sigma
 61 |         self.crefs = []
 62 |         self.ctest = []
 63 |         self.document_frequency = defaultdict(float)
 64 |         self.cook_append(test, refs)
 65 |         self.ref_len = None
 66 | 
 67 |     def cook_append(self, test, refs):
 68 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 69 | 
 70 |         if refs is not None:
 71 |             self.crefs.append(cook_refs(refs))
 72 |             if test is not None:
 73 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 74 |             else:
 75 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 76 | 
 77 |     def size(self):
 78 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 79 |         return len(self.crefs)
 80 | 
 81 |     def __iadd__(self, other):
 82 |         '''add an instance (e.g., from another sentence).'''
 83 | 
 84 |         if type(other) is tuple:
 85 |             ## avoid creating new CiderScorer instances
 86 |             self.cook_append(other[0], other[1])
 87 |         else:
 88 |             self.ctest.extend(other.ctest)
 89 |             self.crefs.extend(other.crefs)
 90 | 
 91 |         return self
 92 |     def compute_doc_freq(self):
 93 |         '''
 94 |         Compute term frequency for reference data.
 95 |         This will be used to compute idf (inverse document frequency later)
 96 |         The term frequency is stored in the object
 97 |         :return: None
 98 |         '''
 99 |         for refs in self.crefs:
100 |             # refs, k ref captions of one image
101 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
102 |                 self.document_frequency[ngram] += 1
103 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
104 | 
105 |     def compute_cider(self):
106 |         def counts2vec(cnts):
107 |             """
108 |             Function maps counts of ngram to vector of tfidf weights.
109 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
110 |             The n-th entry of array denotes length of n-grams.
111 |             :param cnts:
112 |             :return: vec (array of dict), norm (array of float), length (int)
113 |             """
114 |             vec = [defaultdict(float) for _ in range(self.n)]
115 |             length = 0
116 |             norm = [0.0 for _ in range(self.n)]
117 |             for (ngram,term_freq) in cnts.items():
118 |                 # give word count 1 if it doesn't appear in reference corpus
119 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
120 |                 # ngram index
121 |                 n = len(ngram)-1
122 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
123 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
124 |                 # compute norm for the vector.  the norm will be used for computing similarity
125 |                 norm[n] += pow(vec[n][ngram], 2)
126 | 
127 |                 if n == 1:
128 |                     length += term_freq
129 |             norm = [np.sqrt(n) for n in norm]
130 |             return vec, norm, length
131 | 
132 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
133 |             '''
134 |             Compute the cosine similarity of two vectors.
135 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
136 |             :param vec_ref: array of dictionary for vector corresponding to reference
137 |             :param norm_hyp: array of float for vector corresponding to hypothesis
138 |             :param norm_ref: array of float for vector corresponding to reference
139 |             :param length_hyp: int containing length of hypothesis
140 |             :param length_ref: int containing length of reference
141 |             :return: array of score for each n-grams cosine similarity
142 |             '''
143 |             delta = float(length_hyp - length_ref)
144 |             # measure consine similarity
145 |             val = np.array([0.0 for _ in range(self.n)])
146 |             for n in range(self.n):
147 |                 # ngram
148 |                 for (ngram,count) in vec_hyp[n].items():
149 |                     # vrama91 : added clipping
150 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
151 | 
152 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
153 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
154 | 
155 |                 assert(not math.isnan(val[n]))
156 |                 # vrama91: added a length based gaussian penalty
157 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
158 |             return val
159 | 
160 |         # compute log reference length
161 |         self.ref_len = np.log(float(len(self.crefs)))
162 | 
163 |         scores = []
164 |         for test, refs in zip(self.ctest, self.crefs):
165 |             # compute vector for test captions
166 |             vec, norm, length = counts2vec(test)
167 |             # compute vector for ref captions
168 |             score = np.array([0.0 for _ in range(self.n)])
169 |             for ref in refs:
170 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
171 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
172 |             # change by vrama91 - mean of ngram scores, instead of sum
173 |             score_avg = np.mean(score)
174 |             # divide by number of references
175 |             score_avg /= len(refs)
176 |             # multiply score by 10
177 |             score_avg *= 10.0
178 |             # append score of an image to the score list
179 |             scores.append(score_avg)
180 |         return scores
181 | 
182 |     def compute_score(self, option=None, verbose=0):
183 |         # compute idf
184 |         self.compute_doc_freq()
185 |         # assert to check document frequency
186 |         assert(len(self.ctest) >= max(self.document_frequency.values()))
187 |         # compute cider score
188 |         score = self.compute_cider()
189 |         return np.mean(np.array(score)), np.array(score)


--------------------------------------------------------------------------------
/nmtpy/cocoeval/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/meteor/meteor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Python wrapper for METEOR implementation, by Xinlei Chen
 3 | # Acknowledge Michael Denkowski for the generous discussion and help 
 4 | 
 5 | import os
 6 | import threading
 7 | import subprocess
 8 | import pkg_resources
 9 | 
10 | METEOR_JAR = pkg_resources.resource_filename('nmtpy', 'external/meteor-1.5.jar')
11 | 
12 | class Meteor(object):
13 |     def __init__(self, language, norm=False):
14 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, '-', '-', '-stdio', '-l', language]
15 |         self.env = os.environ
16 |         self.env['LC_ALL'] = 'en_US.UTF_8'
17 | 
18 |         if norm:
19 |             self.meteor_cmd.append('-norm')
20 | 
21 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, stdin=subprocess.PIPE, \
22 |                                         stdout=subprocess.PIPE, stderr=subprocess.PIPE,
23 |                                         env=self.env, universal_newlines=True, bufsize=1)
24 |         # Used to guarantee thread safety
25 |         self.lock = threading.Lock()
26 | 
27 |     def method(self):
28 |         return "METEOR"
29 | 
30 |     def compute_score(self, gts, res):
31 |         imgIds = sorted(list(gts.keys()))
32 |         scores = []
33 | 
34 |         eval_line = 'EVAL'
35 |         self.lock.acquire()
36 |         for i in imgIds:
37 |             assert(len(res[i]) == 1)
38 | 
39 |             hypothesis_str = res[i][0].replace('|||', '').replace('  ', ' ')
40 |             score_line = ' ||| '.join(('SCORE', ' ||| '.join(gts[i]), hypothesis_str))
41 | 
42 |             # We obtained --> SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
43 |             self.meteor_p.stdin.write(score_line + '\n')
44 |             stat = self.meteor_p.stdout.readline().strip()
45 |             eval_line += ' ||| {}'.format(stat)
46 | 
47 |         # Send to METEOR
48 |         self.meteor_p.stdin.write(eval_line + '\n')
49 | 
50 |         # Collect segment scores
51 |         for i in range(len(imgIds)):
52 |             score = float(self.meteor_p.stdout.readline().strip())
53 |             scores.append(score)
54 | 
55 |         # Final score
56 |         final_score = 100*float(self.meteor_p.stdout.readline().strip())
57 |         self.lock.release()
58 | 
59 |         return final_score, scores
60 | 
61 |     def __del__(self):
62 |         self.lock.acquire()
63 |         self.meteor_p.stdin.close()
64 |         self.meteor_p.wait()
65 |         self.lock.release()
66 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/nmtpy/cocoeval/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File Name : rouge.py
  3 | #
  4 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  5 | #
  6 | # Creation Date : 2015-01-07 06:03
  7 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  8 | 
  9 | import numpy as np
 10 | 
 11 | def my_lcs(string, sub):
 12 |     """
 13 |     Calculates longest common subsequence for a pair of tokenized strings
 14 |     :param string : list of str : tokens from a string split using whitespace
 15 |     :param sub : list of str : shorter string, also split using whitespace
 16 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 17 | 
 18 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 19 |     """
 20 |     if(len(string)< len(sub)):
 21 |         sub, string = string, sub
 22 | 
 23 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 24 | 
 25 |     for j in range(1,len(sub)+1):
 26 |         for i in range(1,len(string)+1):
 27 |             if(string[i-1] == sub[j-1]):
 28 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 29 |             else:
 30 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 31 | 
 32 |     return lengths[len(string)][len(sub)]
 33 | 
 34 | class Rouge():
 35 |     '''
 36 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 37 | 
 38 |     '''
 39 |     def __init__(self):
 40 |         # vrama91: updated the value below based on discussion with Hovey
 41 |         self.beta = 1.2
 42 | 
 43 |     def calc_score(self, candidate, refs):
 44 |         """
 45 |         Compute ROUGE-L score given one candidate and references for an image
 46 |         :param candidate: str : candidate sentence to be evaluated
 47 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 48 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 49 |         """
 50 |         assert(len(candidate)==1)	
 51 |         assert(len(refs)>0)         
 52 |         prec = []
 53 |         rec = []
 54 | 
 55 |         # split into tokens
 56 |         token_c = candidate[0].split(" ")
 57 |     	
 58 |         for reference in refs:
 59 |             # split into tokens
 60 |             token_r = reference.split(" ")
 61 |             # compute the longest common subsequence
 62 |             lcs = my_lcs(token_r, token_c)
 63 |             prec.append(lcs/float(len(token_c)))
 64 |             rec.append(lcs/float(len(token_r)))
 65 | 
 66 |         prec_max = max(prec)
 67 |         rec_max = max(rec)
 68 | 
 69 |         if(prec_max!=0 and rec_max !=0):
 70 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 71 |         else:
 72 |             score = 0.0
 73 |         return score
 74 | 
 75 |     def compute_score(self, gts, res):
 76 |         """
 77 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 78 |         Invoked by evaluate_captions.py 
 79 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 80 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 81 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 82 |         """
 83 |         score = []
 84 |         for id in sorted(gts.keys()):
 85 |             hypo = res[id]
 86 |             ref  = gts[id]
 87 | 
 88 |             score.append(self.calc_score(hypo, ref))
 89 | 
 90 |             # Sanity check.
 91 |             assert(type(hypo) is list)
 92 |             assert(len(hypo) == 1)
 93 |             assert(type(ref) is list)
 94 |             assert(len(ref) > 0)
 95 | 
 96 |         average_score = np.mean(np.array(score))
 97 |         return average_score, np.array(score)
 98 | 
 99 |     def method(self):
100 |         return "Rouge"
101 | 


--------------------------------------------------------------------------------
/nmtpy/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import glob
 4 | 
 5 | from configparser import SafeConfigParser
 6 | from argparse import Namespace
 7 | from ast import literal_eval
 8 | 
 9 | def _parse_value(value):
10 |     # Check for boolean or None
11 |     if value.capitalize().startswith(('False', 'True', 'None')):
12 |         return eval(value.capitalize(), {}, {})
13 | 
14 |     # Check for path, files
15 |     elif value.startswith(('~', '/', '../', './')):
16 |         real_path = os.path.realpath(os.path.expanduser(value))
17 |         if '*' in real_path:
18 |             # Resolve wildcards if any
19 |             files = glob.glob(real_path)
20 |             if len(files) == 0:
21 |                 raise Exception('%s did not match any file.' % value)
22 |             # Return list if multiple, single file if not
23 |             return sorted(files) if len(files) > 1 else files[0]
24 |         else:
25 |             return real_path
26 | 
27 |     else:
28 |         # Detect strings, floats and ints
29 |         try:
30 |             # If this fails, this is a string
31 |             literal = literal_eval(value)
32 |         except Exception as ve:
33 |             return value
34 |         else:
35 |             # Did not fail => literal is a float or int now
36 |             return literal
37 | 
38 | def _get_section_dict(l):
39 |     """l is a list of key-value tuples returned by ConfigParser.items().
40 |     Convert it to a dictionary after inferring value types."""
41 |     return {key : _parse_value(value) for key,value in l}
42 | 
43 | def _update_dict(d, defs):
44 |     """Update d with key-values from defs IF key misses from d."""
45 |     for k,v in list(defs.items()):
46 |         if k not in d:
47 |             d[k] = v
48 |     return d
49 | 
50 | class Config(SafeConfigParser, object):
51 |     """Custom parser inheriting from SafeConfigParser."""
52 | 
53 |     def __init__(self, filename, trdefs=None, mddefs=None, override=None):
54 |         # Call parent's __init__()
55 |         super(self.__class__, self).__init__()
56 | 
57 |         # Use values from defaults.py when missing
58 |         self._trdefs    = trdefs if trdefs else {}
59 |         self._mddefs    = mddefs if mddefs else {}
60 | 
61 |         # dict that will override
62 |         # this can contain both model and training args unfortunately.
63 |         self._override  = _get_section_dict(list(override.items())) \
64 |                                 if override else {}
65 | 
66 |         # Parse the file, raise if error
67 |         if len(self.read(filename)) == 0:
68 |             raise Exception('Could not parse configuration file.')
69 | 
70 |     def parse(self):
71 |         """Parse everything and return 2 Namespace objects."""
72 |         # Convert training and model sections to dictionary
73 |         trdict = _get_section_dict(self.items('training')) \
74 |                     if 'training' in self.sections() else {}
75 |         mddict = _get_section_dict(self.items('model')) \
76 |                     if 'model' in self.sections() else {}
77 | 
78 |         # Update parsed sections with missing defaults
79 |         trdict = _update_dict(trdict, self._trdefs)
80 |         mddict = _update_dict(mddict, self._mddefs)
81 | 
82 |         for key, value in list(self._override.items()):
83 |             assert not (key in trdict and key in mddict)
84 |             if key in trdict:
85 |                 trdict[key] = value
86 |             else:
87 |                 # everything else goes to model args
88 |                 mddict[key] = value
89 | 
90 |         # Finally merge model.* subsections into model
91 |         for section in self.sections():
92 |             if section.startswith('model.'):
93 |                 subsection = section.split('.')[-1]
94 |                 mddict[subsection] = _get_section_dict(self.items(section))
95 | 
96 |         return (Namespace(**trdict), Namespace(**mddict))
97 | 


--------------------------------------------------------------------------------
/nmtpy/defaults.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Default data types
 4 | INT   = 'int64'
 5 | FLOAT = 'float32'
 6 | 
 7 | MODEL_DEFAULTS = {
 8 |         'weight_init':        'xavier',       # Can be a float for the scale of normal initialization, "xavier" or "he".
 9 |         'batch_size':         32,             # Training batch size
10 |         'optimizer':          'adam',         # adadelta, sgd, rmsprop, adam
11 |         'lrate':              None,           # Initial learning rate. Defaults for each optimizer is different so value
12 |                                               # will be initialized when building optimizer if None.
13 |         }
14 | 
15 | TRAIN_DEFAULTS = {
16 |         'init':               None,           # Pretrained model .npz file
17 |         'device_id':          'auto',         #
18 |         'seed':               1234,           # RNG seed
19 |         'clip_c':             5.,             # Clip gradients above clip_c
20 |         'decay_c':            0.,             # L2 penalty factor
21 |         'patience':           10,             # Early stopping patience
22 |         'patience_delta':     0.,             # Absolute difference that will be taken into account as improvement for valid metric
23 |         'max_epochs':         100,            # Max number of epochs to train
24 |         'max_iteration':      int(1e6),       # Max number of updates to train
25 |         'valid_metric':       'bleu',         # one or more metrics separated by comma, 1st one used for early-stopping
26 |         'valid_start':        1,              # Epoch which validation will start
27 |         'valid_njobs':        16,             # # of parallel CPU tasks to do beam-search
28 |         'valid_beam':         12,             # Allow changing beam size during validation
29 |         'valid_freq':         0,              # 0: End of epochs
30 |         'valid_save_hyp':     False,          # Save each output of validation to separate files
31 |         'snapshot_freq':      0,              # Checkpoint frequency for resuming in terms of number of iterations
32 |         'disp_freq':          10,             # Display training statistics after each disp_freq minibatches
33 |         'save_best_n':        4,              # Always keep a set of 4 best validation models on disk
34 |         'save_timestamp':     False,          # Creates a subfolder for each experiment with timestamp prefix
35 |         }
36 | 


--------------------------------------------------------------------------------
/nmtpy/external/data/README.md:
--------------------------------------------------------------------------------
1 | Download METEOR paraphrase files to here using `scripts/get-meteor-data.sh`
2 | before running `python setup.py install`
3 | 


--------------------------------------------------------------------------------
/nmtpy/external/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/nmtpy/external/meteor-1.5.jar


--------------------------------------------------------------------------------
/nmtpy/external/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | #
  6 | # This is a verbatim copy of the original multi-bleu.perl at:
  7 | #
  8 | # commit ec71c2397bb8316110efb91067dfb4c66b843cf3
  9 | # Author: Ulrich Germann <Ulrich.Germann@gmail.com>
 10 | # Date:   Tue Nov 10 01:16:17 2015 +0000
 11 | #     Allow multiple reference files to be specified on the command line; handle gzipped reference files.
 12 | 
 13 | # $Id$
 14 | use warnings;
 15 | use strict;
 16 | 
 17 | my $lowercase = 0;
 18 | if ($ARGV[0] eq "-lc") {
 19 |   $lowercase = 1;
 20 |   shift;
 21 | }
 22 | 
 23 | my $stem = $ARGV[0];
 24 | if (!defined $stem) {
 25 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 26 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 27 |   exit(1);
 28 | }
 29 | 
 30 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 31 | 
 32 | my @REF;
 33 | my $ref=0;
 34 | while(-e "$stem$ref") {
 35 |     &add_to_ref("$stem$ref",\@REF);
 36 |     $ref++;
 37 | }
 38 | &add_to_ref($stem,\@REF) if -e $stem;
 39 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 40 | 
 41 | # add additional references explicitly specified on the command line
 42 | shift;
 43 | foreach my $stem (@ARGV) {
 44 |     &add_to_ref($stem,\@REF) if -e $stem;
 45 | }
 46 | 
 47 | 
 48 | 
 49 | sub add_to_ref {
 50 |     my ($file,$REF) = @_;
 51 |     my $s=0;
 52 |     if ($file =~ /.gz$/) {
 53 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 54 |     } else { 
 55 | 	open(REF,$file) or die "Can't read $file";
 56 |     }
 57 |     while(<REF>) {
 58 | 	chop;
 59 | 	push @{$$REF[$s++]}, $_;
 60 |     }
 61 |     close(REF);
 62 | }
 63 | 
 64 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 65 | my $s=0;
 66 | while(<STDIN>) {
 67 |     chop;
 68 |     $_ = lc if $lowercase;
 69 |     my @WORD = split;
 70 |     my %REF_NGRAM = ();
 71 |     my $length_translation_this_sentence = scalar(@WORD);
 72 |     my ($closest_diff,$closest_length) = (9999,9999);
 73 |     foreach my $reference (@{$REF[$s]}) {
 74 | #      print "$s $_ <=> $reference\n";
 75 |   $reference = lc($reference) if $lowercase;
 76 | 	my @WORD = split(' ',$reference);
 77 | 	my $length = scalar(@WORD);
 78 |         my $diff = abs($length_translation_this_sentence-$length);
 79 | 	if ($diff < $closest_diff) {
 80 | 	    $closest_diff = $diff;
 81 | 	    $closest_length = $length;
 82 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 83 | 	} elsif ($diff == $closest_diff) {
 84 |             $closest_length = $length if $length < $closest_length;
 85 |             # from two references with the same closeness to me
 86 |             # take the *shorter* into account, not the "first" one.
 87 |         }
 88 | 	for(my $n=1;$n<=4;$n++) {
 89 | 	    my %REF_NGRAM_N = ();
 90 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 91 | 		my $ngram = "$n";
 92 | 		for(my $w=0;$w<$n;$w++) {
 93 | 		    $ngram .= " ".$WORD[$start+$w];
 94 | 		}
 95 | 		$REF_NGRAM_N{$ngram}++;
 96 | 	    }
 97 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 98 | 		if (!defined($REF_NGRAM{$ngram}) ||
 99 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
100 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
101 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
102 | 		}
103 | 	    }
104 | 	}
105 |     }
106 |     $length_translation += $length_translation_this_sentence;
107 |     $length_reference += $closest_length;
108 |     for(my $n=1;$n<=4;$n++) {
109 | 	my %T_NGRAM = ();
110 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
111 | 	    my $ngram = "$n";
112 | 	    for(my $w=0;$w<$n;$w++) {
113 | 		$ngram .= " ".$WORD[$start+$w];
114 | 	    }
115 | 	    $T_NGRAM{$ngram}++;
116 | 	}
117 | 	foreach my $ngram (keys %T_NGRAM) {
118 | 	    $ngram =~ /^(\d+) /;
119 | 	    my $n = $1;
120 |             # my $corr = 0;
121 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
122 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
123 | 	    if (defined($REF_NGRAM{$ngram})) {
124 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
125 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
126 |                     # $corr =  $T_NGRAM{$ngram};
127 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
128 | 		}
129 | 		else {
130 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
131 |                     # $corr =  $REF_NGRAM{$ngram};
132 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
133 | 		}
134 | 	    }
135 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
136 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
137 | 	}
138 |     }
139 |     $s++;
140 | }
141 | my $brevity_penalty = 1;
142 | my $bleu = 0;
143 | 
144 | my @bleu=();
145 | 
146 | for(my $n=1;$n<=4;$n++) {
147 |   if (defined ($TOTAL[$n])){
148 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
149 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
150 |   }else{
151 |     $bleu[$n]=0;
152 |   }
153 | }
154 | 
155 | if ($length_reference==0){
156 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
157 |   exit(1);
158 | }
159 | 
160 | if ($length_translation<$length_reference) {
161 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
162 | }
163 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
164 | 				my_log( $bleu[2] ) +
165 | 				my_log( $bleu[3] ) +
166 | 				my_log( $bleu[4] ) ) / 4) ;
167 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
168 |     100*$bleu,
169 |     100*$bleu[1],
170 |     100*$bleu[2],
171 |     100*$bleu[3],
172 |     100*$bleu[4],
173 |     $brevity_penalty,
174 |     $length_translation / $length_reference,
175 |     $length_translation,
176 |     $length_reference;
177 | 
178 | sub my_log {
179 |   return -9999999999 unless $_[0];
180 |   return log($_[0]);
181 | }
182 | 


--------------------------------------------------------------------------------
/nmtpy/filters.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | class Filter(object):
 5 |     """Common Filter class for post-processing sentences."""
 6 |     def __call__(self, inp):
 7 |         if isinstance(inp, str):
 8 |             # Apply to single sentence
 9 |             return self.process(inp)
10 |         else:
11 |             # Assume a sequence and apply to each
12 |             return [self.process(e) for e in inp]
13 | 
14 |     def process(self, s):
15 |         # Derived classes should implement this method
16 |         return s
17 | 
18 | class CompoundFilter(Filter):
19 |     """Filters out fillers from compound splitted sentences."""
20 |     def process(self, s):
21 |         return s.replace(" @@ ", "").replace(" @@", "").replace(" @", "").replace("@ ", "")
22 | 
23 | class BPEFilter(Filter):
24 |     """Filters out fillers from BPE applied sentences."""
25 |     def process(self, s):
26 |         # The first replace misses lines ending with @@
27 |         # like 'foo@@ bar Hotel@@'
28 |         return s.replace("@@ ", "").replace("@@", "")
29 | 
30 | class DesegmentFilter(Filter):
31 |     """Converts Turkish segmentations of <tag:morpheme> to normal form."""
32 |     def process(self, s):
33 |         return re.sub(' *<.*?:(.*?)>', '\\1', s)
34 | 
35 | class Char2Words(Filter):
36 |     """Converts a space delimited character sequence to
37 |     normal word form. The output will be non-tokenized."""
38 |     def process(self, s):
39 |         return s.replace(' ', '').replace('<s>', ' ').strip()
40 | 
41 | def get_filter(name):
42 |     filters = {
43 |                 "bpe"          : BPEFilter(),
44 |                 "char2words"   : Char2Words(),
45 |                 "compound"     : CompoundFilter(),
46 |                 "desegment"    : DesegmentFilter(),
47 |               }
48 |     return filters.get(name, None)
49 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/bitext.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | from ..sysutils   import fopen
  5 | from .iterator    import Iterator
  6 | from .homogeneous import HomogeneousData
  7 | 
  8 | """Parallel text iterator for translation data."""
  9 | class BiTextIterator(Iterator):
 10 |     def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs):
 11 |         super(BiTextIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger)
 12 | 
 13 |         assert 'srcfile' in kwargs, "Missing argument srcfile"
 14 |         assert 'trgfile' in kwargs, "Missing argument trgfile"
 15 |         assert 'srcdict' in kwargs, "Missing argument srcdict"
 16 |         assert 'trgdict' in kwargs, "Missing argument trgdict"
 17 |         assert batch_size > 1, "Batch size should be > 1"
 18 | 
 19 |         self._print('Shuffle mode: %s' % shuffle_mode)
 20 | 
 21 |         self.srcfile = kwargs['srcfile']
 22 |         self.trgfile = kwargs['trgfile']
 23 |         self.srcdict = kwargs['srcdict']
 24 |         self.trgdict = kwargs['trgdict']
 25 | 
 26 |         self.n_words_src = kwargs.get('n_words_src', 0)
 27 |         self.n_words_trg = kwargs.get('n_words_trg', 0)
 28 | 
 29 |         self.src_name = kwargs.get('src_name', 'x')
 30 |         self.trg_name = kwargs.get('trg_name', 'y')
 31 | 
 32 |         self._keys = [self.src_name]
 33 |         if self.mask:
 34 |             self._keys.append("%s_mask" % self.src_name)
 35 | 
 36 |         self._keys.append(self.trg_name)
 37 |         if self.mask:
 38 |             self._keys.append("%s_mask" % self.trg_name)
 39 | 
 40 |     def read(self):
 41 |         seqs = []
 42 |         sf = fopen(self.srcfile, 'r')
 43 |         tf = fopen(self.trgfile, 'r')
 44 | 
 45 |         src_unks = 0
 46 |         trg_unks = 0
 47 | 
 48 |         for idx, (sline, tline) in enumerate(zip(sf, tf)):
 49 |             sline = sline.strip()
 50 |             tline = tline.strip()
 51 | 
 52 |             # Exception if empty line found
 53 |             if sline == "" or tline == "":
 54 |                 continue
 55 | 
 56 |             sseq = [self.srcdict.get(w, 1) for w in sline.split(' ')]
 57 |             tseq = [self.trgdict.get(w, 1) for w in tline.split(' ')]
 58 | 
 59 |             # if given limit vocabulary
 60 |             if self.n_words_src > 0:
 61 |                 sseq = [w if w < self.n_words_src else 1 for w in sseq]
 62 | 
 63 |             # if given limit vocabulary
 64 |             if self.n_words_trg > 0:
 65 |                 tseq = [w if w < self.n_words_trg else 1 for w in tseq]
 66 | 
 67 |             src_unks += sseq.count(1)
 68 |             trg_unks += tseq.count(1)
 69 | 
 70 |             # Append sequences to the list
 71 |             seqs.append((sseq, tseq))
 72 | 
 73 |         sf.close()
 74 |         tf.close()
 75 | 
 76 |         # Save sequences
 77 |         self._seqs = seqs
 78 |         self.n_unks_src = src_unks
 79 |         self.n_unks_trg = trg_unks
 80 | 
 81 |         # Number of training samples
 82 |         self.n_samples = len(self._seqs)
 83 | 
 84 |         # Set batch processor function
 85 |         self._process_batch = (lambda idxs: self.mask_seqs(idxs))
 86 | 
 87 |         if self.shuffle_mode == 'trglen':
 88 |             # Homogeneous batches ordered by target sequence length
 89 |             # Get an iterator over sample idxs
 90 |             self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=1)
 91 |         else:
 92 |             self.rewind()
 93 | 
 94 |     def rewind(self):
 95 |         if self.shuffle_mode != 'trglen':
 96 |             # Fill in the _idxs list for sample order
 97 |             if self.shuffle_mode == 'simple':
 98 |                 # Simple shuffle
 99 |                 self._idxs = np.random.permutation(self.n_samples).tolist()
100 |             elif self.shuffle_mode is None:
101 |                 # Ordered
102 |                 self._idxs = np.arange(self.n_samples).tolist()
103 | 
104 |             self._iter = []
105 |             for i in range(0, self.n_samples, self.batch_size):
106 |                 self._iter.append(self._idxs[i:i + self.batch_size])
107 |             self._iter = iter(self._iter)
108 | 
109 |     def mask_seqs(self, idxs):
110 |         """Prepares a list of padded tensors with their masks for the given sample idxs."""
111 |         src, src_mask = Iterator.mask_data([self._seqs[i][0] for i in idxs])
112 |         trg, trg_mask = Iterator.mask_data([self._seqs[i][1] for i in idxs])
113 |         return (src, src_mask, trg, trg_mask)
114 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/fusion.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | from ..sysutils     import listify
  6 | from ..nmtutils     import sent_to_idx
  7 | from .iterator      import Iterator
  8 | from .homogeneous   import HomogeneousData
  9 | from ..defaults     import INT, FLOAT
 10 | 
 11 | # This is an iterator specifically to be used by the .pkl
 12 | # corpora files created for WMT17 Shared Task on Multimodal Machine Translation
 13 | # Each element of the list that is pickled is in the following format:
 14 | # [src_split_idx, trg_split_idx, imgid, imgname, src_words, trg_words]
 15 | 
 16 | # Shorthand for positional access
 17 | SSPLIT, TSPLIT, IMGID, IMGNAME, STOKENS, TTOKENS = range(6)
 18 | 
 19 | class FusionIterator(Iterator):
 20 |     def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs):
 21 |         super(FusionIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger)
 22 | 
 23 |         assert 'pklfile' in kwargs, "Missing argument pklfile"
 24 | 
 25 |         # pkl file containing the data
 26 |         self.pklfile = kwargs['pklfile']
 27 | 
 28 |         # Don't use mask when batch_size == 1 which means we're doing
 29 |         # translation with nmt-translate
 30 |         if self.batch_size == 1:
 31 |             self.mask = False
 32 | 
 33 |         # Will be set after reading the data
 34 |         self.src_avail = False
 35 |         self.trg_avail = False
 36 | 
 37 |         # Source word dictionary
 38 |         # This may not be available in image captioning
 39 |         self.srcdict = kwargs.get('srcdict', None)
 40 |         # This may not be available during validation
 41 |         self.trgdict = kwargs.get('trgdict', None)
 42 | 
 43 |         # Short-list sizes
 44 |         self.n_words_src = kwargs.get('n_words_src', 0)
 45 |         self.n_words_trg = kwargs.get('n_words_trg', 0)
 46 | 
 47 |         # How do we refer to symbolic data variables?
 48 |         self.src_name = kwargs.get('src_name', 'x')
 49 |         self.trg_name = kwargs.get('trg_name', 'y')
 50 | 
 51 |         # Image features file
 52 |         #   (n_samples, flattened_spatial, n_maps)
 53 |         self.imgfile = kwargs.get('imgfile', None)
 54 | 
 55 |         if self.srcdict:
 56 |             self._keys = [self.src_name]
 57 |             if self.mask:
 58 |                 self._keys.append("%s_mask" % self.src_name)
 59 | 
 60 |         # We have images in the middle
 61 |         if self.imgfile:
 62 |             self._keys.append("%s_img" % self.src_name)
 63 | 
 64 |         if self.trgdict:
 65 |             self._keys.append(self.trg_name)
 66 |             if self.mask:
 67 |                 self._keys.append("%s_mask" % self.trg_name)
 68 | 
 69 |     def read(self):
 70 |         # Load image features file if any
 71 |         if self.imgfile is not None:
 72 |             self._print('Loading image file...')
 73 |             self.img_feats = np.load(self.imgfile)
 74 | 
 75 |             # Move n_samples to middle dimension
 76 |             # -> 196 x n_samples x 1024 for res4f_relu
 77 |             self.img_feats = self.img_feats.transpose(1, 0, 2)
 78 | 
 79 |             # (w*h, n, c)
 80 |             self.img_shape = tuple((self.img_feats.shape[0], -1, self.img_feats.shape[-1]))
 81 |             self._print('Done.')
 82 | 
 83 |         # Load the corpora
 84 |         with open(self.pklfile, 'rb') as f:
 85 |             self._print('Loading pkl file...')
 86 |             self._seqs = pickle.load(f)
 87 |             self._print('Done.')
 88 | 
 89 |         # Introspect the pickle by looking the first sample
 90 |         ss = self._seqs[0]
 91 | 
 92 |         # we may not have them in pickle or we may not
 93 |         # want to use target sentences by giving its vocab None
 94 |         if ss[TTOKENS] is not None and self.trgdict:
 95 |             self.trg_avail = True
 96 | 
 97 |         # Same for source side
 98 |         if ss[STOKENS] is not None and self.srcdict:
 99 |             self.src_avail = True
100 | 
101 |         # We now have a list of samples
102 |         self.n_samples = len(self._seqs)
103 | 
104 |         # Depending on mode, we can have multiple sentences per image so
105 |         # let's store the number of actual images as well.
106 |         # n_unique_samples <= n_samples
107 |         self.n_unique_images = len(set([s[IMGNAME] for s in self._seqs]))
108 | 
109 |         # Some statistics
110 |         total_src_words = []
111 |         total_trg_words = []
112 | 
113 |         # Let's map the sentences once to idx's
114 |         for sample in self._seqs:
115 |             if self.src_avail:
116 |                 sample[STOKENS] = sent_to_idx(self.srcdict, sample[STOKENS], self.n_words_src)
117 |                 total_src_words.extend(sample[STOKENS])
118 |             if self.trg_avail:
119 |                 sample[TTOKENS] = sent_to_idx(self.trgdict, sample[TTOKENS], self.n_words_trg)
120 |                 total_trg_words.extend(sample[TTOKENS])
121 | 
122 |         if self.src_avail:
123 |             self.unk_src = total_src_words.count(1)
124 |             self.total_src_words = len(total_src_words)
125 |         if self.trg_avail:
126 |             self.unk_trg = total_trg_words.count(1)
127 |             self.total_trg_words = len(total_trg_words)
128 | 
129 |         # Set batch processor function
130 |         # idxs can be a list of single element as well
131 |         self._process_batch = (lambda idxs: self.mask_seqs(idxs))
132 | 
133 |         # Homogeneous batches ordered by target sequence length
134 |         # Get an iterator over sample idxs
135 |         if self.batch_size > 1:
136 |             # Training
137 |             self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=TTOKENS)
138 |         else:
139 |             # Test-set
140 |             self._iter = iter([[i] for i in np.arange(self.n_samples)])
141 | 
142 |     def mask_seqs(self, idxs):
143 |         """Pad if necessary and return padded batches or single samples."""
144 |         data = []
145 | 
146 |         # Let's fetch batch samples first
147 |         batch = [self._seqs[i] for i in idxs]
148 | 
149 |         if self.src_avail:
150 |             data += Iterator.mask_data([b[STOKENS] for b in batch], get_mask=self.mask)
151 | 
152 |         # Source image features
153 |         if self.imgfile is not None:
154 |             x_img = self.img_feats[:, [b[IMGID] for b in batch], :]
155 | 
156 |             # Reshape accordingly
157 |             x_img.shape = self.img_shape
158 |             data += [x_img]
159 | 
160 |         if self.trg_avail:
161 |             data += Iterator.mask_data([b[TTOKENS] for b in batch], get_mask=self.mask)
162 | 
163 |         return data
164 | 
165 |     def rewind(self):
166 |         # Done automatically within homogeneous iterator
167 |         pass
168 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/homogeneous.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import copy
 4 | 
 5 | # Iterator that randomly fetches samples with same target
 6 | # length to be efficient in terms of RNN underlyings.
 7 | # Code from https://github.com/kelvinxu/arctic-captions
 8 | class HomogeneousData(object):
 9 |     def __init__(self, data, batch_size, trg_pos):
10 |         self.batch_size = batch_size
11 |         self.data = data
12 |         self.trg_pos = trg_pos
13 | 
14 |         self.prepare()
15 |         self.reset()
16 | 
17 |     def prepare(self):
18 |         # find all target sequence lengths
19 |         self.lengths = [len(cc[self.trg_pos]) for cc in self.data]
20 | 
21 |         # Compute unique lengths
22 |         self.len_unique = np.unique(self.lengths)
23 | 
24 |         # indices of unique lengths
25 |         self.len_indices = dict()
26 |         self.len_counts = dict()
27 | 
28 |         # For each length, find the sample idxs and their counts
29 |         for ll in self.len_unique:
30 |             self.len_indices[ll] = np.where(self.lengths == ll)[0]
31 |             self.len_counts[ll] = len(self.len_indices[ll])
32 | 
33 |     def reset(self):
34 |         self.len_curr_counts = copy.copy(self.len_counts)
35 | 
36 |         # Randomize length order
37 |         self.len_unique = np.random.permutation(self.len_unique)
38 |         self.len_indices_pos = dict()
39 |         for ll in self.len_unique:
40 |             # Randomize sample order for a specific length
41 |             self.len_indices[ll] = np.random.permutation(self.len_indices[ll])
42 |             # Set initial position for this length to 0
43 |             self.len_indices_pos[ll] = 0
44 | 
45 |         self.len_idx = -1
46 | 
47 |     def __next__(self):
48 |         fin_unique_len = 0
49 |         while True:
50 |             # What is the length idx for this batch?
51 |             self.len_idx = (self.len_idx + 1) % len(self.len_unique)
52 |             # Current candidate length
53 |             self.cur_len = self.len_unique[self.len_idx]
54 |             # Do we have samples left for this length?
55 |             if self.len_curr_counts[self.cur_len] > 0:
56 |                 break
57 | 
58 |             # All samples for a length exhausted, increment counter
59 |             fin_unique_len += 1
60 | 
61 |             # Is this the end for this epoch?
62 |             if fin_unique_len >= len(self.len_unique):
63 |                 break
64 | 
65 |         # All data consumed
66 |         if fin_unique_len >= len(self.len_unique):
67 |             self.reset()
68 |             raise StopIteration()
69 | 
70 |         # batch_size or what is left for this length
71 |         curr_batch_size = np.minimum(self.batch_size, self.len_curr_counts[self.cur_len])
72 |         # Get current position for the batch
73 |         curr_pos = self.len_indices_pos[self.cur_len]
74 | 
75 |         # get the indices for the current batch
76 |         curr_indices = self.len_indices[self.cur_len][curr_pos:curr_pos+curr_batch_size]
77 | 
78 |         # Increment next position
79 |         self.len_indices_pos[self.cur_len] += curr_batch_size
80 |         # Decrement used sample count
81 |         self.len_curr_counts[self.cur_len] -= curr_batch_size
82 | 
83 |         # Return batch indices from here
84 |         return curr_indices
85 | 
86 |     def __iter__(self):
87 |         return self
88 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/iterator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random
 3 | 
 4 | from abc import ABCMeta, abstractmethod
 5 | from collections import OrderedDict
 6 | 
 7 | import numpy as np
 8 | from ..defaults import INT, FLOAT
 9 | 
10 | class Iterator(object, metaclass=ABCMeta):
11 |     """Base Iterator class."""
12 | 
13 |     @staticmethod
14 |     def mask_data(seqs, get_mask=True):
15 |         """Pads sequences with EOS (0) for minibatch processing."""
16 |         lengths = [len(s) for s in seqs]
17 |         maxlen = np.max(lengths) + 1
18 | 
19 |         # Shape is (t_steps, samples)
20 |         x = np.zeros((maxlen, len(seqs))).astype(INT)
21 |         x_mask = np.zeros_like(x).astype(FLOAT)
22 | 
23 |         for idx, s_x in enumerate(seqs):
24 |             x[:lengths[idx], idx] = s_x
25 |             x_mask[:lengths[idx] + 1, idx] = 1.
26 | 
27 |         if get_mask:
28 |             return [x, x_mask]
29 |         else:
30 |             return [x]
31 | 
32 |     def _print(self, msg):
33 |         if self._logger:
34 |             self._logger.info(msg)
35 | 
36 |     def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None):
37 |         self.n_samples  = 0
38 |         self.seed       = seed
39 |         self.mask       = mask
40 |         self._logger     = logger
41 |         self.batch_size = batch_size
42 |         self._keys     = []
43 |         self._idxs     = []
44 |         self._seqs     = []
45 |         self._iter     = None
46 |         self._minibatches = []
47 | 
48 |         self.shuffle_mode = shuffle_mode
49 |         if self.shuffle_mode:
50 |             # Set random seed
51 |             random.seed(self.seed)
52 | 
53 |         # This can be set by child classes for processing
54 |         # a list of idxs into the actual minibatch
55 |         self._process_batch = lambda x: x
56 | 
57 |     def __len__(self):
58 |         """Returns number of samples."""
59 |         return self.n_samples
60 | 
61 |     def __iter__(self):
62 |         return self
63 | 
64 |     def __next__(self):
65 |         """Returns the next set of data from the iterator."""
66 |         try:
67 |             data = self._process_batch(next(self._iter))
68 |         except StopIteration as si:
69 |             self.rewind()
70 |             raise
71 |         else:
72 |             # Lookup the keys and return an ordered dict of the current minibatch
73 |             return OrderedDict([(k, data[i]) for i,k in enumerate(self._keys)])
74 | 
75 |     # May or may not be used.
76 |     def prepare_batches(self):
77 |         """Prepare self.__iter."""
78 |         pass
79 | 
80 |     @abstractmethod
81 |     def read(self):
82 |         """Read the data and put in into self.__seqs."""
83 |         pass
84 | 
85 |     @abstractmethod
86 |     def rewind(self):
87 |         pass
88 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/mnmt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | from ..sysutils     import listify
  6 | from ..nmtutils     import sent_to_idx
  7 | from .iterator      import Iterator
  8 | from .homogeneous   import HomogeneousData
  9 | from ..defaults     import INT, FLOAT
 10 | 
 11 | # This is an iterator specifically to be used by the .pkl
 12 | # corpora files created for WMT17 Shared Task on Multimodal Machine Translation
 13 | # Each element of the list that is pickled is in the following format:
 14 | # [src_split_idx, trg_split_idx, imgid, imgname, src_words, trg_words]
 15 | 
 16 | # Shorthand for positional access
 17 | SSPLIT, TSPLIT, IMGID, IMGNAME, STOKENS, TTOKENS = range(6)
 18 | 
 19 | class MNMTIterator(Iterator):
 20 |     def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs):
 21 |         super(MNMTIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger)
 22 | 
 23 |         assert 'pklfile' in kwargs, "Missing argument pklfile"
 24 | 
 25 |         # pkl file containing the data
 26 |         self.pklfile = kwargs['pklfile']
 27 | 
 28 |         self._print('Shuffle mode: %s' % shuffle_mode)
 29 | 
 30 |         # Don't use mask when batch_size == 1 which means we're doing
 31 |         # translation with nmt-translate
 32 |         if self.batch_size == 1:
 33 |             self.mask = False
 34 | 
 35 |         # Will be set after reading the data
 36 |         self.src_avail = False
 37 |         self.trg_avail = False
 38 | 
 39 |         # Source word dictionary
 40 |         # This may not be available in image captioning
 41 |         self.srcdict = kwargs.get('srcdict', None)
 42 |         # This may not be available during validation
 43 |         self.trgdict = kwargs.get('trgdict', None)
 44 | 
 45 |         # Short-list sizes
 46 |         self.n_words_src = kwargs.get('n_words_src', 0)
 47 |         self.n_words_trg = kwargs.get('n_words_trg', 0)
 48 | 
 49 |         # How do we refer to symbolic data variables?
 50 |         self.src_name = kwargs.get('src_name', 'x')
 51 |         self.trg_name = kwargs.get('trg_name', 'y')
 52 | 
 53 |         # Image features file
 54 |         #   (n_samples, flattened_spatial, n_maps)
 55 |         self.imgfile = kwargs.get('imgfile', None)
 56 | 
 57 |     def read(self):
 58 |         # Load image features file if any
 59 |         if self.imgfile is not None:
 60 |             self._print('Loading image file...')
 61 |             self.img_feats = np.load(self.imgfile)
 62 | 
 63 |         # Load the corpora
 64 |         with open(self.pklfile, 'rb') as f:
 65 |             self._print('Loading pkl file...')
 66 |             self._seqs = pickle.load(f)
 67 | 
 68 |         # Introspect the pickle by looking the first sample
 69 |         ss = self._seqs[0]
 70 | 
 71 |         # we may not have them in pickle or we may not
 72 |         # want to use target sentences by giving its vocab None
 73 |         if ss[TTOKENS] is not None and self.trgdict:
 74 |             self.trg_avail = True
 75 | 
 76 |         # Same for source side
 77 |         if ss[STOKENS] is not None and self.srcdict:
 78 |             self.src_avail = True
 79 | 
 80 |         if self.src_avail:
 81 |             self._keys = [self.src_name]
 82 |             if self.mask:
 83 |                 self._keys.append("%s_mask" % self.src_name)
 84 | 
 85 |         # We have images in the middle
 86 |         if self.imgfile:
 87 |             self._keys.append("%s_img" % self.src_name)
 88 | 
 89 |         if self.trg_avail:
 90 |             self._keys.append(self.trg_name)
 91 |             if self.mask:
 92 |                 self._keys.append("%s_mask" % self.trg_name)
 93 | 
 94 |         # We now have a list of samples
 95 |         self.n_samples = len(self._seqs)
 96 | 
 97 |         # Depending on mode, we can have multiple sentences per image so
 98 |         # let's store the number of actual images as well.
 99 |         # n_unique_samples <= n_samples
100 |         self.n_unique_images = len(set([s[IMGNAME] for s in self._seqs]))
101 | 
102 |         # Some statistics
103 |         total_src_words = []
104 |         total_trg_words = []
105 | 
106 |         # Let's map the sentences once to idx's
107 |         for sample in self._seqs:
108 |             if self.src_avail:
109 |                 sample[STOKENS] = sent_to_idx(self.srcdict, sample[STOKENS], self.n_words_src)
110 |                 total_src_words.extend(sample[STOKENS])
111 |             if self.trg_avail:
112 |                 sample[TTOKENS] = sent_to_idx(self.trgdict, sample[TTOKENS], self.n_words_trg)
113 |                 total_trg_words.extend(sample[TTOKENS])
114 | 
115 |         if self.src_avail:
116 |             self.n_unks_src = total_src_words.count(1)
117 |             self.total_src_words = len(total_src_words)
118 |         if self.trg_avail:
119 |             self.n_unks_trg = total_trg_words.count(1)
120 |             self.total_trg_words = len(total_trg_words)
121 | 
122 |         # Set batch processor function
123 |         # idxs can be a list of single element as well
124 |         self._process_batch = (lambda idxs: self.mask_seqs(idxs))
125 | 
126 |         # Homogeneous batches ordered by target sequence length
127 |         # Get an iterator over sample idxs
128 |         if self.batch_size > 1 and self.shuffle_mode == 'trglen':
129 |             # Training
130 |             self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=TTOKENS)
131 |         else:
132 |             # Handles both bsize = 1 and > 1. Test-set mode
133 |             self._idxs = np.arange(self.n_samples)
134 |             self._iter = []
135 |             for i in range(0, self.n_samples, self.batch_size):
136 |                 self._iter.append(self._idxs[i:i + self.batch_size])
137 |             self._iter = iter(self._iter)
138 | 
139 |     def mask_seqs(self, idxs):
140 |         """Pad if necessary and return padded batches or single samples."""
141 |         data = []
142 | 
143 |         # Let's fetch batch samples first
144 |         batch = [self._seqs[i] for i in idxs]
145 | 
146 |         if self.src_avail:
147 |             data += Iterator.mask_data([b[STOKENS] for b in batch], get_mask=self.mask)
148 | 
149 |         # Source image features
150 |         if self.imgfile is not None:
151 |             x_img = self.img_feats[[b[IMGID] for b in batch], ...]
152 |             data += [x_img]
153 | 
154 |         if self.trg_avail:
155 |             data += Iterator.mask_data([b[TTOKENS] for b in batch], get_mask=self.mask)
156 | 
157 |         return data
158 | 
159 |     def rewind(self):
160 |         if self.shuffle_mode != 'trglen':
161 |             # Handles both bsize = 1 and > 1. Test-set mode
162 |             self._idxs = np.arange(self.n_samples)
163 |             self._iter = []
164 |             for i in range(0, self.n_samples, self.batch_size):
165 |                 self._iter.append(self._idxs[i:i + self.batch_size])
166 |             self._iter = iter(self._iter)
167 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | 
 4 | from ..sysutils import fopen
 5 | from .iterator import Iterator
 6 | 
 7 | """Text iterator for monolingual data."""
 8 | class TextIterator(Iterator):
 9 |     def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs):
10 |         super(TextIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger)
11 | 
12 |         assert 'file'   in kwargs, "Missing argument file"
13 |         assert 'dict'   in kwargs, "Missing argument dict"
14 | 
15 |         self.__file = kwargs['file']
16 |         self.__dict = kwargs['dict']
17 |         self.__n_words = kwargs.get('n_words', 0)
18 |         self.name = kwargs.get('name', 'x')
19 | 
20 |         self._keys = [self.name]
21 |         if self.mask:
22 |             self._keys.append('%s_mask' % self.name)
23 | 
24 |     def read(self):
25 |         seqs = []
26 |         with fopen(self.__file, 'r') as f:
27 |             for idx, line in enumerate(f):
28 |                 line = line.strip()
29 | 
30 |                 # Skip empty lines
31 |                 if line == "":
32 |                     print('Warning: empty line in %s' % self.__file)
33 |                 else:
34 |                     line = line.split(" ")
35 | 
36 |                     seq = [self.__dict.get(w, 1) for w in line]
37 | 
38 |                     # if given limit vocabulary
39 |                     if self.__n_words > 0:
40 |                         seq = [w if w < self.__n_words else 1 for w in seq]
41 |                     # Append the sequence
42 |                     seqs += [seq]
43 | 
44 |         self._seqs = seqs
45 |         self.n_samples = len(self._seqs)
46 |         self._idxs = np.arange(self.n_samples)
47 | 
48 |         if not self._minibatches:
49 |             self.prepare_batches()
50 |         self.rewind()
51 | 
52 |     def prepare_batches(self):
53 |         self._minibatches = []
54 | 
55 |         for i in range(0, self.n_samples, self.batch_size):
56 |             batch_idxs = self._idxs[i:i + self.batch_size]
57 |             x, x_mask = Iterator.mask_data([self._seqs[i] for i in batch_idxs])
58 |             self._minibatches.append((x, x_mask))
59 | 
60 |     def rewind(self):
61 |         """Recreate the iterator."""
62 |         if self.shuffle_mode == 'simple':
63 |             self._idxs = np.random.permutation(self.n_samples)
64 |             self.prepare_batches()
65 | 
66 |         self._iter = iter(self._minibatches)
67 | 


--------------------------------------------------------------------------------
/nmtpy/iterators/wmt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | from ..nmtutils     import sent_to_idx
  6 | from .iterator      import Iterator
  7 | from .homogeneous   import HomogeneousData
  8 | 
  9 | # This is an iterator specifically to be used by the .pkl
 10 | # corpora files created for WMT16 Shared Task on Multimodal Machine Translation
 11 | # Each element of the list that is pickled is in the following format:
 12 | # [src_split_idx, trg_split_idx, imgid, imgname, src_words, trg_words]
 13 | 
 14 | class WMTIterator(Iterator):
 15 |     def __init__(self, batch_size, seed=1234, mask=True, shuffle_mode=None, logger=None, **kwargs):
 16 |         super(WMTIterator, self).__init__(batch_size, seed, mask, shuffle_mode, logger)
 17 | 
 18 |         assert 'pklfile' in kwargs, "Missing argument pklfile"
 19 |         assert 'srcdict' in kwargs, "Missing argument srcdict"
 20 | 
 21 |         self._print('Shuffle mode: %s' % shuffle_mode)
 22 | 
 23 |         # Short-list sizes
 24 |         self.n_words_src = kwargs.get('n_words_src', 0)
 25 |         self.n_words_trg = kwargs.get('n_words_trg', 0)
 26 | 
 27 |         # How do we refer to symbolic data variables?
 28 |         self.src_name = kwargs.get('src_name', 'x')
 29 |         self.trg_name = kwargs.get('trg_name', 'y')
 30 | 
 31 |         # How do we use the multimodal data? (Numbers in parens are for Task 2)
 32 |         # 'all'     : All combinations (~725K parallel)
 33 |         # 'single'  : Take only the first pair e.g., train0.en->train0.de (~29K parallel)
 34 |         # 'pairs'   : Take only one-to-one pairs e.g., train_i.en->train_i.de (~145K parallel)
 35 |         self.mode = kwargs.get('mode', 'pairs')
 36 | 
 37 |         # pkl file which contains a list of samples
 38 |         self.pklfile = kwargs['pklfile']
 39 |         # Resnet-50 image features file
 40 |         self.imgfile = kwargs.get('imgfile', None)
 41 |         self.img_avail = self.imgfile is not None
 42 | 
 43 |         self.trg_avail = False
 44 | 
 45 |         # Source word dictionary and short-list limit
 46 |         # This may not be available if the task is image -> description (Not implemented)
 47 |         self.srcdict = kwargs['srcdict']
 48 |         # This may not be available during validation
 49 |         self.trgdict = kwargs.get('trgdict', None)
 50 | 
 51 |         # Don't use mask when batch_size == 1 which means we're doing
 52 |         # translation with nmt-translate
 53 |         if self.batch_size == 1:
 54 |             self.mask = False
 55 | 
 56 |         self._keys = [self.src_name]
 57 |         if self.mask:
 58 |             self._keys.append("%s_mask" % self.src_name)
 59 | 
 60 |         # We have images in the middle
 61 |         if self.imgfile:
 62 |             self._keys.append("%s_img" % self.src_name)
 63 | 
 64 |         # Target may not be available during validation
 65 |         if self.trgdict:
 66 |             self._keys.append(self.trg_name)
 67 |             if self.mask:
 68 |                 self._keys.append("%s_mask" % self.trg_name)
 69 | 
 70 |     def read(self):
 71 |         # Load image features file if any
 72 |         if self.img_avail:
 73 |             self._print('Loading image file...')
 74 |             self.img_feats = np.load(self.imgfile)
 75 |             self._print('Done.')
 76 | 
 77 |         # Load the corpora
 78 |         with open(self.pklfile, 'rb') as f:
 79 |             self._print('Loading pkl file...')
 80 |             self._seqs = pickle.load(f)
 81 |             self._print('Done.')
 82 | 
 83 |         # Check for what is available
 84 |         ss = self._seqs[0]
 85 |         # If no split idxs are found, its Task 1, set mode to 'all'
 86 |         if ss[0] is None and ss[1] is None:
 87 |             self.mode = 'all'
 88 | 
 89 |         if ss[5] is not None and self.trgdict:
 90 |             self.trg_avail = True
 91 | 
 92 |         if self.mode == 'single':
 93 |             # Just take the first src-trg pair. Useful for validation
 94 |             if ss[1] is not None:
 95 |                 self._seqs = [s for s in self._seqs if (s[0] == s[1] == 0)]
 96 |             else:
 97 |                 self._seqs = [s for s in self._seqs if (s[0] == 0)]
 98 | 
 99 |         elif ss[1] is not None and self.mode == 'pairs':
100 |             # Take the pairs with split idx's equal
101 |             self._seqs = [s for s in self._seqs if s[0] == s[1]]
102 | 
103 |         # We now have a list of samples
104 |         self.n_samples = len(self._seqs)
105 | 
106 |         # Depending on mode, we can have multiple sentences per image so
107 |         # let's store the number of actual images as well.
108 |         # n_unique_samples <= n_samples
109 |         self.n_unique_images = len(set([s[3] for s in self._seqs]))
110 | 
111 |         # Some statistics
112 |         total_src_words = []
113 |         total_trg_words = []
114 | 
115 |         # Let's map the sentences once to idx's
116 |         for sample in self._seqs:
117 |             sample[4] = sent_to_idx(self.srcdict, sample[4], self.n_words_src)
118 |             total_src_words.extend(sample[4])
119 |             if self.trg_avail:
120 |                 sample[5] = sent_to_idx(self.trgdict, sample[5], self.n_words_trg)
121 |                 total_trg_words.extend(sample[5])
122 | 
123 |         self.unk_src = total_src_words.count(1)
124 |         self.unk_trg = total_trg_words.count(1)
125 |         self.total_src_words = len(total_src_words)
126 |         self.total_trg_words = len(total_trg_words)
127 | 
128 |         #########################
129 |         # Prepare iteration stuff
130 |         #########################
131 |         # Set batch processor function
132 |         if self.batch_size == 1:
133 |             self._process_batch = (lambda idxs: self.process_single(idxs[0]))
134 |         else:
135 |             self._process_batch = (lambda idxs: self.mask_seqs(idxs))
136 | 
137 |         if self.shuffle_mode == 'trglen':
138 |             # Homogeneous batches ordered by target sequence length
139 |             # Get an iterator over sample idxs
140 |             self._iter = HomogeneousData(self._seqs, self.batch_size, trg_pos=5)
141 |         else:
142 |             # For once keep it ordered
143 |             self._idxs = np.arange(self.n_samples).tolist()
144 |             self._iter = []
145 |             for i in range(0, self.n_samples, self.batch_size):
146 |                 self._iter.append(self._idxs[i:i + self.batch_size])
147 |             self._iter = iter(self._iter)
148 | 
149 |     def process_single(self, idx):
150 |         data, _ = Iterator.mask_data([self._seqs[idx][4]])
151 |         data = [data]
152 |         if self.img_avail:
153 |             # Do this 196 x 1024
154 |             data += [self.img_feats[self._seqs[idx][2]][:, None, :]]
155 |         if self.trg_avail:
156 |             trg, _ = Iterator.mask_data([self._seqs[idx][5]])
157 |             data.append(trg)
158 |         return data
159 | 
160 |     def mask_seqs(self, idxs):
161 |         """Prepares a list of padded tensors with their masks for the given sample idxs."""
162 |         data = list(Iterator.mask_data([self._seqs[i][4] for i in idxs]))
163 |         # Source image features
164 |         if self.img_avail:
165 |             img_idxs = [self._seqs[i][2] for i in idxs]
166 | 
167 |             # Do this 196 x bsize x 1024
168 |             x_img = self.img_feats[img_idxs].transpose(1, 0, 2)
169 |             data += [x_img]
170 | 
171 |         if self.trg_avail:
172 |             data += list(Iterator.mask_data([self._seqs[i][5] for i in idxs]))
173 | 
174 |         return data
175 | 
176 |     def rewind(self):
177 |         if self.shuffle_mode != 'trglen':
178 |             # Fill in the _idxs list for sample order
179 |             if self.shuffle_mode == 'simple':
180 |                 # Simple shuffle
181 |                 self._idxs = np.random.permutation(self.n_samples).tolist()
182 |             elif self.shuffle_mode is None:
183 |                 # Ordered
184 |                 self._idxs = np.arange(self.n_samples).tolist()
185 | 
186 |             self._iter = []
187 |             for i in range(0, self.n_samples, self.batch_size):
188 |                 self._iter.append(self._idxs[i:i + self.batch_size])
189 |             self._iter = iter(self._iter)
190 | 


--------------------------------------------------------------------------------
/nmtpy/logger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | 
 4 | def singleton(cls):
 5 |     instances = {}
 6 |     def get_instance():
 7 |         if cls not in instances:
 8 |             instances[cls] = cls()
 9 |         return instances[cls]
10 |     return get_instance()
11 | 
12 | @singleton
13 | class Logger(object):
14 |     """Logs to stdout and to file simultaneously."""
15 |     def __init__(self):
16 |         pass
17 | 
18 |     def setup(self, log_file=None, timestamp=True):
19 |         _format = '%(message)s'
20 |         if timestamp:
21 |             _format = '%(asctime)s ' + _format
22 | 
23 |         self.formatter = logging.Formatter(_format)
24 |         self._logger = logging.getLogger('nmtpy')
25 |         self._logger.setLevel(logging.DEBUG)
26 |         self._ch = logging.StreamHandler()
27 |         self._ch.setFormatter(self.formatter)
28 |         self._logger.addHandler(self._ch)
29 | 
30 |         if log_file:
31 |             self._fh = logging.FileHandler(log_file, mode='w')
32 |             self._fh.setFormatter(self.formatter)
33 |             self._logger.addHandler(self._fh)
34 | 
35 |     def get(self):
36 |         return self._logger
37 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import operator
 4 | import numpy as np
 5 | 
 6 | from .bleu              import MultiBleuScorer
 7 | from .meteor            import METEORScorer
 8 | from .factors2wordbleu  import Factors2word
 9 | from .mtevalbleu        import MTEvalV13aBLEUScorer
10 | from .external          import ExternalScorer
11 | 
12 | comparators = {
13 |         'bleu'      : (max, operator.gt, 0),
14 |         'bleu_v13a' : (max, operator.gt, 0),
15 |         'meteor'    : (max, operator.gt, 0),
16 |         'cider'     : (max, operator.gt, 0),
17 |         'rouge'     : (max, operator.gt, 0),
18 |         'loss'      : (min, operator.lt, -1),
19 |         'ter'       : (min, operator.lt, -1),
20 |     }
21 | 
22 | def get_scorer(scorer):
23 |     scorers = {
24 |                 'meteor'      : METEORScorer,
25 |                 'bleu'        : MultiBleuScorer,
26 |                 'bleu_v13a'   : MTEvalV13aBLEUScorer,
27 |                 'factors2word': Factors2word,
28 |               }
29 | 
30 |     if scorer in scorers:
31 |         # A defined metric
32 |         return scorers[scorer]()
33 |     elif scorer.startswith(('/', '~')):
34 |         # External script
35 |         return ExternalScorer(os.path.expanduser(scorer))
36 | 
37 | def is_last_best(name, history, min_delta):
38 |     """Checks whether the last element is the best score so far
39 |     by taking into account an absolute improvement threshold min_delta."""
40 |     if len(history) == 1:
41 |         # If first validation, return True to save it
42 |         return True
43 | 
44 |     new_value = history[-1]
45 | 
46 |     # bigger is better
47 |     if name.startswith(('bleu', 'meteor', 'cider', 'rouge')):
48 |         cur_best = max(history[:-1])
49 |         return new_value > cur_best and abs(new_value - cur_best) >= (min_delta - 1e-5)
50 |     # lower is better
51 |     elif name in ['loss', 'px', 'ter']:
52 |         cur_best = min(history[:-1])
53 |         return new_value < cur_best and abs(new_value - cur_best) >= (min_delta - 1e-5)
54 | 
55 | def find_best(name, history):
56 |     """Returns the best idx and value for the given metric."""
57 |     history = np.array(history)
58 |     if name.startswith(('bleu', 'meteor', 'cider', 'rouge')):
59 |         best_idx = np.argmax(history)
60 |     elif name in ['loss', 'px', 'ter']:
61 |         best_idx = np.argmin(history)
62 | 
63 |     # Validation periods start from 1
64 |     best_val = history[best_idx]
65 |     return (best_idx + 1), best_val
66 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/bleu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import subprocess
 3 | import pkg_resources
 4 | 
 5 | from .metric import Metric
 6 | 
 7 | BLEU_SCRIPT = pkg_resources.resource_filename('nmtpy', 'external/multi-bleu.perl')
 8 | 
 9 | class BLEUScore(Metric):
10 |     def __init__(self, score=None):
11 |         super(BLEUScore, self).__init__(score)
12 |         self.name = "BLEU"
13 |         if score:
14 |             self.score = float(score.split()[2][:-1])
15 |             self.score_str = score.replace('BLEU = ', '')
16 | 
17 | """MultiBleuScorer class."""
18 | class MultiBleuScorer(object):
19 |     def __init__(self, lowercase=False):
20 |         # For multi-bleu.perl we give the reference(s) files as argv,
21 |         # while the candidate translations are read from stdin.
22 |         self.lowercase = lowercase
23 |         self.__cmdline = [BLEU_SCRIPT]
24 |         if self.lowercase:
25 |             self.__cmdline.append("-lc")
26 | 
27 |     def compute(self, refs, hypfile):
28 |         cmdline = self.__cmdline[:]
29 | 
30 |         # Make reference files a list
31 |         refs = [refs] if isinstance(refs, str) else refs
32 |         cmdline.extend(refs)
33 | 
34 |         hypstring = None
35 |         with open(hypfile, "r") as fhyp:
36 |             hypstring = fhyp.read().rstrip()
37 | 
38 |         score = subprocess.run(cmdline, stdout=subprocess.PIPE,
39 |                                input=hypstring, universal_newlines=True).stdout.splitlines()
40 |         if len(score) == 0:
41 |             return BLEUScore()
42 |         else:
43 |             return BLEUScore(score[0].rstrip("\n"))
44 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/external.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import subprocess
 4 | 
 5 | from .metric import Metric
 6 | 
 7 | class ExternalScore(Metric):
 8 |     def __init__(self, score=None):
 9 |         super(ExternalScore, self).__init__(score)
10 |         # This should be overriden once the score is received
11 |         # So the script should behave exactly as it is documented below.
12 |         self.name = 'External'
13 | 
14 |         if score:
15 |             # Parse score line of format:
16 |             # METRIC = SCORE, ......
17 |             name, rest = score.split('=', 1)
18 |             self.score_str = rest.strip()
19 |             self.name = name.strip()
20 |             score, rest = rest.split(',', 1)
21 |             self.score = float(score.strip())
22 | 
23 | class ExternalScorer(object):
24 |     """An external scorer that calls arbitrary script for metric computation.
25 |         - The script should be runnable as it-is
26 |         - It should consume the hypotheses from stdin and receive
27 |           a variable number of references as cmdline arguments.
28 |         - The script should output a "single line" to stdout with the
29 |           following format:
30 |             METRICNAME = SCORE, <any arbitrary score information string>
31 | 
32 |         Example:
33 |             $ custombleu.perl ref1 ref2 ref3 < hyps (Higher better)
34 |                 BLEU = 23.43, (ref_len=xxx,hyp_len=xxx,penalty=xxx)
35 |             $ wer.py ref1 < hyps (Lower better)
36 |                 WER = 32.42, (....)"""
37 | 
38 |     def __init__(self, script):
39 |         self.__cmdline = [script]
40 | 
41 |     def compute(self, refs, hypfile):
42 |         cmdline = self.__cmdline[:]
43 | 
44 |         # Make reference files a list and add to command
45 |         refs = [refs] if isinstance(refs, str) else refs
46 |         cmdline.extend(refs)
47 | 
48 |         # Read hypotheses
49 |         with open(hypfile, "r") as fhyp:
50 |             hypstring = fhyp.read().rstrip()
51 | 
52 |         # Run script
53 |         score = subprocess.run(cmdline, stdout=subprocess.PIPE,
54 |                                input=hypstring,
55 |                                universal_newlines=True).stdout.splitlines()
56 |         if len(score) == 0:
57 |             return ExternalScore()
58 |         else:
59 |             return ExternalScore(score[0].strip())
60 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/factors2wordbleu.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | from ..sysutils import find_executable
 4 | from .bleu      import BLEUScore
 5 | 
 6 | """Factors2word class."""
 7 | class Factors2word(object):
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def compute(self, script, hyp_file, hyp_mult_file, ref):
12 |         script = find_executable(script)
13 |         lang = ref.split('.')[-1]
14 |         cmdline = [script, lang, hyp_file, hyp_mult_file, ref]
15 | 
16 |         hypstring = None
17 |         with open(hyp_file, "r") as fhyp:
18 |             hypstring = fhyp.read().rstrip()
19 |         
20 |         out = subprocess.run(cmdline, stdout=subprocess.PIPE,
21 |                                input=hypstring, universal_newlines=True).stdout.splitlines()
22 |         # TODO: this -1 gives many problems, in future we will return just BLEU and avoid those problems
23 |         score = out[-1].splitlines()
24 |         if len(score) == 0:
25 |             return BLEUScore()
26 |         else:
27 |             return BLEUScore(score[0].rstrip("\n"))
28 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/meteor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import subprocess
 4 | import pkg_resources
 5 | 
 6 | from ..sysutils import get_temp_file
 7 | from .metric import Metric
 8 | 
 9 | METEOR_JAR = pkg_resources.resource_filename('nmtpy', 'external/meteor-1.5.jar')
10 | 
11 | class METEORScore(Metric):
12 |     def __init__(self, score=None):
13 |         super(METEORScore, self).__init__(score)
14 |         self.name = "METEOR"
15 |         self.score = (100*score) if score else 0.
16 |         self.score_str = "%.3f" % self.score
17 | 
18 | class METEORScorer(object):
19 |     def __init__(self):
20 |         self.__cmdline = ["java", "-Xmx2G", "-jar", METEOR_JAR]
21 | 
22 |     def compute(self, refs, hyps, language="auto", norm=False):
23 |         cmdline = self.__cmdline[:]
24 | 
25 |         if isinstance(hyps, list):
26 |             # Create a temporary file
27 |             with get_temp_file(suffix=".hyps") as tmpf:
28 |                 for hyp in hyps:
29 |                     tmpf.write("%s\n" % hyp)
30 | 
31 |                 cmdline.append(tmpf.name)
32 | 
33 |         elif isinstance(hyps, str):
34 |             cmdline.append(hyps)
35 | 
36 |         # Make reference files a list
37 |         refs = [refs] if isinstance(refs, str) else refs
38 |         n_refs = len(refs)
39 |         if n_refs > 1:
40 |             # Multiple references
41 |             # FIXME: METEOR can consume everything from stdin
42 |             tmpff = get_temp_file(suffix=".refs")
43 |             fname = tmpff.name
44 |             tmpff.close()
45 |             os.system('paste -d"\\n" %s > %s' % (" ".join(refs), fname))
46 |             cmdline.append(fname)
47 |         else:
48 |             cmdline.append(refs[0])
49 | 
50 |         if language == "auto":
51 |             # Take the extension of the 1st reference file, e.g. ".de"
52 |             language = os.path.splitext(refs[0])[-1][1:]
53 | 
54 |         cmdline.extend(["-l", language])
55 |         if norm:
56 |             cmdline.append("-norm")
57 | 
58 |         if n_refs > 1:
59 |             # Multiple references
60 |             cmdline.extend(["-r", str(n_refs)])
61 | 
62 |         score = subprocess.run(cmdline, stdout=subprocess.PIPE,
63 |                                universal_newlines=True).stdout.splitlines()
64 |         if len(score) == 0:
65 |             return METEORScore()
66 |         else:
67 |             # Final score: 0.320320320320
68 |             return METEORScore(float(score[-1].split(":")[-1].strip()))
69 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/metric.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from functools import total_ordering
 3 | 
 4 | @total_ordering
 5 | class Metric(object):
 6 |     def __init__(self, score=None):
 7 |         self.score_str = "0.0"
 8 |         self.score = 0.
 9 |         self.name = ""
10 | 
11 |     def __eq__(self, other):
12 |         return self.score == other.score
13 | 
14 |     def __lt__(self, other):
15 |         return self.score < other.score
16 | 
17 |     def __repr__(self):
18 |         return "%s = %s" % (self.name, self.score_str)
19 | 


--------------------------------------------------------------------------------
/nmtpy/metrics/mtevalbleu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import math
  4 | 
  5 | from collections import defaultdict
  6 | 
  7 | from .metric import Metric
  8 | 
  9 | # This is an exact reimplementation of mteval-v13a.pl
 10 | # It currently only works for single reference
 11 | 
 12 | LOG_2 = math.log(2)
 13 | 
 14 | def score_segment(tst_words, ref_words, ref_ngram_freqs, max_order):
 15 |     # Create initial lists
 16 |     match_cnt       = [0 for i in range(max_order)]
 17 |     tst_cnt         = [0 for i in range(max_order)]
 18 |     ref_cnt         = [0 for i in range(max_order)]
 19 |     tst_info        = [0 for i in range(max_order)]
 20 |     ref_info        = [0 for i in range(max_order)]
 21 | 
 22 |     ref_ngrams_max = {}
 23 | 
 24 |     # Get the ngram counts for the test segment
 25 |     tst_ngrams  = words_to_ngrams(tst_words, max_order)
 26 |     len_tst     = len(tst_words)
 27 |     for i in range(max_order):
 28 |         tst_cnt[i] = (len_tst - i) if i < len_tst else 0
 29 | 
 30 |     ###########
 31 |     # Reference
 32 |     ###########
 33 |     ref_ngrams  = words_to_ngrams(ref_words, max_order)
 34 |     len_ref     = len(ref_words)
 35 |     for ngram_words, freq in ref_ngrams.items():
 36 |         # Counts of ngrams for this sentence
 37 |         ref_info[len(ngram_words) - 1] += ref_ngram_freqs[ngram_words]
 38 | 
 39 |         # Update the maximum count of this ngram
 40 |         # Shorter=>ref_ngrams_max[ngram_words] = max(ref_ngrams_max.get(ngram_words, -1), ref_ngrams[ngram_words])
 41 |         if ngram_words in ref_ngrams_max:
 42 |             ref_ngrams_max[ngram_words] = max(ref_ngrams_max[ngram_words], freq)
 43 |         else:
 44 |             ref_ngrams_max[ngram_words] = freq
 45 | 
 46 |     # Update reference ngram counts
 47 |     for i in range(max_order):
 48 |         ref_cnt[i] = (len_ref - i) if i < len_ref else 0
 49 | 
 50 |     for ngram_words, freq in tst_ngrams.items():
 51 |         if ngram_words in ref_ngrams_max:
 52 |             m = min(freq, ref_ngrams_max[ngram_words])
 53 |             l = len(ngram_words) - 1
 54 |             tst_info[l]  += ref_ngram_freqs[ngram_words] * m
 55 |             match_cnt[l] += m
 56 | 
 57 |     return len_ref, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info
 58 | 
 59 | def score_system(ref_segs, tst_segs, max_order):
 60 |     ref_ngram_freqs = compute_ngram_info(ref_segs, max_order)
 61 | 
 62 |     # 0-based indexing in contrast to perl version
 63 |     cum_match       = [0 for i in range(max_order)]
 64 |     cum_tst_cnt     = [0 for i in range(max_order)]
 65 |     cum_ref_cnt     = [0 for i in range(max_order)]
 66 |     cum_tst_info    = [0 for i in range(max_order)]
 67 |     cum_ref_info    = [0 for i in range(max_order)]
 68 |     cum_ref_len     = 0
 69 | 
 70 |     # Score each segment and keep statistics
 71 |     for tst, ref in zip(tst_segs, ref_segs):
 72 |         ref_len, match_cnt, tst_cnt, ref_cnt, tst_info, ref_info = score_segment(tst, ref, ref_ngram_freqs, max_order)
 73 | 
 74 |         # Sum ref length
 75 |         cum_ref_len += ref_len
 76 | 
 77 |         for i in range(max_order):
 78 |             cum_match[i]    += match_cnt[i]
 79 |             cum_tst_cnt[i]  += tst_cnt[i]
 80 |             cum_ref_cnt[i]  += ref_cnt[i]
 81 |             cum_tst_info[i] += tst_info[i]
 82 |             cum_ref_info[i] += ref_info[i]
 83 | 
 84 |     # Compute length score
 85 |     exp_len_score = math.exp(min(0, 1 - cum_ref_len / cum_tst_cnt[0])) \
 86 |             if cum_tst_cnt[0] > 0 else 0
 87 | 
 88 |     # For further length ratio computation
 89 |     tst_vs_ref_ratio = (cum_tst_cnt[0], cum_ref_len, math.log(exp_len_score))
 90 | 
 91 |     return bleu_score(cum_ref_len, cum_match, cum_tst_cnt, exp_len_score, max_order), tst_vs_ref_ratio
 92 | 
 93 | def read_file(filename, tokenizer, is_cased):
 94 |     """Read simple plain text file."""
 95 |     sents = []
 96 |     with open(filename) as f:
 97 |         for line in f:
 98 |             sents.append(tokenizer(line, is_cased))
 99 |     return sents
100 | 
101 | def words_to_ngrams(words, max_order):
102 |     """Convert a sequence of words to an ngram count dict."""
103 |     d = defaultdict(int)
104 | 
105 |     # Iterate over word indices as start pointers
106 |     for i in range(len(words)):
107 |         # Sliding windows
108 |         for j in range(min(max_order, len(words) - i)):
109 |             # Increment counter, keep keys as tuples
110 |             d[tuple(words[i: i+j+1])] += 1
111 | 
112 |     return d
113 | 
114 | def compute_ngram_info(ref_segs, max_order):
115 |     tot_words = 0
116 | 
117 |     # Segment-wise frequencies
118 |     ngram_count = defaultdict(int)
119 | 
120 |     ngram_info  = {}
121 | 
122 |     for words in ref_segs:
123 |         tot_words += len(words)
124 |         # Get frequencies and add them to ngramcpunt
125 |         for key, value in words_to_ngrams(words, max_order).items():
126 |             ngram_count[key] += value
127 | 
128 |     for ngram_words, freq in ngram_count.items():
129 |         if len(ngram_words) == 1:
130 |             # ngram is unigram => corpus frequency
131 |             denom = tot_words
132 |         else:
133 |             # n-gram is n-gram => n-gram frequency
134 |             denom = ngram_count[ngram_words[:-1]]
135 | 
136 |         ngram_info[ngram_words] = -math.log(freq / denom) / LOG_2
137 |     return ngram_info
138 | 
139 | def bleu_score(ref_len, matching_ngrams, tst_ngrams, exp_len_score, max_order):
140 |     score = 0
141 |     iscore = 0
142 |     smooth = 1
143 | 
144 |     ind_scores = []
145 |     cum_scores = []
146 | 
147 |     for i in range(max_order):
148 |         if tst_ngrams[i] == 0:
149 |             iscore = 0
150 |         elif matching_ngrams[i] == 0:
151 |             smooth *= 2
152 |             iscore = math.log(1 / (smooth * tst_ngrams[i]))
153 |         else:
154 |             iscore = math.log(matching_ngrams[i] / tst_ngrams[i])
155 | 
156 |         ind_scores.append(math.exp(iscore))
157 |         score += iscore
158 |         cum_scores.append(math.exp(score / (i+1)) * exp_len_score)
159 | 
160 |     return ind_scores, cum_scores
161 | 
162 | def tokenizer(s, is_cased):
163 |     s = s.strip()
164 | 
165 |     # language-independent part:
166 |     if '<skipped>' in s:
167 |         s = re.sub('<skipped>', '', s)
168 | 
169 |     # language-dependent part (assuming Western languages):
170 |     s = " %s " % s
171 |     if not is_cased:
172 |         s = s.lower()
173 | 
174 |     # tokenize punctuation
175 |     s = re.sub('([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', s)
176 | 
177 |     # tokenize period and comma unless preceded by a digit
178 |     s = re.sub('([^0-9])([\.,])', '\\1 \\2 ', s)
179 | 
180 |     # tokenize period and comma unless followed by a digit
181 |     s = re.sub('([\.,])([^0-9])', ' \\1 \\2', s)
182 | 
183 |     # tokenize dash when preceded by a digit
184 |     if '-' in s:
185 |         s = re.sub('([0-9])(-)', '\\1 \\2 ', s)
186 | 
187 |     # Strip multiple spaces
188 |     # Strip trailing and leading spaces
189 |     return re.sub('\s+', ' ', s).strip().split()
190 | 
191 | #######################
192 | class BLEUScore(Metric):
193 |     def __init__(self, score=None):
194 |         super(BLEUScore, self).__init__(score)
195 |         self.name = "BLEU_v13a"
196 |         if score:
197 |             self.score = float(score.split()[0][:-1])
198 |             self.score_str = score
199 | 
200 | class MTEvalV13aBLEUScorer(object):
201 |     def __init__(self):
202 |         pass
203 | 
204 |     def compute(self, refs, hyp_file):
205 |         # Make reference files a list
206 |         refs = [refs] if isinstance(refs, str) else refs
207 | 
208 |         # Take the first one
209 |         ref_file = refs[0]
210 | 
211 |         # Read (detokenized) files and tokenize them
212 |         ref_segs = read_file(ref_file, tokenizer, True)
213 |         tst_segs = read_file(hyp_file, tokenizer, True)
214 | 
215 |         assert(len(ref_segs) == len(tst_segs))
216 | 
217 |         (ind_scores, cum_scores), ratios = score_system(ref_segs, tst_segs, 4)
218 | 
219 |         bleu_str = "%.2f, %s (ratio=%.3f, hyp_len=%d, ref_len=%d)" % (
220 |                     cum_scores[3]*100,
221 |                     "/".join([("%.1f" % (s*100)) for s in ind_scores[:4]]),
222 |                     (ratios[0]/ratios[1]), ratios[0], ratios[1])
223 |         return BLEUScore(bleu_str)
224 | 


--------------------------------------------------------------------------------
/nmtpy/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lium-lst/nmtpy/dc0a1618f217d5117d6abeacdc15a22443561acf/nmtpy/models/__init__.py


--------------------------------------------------------------------------------
/nmtpy/models/attention_wmt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from ..iterators.fusion import FusionIterator
 3 | from .attention import Model as Attention
 4 | 
 5 | # Same model as attention but using FusionIterator
 6 | # Purpose was to train a monomodal system using the same .pkl
 7 | # files prepared for multimodal Task 2 system.
 8 | #
 9 | # FIXME: Not tested since WMT16 Task 2 experiments, probably broken
10 | class Model(Attention):
11 |     def __init__(self, seed, logger, **kwargs):
12 |         # Call parent's init first
13 |         super(Model, self).__init__(seed, logger, **kwargs)
14 | 
15 |         self.data_mode = kwargs.pop('data_mode', 'pairs')
16 | 
17 |     def load_valid_data(self, from_translate=False, data_mode='single'):
18 |         if from_translate:
19 |             self.valid_ref_files = self.data['valid_trg']
20 |             if isinstance(self.valid_ref_files, str):
21 |                 self.valid_ref_files = list([self.valid_ref_files])
22 | 
23 |             self.valid_iterator = FusionIterator(
24 |                     mask=False,
25 |                     batch_size=1,
26 |                     pklfile=self.data['valid_src'],
27 |                     srcdict=self.src_dict, n_words_src=self.n_words_src,
28 |                     mode=data_mode)
29 |         else:
30 |             # Take the first validation item for NLL computation
31 |             self.valid_iterator = FusionIterator(
32 |                     batch_size=self.batch_size,
33 |                     pklfile=self.data['valid_src'],
34 |                     trgdict=self.trg_dict, srcdict=self.src_dict,
35 |                     n_words_trg=self.n_words_trg, n_words_src=self.n_words_src,
36 |                     mode='single') # Override the given parameter
37 | 
38 |         self.valid_iterator.read()
39 | 
40 |     def load_data(self):
41 |         self.train_iterator = FusionIterator(
42 |                 batch_size=self.batch_size,
43 |                 shuffle_mode=self.shuffle_mode,
44 |                 logger=self._logger,
45 |                 pklfile=self.data['train_src'],
46 |                 trgdict=self.trg_dict, srcdict=self.src_dict,
47 |                 n_words_trg=self.n_words_trg, n_words_src=self.n_words_src,
48 |                 mode=self.data_mode)
49 |         # Prepare batches
50 |         self.train_iterator.read()
51 |         self.load_valid_data()
52 | 


--------------------------------------------------------------------------------
/nmtpy/models/basemodel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from collections import OrderedDict
  3 | 
  4 | from abc import ABCMeta, abstractmethod
  5 | 
  6 | import theano
  7 | import theano.tensor as tensor
  8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  9 | 
 10 | import numpy as np
 11 | from ..nmtutils import unzip
 12 | from ..sysutils import readable_size, get_temp_file, get_param_dict, get_valid_evaluation
 13 | from ..defaults import INT, FLOAT
 14 | from ..optimizers import get_optimizer
 15 | 
 16 | class BaseModel(object, metaclass=ABCMeta):
 17 |     def __init__(self, **kwargs):
 18 |         # This will save all arguments as instance attributes
 19 |         self.__dict__.update(kwargs)
 20 | 
 21 |         # Will be set when set_dropout is first called
 22 |         self._use_dropout   = None
 23 | 
 24 |         # Theano TRNG
 25 |         self._trng          = None
 26 | 
 27 |         # Input tensor lists
 28 |         self.inputs         = None
 29 | 
 30 |         # Theano variables
 31 |         self.f_log_probs    = None
 32 |         self.f_init         = None
 33 |         self.f_next         = None
 34 | 
 35 |         # Model parameters, i.e. weights and biases
 36 |         self.initial_params = None
 37 |         self.tparams        = None
 38 | 
 39 |         # Iterators
 40 |         self.train_iterator = None
 41 |         self.valid_iterator = None
 42 | 
 43 |         # A theano shared variable for lrate annealing
 44 |         self.learning_rate  = None
 45 | 
 46 |         # Optimizer instance (will not be serialized)
 47 |         self.__opt          = None
 48 | 
 49 |     @staticmethod
 50 |     def beam_search(inputs, f_inits, f_nexts, beam_size=12, maxlen=100, suppress_unks=False, **kwargs):
 51 |         # Override this from your classes
 52 |         pass
 53 | 
 54 |     def set_options(self, optdict):
 55 |         """Filter out None's and '__[a-zA-Z]' then store into self._options."""
 56 |         self._options = OrderedDict()
 57 |         for k,v in optdict.items():
 58 |             # Don't keep model attributes with _ prefix
 59 |             if v is not None and not k.startswith('_'):
 60 |                 self._options[k] = v
 61 | 
 62 |     def set_trng(self, seed):
 63 |         """Set the seed for Theano RNG."""
 64 |         if seed == 0:
 65 |             # No seed given, randomly pick the seed
 66 |             seed = np.random.randint(2**29) + 1
 67 |         self._trng = RandomStreams(seed)
 68 | 
 69 |     def set_dropout(self, val):
 70 |         """Set dropout indicator for activation scaling if dropout is available through configuration."""
 71 |         if self._use_dropout is None:
 72 |             self._use_dropout = theano.shared(np.float64(0.).astype(FLOAT))
 73 |         else:
 74 |             self._use_dropout.set_value(float(val))
 75 | 
 76 |     def update_lrate(self, lrate):
 77 |         """Update learning rate."""
 78 |         self.__opt.set_lrate(lrate)
 79 | 
 80 |     def get_nb_params(self):
 81 |         """Return the number of parameters of the model."""
 82 |         return readable_size(sum([p.size for p in self.initial_params.values()]))
 83 | 
 84 |     def save(self, fname):
 85 |         """Save model parameters as .npz."""
 86 |         kwargs = OrderedDict()
 87 |         kwargs['opts'] = self._options
 88 |         if self.tparams is not None:
 89 |             kwargs.update(unzip(self.tparams))
 90 | 
 91 |         # Save each param as a separate argument into npz
 92 |         np.savez(fname, **kwargs)
 93 | 
 94 |     def load(self, params):
 95 |         """Restore .npz checkpoint file into model."""
 96 |         self.tparams = OrderedDict()
 97 | 
 98 |         params = get_param_dict(params)
 99 | 
100 |         for k,v in params.items():
101 |             self.tparams[k] = theano.shared(v.astype(FLOAT), name=k)
102 | 
103 |     def init_shared_variables(self):
104 |         """Initialize the shared variables of the model."""
105 |         # Create tensor dict
106 |         self.tparams = OrderedDict()
107 | 
108 |         # Fill it with initial random weights
109 |         for kk, pp in self.initial_params.items():
110 |             self.tparams[kk] = theano.shared(pp, name=kk)
111 | 
112 |     def update_shared_variables(self, _from):
113 |         """Reset some variables from _from dict."""
114 |         for kk in _from.keys():
115 |             self.tparams[kk].set_value(_from[kk])
116 | 
117 |     def val_loss(self, mean=True):
118 |         """Compute validation loss."""
119 |         probs = []
120 | 
121 |         # dict of x, x_mask, y, y_mask
122 |         for data in self.valid_iterator:
123 |             # Don't fail if data doesn't contain y_mask. The loss won't
124 |             # be normalized but the training will continue
125 |             norm = data['y_mask'].sum(0) if 'y_mask' in data else 1
126 |             log_probs = self.f_log_probs(*list(data.values())) / norm
127 |             probs.extend(log_probs)
128 | 
129 |         if mean:
130 |             return np.array(probs).mean()
131 |         else:
132 |             return np.array(probs)
133 | 
134 |     def get_l2_weight_decay(self, decay_c, skip_bias=True):
135 |         """Return l2 weight decay regularization term."""
136 |         decay_c = theano.shared(np.float64(decay_c).astype(FLOAT), name='decay_c')
137 |         weight_decay = 0.
138 |         for kk, vv in self.tparams.items():
139 |             # Skip biases for L2 regularization
140 |             if not skip_bias or (skip_bias and vv.get_value().ndim > 1):
141 |                 weight_decay += (vv ** 2).sum()
142 |         weight_decay *= decay_c
143 |         return weight_decay
144 | 
145 |     def get_clipped_grads(self, grads, clip_c):
146 |         """Clip gradients a la Pascanu et al."""
147 |         g2 = 0.
148 |         new_grads = []
149 |         for g in grads:
150 |             g2 += (g**2).sum()
151 |         for g in grads:
152 |             new_grads.append(tensor.switch(g2 > (clip_c**2),
153 |                                            g / tensor.sqrt(g2) * clip_c,
154 |                                            g))
155 |         return new_grads
156 | 
157 |     def build_optimizer(self, cost, regcost, clip_c, dont_update=None, opt_history=None):
158 |         """Build optimizer by optionally disabling learning for some weights."""
159 |         tparams = OrderedDict(self.tparams)
160 | 
161 |         # Filter out weights that we do not want to update during backprop
162 |         if dont_update is not None:
163 |             for key in list(tparams.keys()):
164 |                 if key in dont_update:
165 |                     del tparams[key]
166 | 
167 |         # Our final cost
168 |         final_cost = cost.mean()
169 | 
170 |         # If we have a regularization cost, add it
171 |         if regcost is not None:
172 |             final_cost += regcost
173 | 
174 |         # Normalize cost w.r.t sentence lengths to correctly compute perplexity
175 |         # Only active when y_mask is available
176 |         if 'y_mask' in self.inputs:
177 |             norm_cost = (cost / self.inputs['y_mask'].sum(0)).mean()
178 |             if regcost is not None:
179 |                 norm_cost += regcost
180 |         else:
181 |             norm_cost = final_cost
182 | 
183 |         # Get gradients of cost with respect to variables
184 |         # This uses final_cost which is not normalized w.r.t sentence lengths
185 |         grads = tensor.grad(final_cost, wrt=list(tparams.values()))
186 | 
187 |         # Clip gradients if requested
188 |         if clip_c > 0:
189 |             grads = self.get_clipped_grads(grads, clip_c)
190 | 
191 |         # Create optimizer, self.lrate is passed from nmt-train
192 |         self.__opt = get_optimizer(self.optimizer)(lr0=self.lrate)
193 |         self.__opt.set_trng(self._trng)
194 |         #TODO: parameterize this! self.__opt.set_gradient_noise(0.1)
195 | 
196 |         # Get updates
197 |         updates = self.__opt.get_updates(tparams, grads, opt_history)
198 | 
199 |         # Compile forward/backward function
200 |         self.train_batch = theano.function(list(self.inputs.values()), norm_cost, updates=updates)
201 | 
202 |     def run_beam_search(self, beam_size=12, n_jobs=8, metric='bleu', f_valid_out=None):
203 |         """Save model under /tmp for passing it to nmt-translate."""
204 |         # Save model temporarily
205 |         with get_temp_file(suffix=".npz", delete=True) as tmpf:
206 |             self.save(tmpf.name)
207 |             result = get_valid_evaluation(tmpf.name,
208 |                                           beam_size=beam_size,
209 |                                           n_jobs=n_jobs,
210 |                                           metric=metric,
211 |                                           f_valid_out=f_valid_out)
212 | 
213 |         # Return every available metric back
214 |         return result
215 | 
216 |     def info(self):
217 |         """Reimplement to show model specific information before training."""
218 |         pass
219 | 
220 |     ##########################################################
221 |     # For all the abstract methods below, you can take a look
222 |     # at attention.py to understand how they are implemented.
223 |     # Remember that you NEED to implement these methods in your
224 |     # own model.
225 |     ##########################################################
226 | 
227 |     @abstractmethod
228 |     def load_data(self):
229 |         """Load and prepare your training and validation data."""
230 |         pass
231 | 
232 |     @abstractmethod
233 |     def init_params(self):
234 |         """Initialize the weights and biases of your network."""
235 |         pass
236 | 
237 |     @abstractmethod
238 |     def build(self):
239 |         """Build the computational graph of your network."""
240 |         pass
241 | 
242 |     @abstractmethod
243 |     def build_sampler(self, **kwargs):
244 |         """Build f_init() and f_next() for beam-search."""
245 |         pass
246 | 


--------------------------------------------------------------------------------
/nmtpy/models/fusion_sum_dep_ind.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | import theano
  5 | import theano.tensor as tensor
  6 | 
  7 | # Ours
  8 | from ..layers import *
  9 | from ..defaults import INT, FLOAT
 10 | 
 11 | # Base fusion model
 12 | from .basefusion import Model as Fusion
 13 | 
 14 | class Model(Fusion):
 15 |     def __init__(self, seed, logger, **kwargs):
 16 |         # Call parent's init first
 17 |         super(Model, self).__init__(seed, logger, **kwargs)
 18 | 
 19 |         # Set architecture specific methods
 20 |         self.init_gru_decoder   = init_gru_decoder_multi
 21 |         self.gru_decoder        = gru_decoder_multi
 22 | 
 23 | ########## Define layers here ###########
 24 | def init_gru_decoder_multi(params, nin, dim, dimctx, scale=0.01, prefix='gru_decoder_multi'):
 25 |     # Init with usual gru_cond function
 26 |     params = param_init_gru_cond(params, nin, dim, dimctx, scale, prefix)
 27 | 
 28 |     # Add separate attention weights for the 2nd modality
 29 |     params[pp(prefix, 'Wc_att2')]   = norm_weight(dimctx, dimctx, scale=scale)
 30 |     params[pp(prefix,  'b_att2')]   = np.zeros((dimctx,)).astype(FLOAT)
 31 | 
 32 |     # attention: This gives the alpha's
 33 |     params[pp(prefix, 'U_att2')]    = norm_weight(dimctx, 1, scale=scale)
 34 |     params[pp(prefix, 'c_att2')]    = np.zeros((1,)).astype(FLOAT)
 35 | 
 36 |     return params
 37 | 
 38 | def gru_decoder_multi(tparams, state_below,
 39 |                       ctx1, ctx2, prefix='gru_decoder_multi',
 40 |                       input_mask=None, one_step=False,
 41 |                       init_state=None, ctx1_mask=None):
 42 |     if one_step:
 43 |         assert init_state, 'previous state must be provided'
 44 | 
 45 |     # Context
 46 |     # n_timesteps x n_samples x ctxdim
 47 |     assert ctx1 and ctx2, 'Contexts must be provided'
 48 |     assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim'
 49 | 
 50 |     # Number of padded source timesteps
 51 |     nsteps = state_below.shape[0]
 52 | 
 53 |     # Batch or single sample?
 54 |     n_samples = state_below.shape[1] if state_below.ndim == 3 else 1
 55 | 
 56 |     # if we have no mask, we assume all the inputs are valid
 57 |     # tensor.alloc(value, *shape)
 58 |     # input_mask: (n_steps, 1) filled with 1
 59 |     if input_mask is None:
 60 |         input_mask = tensor.alloc(1., nsteps, 1)
 61 | 
 62 |     # Infer RNN dimensionality
 63 |     dim = tparams[pp(prefix, 'Wcx')].shape[1]
 64 | 
 65 |     # initial/previous state
 66 |     # if not given, assume it's all zeros
 67 |     if init_state is None:
 68 |         init_state = tensor.alloc(0., n_samples, dim)
 69 | 
 70 |     # These two dot products are same with gru_layer, refer to the equations.
 71 |     # [W_r * X + b_r, W_z * X + b_z]
 72 |     state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')]
 73 | 
 74 |     # input to compute the hidden state proposal
 75 |     # This is the [W*x]_j in the eq. 8 of the paper
 76 |     state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')]
 77 | 
 78 |     # Wc_att: dimctx -> dimctx
 79 |     # Linearly transform the contexts to another space with same dimensionality
 80 |     pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')]
 81 |     pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att2')]) + tparams[pp(prefix, 'b_att2')]
 82 | 
 83 |     # Step function for the recurrence/scan
 84 |     # Sequences
 85 |     # ---------
 86 |     # m_    : mask
 87 |     # x_    : state_below_
 88 |     # xx_   : state_belowx
 89 |     # outputs_info
 90 |     # ------------
 91 |     # h_     : init_state,
 92 |     # ctx_   : need to be defined as it's returned by _step
 93 |     # alpha1_: need to be defined as it's returned by _step
 94 |     # alpha2_: need to be defined as it's returned by _step
 95 |     # non sequences
 96 |     # -------------
 97 |     # pctx1_ : pctx1_
 98 |     # pctx2_ : pctx2_
 99 |     # cc1_   : ctx1
100 |     # cc2_   : ctx2
101 |     # and all the shared weights and biases..
102 |     def _step(m_, x_, xx_,
103 |               h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations
104 |               pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, U_att, c_att, U_att2, c_att2,
105 |               Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl):
106 | 
107 |         # Do a step of classical GRU
108 |         h1 = gru_step(m_, x_, xx_, h_, U, Ux)
109 | 
110 |         ###########
111 |         # Attention
112 |         ###########
113 |         # h1 X W_comb_att
114 |         # W_comb_att: dim -> dimctx
115 |         # pstate_ should be 2D as we're working with unrolled timesteps
116 |         pstate_ = tensor.dot(h1, W_comb_att)
117 | 
118 |         # Accumulate in pctx*__ and apply tanh()
119 |         # This becomes the projected context(s) + the current hidden state
120 |         # of the decoder, e.g. this is the information accumulating
121 |         # into the returned original contexts with the knowledge of target
122 |         # sentence decoding.
123 |         pctx1__ = tanh(pctx1_ + pstate_[None, :, :])
124 |         pctx2__ = tanh(pctx2_ + pstate_[None, :, :])
125 | 
126 |         # Affine transformation for alpha* = (pctx*__ X U_att) + c_att
127 |         # We're now down to scalar alpha's for each accumulated
128 |         # context (0th dim) in the pctx*__
129 |         # alpha1 should be n_timesteps, 1, 1
130 |         alpha1 = tensor.dot(pctx1__, U_att) + c_att
131 |         alpha2 = tensor.dot(pctx2__, U_att2) + c_att2
132 | 
133 |         # Drop the last dimension, e.g. (n_timesteps, 1)
134 |         alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]])
135 |         alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]])
136 | 
137 |         # Exponentiate alpha1
138 |         alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True))
139 |         alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True))
140 | 
141 |         # If there is a context mask, multiply with it to cancel unnecessary steps
142 |         # We won't have a ctx_mask for image vectors
143 |         if ctx1_mask:
144 |             alpha1 = alpha1 * ctx1_mask
145 | 
146 |         # Normalize so that the sum makes 1
147 |         alpha1 = alpha1 / alpha1.sum(0, keepdims=True)
148 |         alpha2 = alpha2 / alpha2.sum(0, keepdims=True)
149 | 
150 |         # Compute the current context ctx*_ as the alpha-weighted sum of
151 |         # the initial contexts ctx*'s
152 |         ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0)
153 |         ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0)
154 |         # n_samples x ctxdim (2000)
155 | 
156 |         # Sum of contexts
157 |         ctx_ = tanh(ctx1_ + ctx2_)
158 | 
159 |         ############################################
160 |         # ctx*_ and alpha computations are completed
161 |         ############################################
162 | 
163 |         ####################################
164 |         # The below code is another GRU cell
165 |         ####################################
166 |         # Affine transformation: h1 X U_nl + b_nl
167 |         # U_nl, b_nl: Stacked dim*2
168 |         preact = tensor.dot(h1, U_nl) + b_nl
169 | 
170 |         # Transform the weighted context sum with Wc
171 |         # and add it to preact
172 |         # Wc: dimctx -> Stacked dim*2
173 |         preact += tensor.dot(ctx_, Wc)
174 | 
175 |         # Apply sigmoid nonlinearity
176 |         preact = sigmoid(preact)
177 | 
178 |         # Slice activations: New gates r2 and u2
179 |         r2 = tensor_slice(preact, 0, dim)
180 |         u2 = tensor_slice(preact, 1, dim)
181 | 
182 |         preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2
183 |         preactx += tensor.dot(ctx_, Wcx)
184 | 
185 |         # Candidate hidden
186 |         h2_tilda = tanh(preactx)
187 | 
188 |         # Leaky integration between the new h2 and the
189 |         # old h1 computed in line 285
190 |         h2 = u2 * h2_tilda + (1. - u2) * h1
191 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
192 | 
193 |         return h2, ctx_, alpha1.T, alpha2.T
194 | 
195 |     # Sequences are the input mask and the transformed target embeddings
196 |     seqs = [input_mask, state_below_, state_belowx]
197 | 
198 |     # Create a list of shared parameters for easy parameter passing
199 |     shared_vars = [tparams[pp(prefix, 'U')],
200 |                    tparams[pp(prefix, 'Wc')],
201 |                    tparams[pp(prefix, 'W_comb_att')],
202 |                    tparams[pp(prefix, 'U_att')],
203 |                    tparams[pp(prefix, 'c_att')],
204 |                    tparams[pp(prefix, 'U_att2')],
205 |                    tparams[pp(prefix, 'c_att2')],
206 |                    tparams[pp(prefix, 'Ux')],
207 |                    tparams[pp(prefix, 'Wcx')],
208 |                    tparams[pp(prefix, 'U_nl')],
209 |                    tparams[pp(prefix, 'Ux_nl')],
210 |                    tparams[pp(prefix, 'b_nl')],
211 |                    tparams[pp(prefix, 'bx_nl')]]
212 | 
213 |     if one_step:
214 |         rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars))
215 |     else:
216 |         outputs_info=[init_state,
217 |                       tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim       (ctx_)
218 |                       tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps  (alpha1)
219 |                       tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps  (alpha2)
220 | 
221 |         rval, updates = theano.scan(_step,
222 |                                     sequences=seqs,
223 |                                     outputs_info=outputs_info,
224 |                                     non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars,
225 |                                     name=pp(prefix, '_layers'),
226 |                                     n_steps=nsteps,
227 |                                     strict=True)
228 |     return rval
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/nmtpy/models/fusion_sum_ind_dep.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | import theano
  5 | import theano.tensor as tensor
  6 | 
  7 | # Ours
  8 | from ..layers import *
  9 | from ..defaults import INT, FLOAT
 10 | 
 11 | # Base fusion model
 12 | from .basefusion import Model as Fusion
 13 | 
 14 | class Model(Fusion):
 15 |     def __init__(self, seed, logger, **kwargs):
 16 |         # Call parent's init first
 17 |         super(Model, self).__init__(seed, logger, **kwargs)
 18 | 
 19 |         # Set architecture specific methods
 20 |         self.init_gru_decoder   = init_gru_decoder_multi
 21 |         self.gru_decoder        = gru_decoder_multi
 22 | 
 23 | ########## Define layers here ###########
 24 | def init_gru_decoder_multi(params, nin, dim, dimctx, scale=0.01, prefix='gru_decoder_multi'):
 25 |     # Init with usual gru_cond function
 26 |     params = param_init_gru_cond(params, nin, dim, dimctx, scale, prefix)
 27 | 
 28 |     # Add separate attention weights for the 2nd modality
 29 |     params[pp(prefix, 'Wc_att2')]   = norm_weight(dimctx, dimctx, scale=scale)
 30 |     params[pp(prefix,  'b_att2')]   = np.zeros((dimctx,)).astype(FLOAT)
 31 | 
 32 |     # attention: This gives the alpha's
 33 |     params[pp(prefix, 'W_comb_att2')] = norm_weight(dim, dimctx, scale=scale)
 34 | 
 35 |     return params
 36 | 
 37 | def gru_decoder_multi(tparams, state_below,
 38 |                       ctx1, ctx2, prefix='gru_decoder_multi',
 39 |                       input_mask=None, one_step=False,
 40 |                       init_state=None, ctx1_mask=None):
 41 |     if one_step:
 42 |         assert init_state, 'previous state must be provided'
 43 | 
 44 |     # Context
 45 |     # n_timesteps x n_samples x ctxdim
 46 |     assert ctx1 and ctx2, 'Contexts must be provided'
 47 |     assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim'
 48 | 
 49 |     # Number of padded source timesteps
 50 |     nsteps = state_below.shape[0]
 51 | 
 52 |     # Batch or single sample?
 53 |     n_samples = state_below.shape[1] if state_below.ndim == 3 else 1
 54 | 
 55 |     # if we have no mask, we assume all the inputs are valid
 56 |     # tensor.alloc(value, *shape)
 57 |     # input_mask: (n_steps, 1) filled with 1
 58 |     if input_mask is None:
 59 |         input_mask = tensor.alloc(1., nsteps, 1)
 60 | 
 61 |     # Infer RNN dimensionality
 62 |     dim = tparams[pp(prefix, 'Wcx')].shape[1]
 63 | 
 64 |     # initial/previous state
 65 |     # if not given, assume it's all zeros
 66 |     if init_state is None:
 67 |         init_state = tensor.alloc(0., n_samples, dim)
 68 | 
 69 |     # These two dot products are same with gru_layer, refer to the equations.
 70 |     # [W_r * X + b_r, W_z * X + b_z]
 71 |     state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')]
 72 | 
 73 |     # input to compute the hidden state proposal
 74 |     # This is the [W*x]_j in the eq. 8 of the paper
 75 |     state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')]
 76 | 
 77 |     # Wc_att: dimctx -> dimctx
 78 |     # Linearly transform the contexts to another space with same dimensionality
 79 |     pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')]
 80 |     pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att2')]) + tparams[pp(prefix, 'b_att2')]
 81 | 
 82 |     # Step function for the recurrence/scan
 83 |     # Sequences
 84 |     # ---------
 85 |     # m_    : mask
 86 |     # x_    : state_below_
 87 |     # xx_   : state_belowx
 88 |     # outputs_info
 89 |     # ------------
 90 |     # h_     : init_state,
 91 |     # ctx_   : need to be defined as it's returned by _step
 92 |     # alpha1_: need to be defined as it's returned by _step
 93 |     # alpha2_: need to be defined as it's returned by _step
 94 |     # non sequences
 95 |     # -------------
 96 |     # pctx1_ : pctx1_
 97 |     # pctx2_ : pctx2_
 98 |     # cc1_   : ctx1
 99 |     # cc2_   : ctx2
100 |     # and all the shared weights and biases..
101 |     def _step(m_, x_, xx_,
102 |               h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations
103 |               pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, W_comb_att2, U_att, c_att,
104 |               Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl):
105 | 
106 |         # Do a step of classical GRU
107 |         h1 = gru_step(m_, x_, xx_, h_, U, Ux)
108 | 
109 |         ###########
110 |         # Attention
111 |         ###########
112 |         # h1 X W_comb_att
113 |         # W_comb_att: dim -> dimctx
114 |         # pstate_ should be 2D as we're working with unrolled timesteps
115 |         pstate1_ = tensor.dot(h1, W_comb_att)
116 |         pstate2_ = tensor.dot(h1, W_comb_att2)
117 | 
118 |         # Accumulate in pctx*__ and apply tanh()
119 |         # This becomes the projected context(s) + the current hidden state
120 |         # of the decoder, e.g. this is the information accumulating
121 |         # into the returned original contexts with the knowledge of target
122 |         # sentence decoding.
123 |         pctx1__ = tanh(pctx1_ + pstate1_[None, :, :])
124 |         pctx2__ = tanh(pctx2_ + pstate2_[None, :, :])
125 | 
126 |         # Affine transformation for alpha* = (pctx*__ X U_att) + c_att
127 |         # We're now down to scalar alpha's for each accumulated
128 |         # context (0th dim) in the pctx*__
129 |         # alpha1 should be n_timesteps, 1, 1
130 |         alpha1 = tensor.dot(pctx1__, U_att) + c_att
131 |         alpha2 = tensor.dot(pctx2__, U_att) + c_att
132 | 
133 |         # Drop the last dimension, e.g. (n_timesteps, 1)
134 |         alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]])
135 |         alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]])
136 | 
137 |         # Exponentiate alpha1
138 |         alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True))
139 |         alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True))
140 | 
141 |         # If there is a context mask, multiply with it to cancel unnecessary steps
142 |         # We won't have a ctx_mask for image vectors
143 |         if ctx1_mask:
144 |             alpha1 = alpha1 * ctx1_mask
145 | 
146 |         # Normalize so that the sum makes 1
147 |         alpha1 = alpha1 / alpha1.sum(0, keepdims=True)
148 |         alpha2 = alpha2 / alpha2.sum(0, keepdims=True)
149 | 
150 |         # Compute the current context ctx*_ as the alpha-weighted sum of
151 |         # the initial contexts ctx*'s
152 |         ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0)
153 |         ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0)
154 |         # n_samples x ctxdim (2000)
155 | 
156 |         # Sum of contexts
157 |         ctx_ = tanh(ctx1_ + ctx2_)
158 | 
159 |         ############################################
160 |         # ctx*_ and alpha computations are completed
161 |         ############################################
162 | 
163 |         ####################################
164 |         # The below code is another GRU cell
165 |         ####################################
166 |         # Affine transformation: h1 X U_nl + b_nl
167 |         # U_nl, b_nl: Stacked dim*2
168 |         preact = tensor.dot(h1, U_nl) + b_nl
169 | 
170 |         # Transform the weighted context sum with Wc
171 |         # and add it to preact
172 |         # Wc: dimctx -> Stacked dim*2
173 |         preact += tensor.dot(ctx_, Wc)
174 | 
175 |         # Apply sigmoid nonlinearity
176 |         preact = sigmoid(preact)
177 | 
178 |         # Slice activations: New gates r2 and u2
179 |         r2 = tensor_slice(preact, 0, dim)
180 |         u2 = tensor_slice(preact, 1, dim)
181 | 
182 |         preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2
183 |         preactx += tensor.dot(ctx_, Wcx)
184 | 
185 |         # Candidate hidden
186 |         h2_tilda = tanh(preactx)
187 | 
188 |         # Leaky integration between the new h2 and the
189 |         # old h1 computed in line 285
190 |         h2 = u2 * h2_tilda + (1. - u2) * h1
191 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
192 | 
193 |         return h2, ctx_, alpha1.T, alpha2.T
194 | 
195 |     # Sequences are the input mask and the transformed target embeddings
196 |     seqs = [input_mask, state_below_, state_belowx]
197 | 
198 |     # Create a list of shared parameters for easy parameter passing
199 |     shared_vars = [tparams[pp(prefix, 'U')],
200 |                    tparams[pp(prefix, 'Wc')],
201 |                    tparams[pp(prefix, 'W_comb_att')],
202 |                    tparams[pp(prefix, 'W_comb_att2')],
203 |                    tparams[pp(prefix, 'U_att')],
204 |                    tparams[pp(prefix, 'c_att')],
205 |                    tparams[pp(prefix, 'Ux')],
206 |                    tparams[pp(prefix, 'Wcx')],
207 |                    tparams[pp(prefix, 'U_nl')],
208 |                    tparams[pp(prefix, 'Ux_nl')],
209 |                    tparams[pp(prefix, 'b_nl')],
210 |                    tparams[pp(prefix, 'bx_nl')]]
211 | 
212 |     if one_step:
213 |         rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars))
214 |     else:
215 |         outputs_info=[init_state,
216 |                       tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim       (ctx_)
217 |                       tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps  (alpha1)
218 |                       tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps  (alpha2)
219 | 
220 |         rval, updates = theano.scan(_step,
221 |                                     sequences=seqs,
222 |                                     outputs_info=outputs_info,
223 |                                     non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars,
224 |                                     name=pp(prefix, '_layers'),
225 |                                     n_steps=nsteps,
226 |                                     strict=True)
227 |     return rval
228 | 
229 | 
230 | 


--------------------------------------------------------------------------------
/nmtpy/models/fusion_sum_ind_ind.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | 
  4 | import theano
  5 | import theano.tensor as tensor
  6 | 
  7 | # Ours
  8 | from ..layers import *
  9 | from ..defaults import INT, FLOAT
 10 | 
 11 | # Base fusion model
 12 | from .basefusion import Model as Fusion
 13 | 
 14 | class Model(Fusion):
 15 |     def __init__(self, seed, logger, **kwargs):
 16 |         # Call parent's init first
 17 |         super(Model, self).__init__(seed, logger, **kwargs)
 18 | 
 19 |         # Set architecture specific methods
 20 |         self.init_gru_decoder   = init_gru_decoder_multi
 21 |         self.gru_decoder        = gru_decoder_multi
 22 | 
 23 | ########## Define layers here ###########
 24 | def init_gru_decoder_multi(params, nin, dim, dimctx, scale=0.01, prefix='gru_decoder_multi'):
 25 |     # Init with usual gru_cond function
 26 |     return param_init_gru_cond(params, nin, dim, dimctx, scale, prefix, False)
 27 | 
 28 | def gru_decoder_multi(tparams, state_below,
 29 |                       ctx1, ctx2, prefix='gru_decoder_multi',
 30 |                       input_mask=None, one_step=False,
 31 |                       init_state=None, ctx1_mask=None):
 32 |     if one_step:
 33 |         assert init_state, 'previous state must be provided'
 34 | 
 35 |     # Context
 36 |     # n_timesteps x n_samples x ctxdim
 37 |     assert ctx1 and ctx2, 'Contexts must be provided'
 38 |     assert ctx1.ndim == 3 and ctx2.ndim == 3, 'Contexts must be 3-d: #annotation x #sample x dim'
 39 | 
 40 |     # Number of padded source timesteps
 41 |     nsteps = state_below.shape[0]
 42 | 
 43 |     # Batch or single sample?
 44 |     n_samples = state_below.shape[1] if state_below.ndim == 3 else 1
 45 | 
 46 |     # if we have no mask, we assume all the inputs are valid
 47 |     # tensor.alloc(value, *shape)
 48 |     # input_mask: (n_steps, 1) filled with 1
 49 |     if input_mask is None:
 50 |         input_mask = tensor.alloc(1., nsteps, 1)
 51 | 
 52 |     # Infer RNN dimensionality
 53 |     dim = tparams[pp(prefix, 'Wcx')].shape[1]
 54 | 
 55 |     # initial/previous state
 56 |     # if not given, assume it's all zeros
 57 |     if init_state is None:
 58 |         init_state = tensor.alloc(0., n_samples, dim)
 59 | 
 60 |     # These two dot products are same with gru_layer, refer to the equations.
 61 |     # [W_r * X + b_r, W_z * X + b_z]
 62 |     state_below_ = tensor.dot(state_below, tparams[pp(prefix, 'W')]) + tparams[pp(prefix, 'b')]
 63 | 
 64 |     # input to compute the hidden state proposal
 65 |     # This is the [W*x]_j in the eq. 8 of the paper
 66 |     state_belowx = tensor.dot(state_below, tparams[pp(prefix, 'Wx')]) + tparams[pp(prefix, 'bx')]
 67 | 
 68 |     # Wc_att: dimctx -> dimctx
 69 |     # Linearly transform the contexts to another space with same dimensionality
 70 |     pctx1_ = tensor.dot(ctx1, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')]
 71 |     pctx2_ = tensor.dot(ctx2, tparams[pp(prefix, 'Wc_att')]) + tparams[pp(prefix, 'b_att')]
 72 | 
 73 |     # Step function for the recurrence/scan
 74 |     # Sequences
 75 |     # ---------
 76 |     # m_    : mask
 77 |     # x_    : state_below_
 78 |     # xx_   : state_belowx
 79 |     # outputs_info
 80 |     # ------------
 81 |     # h_     : init_state,
 82 |     # ctx_   : need to be defined as it's returned by _step
 83 |     # alpha1_: need to be defined as it's returned by _step
 84 |     # alpha2_: need to be defined as it's returned by _step
 85 |     # non sequences
 86 |     # -------------
 87 |     # pctx1_ : pctx1_
 88 |     # pctx2_ : pctx2_
 89 |     # cc1_   : ctx1
 90 |     # cc2_   : ctx2
 91 |     # and all the shared weights and biases..
 92 |     def _step(m_, x_, xx_,
 93 |               h_, ctx_, alpha1_, alpha2_, # These ctx and alpha's are not used in the computations
 94 |               pctx1_, pctx2_, cc1_, cc2_, U, Wc, W_comb_att, U_att, c_att, Ux, Wcx, U_nl, Ux_nl, b_nl, bx_nl):
 95 | 
 96 |         # Do a step of classical GRU
 97 |         h1 = gru_step(m_, x_, xx_, h_, U, Ux)
 98 | 
 99 |         ###########
100 |         # Attention
101 |         ###########
102 |         # h1 X W_comb_att
103 |         # W_comb_att: dim -> dimctx
104 |         # pstate_ should be 2D as we're working with unrolled timesteps
105 |         pstate_ = tensor.dot(h1, W_comb_att)
106 | 
107 |         # Accumulate in pctx*__ and apply tanh()
108 |         # This becomes the projected context(s) + the current hidden state
109 |         # of the decoder, e.g. this is the information accumulating
110 |         # into the returned original contexts with the knowledge of target
111 |         # sentence decoding.
112 |         pctx1__ = tanh(pctx1_ + pstate_[None, :, :])
113 |         pctx2__ = tanh(pctx2_ + pstate_[None, :, :])
114 | 
115 |         # Affine transformation for alpha* = (pctx*__ X U_att) + c_att
116 |         # We're now down to scalar alpha's for each accumulated
117 |         # context (0th dim) in the pctx*__
118 |         # alpha1 should be n_timesteps, 1, 1
119 |         alpha1 = tensor.dot(pctx1__, U_att) + c_att
120 |         alpha2 = tensor.dot(pctx2__, U_att) + c_att
121 | 
122 |         # Drop the last dimension, e.g. (n_timesteps, 1)
123 |         alpha1 = alpha1.reshape([alpha1.shape[0], alpha1.shape[1]])
124 |         alpha2 = alpha2.reshape([alpha2.shape[0], alpha2.shape[1]])
125 | 
126 |         # Exponentiate alpha1
127 |         alpha1 = tensor.exp(alpha1 - alpha1.max(0, keepdims=True))
128 |         alpha2 = tensor.exp(alpha2 - alpha2.max(0, keepdims=True))
129 | 
130 |         # If there is a context mask, multiply with it to cancel unnecessary steps
131 |         # We won't have a ctx_mask for image vectors
132 |         if ctx1_mask:
133 |             alpha1 = alpha1 * ctx1_mask
134 | 
135 |         # Normalize so that the sum makes 1
136 |         alpha1 = alpha1 / alpha1.sum(0, keepdims=True)
137 |         alpha2 = alpha2 / alpha2.sum(0, keepdims=True)
138 | 
139 |         # Compute the current context ctx*_ as the alpha-weighted sum of
140 |         # the initial contexts ctx*'s
141 |         ctx1_ = (cc1_ * alpha1[:, :, None]).sum(0)
142 |         ctx2_ = (cc2_ * alpha2[:, :, None]).sum(0)
143 |         # n_samples x ctxdim (2000)
144 | 
145 |         # Sum of contexts
146 |         ctx_ = tanh(ctx1_ + ctx2_)
147 | 
148 |         ############################################
149 |         # ctx*_ and alpha computations are completed
150 |         ############################################
151 | 
152 |         ####################################
153 |         # The below code is another GRU cell
154 |         ####################################
155 |         # Affine transformation: h1 X U_nl + b_nl
156 |         # U_nl, b_nl: Stacked dim*2
157 |         preact = tensor.dot(h1, U_nl) + b_nl
158 | 
159 |         # Transform the weighted context sum with Wc
160 |         # and add it to preact
161 |         # Wc: dimctx -> Stacked dim*2
162 |         preact += tensor.dot(ctx_, Wc)
163 | 
164 |         # Apply sigmoid nonlinearity
165 |         preact = sigmoid(preact)
166 | 
167 |         # Slice activations: New gates r2 and u2
168 |         r2 = tensor_slice(preact, 0, dim)
169 |         u2 = tensor_slice(preact, 1, dim)
170 | 
171 |         preactx = (tensor.dot(h1, Ux_nl) + bx_nl) * r2
172 |         preactx += tensor.dot(ctx_, Wcx)
173 | 
174 |         # Candidate hidden
175 |         h2_tilda = tanh(preactx)
176 | 
177 |         # Leaky integration between the new h2 and the
178 |         # old h1 computed in line 285
179 |         h2 = u2 * h2_tilda + (1. - u2) * h1
180 |         h2 = m_[:, None] * h2 + (1. - m_)[:, None] * h1
181 | 
182 |         return h2, ctx_, alpha1.T, alpha2.T
183 | 
184 |     # Sequences are the input mask and the transformed target embeddings
185 |     seqs = [input_mask, state_below_, state_belowx]
186 | 
187 |     # Create a list of shared parameters for easy parameter passing
188 |     shared_vars = [tparams[pp(prefix, 'U')],
189 |                    tparams[pp(prefix, 'Wc')],
190 |                    tparams[pp(prefix, 'W_comb_att')],
191 |                    tparams[pp(prefix, 'U_att')],
192 |                    tparams[pp(prefix, 'c_att')],
193 |                    tparams[pp(prefix, 'Ux')],
194 |                    tparams[pp(prefix, 'Wcx')],
195 |                    tparams[pp(prefix, 'U_nl')],
196 |                    tparams[pp(prefix, 'Ux_nl')],
197 |                    tparams[pp(prefix, 'b_nl')],
198 |                    tparams[pp(prefix, 'bx_nl')]]
199 | 
200 |     if one_step:
201 |         rval = _step(*(seqs + [init_state, None, None, None, pctx1_, pctx2_, ctx1, ctx2] + shared_vars))
202 |     else:
203 |         outputs_info=[init_state,
204 |                       tensor.alloc(0., n_samples, ctx1.shape[2]), # ctxdim       (ctx_)
205 |                       tensor.alloc(0., n_samples, ctx1.shape[0]), # n_timesteps  (alpha1)
206 |                       tensor.alloc(0., n_samples, ctx2.shape[0])] # n_timesteps  (alpha2)
207 | 
208 |         rval, updates = theano.scan(_step,
209 |                                     sequences=seqs,
210 |                                     outputs_info=outputs_info,
211 |                                     non_sequences=[pctx1_, pctx2_, ctx1, ctx2] + shared_vars,
212 |                                     name=pp(prefix, '_layers'),
213 |                                     n_steps=nsteps,
214 |                                     strict=True)
215 |     return rval
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/nmtpy/models/mnmt_yemb_mulimg.py:
--------------------------------------------------------------------------------
1 | mnmt_trgmul.py


--------------------------------------------------------------------------------
/nmtpy/models/rnnlm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from collections import OrderedDict
  3 | import numpy as np
  4 | 
  5 | import theano
  6 | import theano.tensor as tensor
  7 | from ..layers import tanh, get_new_layer
  8 | from ..defaults import INT, FLOAT
  9 | from ..nmtutils import load_dictionary, norm_weight
 10 | from ..iterators.text import TextIterator
 11 | 
 12 | from .basemodel import BaseModel
 13 | 
 14 | class Model(BaseModel):
 15 |     def __init__(self, seed, logger, **kwargs):
 16 |         # Call parent's init first
 17 |         super(Model, self).__init__(**kwargs)
 18 | 
 19 |         # Set the logger
 20 |         self._logger = logger
 21 | 
 22 |         # Load dictionaries
 23 |         dicts = kwargs.pop('dicts')
 24 | 
 25 |         # Let's default to GRU
 26 |         self.rnn_type = kwargs.pop('rnn_type', 'gru')
 27 | 
 28 |         self.src_dict, src_idict = load_dictionary(dicts['src'])
 29 |         self.n_words = min(self.n_words, len(self.src_dict)) \
 30 |                 if self.n_words > 0 else len(self.src_dict)
 31 | 
 32 |         self.set_options(self.__dict__)
 33 |         self.src_idict = src_idict
 34 |         self.set_trng(seed)
 35 |         self.set_dropout(False)
 36 | 
 37 |     def load_valid_data(self):
 38 |         self.valid_iterator = TextIterator(
 39 |                                 batch_size=1,
 40 |                                 mask=True,
 41 |                                 shuffle_mode=None,
 42 |                                 file=self.data['valid_src'],
 43 |                                 dict=self.src_dict,
 44 |                                 n_words=self.n_words,
 45 |                                 name='y') # This is important for the loss to be correctly normalized!
 46 |         self.valid_iterator.read()
 47 | 
 48 |     def load_data(self):
 49 |         self.train_iterator = TextIterator(
 50 |                                 batch_size=self.batch_size,
 51 |                                 mask=True,
 52 |                                 shuffle_mode=None, # or simple or trglen, not tested in rnnlm.
 53 |                                 file=self.data['train_src'],
 54 |                                 dict=self.src_dict,
 55 |                                 n_words=self.n_words)
 56 | 
 57 |         self.train_iterator.read()
 58 |         self.load_valid_data()
 59 | 
 60 |     def init_params(self):
 61 |         params = OrderedDict()
 62 | 
 63 |         # encoder: ff tanh
 64 |         #########
 65 |         # Forward encoder initializer
 66 |         #params = get_new_layer(self.enc_type)[0](params, prefix='encoder', nin=self.in_emb_dim, nout=self.rnn_dim)
 67 |         # embedding weights for encoder
 68 |         params['W_in_emb'] = norm_weight(self.n_words, self.in_emb_dim)
 69 | 
 70 |         # init_state, init_cell
 71 |         #params = get_new_layer('ff')[0](params, prefix='ff_state', nin=self.in_emb_dim, nout=self.rnn_dim)
 72 | 
 73 |         # recurrent layer: in_emb_dim to rnn_dim
 74 |         params = get_new_layer(self.rnn_type)[0](params, prefix='recurrent', nin=self.in_emb_dim, dim=self.rnn_dim)
 75 | 
 76 |         # generate target embedding
 77 |         params = get_new_layer('ff')[0](params, prefix='ff_logit_rnn'   , nin=self.rnn_dim, nout=self.out_emb_dim, ortho=False)
 78 |         # output to input: out_emb_dim -> out_emb_dim
 79 |         params = get_new_layer('ff')[0](params, prefix='ff_logit_prev'  , nin=self.out_emb_dim, nout=self.out_emb_dim, ortho=False)
 80 |         # prepare softmax: out_emb_dim -> n_words
 81 |         params = get_new_layer('ff')[0](params, prefix='ff_logit'       , nin=self.out_emb_dim, nout=self.n_words)
 82 | 
 83 |         self.initial_params = params
 84 | 
 85 |     def build(self):
 86 |         # description string: #words x #samples
 87 |         x = tensor.matrix('x', dtype=INT)
 88 |         x_mask = tensor.matrix('x_mask', dtype=FLOAT)
 89 | 
 90 |         # Store tensors
 91 |         self.inputs = OrderedDict()
 92 |         self.inputs['x']        = x         # Source words
 93 |         self.inputs['x_mask']   = x_mask    # Source mask
 94 | 
 95 |         n_timesteps = x.shape[0]
 96 |         n_samples = x.shape[1]
 97 | 
 98 |         # input word embedding
 99 |         emb = self.tparams['W_in_emb'][x.flatten()]
100 |         emb = emb.reshape([n_timesteps, n_samples, self.in_emb_dim])
101 |         #proj = get_new_layer(self.enc_type)[1](self.tparams, emb, prefix='encoder', mask=x_mask)
102 |         # prepare outputs
103 |         emb_shifted = tensor.zeros_like(emb)
104 |         emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
105 |         emb = emb_shifted
106 | 
107 |         # pass through gru layer, recurrence here
108 |         proj = get_new_layer(self.rnn_type)[1](self.tparams, emb,
109 |                                                 prefix='recurrent', mask=x_mask)
110 | 
111 |         proj_h = proj[0]
112 | 
113 |         # compute word probabilities
114 |         # internal state of RNN
115 |         logit_rnn = get_new_layer('ff')[1](self.tparams, proj_h, prefix='ff_logit_rnn', activ='linear')
116 |         # previous output word embedding
117 |         logit_prev = get_new_layer('ff')[1](self.tparams, emb, prefix='ff_logit_prev', activ='linear')
118 |         logit = tanh(logit_rnn + logit_prev)
119 | 
120 | 
121 |         logit = get_new_layer('ff')[1](self.tparams, logit, prefix='ff_logit', activ='linear')
122 |         logit_shp = logit.shape
123 | 
124 |         # Apply logsoftmax (stable version)
125 |         log_probs = -tensor.nnet.logsoftmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
126 | 
127 |         # cost
128 |         x_flat = x.flatten()
129 |         x_flat_idx = tensor.arange(x_flat.shape[0]) * self.n_words + x_flat
130 | 
131 |         cost = log_probs.flatten()[x_flat_idx]
132 |         cost = cost.reshape([x.shape[0], x.shape[1]])
133 |         cost = (cost * x_mask)
134 | 
135 |         #f_log_probs_detailled return the log probs array correponding to each word log probs
136 |         self.f_log_probs_detailled = theano.function(list(self.inputs.values()), cost)
137 |         cost = (cost * x_mask).sum(0)
138 | 
139 |         #f_log_probs return the sum of the sentence log probs
140 |         self.f_log_probs = theano.function(list(self.inputs.values()), cost)
141 | 
142 |         return cost.mean()
143 | 
144 |     def val_loss(self, sentence=None):
145 |          probs = []
146 |          if sentence is None:
147 |              #Training validation
148 |              for data in self.valid_iterator:
149 |                  norm = data['y_mask'].sum(0)
150 |                  log_probs = sum(self.f_log_probs(*list(data.values()))) / norm
151 |                  probs.extend(log_probs)
152 |              return np.array(probs).mean()
153 |          else:
154 |              #lm testing one sentence at the time
155 |              norm = sentence['y_mask'].sum(0)
156 |              log_probs = self.f_log_probs_detailled(*list(sentence.values()))
157 |              probs.extend(log_probs)
158 |              return np.array(probs), norm
159 | 
160 |     def build_sampler(self, **kwargs):
161 |         # x: 1 x 1
162 |         y = tensor.vector('y_sampler', dtype=INT)
163 |         init_state = tensor.matrix('init_state', dtype=FLOAT)
164 | 
165 |         # if it's the first word, emb should be all zero
166 |         emb = tensor.switch(y[:, None] < 0,
167 |                             tensor.alloc(0., 1, self.tparams['W_in_emb'].shape[1]),
168 |                             self.tparams['W_in_emb'][y])
169 | 
170 |         # apply one step of gru layer
171 |         proj = get_new_layer(self.rnn_type)[1](self.tparams, emb,
172 |                                                 prefix='recurrent',
173 |                                                 mask=None)
174 |         next_state = proj[0]
175 | 
176 |         # compute the output probability dist and sample
177 |         logit_rnn = get_new_layer('ff')[1](self.tparams, next_state, prefix='ff_logit_rnn', activ='linear')
178 |         logit_prev = get_new_layer('ff')[1](self.tparams, emb, prefix='ff_logit_prev', activ='linear')
179 |         logit = tensor.tanh(logit_rnn+logit_prev)
180 |         logit = get_new_layer('ff')[1](self.tparams, logit, prefix='ff_logit', activ='linear')
181 |         logit_shp = logit.shape
182 | 
183 |         # Apply logsoftmax (stable version)
184 |         next_log_probs = -tensor.nnet.logsoftmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))
185 | 
186 |         # Sample from the softmax distribution
187 |         next_probs = tensor.exp(next_log_probs)
188 |         next_word = self._trng.multinomial(pvals=next_probs).argmax(1)
189 | 
190 |         # next word probability
191 |         inps = [y, init_state]
192 |         outs = [next_log_probs, next_word, next_state]
193 |         self.f_next = theano.function(inps, outs, name='f_next')
194 | 
195 |     def gen_sample(tparams, f_next, options, trng=None, maxlen=30, argmax=False):
196 |         sample = []
197 |         sample_score = 0
198 | 
199 |         # initial token is indicated by a -1 and initial state is zero
200 |         next_w = -1 * np.ones((1,)).astype(INT)
201 |         next_state = np.zeros((1, options['dim'])).astype(FLOAT)
202 | 
203 |         for ii in range(maxlen):
204 |             inps = [next_w, next_state]
205 |             ret = f_next(*inps)
206 |             next_p, next_w, next_state = ret[0], ret[1], ret[2]
207 | 
208 |             if argmax:
209 |                 nw = next_p[0].argmax()
210 |             else:
211 |                 nw = next_w[0]
212 |             sample.append(nw)
213 |             sample_score += next_p[0, nw]
214 |             if nw == 0:
215 |                 break
216 | 
217 |         # FIXME: This is broken
218 |         return sample, sample_score, sample_scores, curr_loss, maxlen
219 | 


--------------------------------------------------------------------------------
/nmtpy/nmtutils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import pickle
 4 | 
 5 | from collections import OrderedDict
 6 | from .defaults import INT, FLOAT
 7 | 
 8 | def invert_dictionary(d):
 9 |     return OrderedDict([(v,k) for k,v in d.items()])
10 | 
11 | def load_dictionary(fname):
12 |     with open(fname, 'rb') as f:
13 |         vocab = pickle.load(f)
14 | 
15 |     return vocab, invert_dictionary(vocab)
16 | 
17 | # Function to convert idxs to sentence
18 | def idx_to_sent(ivocab, idxs, join=True):
19 |     sent = []
20 |     for widx in idxs:
21 |         if widx == 0:
22 |             break
23 |         sent.append(ivocab.get(widx, "<unk>"))
24 |     if join:
25 |         return " ".join(sent)
26 |     else:
27 |         return sent
28 | 
29 | # Function to convert sentence to idxs
30 | def sent_to_idx(vocab, tokens, limit=0):
31 |     idxs = []
32 |     for word in tokens:
33 |         # Get token, 1 if not available
34 |         idx = vocab.get(word, 1)
35 |         if limit > 0:
36 |             idx = idx if idx < limit else 1
37 |         idxs.append(idx)
38 |     return idxs
39 | 
40 | # push parameters to Theano shared variables
41 | def zipp(params, tparams):
42 |     for kk, vv in params.items():
43 |         tparams[kk].set_value(vv)
44 | 
45 | # pull parameters from Theano shared variables
46 | def unzip(zipped):
47 |     new_params = OrderedDict()
48 |     for kk, vv in zipped.items():
49 |         new_params[kk] = vv.get_value()
50 |     return new_params
51 | 
52 | # make prefix-appended name
53 | def pp(prefix, name):
54 |     return '%s_%s' % (prefix, name)
55 | 
56 | # orthogonal initialization for weights
57 | # Saxe, Andrew M., James L. McClelland, and Surya Ganguli.
58 | # "Exact solutions to the nonlinear dynamics of learning in deep
59 | # linear neural networks." arXiv preprint arXiv:1312.6120 (2013).
60 | def ortho_weight(ndim):
61 |     W = np.random.randn(ndim, ndim)
62 |     u, s, v = np.linalg.svd(W)
63 |     return u.astype(FLOAT)
64 | 
65 | # weight initializer, normal by default
66 | def norm_weight(nin, nout, scale=0.01, ortho=True):
67 |     if scale == "xavier":
68 |         # Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of training deep feedforward neural networks."
69 |         # International conference on artificial intelligence and statistics. 2010.
70 |         # http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf
71 |         scale = 1. / np.sqrt(nin)
72 |     elif scale == "he":
73 |         # Claimed necessary for ReLU
74 |         # Kaiming He et al. (2015)
75 |         # Delving deep into rectifiers: Surpassing human-level performance on
76 |         # imagenet classification. arXiv preprint arXiv:1502.01852.
77 |         scale = 1. / np.sqrt(nin/2.)
78 | 
79 |     if nout == nin and ortho:
80 |         W = ortho_weight(nin)
81 |     else:
82 |         W = scale * np.random.randn(nin, nout)
83 |     return W.astype(FLOAT)
84 | 


--------------------------------------------------------------------------------
/nmtpy/optimizers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from collections import OrderedDict
  3 | from abc import ABCMeta, abstractmethod
  4 | 
  5 | import numpy as np
  6 | 
  7 | import theano
  8 | import theano.tensor as tensor
  9 | 
 10 | from .defaults import FLOAT
 11 | from .nmtutils import unzip
 12 | 
 13 | 
 14 | def get_optimizer(name):
 15 |     optimizers = {
 16 |             'sgd'       : SGD,
 17 |             'adam'      : Adam,
 18 |             'rmsprop'   : RMSProp,
 19 |             'adadelta'  : Adadelta,
 20 |             }
 21 |     return optimizers[name]
 22 | 
 23 | class Optimizer(object, metaclass=ABCMeta):
 24 |     def __init__(self, lr0):
 25 |         # Learning rate shared variable
 26 |         self.lr = theano.shared(np.float64(lr0).astype(FLOAT), name='lrate')
 27 | 
 28 |         # Theano shared variables for accumulator tensors
 29 |         self.history = OrderedDict()
 30 | 
 31 |         # Store grad variables given with get_updates()
 32 |         self.grads = None
 33 | 
 34 |         # Gradient noise per update
 35 |         self.grad_noise_factor = 0.
 36 | 
 37 |     def set_trng(self, trng):
 38 |         """Save Theano RNG."""
 39 |         self._trng = trng
 40 | 
 41 |     def set_gradient_noise(self, factor):
 42 |         """Set gradient noise factor."""
 43 |         self.grad_noise_factor = factor
 44 | 
 45 |     def init_value(self, shape, name, history=None):
 46 |         """Initialize a variable with zero or last value."""
 47 |         value = history[name] if history else np.zeros(shape, dtype=FLOAT)
 48 | 
 49 |         # Create the shared variable and store it
 50 |         self.history[name] = theano.shared(value, name)
 51 |         return self.history[name]
 52 | 
 53 |     def get_history(self):
 54 |         """Returns a dictionary of numpy tensors for history variables."""
 55 |         return unzip(self.history)
 56 | 
 57 |     def set_lrate(self, lrate):
 58 |         """Update the internal lrate."""
 59 |         self.lr.set_value(lrate)
 60 | 
 61 |     @abstractmethod
 62 |     def get_updates(self, tparams, grads, history=None):
 63 |         """Return updates list for params."""
 64 |         pass
 65 | 
 66 | 
 67 | #############################
 68 | # Stochastic Gradient Descent
 69 | #############################
 70 | class SGD(Optimizer):
 71 |     def __init__(self, lr0=0.01):
 72 |         super(SGD, self).__init__(lr0)
 73 | 
 74 |     def get_updates(self, tparams, grads, history=None):
 75 |         self.grads = grads
 76 |         updates = []
 77 |         for tparam, grad in zip(tparams.values(), grads):
 78 |             updates.append((tparam, tparam - self.lr * grad))
 79 | 
 80 |         return updates
 81 | 
 82 | #########
 83 | # RMSProp
 84 | #########
 85 | class RMSProp(Optimizer):
 86 |     def __init__(self, lr0=0.001, rho=0.95, eps=1e-6):
 87 |         super(RMSProp, self).__init__(lr0)
 88 |         self.rho = rho
 89 |         self.eps = eps
 90 | 
 91 |     def get_updates(self, tparams, grads, history=None):
 92 |         self.grads = grads
 93 |         updates = []
 94 |         for tparam, grad in zip(tparams.values(), grads):
 95 |             # Accumulate gradient squares
 96 |             v = self.init_value(tparam.get_value().shape, '%s_v' % tparam.name, history)
 97 | 
 98 |             # rho * past + (1 - rho) * current
 99 |             v_new = (self.rho * v) + (1. - self.rho) * grad**2
100 | 
101 |             updates.append((v, v_new))
102 |             updates.append((tparam, tparam - (self.lr * grad / tensor.sqrt(v_new + self.eps))))
103 | 
104 |         return updates
105 | 
106 | ##########
107 | # Adadelta
108 | ##########
109 | class Adadelta(Optimizer):
110 |     def __init__(self, lr0=1., rho=0.95, eps=1e-6):
111 |         super(Adadelta, self).__init__(lr0)
112 |         self.rho = rho
113 |         self.eps = eps
114 | 
115 |     def get_updates(self, tparams, grads, history=None):
116 |         self.grads = grads
117 |         updates = []
118 |         for tparam, grad in zip(tparams.values(), grads):
119 |             v = self.init_value(tparam.get_value().shape, '%s_v' % tparam.name, history)
120 |             u = self.init_value(tparam.get_value().shape, '%s_u' % tparam.name, history)
121 | 
122 |             # Accumulate gradient squares
123 |             # rho * past + (1 - rho) * current
124 |             v_new = (self.rho * v) + (1. - self.rho) * grad**2
125 |             updates.append((v, v_new))
126 | 
127 |             # Update rule
128 |             up = (grad * tensor.sqrt(u + self.eps) / tensor.sqrt(v_new + self.eps))
129 |             updates.append((tparam, tparam - self.lr * up))
130 | 
131 |             # Accumulate update magnitudes
132 |             updates.append((u, self.rho * u + (1. - self.rho) * up**2))
133 | 
134 |         return updates
135 | 
136 | ######
137 | # Adam
138 | ######
139 | class Adam(Optimizer):
140 |     def __init__(self, *args, lr0=0.0001, b1=0.9, b2=0.999, eps=1e-8):
141 |         super().__init__(lr0)
142 |         self.b1  = b1
143 |         self.b2  = b2
144 |         self.eps = eps
145 | 
146 |     def get_updates(self, tparams, grads, history=None):
147 |         self.grads = grads
148 |         updates = []
149 | 
150 |         # Iteration counter, 'None' for shape creates a scalar
151 |         i = self.init_value(None, 'i', history)
152 | 
153 |         i_t = i + 1.
154 | 
155 |         # Running learning-rate that will eventually -> lr0
156 |         lr_t = self.lr * (tensor.sqrt(1. - self.b2**i_t) / (1. - self.b1**i_t))
157 | 
158 |         # Increment iteration counter
159 |         updates.append((i, i_t))
160 | 
161 |         for tparam, grad in zip(tparams.values(), grads):
162 |             m = self.init_value(tparam.get_value().shape, '%s_m' % tparam.name, history)
163 |             v = self.init_value(tparam.get_value().shape, '%s_v' % tparam.name, history)
164 | 
165 |             if self.grad_noise_factor > 0:
166 |                 # Sample normal noise from N(0, sqrt(factor/((1+t)**0.55))).
167 |                 var = self.grad_noise_factor / (i_t**0.55)
168 |                 noise = self._trng.normal(grad.shape, std=tensor.sqrt(stdev), dtype=FLOAT)
169 |                 grad += noise
170 | 
171 |             m_t = (self.b1 * m) + ((1. - self.b1) * grad)
172 |             v_t = (self.b2 * v) + ((1. - self.b2) * grad**2)
173 |             p_t = tparam - (lr_t * (m_t / (tensor.sqrt(v_t) + self.eps)))
174 | 
175 |             # Add updates
176 |             updates.append((m, m_t))
177 |             updates.append((v, v_t))
178 |             updates.append((tparam, p_t))
179 | 
180 |         return updates
181 | 


--------------------------------------------------------------------------------
/nmtpy/textutils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Text processing related functions"""
 3 | 
 4 | def reduce_to_best(hyps, scores, n_unique_samples, avoid_unk=True):
 5 |     """Pick the best of each hypotheses group based on their scores."""
 6 | 
 7 |     if avoid_unk:
 8 |         # Penalize hyps having <unk> inside
 9 |         pairs = [(p[0], p[1] + (100 if "<unk>" in p[0][0] else 0)) for p in zip(hyps, scores)]
10 | 
11 |     # Group each sample's hypotheses
12 |     groups = [pairs[i::n_unique_samples] for i in range(n_unique_samples)]
13 | 
14 |     # Now each element of "groups" contain let's say 5 hypotheses and their scores
15 |     # Sort them and get the first (smallest score)
16 |     return [sorted(g, key=lambda x: x[1])[0][0] for g in groups]
17 | 


--------------------------------------------------------------------------------
/patches/00-theano-advancedinctensor.patch:
--------------------------------------------------------------------------------
 1 | --- theano/sandbox/cuda/opt.py	2017-03-21 18:45:53.532335945 +0100
 2 | +++ theano/sandbox/cuda/opt.py	2017-03-21 18:45:53.532335945 +0100
 3 | @@ -1111,7 +1111,7 @@
 4 |  
 5 |                  gpu_op = GpuAdvancedIncSubtensor1(**props_dict)
 6 |              else:
 7 | -                gpu_op = GpuAdvancedIncSubtensor1_dev20(**props_dict)
 8 | +                gpu_op = GpuAdvancedIncSubtensor1(**props_dict)
 9 |              return [gpu_op(as_cuda_ndarray_variable(x),
10 |                             as_cuda_ndarray_variable(y), *coords)]
11 |  
12 | @@ -1149,7 +1149,7 @@
13 |              if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
14 |                  gpu_op = GpuAdvancedIncSubtensor1(**node.op._props_dict())
15 |              else:
16 | -                gpu_op = GpuAdvancedIncSubtensor1_dev20(**node.op._props_dict())
17 | +                gpu_op = GpuAdvancedIncSubtensor1(**node.op._props_dict())
18 |              return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
19 |      return False
20 |  
21 | --- theano/gpuarray/opt.py	2017-03-21 18:42:35.589317691 +0100
22 | +++ theano/gpuarray/opt.py	2017-03-21 18:42:35.589317691 +0100
23 | @@ -68,8 +68,7 @@
24 |  from .subtensor import (GpuIncSubtensor, GpuSubtensor,
25 |                          GpuAdvancedSubtensor,
26 |                          GpuAdvancedSubtensor1,
27 | -                        GpuAdvancedIncSubtensor1,
28 | -                        GpuAdvancedIncSubtensor1_dev20)
29 | +                        GpuAdvancedIncSubtensor1)
30 |  from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
31 |  from .reduction import GpuMaxAndArgmax
32 |  from .linalg import (GpuCusolverSolve, cusolver_available)
33 | @@ -1056,7 +1055,7 @@
34 |      if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0:
35 |          x = x.dimshuffle(0, 'x')
36 |          y = y.dimshuffle('x', 'x')
37 | -        ret = GpuAdvancedIncSubtensor1_dev20(
38 | +        ret = GpuAdvancedIncSubtensor1(
39 |              set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
40 |          ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
41 |          return ret
42 | @@ -1064,15 +1063,14 @@
43 |          return GpuAdvancedIncSubtensor1(
44 |              set_instead_of_inc=set_instead_of_inc)
45 |      else:
46 | -        return GpuAdvancedIncSubtensor1_dev20(
47 | +        return GpuAdvancedIncSubtensor1(
48 |              set_instead_of_inc=set_instead_of_inc)
49 |  
50 |  
51 |  @register_inplace()
52 | -@local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20])
53 | +@local_optimizer([GpuAdvancedIncSubtensor1])
54 |  def local_advincsub1_gpua_inplace(node):
55 | -    if isinstance(node.op, (GpuAdvancedIncSubtensor1,
56 | -                            GpuAdvancedIncSubtensor1_dev20)):
57 | +    if isinstance(node.op, (GpuAdvancedIncSubtensor1,)):
58 |          if not node.op.inplace:
59 |              return [node.op.clone_inplace()(*node.inputs)]
60 |  
61 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | Scripts
2 | -------
3 | 
4 |  - `get-meteor-data.sh`: Used to download METEOR paraphrases prior to `nmtpy` installation.
5 | 


--------------------------------------------------------------------------------
/scripts/get-meteor-data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PREFIX="https://raw.githubusercontent.com/cmu-mtlab/meteor/master/data/paraphrase"
 4 | SAVEDIR="nmtpy/external/data"
 5 | 
 6 | for lang in cz de en es fr ru; do
 7 |   if [ ! -f "${SAVEDIR}/paraphrase-${lang}.gz" ]; then
 8 |     echo "Downloading $lang paraphrase data..."
 9 |     curl "${PREFIX}-${lang}.gz" -o "${SAVEDIR}/paraphrase-${lang}.gz"
10 |   fi
11 | done
12 | 


--------------------------------------------------------------------------------
/scripts/modify-npz:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import numpy as np
 7 | 
 8 | from ast import literal_eval
 9 | from collections import OrderedDict
10 | 
11 | from nmtpy.sysutils import *
12 | 
13 | os.environ['THEANO_FLAGS'] = 'device=cpu'
14 | 
15 | def parse_value(value):
16 |     try:
17 |         return literal_eval(value)
18 |     except ValueError as ve:
19 |         return value
20 | 
21 | if __name__ == '__main__':
22 |     # Change a property inside the 'opts' dictionary
23 |     # of npz files and write it back.
24 |     # Useful to port old npz files to new nmtpy versions.
25 | 
26 |     modifs, files = [], []
27 | 
28 |     for param in sys.argv[1:]:
29 |         if ":" in param:
30 |             modifs.append(param.split(':'))
31 |         else:
32 |             files.append(param)
33 | 
34 |     for npzf in files:
35 |         # Load the file
36 |         npz = np.load(npzf)
37 | 
38 |         # Get the dict
39 |         opts = get_model_options(npz)
40 |         params = get_param_dict(npz)
41 | 
42 |         newfilename = npzf
43 | 
44 |         for key, value in modifs:
45 |             opts[key] = parse_value(value)
46 |             print('%s -> %s' % (key, opts[key]))
47 | 
48 |             if key == 'model_type':
49 |                 # If model_type changed, change the filename as well
50 |                 oldmodel, rest = npzf.split('-', 1)
51 |                 newfilename = '%s-%s' % (value, rest)
52 | 
53 |         print('Writing %s' % newfilename)
54 | 
55 |         params['opts'] = opts
56 | 
57 |         np.savez(newfilename, **params)
58 | 


--------------------------------------------------------------------------------
/scripts/prep-charnmt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Example script to show how to prepare
 4 | # a char2char dataset from raw text corpora.
 5 | # You need to use filter:char2words in .conf
 6 | # to correctly post-process hypotheses after beam-search
 7 | 
 8 | # Pipeline:
 9 | #  lowercase.perl -> word2char (sed) -> trim whitespace (awk)
10 | 
11 | datadir=../
12 | SL=en
13 | TL=de
14 | 
15 | for dataset in train val test2016 test2017; do
16 |   for lang in $SL $TL; do
17 |     inputfile=${datadir}/${dataset}.${lang}
18 |     if [ -f $inputfile ]; then
19 |       echo $dataset, $lang
20 |       lowercase.perl -l $SL < $inputfile | sed -e "s/./& /g;s/\ \ \ / <s> /g" \
21 |         | awk '{$1=$1};1' > ${dataset}.lc.char.${lang}
22 |     fi
23 |   done
24 | done
25 | 
26 | nmt-build-dict train*
27 | 


--------------------------------------------------------------------------------
/scripts/snaprun:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #  1. Put this script under your $USER/bin or $USER/.local/bin or whatever
 5 | #  2. Set NMTPY to your nmtpy copy where you do your development
 6 | #  3. Start training by prefixing your command with snaprun:
 7 | #   $ snaprun nmt-train -c foobar.conf
 8 | 
 9 | # Source tree to use as nmtpy unless NMTPY= is set from cmdline
10 | NMTPY=${NMTPY:=~/git/nmtpy-merge}
11 | 
12 | # Enter to $NMTPY
13 | pushd $NMTPY
14 | 
15 | # Take the last commit SHA1
16 | SHA=`git rev-parse --short HEAD`
17 | 
18 | SUFFIX=`uuidgen -t`
19 | 
20 | # Export to the following folder
21 | SNAPSHOT="/tmp/nmtpy-${USER}-${SHA}-${SUFFIX}"
22 | 
23 | # Take a snapshot of the source tree by cleaning unnecessary stuff
24 | rsync --exclude=*egg-info --exclude=*pycache* --exclude "*.git*" -a . $SNAPSHOT
25 | 
26 | # Show code folder
27 | echo "Took snapshot under $SNAPSHOT"
28 | 
29 | # Set PATH to use the new /bin to find nmt-*
30 | # and override default python search path list by giving this
31 | # new path as the first item.
32 | export PATH=${SNAPSHOT}/bin:${PATH}
33 | export PYTHONPATH=$SNAPSHOT
34 | 
35 | popd
36 | # Execute the given command
37 | $@
38 | 
39 | # Remove the folder
40 | rm -rf $SNAPSHOT
41 | 


--------------------------------------------------------------------------------
/scripts/update-npz:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import numpy as np
 7 | 
 8 | from ast import literal_eval
 9 | from collections import OrderedDict
10 | 
11 | from nmtpy.sysutils import *
12 | 
13 | os.environ['THEANO_FLAGS'] = 'device=cpu'
14 | 
15 | def parse_value(value):
16 |     try:
17 |         return literal_eval(value)
18 |     except ValueError as ve:
19 |         return value
20 | 
21 | if __name__ == '__main__':
22 |     # Update nmtpy model checkpoints to recent format
23 |     # to fix problems of inference.
24 | 
25 |     for fname in sys.argv[1:]:
26 |         do_write = False
27 | 
28 |         # Open the file
29 |         npzf = np.load(fname)
30 | 
31 |         # Get option dictionary
32 |         opts = npzf['opts'].tolist()
33 |         tparams = OrderedDict()
34 | 
35 |         if 'tparams' in npzf.files:
36 |             # Old format of saving parameters
37 |             do_write = True
38 |             tparams = npzf['tparams'].tolist()
39 |         else:
40 |             for key in npzf.files:
41 |                 if key != 'opts':
42 |                     tparams[key] = npzf[key]
43 | 
44 |         tparams['opts'] = opts
45 | 
46 |         # Close the file
47 |         npzf.close()
48 | 
49 |         new_fname = fname
50 |         ###############
51 | 
52 |         # attention_singledict is now included in main model
53 |         if opts['model_type'] == 'attention_singledict':
54 |             do_write = True
55 |             opts['model_type'] = 'attention'
56 |             opts['tied_emb'] = '3way'
57 |             del opts['tied_trg_emb']
58 | 
59 |             new_fname = fname.replace('attention_singledict', 'attention')
60 |         else:
61 |             if 'tied_trg_emb' in opts:
62 |                 opts['tied_emb'] = '2way'
63 |                 del opts['tied_trg_emb']
64 |                 do_write = True
65 | 
66 |         if do_write:
67 |             print('Writing %s' % new_fname)
68 |             np.savez(new_fname, **tparams)
69 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import sys
 4 | 
 5 | from setuptools import setup
 6 | import nmtpy
 7 | 
 8 | # Install pycocoevalcap metric scorers as well
 9 | pycocometrics   = ['bleu', 'meteor', 'cider', 'rouge']
10 | pycocopackages  = ['nmtpy.cocoeval.%s' % m for m in pycocometrics]
11 | 
12 | if 'install' in sys.argv or 'develop' in sys.argv:
13 |     if not os.path.exists('nmtpy/external/data/paraphrase-en.gz'):
14 |         print('You need to run scripts/get-meteor-data.sh first.')
15 |         sys.exit(1)
16 | 
17 | setup(
18 |         name='nmtpy',
19 |         version=nmtpy.__version__,
20 |         description='Neural Machine Translation Framework in Python',
21 |         url='https://github.com/lium-lst/nmtpy',
22 |         author='Ozan Çağlayan',
23 |         author_email='ozancag@gmail.com',
24 |         license='MIT',
25 |         classifiers=[
26 |             'Development Status :: 5 - Production/Stable',
27 |             'Intended Audience :: Science/Research',
28 |             'Topic :: Scientific/Engineering',
29 |             'License :: OSI Approved :: MIT License',
30 |             'Programming Language :: Python :: 3 :: Only',
31 |             'Programming Language :: Python :: 3.5',
32 |             'Operating System :: POSIX',
33 |             ],
34 |         keywords='nmt neural-mt translation deep-learning',
35 |         packages=['nmtpy', 'nmtpy.models', 'nmtpy.iterators', 'nmtpy.metrics', 'nmtpy.cocoeval'] + pycocopackages,
36 |         package_data={'' : ['external/meteor-1.5.jar', 'external/data/*gz', 'external/multi-bleu.perl']}, # data files
37 |         install_requires=[
38 |           'numpy',
39 |           'theano',
40 |         ],
41 |         scripts=[
42 |                     'bin/nmt-train',
43 |                     'bin/nmt-extract',
44 |                     'bin/nmt-rescore',
45 |                     'bin/nmt-translate',
46 |                     'bin/nmt-translate-factors', # Factored NMT variant.
47 |                     'bin/nmt-build-dict',
48 |                     'bin/nmt-coco-metrics',
49 |                     'bin/nmt-bpe-apply',
50 |                     'bin/nmt-bpe-learn',
51 |                 ],
52 |         zip_safe=False)
53 | 


--------------------------------------------------------------------------------