├── README.md
├── onmt
    ├── EarlyStop.py
    ├── Loss.py
    ├── ModelConstructor.py
    ├── Models.py
    ├── Optim.py
    ├── Trainer.py
    ├── TrainerMultimodal.py
    ├── Utils.py
    ├── VILoss.py
    ├── VI_Model1.py
    ├── __init__.py
    ├── __pycache__
    │   ├── EarlyStop.cpython-36.pyc
    │   ├── Loss.cpython-36.pyc
    │   ├── ModelConstructor.cpython-36.pyc
    │   ├── Models.cpython-36.pyc
    │   ├── Optim.cpython-36.pyc
    │   ├── Trainer.cpython-36.pyc
    │   ├── TrainerMultimodal.cpython-36.pyc
    │   ├── Utils.cpython-36.pyc
    │   ├── VILoss.cpython-36.pyc
    │   ├── VI_Model1.cpython-36.pyc
    │   └── __init__.cpython-36.pyc
    ├── io
    │   ├── AudioDataset.py
    │   ├── DatasetBase.py
    │   ├── IO.py
    │   ├── ImageDataset.py
    │   ├── TextDataset.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── AudioDataset.cpython-36.pyc
    │   │   ├── DatasetBase.cpython-36.pyc
    │   │   ├── IO.cpython-36.pyc
    │   │   ├── ImageDataset.cpython-36.pyc
    │   │   ├── TextDataset.cpython-36.pyc
    │   │   └── __init__.cpython-36.pyc
    ├── modules
    │   ├── AudioEncoder.py
    │   ├── Conv2Conv.py
    │   ├── ConvMultiStepAttention.py
    │   ├── CopyGenerator.py
    │   ├── Dists.py
    │   ├── Embeddings.py
    │   ├── Gate.py
    │   ├── GlobalAttention.py
    │   ├── ImageEncoder.py
    │   ├── MultiHeadedAttn.py
    │   ├── NormalVariationalEncoder.py
    │   ├── SRU.py
    │   ├── StackedRNN.py
    │   ├── StructuredAttention.py
    │   ├── Transformer.py
    │   ├── UtilClass.py
    │   ├── WeightNorm.py
    │   ├── WordDropout.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── AudioEncoder.cpython-36.pyc
    │   │   ├── Conv2Conv.cpython-36.pyc
    │   │   ├── ConvMultiStepAttention.cpython-36.pyc
    │   │   ├── CopyGenerator.cpython-36.pyc
    │   │   ├── Dists.cpython-36.pyc
    │   │   ├── Embeddings.cpython-36.pyc
    │   │   ├── Gate.cpython-36.pyc
    │   │   ├── GlobalAttention.cpython-36.pyc
    │   │   ├── ImageEncoder.cpython-36.pyc
    │   │   ├── MultiHeadedAttn.cpython-36.pyc
    │   │   ├── NormalVariationalEncoder.cpython-36.pyc
    │   │   ├── SRU.cpython-36.pyc
    │   │   ├── StackedRNN.cpython-36.pyc
    │   │   ├── StructuredAttention.cpython-36.pyc
    │   │   ├── Transformer.cpython-36.pyc
    │   │   ├── UtilClass.cpython-36.pyc
    │   │   ├── WeightNorm.cpython-36.pyc
    │   │   ├── WordDropout.cpython-36.pyc
    │   │   └── __init__.cpython-36.pyc
    └── translate
    │   ├── Beam.py
    │   ├── Translation.py
    │   ├── Translator.py
    │   ├── TranslatorMultimodalVI.py
    │   ├── __init__.py
    │   └── __pycache__
    │       ├── Beam.cpython-36.pyc
    │       ├── Translation.cpython-36.pyc
    │       ├── Translator.cpython-36.pyc
    │       ├── TranslatorMultimodalVI.cpython-36.pyc
    │       └── __init__.cpython-36.pyc
├── opts.py
├── preprocess.py
├── requirements.txt
├── run_additional_data.sh
├── run_translated_m30k_only.sh
├── setup.py
├── tools
    ├── multi-bleu.perl
    └── nonbreaking_prefixes
    │   ├── README.txt
    │   ├── nonbreaking_prefix.ca
    │   ├── nonbreaking_prefix.cs
    │   ├── nonbreaking_prefix.de
    │   ├── nonbreaking_prefix.el
    │   ├── nonbreaking_prefix.en
    │   ├── nonbreaking_prefix.es
    │   ├── nonbreaking_prefix.fi
    │   ├── nonbreaking_prefix.fr
    │   ├── nonbreaking_prefix.ga
    │   ├── nonbreaking_prefix.hu
    │   ├── nonbreaking_prefix.is
    │   ├── nonbreaking_prefix.it
    │   ├── nonbreaking_prefix.lt
    │   ├── nonbreaking_prefix.lv
    │   ├── nonbreaking_prefix.nl
    │   ├── nonbreaking_prefix.pl
    │   ├── nonbreaking_prefix.ro
    │   ├── nonbreaking_prefix.ru
    │   ├── nonbreaking_prefix.sk
    │   ├── nonbreaking_prefix.sl
    │   ├── nonbreaking_prefix.sv
    │   ├── nonbreaking_prefix.ta
    │   ├── nonbreaking_prefix.yue
    │   └── nonbreaking_prefix.zh
├── train.py
├── train_mm_vi_model1.py
├── translate.py
└── translate_mm_vi.py


/README.md:
--------------------------------------------------------------------------------
 1 | # variational\_mmt
 2 | 
 3 | ## TL-DR
 4 | 
 5 | This is the code base one should use to reproduce results reported in the ACL 2019 paper [Latent variable model for multi-modal translation](https://www.aclweb.org/anthology/papers/P/P19/P19-1642/).
 6 | We propose a conditional variational auto-encoder model for multi-modal translation,
 7 | i.e. to model the interaction between visual and textual features for multi-modal neural machine translation (MMT) through a latent variable model.
 8 | This latent variable can be seen as a multi-modal stochastic embedding of an image and its description in a foreign language.
 9 | It is used in a target-language decoder and also to predict image features.
10 | Importantly, our model formulation utilises visual and textual inputs during training but does not require that images be available at test time.
11 | Please refer to the paper for more details.
12 | 
13 | ## Before you start
14 | 
15 | Before you start, please ensure that:
16 | 
17 | - You have installed the right version of PyTorch and all the dependencies according to `requirements.txt`;
18 | - If you want to use your own version of the Multi30k data set, that you changed the respective variable names in the `run_*.sh` files as required.
19 | 
20 | If you want to use the exact version of the Multi30k data set used in the paper:
21 | 
22 | - download a tarball containing all files (PyTorch binaries and image features) for the translated Multi30k data set experiments [here](https://surfdrive.surf.nl/files/index.php/s/VmqtrhTipDv2djx). The tarball includes:
23 |     - `flickr30k_train_resnet50_cnn_features.hdf5`: training set image features, 29K examples.
24 |     - `flickr30k_valid_resnet50_cnn_features.hdf5`: validation set image features, 1,014 examples.
25 |     - `flickr30k_test_resnet50_cnn_features.hdf5`: 2016 test set image features, 1K examples.
26 |     - `flickr30k_test_2017_flickr_resnet50_cnn_features.hdf5`: 2017 test set image features, 1K examples.
27 |     - `flickr30k_test_2017_mscoco_resnet50_cnn_features.hdf5`: ambiguous MSCOCO test set image features, 461 examples.
28 |     - `m30k.{train,valid}.1.pt`, `m30k.vocab.pt`: PyTorch binaries containing sentences in training/validation sets and vocabulary.
29 |     - `{train,val,test_2016_flickr,test_2017_flickr,test_2017_mscoco}.lc.norm.tok.bpe-en-de-30000.{en,de}`: text files containing train/validation/test sets.
30 | - download a tarball containing all files (PyTorch binaries and image features) for the backtranslated comparable + translated Multi30k data set experiments [here](https://surfdrive.surf.nl/files/index.php/s/opHKSCmeJsGtL9Q). The tarball includes:
31 |     - `flickr30k_train_translated-5x-comparable-1x_resnet50_cnn_features.shuffled.hdf5`: this file contains features for 290,000 images, i.e. 29K translated Multi30k images five times each (145K) and 29K comparable Multi30k images also five times each (145K). We upsample images for the translated Multi30k to keep them about half of the images used when training the model in this setting.
32 |     - `concat-multi30k-translational-5times-comparable-1time-shuffled_correct.{train,valid}.1.pt`, `concat-multi30k-translational-5times-comparable-1time-shuffled_correct.vocab.pt`: PyTorch binaries containing sentences in training/validation sets and vocabulary.
33 | - ensure that variable names are correct in the corresponding `run_translated_m30k_only.sh` and `run_additional_data.sh` files. Image features were extracted as described in the paper, i.e. using a pretrained ResNet-50 convolutional neural network.
34 | 
35 | To train a model using only the translated Multi30k, you will use the shell script `run_translated_m30k.sh`; to train a model using the back-translated comparable + translated Multi30k, you will use `run_additional_data.sh`. However, before you run these scripts:
36 | - change `DATA_PATH` and `MODEL_PATH` variables (in both `run_translated_m30k.sh` and  `run_additional_data.sh`), pointing them to the directory where to find the training data (decompressed from the tarball abovementioned) and to the directory where you wish to store model checkpoints, respectively.
37 | 
38 | ## Training
39 | 
40 | To see how to call the `train_mm_vi_model1.py` script, please refer to the `run_*.sh` scripts or run `train_mm_vi_model1.py --help`.
41 | 
42 | ### Training a model on the translated Multi30k
43 | 
44 | To train a model using the Translated Multi30k data set only (~29K source/target/image triplets), run:
45 | ```bash
46 | run_translated_m30k_only.sh
47 | ```
48 | 
49 | This bash script assumes you have a GPU available with at least 12GBs, e.g. TitanX, 1080Ti, etc., and sets all the hyperparameters to reproduce the results in the paper.
50 | 
51 | ### Training a model on the back-translated comparable and translated Multi30k
52 | 
53 | To train a model using the back-translated comparable Multi30k in addition to the translated Multi30k data set (total of ~145K source/target/image triplets), simply run:
54 | ```bash
55 | run_additional_data.sh
56 | ```
57 | 
58 | This bash script also assumes you have a GPU available with at least 12GBs (e.g. TitanX, 1080Ti, etc.) and sets all the hyperparameters to reproduce the results in the paper.
59 | 
60 | ## Decoding a translation
61 | 
62 | By calling the bash scripts above, you will not only train, but after finishing training will also decode the Multi30k's validation, test 2016, test 2017, and the ambiguous MSCOCO 2017 test set.
63 | By default, the model used to translate is the one selected according to best BLEU4 scores on the validation set.
64 | 
65 | To see how to use the `translate_mm_vi.py` script directly, please refer to the `run_*.sh` scripts or call `translate_mm_vi.py --help`.
66 | 
67 | ## Citation
68 | 
69 | If you use this code base, please consider citing our paper.
70 | 
71 |     @inproceedings{calixto-etal-2019-latent,
72 |         title = "Latent Variable Model for Multi-modal Translation",
73 |         author = "Calixto, Iacer and Rios, Miguel  and Aziz, Wilker",
74 |         booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
75 |         month = jul,
76 |         year = "2019",
77 |         address = "Florence, Italy",
78 |         publisher = "Association for Computational Linguistics",
79 |         url = "https://www.aclweb.org/anthology/P19-1642",
80 |         pages = "6392--6405",
81 |     }
82 | 
83 | 


--------------------------------------------------------------------------------
/onmt/Optim.py:
--------------------------------------------------------------------------------
  1 | import torch.optim as optim
  2 | from torch.nn.utils import clip_grad_norm
  3 | 
  4 | 
  5 | class Optim(object):
  6 |     """
  7 |     Controller class for optimization. Mostly a thin
  8 |     wrapper for `optim`, but also useful for implementing
  9 |     rate scheduling beyond what is currently available.
 10 |     Also implements necessary methods for training RNNs such
 11 |     as grad manipulations.
 12 | 
 13 |     Args:
 14 |       method (:obj:`str`): one of [sgd, adagrad, adadelta, adam]
 15 |       lr (float): learning rate
 16 |       lr_decay (float, optional): learning rate decay multiplier
 17 |       start_decay_at (int, optional): epoch to start learning rate decay
 18 |       beta1, beta2 (float, optional): parameters for adam
 19 |       adagrad_accum (float, optional): initialization parameter for adagrad
 20 |       decay_method (str, option): custom decay options
 21 |       warmup_steps (int, option): parameter for `noam` decay
 22 |       model_size (int, option): parameter for `noam` decay
 23 |     """
 24 |     # We use the default parameters for Adam that are suggested by
 25 |     # the original paper https://arxiv.org/pdf/1412.6980.pdf
 26 |     # These values are also used by other established implementations,
 27 |     # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 28 |     # https://keras.io/optimizers/
 29 |     # Recently there are slightly different values used in the paper
 30 |     # "Attention is all you need"
 31 |     # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98
 32 |     # was used there however, beta2=0.999 is still arguably the more
 33 |     # established value, so we use that here as well
 34 |     def __init__(self, method, lr, max_grad_norm,
 35 |                  lr_decay=1, start_decay_at=None,
 36 |                  beta1=0.9, beta2=0.999,
 37 |                  adagrad_accum=0.0,
 38 |                  decay_method=None,
 39 |                  warmup_steps=4000,
 40 |                  model_size=None):
 41 |         self.last_ppl = None
 42 |         self.lr = lr
 43 |         self.original_lr = lr
 44 |         self.max_grad_norm = max_grad_norm
 45 |         self.method = method
 46 |         self.lr_decay = lr_decay
 47 |         self.start_decay_at = start_decay_at
 48 |         self.start_decay = False
 49 |         self._step = 0
 50 |         self.betas = [beta1, beta2]
 51 |         self.adagrad_accum = adagrad_accum
 52 |         self.decay_method = decay_method
 53 |         self.warmup_steps = warmup_steps
 54 |         self.model_size = model_size
 55 | 
 56 |     def set_parameters(self, params):
 57 |         self.params = [p for p in params if p.requires_grad]
 58 |         if self.method == 'sgd':
 59 |             self.optimizer = optim.SGD(self.params, lr=self.lr)
 60 |         elif self.method == 'adagrad':
 61 |             self.optimizer = optim.Adagrad(self.params, lr=self.lr)
 62 |             for group in self.optimizer.param_groups:
 63 |                 for p in group['params']:
 64 |                     self.optimizer.state[p]['sum'] = self.optimizer\
 65 |                         .state[p]['sum'].fill_(self.adagrad_accum)
 66 |         elif self.method == 'adadelta':
 67 |             self.optimizer = optim.Adadelta(self.params, lr=self.lr)
 68 |         elif self.method == 'adam':
 69 |             self.optimizer = optim.Adam(self.params, lr=self.lr,
 70 |                                         betas=self.betas, eps=1e-9)
 71 |         else:
 72 |             raise RuntimeError("Invalid optim method: " + self.method)
 73 | 
 74 |     def _set_rate(self, lr):
 75 |         self.lr = lr
 76 |         self.optimizer.param_groups[0]['lr'] = self.lr
 77 | 
 78 |     def step(self):
 79 |         """Update the model parameters based on current gradients.
 80 | 
 81 |         Optionally, will employ gradient modification or update learning
 82 |         rate.
 83 |         """
 84 |         self._step += 1
 85 | 
 86 |         # Decay method used in tensor2tensor.
 87 |         if self.decay_method == "noam":
 88 |             self._set_rate(
 89 |                 self.original_lr *
 90 |                 (self.model_size ** (-0.5) *
 91 |                  min(self._step ** (-0.5),
 92 |                      self._step * self.warmup_steps**(-1.5))))
 93 | 
 94 |         if self.max_grad_norm:
 95 |             clip_grad_norm(self.params, self.max_grad_norm)
 96 |         self.optimizer.step()
 97 | 
 98 |     def update_learning_rate(self, ppl, epoch):
 99 |         """
100 |         Decay learning rate if val perf does not improve
101 |         or we hit the start_decay_at limit.
102 |         """
103 | 
104 |         if self.start_decay_at is not None and epoch >= self.start_decay_at:
105 |             self.start_decay = True
106 |         if self.last_ppl is not None and ppl > self.last_ppl:
107 |             self.start_decay = True
108 | 
109 |         if self.start_decay:
110 |             self.lr = self.lr * self.lr_decay
111 |             print("Decaying learning rate to %g" % self.lr)
112 | 
113 |         self.last_ppl = ppl
114 |         self.optimizer.param_groups[0]['lr'] = self.lr
115 | 


--------------------------------------------------------------------------------
/onmt/Utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | # these variables must point to the directory containing multi-bleu.perl and meteor-1.5.jar, respectively
 5 | BLEU_SCRIPT='%s/multi-bleu.perl' % "tools"
 6 | METEOR_SCRIPT='%s/meteor-1.5.jar' % "/misc/vlgscratch4/ChoGroup/icalixto/tools/meteor-1.5"
 7 | 
 8 | assert( os.path.isfile(BLEU_SCRIPT) ), 'ERROR: BLEU parl script not found!'
 9 | assert( os.path.isfile(METEOR_SCRIPT) ), 'ERROR: METEOR jar not found!'
10 | 
11 | # list with accepted model types
12 | MODEL_TYPES = ["vi-model1"]
13 | 
14 | def aeq(*args):
15 |     """
16 |     Assert all arguments have the same value
17 |     """
18 |     arguments = (arg for arg in args)
19 |     first = next(arguments)
20 |     assert all(arg == first for arg in arguments), \
21 |         "Not all arguments have the same value: " + str(args)
22 | 
23 | def sequence_mask(lengths, max_len=None):
24 |     """
25 |     Creates a boolean mask from sequence lengths.
26 |     """
27 |     batch_size = lengths.numel()
28 |     max_len = max_len or lengths.max()
29 |     return (torch.arange(0, max_len)
30 |             .type_as(lengths)
31 |             .repeat(batch_size, 1)
32 |             .lt(lengths.unsqueeze(1)))
33 | 
34 | def use_gpu(opt):
35 |     return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \
36 |         (hasattr(opt, 'gpu') and opt.gpu > -1)
37 | 


--------------------------------------------------------------------------------
/onmt/__init__.py:
--------------------------------------------------------------------------------
 1 | import onmt.io
 2 | import onmt.Loss
 3 | import onmt.VILoss
 4 | from onmt.Trainer import Trainer, Statistics
 5 | from onmt.TrainerMultimodal import TrainerMultimodal, VIStatistics
 6 | from onmt.Optim import Optim
 7 | import onmt.Models
 8 | import onmt.VI_Model1
 9 | import onmt.translate
10 | import onmt.EarlyStop 
11 | 
12 | # For flake8 compatibility
13 | __all__ = [onmt.Loss, onmt.Models,
14 |            Trainer, TrainerMultimodal,
15 |            Optim, Statistics, onmt.io, onmt.translate]
16 | 
17 | __all__ += [onmt.VILoss, VIStatistics,
18 |             onmt.VI_Model1, onmt.EarlyStop]
19 | 


--------------------------------------------------------------------------------
/onmt/__pycache__/EarlyStop.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/EarlyStop.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/Loss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Loss.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/ModelConstructor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/ModelConstructor.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/Models.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Models.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/Optim.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Optim.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/Trainer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Trainer.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/TrainerMultimodal.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/TrainerMultimodal.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/Utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Utils.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/VILoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/VILoss.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/VI_Model1.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/VI_Model1.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/io/DatasetBase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from itertools import chain
  4 | import torchtext
  5 | 
  6 | 
  7 | PAD_WORD = '<blank>'
  8 | UNK_WORD = '<unk>'
  9 | UNK = 0
 10 | BOS_WORD = '<s>'
 11 | EOS_WORD = '</s>'
 12 | 
 13 | 
 14 | class ONMTDatasetBase(torchtext.data.Dataset):
 15 |     """
 16 |     A dataset basically supports iteration over all the examples
 17 |     it contains. We currently have 3 datasets inheriting this base
 18 |     for 3 types of corpus respectively: "text", "img", "audio".
 19 | 
 20 |     Internally it initializes an `torchtext.data.Dataset` object with
 21 |     the following attributes:
 22 | 
 23 |      `examples`: a sequence of `torchtext.data.Example` objects.
 24 |      `fields`: a dictionary associating str keys with `torchtext.data.Field`
 25 |         objects, and not necessarily having the same keys as the input fields.
 26 |     """
 27 |     def __getstate__(self):
 28 |         return self.__dict__
 29 | 
 30 |     def __setstate__(self, d):
 31 |         self.__dict__.update(d)
 32 | 
 33 |     def __reduce_ex__(self, proto):
 34 |         "This is a hack. Something is broken with torch pickle."
 35 |         return super(ONMTDatasetBase, self).__reduce_ex__()
 36 | 
 37 |     def load_fields(self, vocab_dict):
 38 |         """ Load fields from vocab.pt, and set the `fields` attribute.
 39 | 
 40 |         Args:
 41 |             vocab_dict (dict): a dict of loaded vocab from vocab.pt file.
 42 |         """
 43 |         from onmt.io.IO import load_fields_from_vocab
 44 | 
 45 |         fields = load_fields_from_vocab(vocab_dict.items(), self.data_type)
 46 |         self.fields = dict([(k, f) for (k, f) in fields.items()
 47 |                            if k in self.examples[0].__dict__])
 48 | 
 49 |     @staticmethod
 50 |     def extract_text_features(tokens):
 51 |         """
 52 |         Args:
 53 |             tokens: A list of tokens, where each token consists of a word,
 54 |                 optionally followed by u"￨"-delimited features.
 55 |         Returns:
 56 |             A sequence of words, a sequence of features, and num of features.
 57 |         """
 58 |         if not tokens:
 59 |             return [], [], -1
 60 | 
 61 |         split_tokens = [token.split(u"￨") for token in tokens]
 62 |         split_tokens = [token for token in split_tokens if token[0]]
 63 |         token_size = len(split_tokens[0])
 64 | 
 65 |         assert all(len(token) == token_size for token in split_tokens), \
 66 |             "all words must have the same number of features"
 67 |         words_and_features = list(zip(*split_tokens))
 68 |         words = words_and_features[0]
 69 |         features = words_and_features[1:]
 70 | 
 71 |         return words, features, token_size - 1
 72 | 
 73 |     # Below are helper functions for intra-class use only.
 74 | 
 75 |     def _join_dicts(self, *args):
 76 |         """
 77 |         Args:
 78 |             dictionaries with disjoint keys.
 79 | 
 80 |         Returns:
 81 |             a single dictionary that has the union of these keys.
 82 |         """
 83 |         return dict(chain(*[d.items() for d in args]))
 84 | 
 85 |     def _peek(self, seq):
 86 |         """
 87 |         Args:
 88 |             seq: an iterator.
 89 | 
 90 |         Returns:
 91 |             the first thing returned by calling next() on the iterator
 92 |             and an iterator created by re-chaining that value to the beginning
 93 |             of the iterator.
 94 |         """
 95 |         first = next(seq)
 96 |         return first, chain([first], seq)
 97 | 
 98 |     def _construct_example_fromlist(self, data, fields):
 99 |         """
100 |         Args:
101 |             data: the data to be set as the value of the attributes of
102 |                 the to-be-created `Example`, associating with respective
103 |                 `Field` objects with same key.
104 |             fields: a dict of `torchtext.data.Field` objects. The keys
105 |                 are attributes of the to-be-created `Example`.
106 | 
107 |         Returns:
108 |             the created `Example` object.
109 |         """
110 |         ex = torchtext.data.Example()
111 |         for (name, field), val in zip(fields, data):
112 |             if field is not None:
113 |                 setattr(ex, name, field.preprocess(val))
114 |             else:
115 |                 setattr(ex, name, val)
116 |         return ex
117 | 


--------------------------------------------------------------------------------
/onmt/io/ImageDataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import codecs
  4 | import os
  5 | 
  6 | import torch
  7 | import torchtext
  8 | 
  9 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, EOS_WORD
 10 | 
 11 | 
 12 | class ImageDataset(ONMTDatasetBase):
 13 |     """ Dataset for data_type=='img'
 14 | 
 15 |         Build `Example` objects, `Field` objects, and filter_pred function
 16 |         from image corpus.
 17 | 
 18 |         Args:
 19 |             fields (dict): a dictionary of `torchtext.data.Field`.
 20 |             src_examples_iter (dict iter): preprocessed source example
 21 |                 dictionary iterator.
 22 |             tgt_examples_iter (dict iter): preprocessed target example
 23 |                 dictionary iterator.
 24 |             num_src_feats (int): number of source side features.
 25 |             num_tgt_feats (int): number of target side features.
 26 |             tgt_seq_length (int): maximum target sequence length.
 27 |             use_filter_pred (bool): use a custom filter predicate to filter
 28 |                 out examples?
 29 |     """
 30 |     def __init__(self, fields, src_examples_iter, tgt_examples_iter,
 31 |                  num_src_feats=0, num_tgt_feats=0,
 32 |                  tgt_seq_length=0, use_filter_pred=True):
 33 |         self.data_type = 'img'
 34 | 
 35 |         self.n_src_feats = num_src_feats
 36 |         self.n_tgt_feats = num_tgt_feats
 37 | 
 38 |         if tgt_examples_iter is not None:
 39 |             examples_iter = (self._join_dicts(src, tgt) for src, tgt in
 40 |                              zip(src_examples_iter, tgt_examples_iter))
 41 |         else:
 42 |             examples_iter = src_examples_iter
 43 | 
 44 |         # Peek at the first to see which fields are used.
 45 |         ex, examples_iter = self._peek(examples_iter)
 46 |         keys = ex.keys()
 47 | 
 48 |         out_fields = [(k, fields[k]) if k in fields else (k, None)
 49 |                       for k in keys]
 50 |         example_values = ([ex[k] for k in keys] for ex in examples_iter)
 51 |         out_examples = (self._construct_example_fromlist(
 52 |                             ex_values, out_fields)
 53 |                         for ex_values in example_values)
 54 |         # If out_examples is a generator, we need to save the filter_pred
 55 |         # function in serialization too, which would cause a problem when
 56 |         # `torch.save()`. Thus we materialize it as a list.
 57 |         out_examples = list(out_examples)
 58 | 
 59 |         def filter_pred(example):
 60 |             if tgt_examples_iter is not None:
 61 |                 return 0 < len(example.tgt) <= tgt_seq_length
 62 |             else:
 63 |                 return True
 64 | 
 65 |         filter_pred = filter_pred if use_filter_pred else lambda x: True
 66 | 
 67 |         super(ImageDataset, self).__init__(
 68 |             out_examples, out_fields, filter_pred
 69 |         )
 70 | 
 71 |     def sort_key(self, ex):
 72 |         """ Sort using the size of the image: (width, height)."""
 73 |         return (ex.src.size(2), ex.src.size(1))
 74 | 
 75 |     @staticmethod
 76 |     def make_image_examples_nfeats_tpl(path, img_dir):
 77 |         """
 78 |         Args:
 79 |             path (str): location of a src file containing image paths
 80 |             src_dir (str): location of source images
 81 | 
 82 |         Returns:
 83 |             (example_dict iterator, num_feats) tuple
 84 |         """
 85 |         examples_iter = ImageDataset.read_img_file(path, img_dir, 'src')
 86 |         num_feats = 0  # Source side(img) has no features.
 87 | 
 88 |         return (examples_iter, num_feats)
 89 | 
 90 |     @staticmethod
 91 |     def read_img_file(path, src_dir, side, truncate=None):
 92 |         """
 93 |         Args:
 94 |             path (str): location of a src file containing image paths
 95 |             src_dir (str): location of source images
 96 |             side (str): 'src' or 'tgt'
 97 |             truncate: maximum img size ((0,0) or None for unlimited)
 98 | 
 99 |         Yields:
100 |             a dictionary containing image data, path and index for each line.
101 |         """
102 |         assert (src_dir is not None) and os.path.exists(src_dir),\
103 |             'src_dir must be a valid directory if data_type is img'
104 | 
105 |         global Image, transforms
106 |         from PIL import Image
107 |         from torchvision import transforms
108 | 
109 |         with codecs.open(path, "r", "utf-8") as corpus_file:
110 |             index = 0
111 |             for line in corpus_file:
112 |                 img_path = os.path.join(src_dir, line.strip())
113 |                 if not os.path.exists(img_path):
114 |                     img_path = line
115 | 
116 |                 assert os.path.exists(img_path), \
117 |                     'img path %s not found' % (line.strip())
118 | 
119 |                 img = transforms.ToTensor()(Image.open(img_path))
120 |                 if truncate and truncate != (0, 0):
121 |                     if not (img.size(1) <= truncate[0]
122 |                             and img.size(2) <= truncate[1]):
123 |                         continue
124 | 
125 |                 example_dict = {side: img,
126 |                                 side+'_path': line.strip(),
127 |                                 'indices': index}
128 |                 index += 1
129 | 
130 |                 yield example_dict
131 | 
132 |     @staticmethod
133 |     def get_fields(n_src_features, n_tgt_features):
134 |         """
135 |         Args:
136 |             n_src_features: the number of source features to
137 |                 create `torchtext.data.Field` for.
138 |             n_tgt_features: the number of target features to
139 |                 create `torchtext.data.Field` for.
140 | 
141 |         Returns:
142 |             A dictionary whose keys are strings and whose values
143 |             are the corresponding Field objects.
144 |         """
145 |         fields = {}
146 | 
147 |         def make_img(data, vocab, is_train):
148 |             c = data[0].size(0)
149 |             h = max([t.size(1) for t in data])
150 |             w = max([t.size(2) for t in data])
151 |             imgs = torch.zeros(len(data), c, h, w)
152 |             for i, img in enumerate(data):
153 |                 imgs[i, :, 0:img.size(1), 0:img.size(2)] = img
154 |             return imgs
155 | 
156 |         fields["src"] = torchtext.data.Field(
157 |             use_vocab=False, tensor_type=torch.FloatTensor,
158 |             postprocessing=make_img, sequential=False)
159 | 
160 |         for j in range(n_src_features):
161 |             fields["src_feat_"+str(j)] = \
162 |                 torchtext.data.Field(pad_token=PAD_WORD)
163 | 
164 |         fields["tgt"] = torchtext.data.Field(
165 |             init_token=BOS_WORD, eos_token=EOS_WORD,
166 |             pad_token=PAD_WORD)
167 | 
168 |         for j in range(n_tgt_features):
169 |             fields["tgt_feat_"+str(j)] = \
170 |                 torchtext.data.Field(init_token=BOS_WORD, eos_token=EOS_WORD,
171 |                                      pad_token=PAD_WORD)
172 | 
173 |         def make_src(data, vocab, is_train):
174 |             src_size = max([t.size(0) for t in data])
175 |             src_vocab_size = max([t.max() for t in data]) + 1
176 |             alignment = torch.zeros(src_size, len(data), src_vocab_size)
177 |             for i, sent in enumerate(data):
178 |                 for j, t in enumerate(sent):
179 |                     alignment[j, i, t] = 1
180 |             return alignment
181 | 
182 |         fields["src_map"] = torchtext.data.Field(
183 |             use_vocab=False, tensor_type=torch.FloatTensor,
184 |             postprocessing=make_src, sequential=False)
185 | 
186 |         def make_tgt(data, vocab, is_train):
187 |             tgt_size = max([t.size(0) for t in data])
188 |             alignment = torch.zeros(tgt_size, len(data)).long()
189 |             for i, sent in enumerate(data):
190 |                 alignment[:sent.size(0), i] = sent
191 |             return alignment
192 | 
193 |         fields["alignment"] = torchtext.data.Field(
194 |             use_vocab=False, tensor_type=torch.LongTensor,
195 |             postprocessing=make_tgt, sequential=False)
196 | 
197 |         fields["indices"] = torchtext.data.Field(
198 |             use_vocab=False, tensor_type=torch.LongTensor,
199 |             sequential=False)
200 | 
201 |         return fields
202 | 
203 |     @staticmethod
204 |     def get_num_features(corpus_file, side):
205 |         """
206 |         For image corpus, source side is in form of image, thus
207 |         no feature; while target side is in form of text, thus
208 |         we can extract its text features.
209 | 
210 |         Args:
211 |             corpus_file (str): file path to get the features.
212 |             side (str): 'src' or 'tgt'.
213 | 
214 |         Returns:
215 |             number of features on `side`.
216 |         """
217 |         if side == 'src':
218 |             num_feats = 0
219 |         else:
220 |             with codecs.open(corpus_file, "r", "utf-8") as cf:
221 |                 f_line = cf.readline().strip().split()
222 |                 _, _, num_feats = ImageDataset.extract_text_features(f_line)
223 | 
224 |         return num_feats
225 | 


--------------------------------------------------------------------------------
/onmt/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \
 2 |                        collect_features, get_num_features, \
 3 |                        load_fields_from_vocab, get_fields, \
 4 |                        save_fields_to_vocab, build_dataset, \
 5 |                        build_vocab, merge_vocabs, OrderedIterator
 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \
 7 |                                 EOS_WORD, UNK
 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator
 9 | from onmt.io.ImageDataset import ImageDataset
10 | from onmt.io.AudioDataset import AudioDataset
11 | 
12 | 
13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase,
14 |            collect_feature_vocabs, make_features,
15 |            collect_features, get_num_features,
16 |            load_fields_from_vocab, get_fields,
17 |            save_fields_to_vocab, build_dataset,
18 |            build_vocab, merge_vocabs, OrderedIterator,
19 |            TextDataset, ImageDataset, AudioDataset,
20 |            ShardedTextCorpusIterator]
21 | 


--------------------------------------------------------------------------------
/onmt/io/__pycache__/AudioDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/AudioDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/io/__pycache__/DatasetBase.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/DatasetBase.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/io/__pycache__/IO.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/IO.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/io/__pycache__/ImageDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/ImageDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/io/__pycache__/TextDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/TextDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/io/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/AudioEncoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class AudioEncoder(nn.Module):
 7 |     """
 8 |     A simple encoder convolutional -> recurrent neural network for
 9 |     audio input.
10 | 
11 |     Args:
12 |         num_layers (int): number of encoder layers.
13 |         bidirectional (bool): bidirectional encoder.
14 |         rnn_size (int): size of hidden states of the rnn.
15 |         dropout (float): dropout probablity.
16 |         sample_rate (float): input spec
17 |         window_size (int): input spec
18 | 
19 |     """
20 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout,
21 |                  sample_rate, window_size):
22 |         super(AudioEncoder, self).__init__()
23 |         self.num_layers = num_layers
24 |         self.num_directions = 2 if bidirectional else 1
25 |         self.hidden_size = rnn_size
26 | 
27 |         self.layer1 = nn.Conv2d(1,   32, kernel_size=(41, 11),
28 |                                 padding=(0, 10), stride=(2, 2))
29 |         self.batch_norm1 = nn.BatchNorm2d(32)
30 |         self.layer2 = nn.Conv2d(32,  32, kernel_size=(21, 11),
31 |                                 padding=(0, 0), stride=(2, 1))
32 |         self.batch_norm2 = nn.BatchNorm2d(32)
33 | 
34 |         input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
35 |         input_size = int(math.floor(input_size - 41) / 2 + 1)
36 |         input_size = int(math.floor(input_size - 21) / 2 + 1)
37 |         input_size *= 32
38 |         self.rnn = nn.LSTM(input_size, rnn_size,
39 |                            num_layers=num_layers,
40 |                            dropout=dropout,
41 |                            bidirectional=bidirectional)
42 | 
43 |     def load_pretrained_vectors(self, opt):
44 |         # Pass in needed options only when modify function definition.
45 |         pass
46 | 
47 |     def forward(self, input, lengths=None):
48 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
49 |         # (batch_size, 1, nfft, t)
50 |         # layer 1
51 |         input = self.batch_norm1(self.layer1(input[:, :, :, :]))
52 | 
53 |         # (batch_size, 32, nfft/2, t/2)
54 |         input = F.hardtanh(input, 0, 20, inplace=True)
55 | 
56 |         # (batch_size, 32, nfft/2/2, t/2)
57 |         # layer 2
58 |         input = self.batch_norm2(self.layer2(input))
59 | 
60 |         # (batch_size, 32, nfft/2/2, t/2)
61 |         input = F.hardtanh(input, 0, 20, inplace=True)
62 | 
63 |         batch_size = input.size(0)
64 |         length = input.size(3)
65 |         input = input.view(batch_size, -1, length)
66 |         input = input.transpose(0, 2).transpose(1, 2)
67 | 
68 |         output, hidden = self.rnn(input)
69 | 
70 |         return hidden, output
71 | 


--------------------------------------------------------------------------------
/onmt/modules/Conv2Conv.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of "Convolutional Sequence to Sequence Learning"
  3 | """
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.init as init
  7 | import torch.nn.functional as F
  8 | from torch.autograd import Variable
  9 | 
 10 | import onmt.modules
 11 | from onmt.modules.WeightNorm import WeightNormConv2d
 12 | from onmt.Models import EncoderBase
 13 | from onmt.Models import DecoderState
 14 | from onmt.Utils import aeq
 15 | 
 16 | 
 17 | SCALE_WEIGHT = 0.5 ** 0.5
 18 | 
 19 | 
 20 | def shape_transform(x):
 21 |     """ Tranform the size of the tensors to fit for conv input. """
 22 |     return torch.unsqueeze(torch.transpose(x, 1, 2), 3)
 23 | 
 24 | 
 25 | class GatedConv(nn.Module):
 26 |     def __init__(self, input_size, width=3, dropout=0.2, nopad=False):
 27 |         super(GatedConv, self).__init__()
 28 |         self.conv = WeightNormConv2d(input_size, 2 * input_size,
 29 |                                      kernel_size=(width, 1), stride=(1, 1),
 30 |                                      padding=(width // 2 * (1 - nopad), 0))
 31 |         init.xavier_uniform(self.conv.weight, gain=(4 * (1 - dropout))**0.5)
 32 |         self.dropout = nn.Dropout(dropout)
 33 | 
 34 |     def forward(self, x_var, hidden=None):
 35 |         x_var = self.dropout(x_var)
 36 |         x_var = self.conv(x_var)
 37 |         out, gate = x_var.split(int(x_var.size(1) / 2), 1)
 38 |         out = out * F.sigmoid(gate)
 39 |         return out
 40 | 
 41 | 
 42 | class StackedCNN(nn.Module):
 43 |     def __init__(self, num_layers, input_size, cnn_kernel_width=3,
 44 |                  dropout=0.2):
 45 |         super(StackedCNN, self).__init__()
 46 |         self.dropout = dropout
 47 |         self.num_layers = num_layers
 48 |         self.layers = nn.ModuleList()
 49 |         for i in range(num_layers):
 50 |             self.layers.append(
 51 |                 GatedConv(input_size, cnn_kernel_width, dropout))
 52 | 
 53 |     def forward(self, x, hidden=None):
 54 |         for conv in self.layers:
 55 |             x = x + conv(x)
 56 |             x *= SCALE_WEIGHT
 57 |         return x
 58 | 
 59 | 
 60 | class CNNEncoder(EncoderBase):
 61 |     """
 62 |     Encoder built on CNN based on
 63 |     :cite:`DBLP:journals/corr/GehringAGYD17`.
 64 |     """
 65 |     def __init__(self, num_layers, hidden_size,
 66 |                  cnn_kernel_width, dropout, embeddings):
 67 |         super(CNNEncoder, self).__init__()
 68 | 
 69 |         self.embeddings = embeddings
 70 |         input_size = embeddings.embedding_size
 71 |         self.linear = nn.Linear(input_size, hidden_size)
 72 |         self.cnn = StackedCNN(num_layers, hidden_size,
 73 |                               cnn_kernel_width, dropout)
 74 | 
 75 |     def forward(self, input, lengths=None, hidden=None):
 76 |         """ See :obj:`onmt.modules.EncoderBase.forward()`"""
 77 |         self._check_args(input, lengths, hidden)
 78 | 
 79 |         emb = self.embeddings(input)
 80 |         s_len, batch, emb_dim = emb.size()
 81 | 
 82 |         emb = emb.transpose(0, 1).contiguous()
 83 |         emb_reshape = emb.view(emb.size(0) * emb.size(1), -1)
 84 |         emb_remap = self.linear(emb_reshape)
 85 |         emb_remap = emb_remap.view(emb.size(0), emb.size(1), -1)
 86 |         emb_remap = shape_transform(emb_remap)
 87 |         out = self.cnn(emb_remap)
 88 | 
 89 |         return emb_remap.squeeze(3).transpose(0, 1).contiguous(),\
 90 |             out.squeeze(3).transpose(0, 1).contiguous()
 91 | 
 92 | 
 93 | class CNNDecoder(nn.Module):
 94 |     """
 95 |     Decoder built on CNN, based on :cite:`DBLP:journals/corr/GehringAGYD17`.
 96 | 
 97 | 
 98 |     Consists of residual convolutional layers, with ConvMultiStepAttention.
 99 |     """
100 |     def __init__(self, num_layers, hidden_size, attn_type,
101 |                  copy_attn, cnn_kernel_width, dropout, embeddings):
102 |         super(CNNDecoder, self).__init__()
103 | 
104 |         # Basic attributes.
105 |         self.decoder_type = 'cnn'
106 |         self.num_layers = num_layers
107 |         self.hidden_size = hidden_size
108 |         self.cnn_kernel_width = cnn_kernel_width
109 |         self.embeddings = embeddings
110 |         self.dropout = dropout
111 | 
112 |         # Build the CNN.
113 |         input_size = self.embeddings.embedding_size
114 |         self.linear = nn.Linear(input_size, self.hidden_size)
115 |         self.conv_layers = nn.ModuleList()
116 |         for i in range(self.num_layers):
117 |             self.conv_layers.append(
118 |                 GatedConv(self.hidden_size, self.cnn_kernel_width,
119 |                           self.dropout, True))
120 | 
121 |         self.attn_layers = nn.ModuleList()
122 |         for i in range(self.num_layers):
123 |             self.attn_layers.append(
124 |                 onmt.modules.ConvMultiStepAttention(self.hidden_size))
125 | 
126 |         # CNNDecoder has its own attention mechanism.
127 |         # Set up a separated copy attention layer, if needed.
128 |         self._copy = False
129 |         if copy_attn:
130 |             self.copy_attn = onmt.modules.GlobalAttention(
131 |                 hidden_size, attn_type=attn_type)
132 |             self._copy = True
133 | 
134 |     def forward(self, input, context, state, context_lengths=None):
135 |         """ See :obj:`onmt.modules.RNNDecoderBase.forward()`"""
136 |         # CHECKS
137 |         assert isinstance(state, CNNDecoderState)
138 |         input_len, input_batch, _ = input.size()
139 |         contxt_len, contxt_batch, _ = context.size()
140 |         aeq(input_batch, contxt_batch)
141 |         # END CHECKS
142 | 
143 |         if state.previous_input is not None:
144 |             input = torch.cat([state.previous_input, input], 0)
145 | 
146 |         # Initialize return variables.
147 |         outputs = []
148 |         attns = {"std": []}
149 |         assert not self._copy, "Copy mechanism not yet tested in conv2conv"
150 |         if self._copy:
151 |             attns["copy"] = []
152 | 
153 |         emb = self.embeddings(input)
154 |         assert emb.dim() == 3  # len x batch x embedding_dim
155 | 
156 |         tgt_emb = emb.transpose(0, 1).contiguous()
157 |         # The output of CNNEncoder.
158 |         src_context_t = context.transpose(0, 1).contiguous()
159 |         # The combination of output of CNNEncoder and source embeddings.
160 |         src_context_c = state.init_src.transpose(0, 1).contiguous()
161 | 
162 |         # Run the forward pass of the CNNDecoder.
163 |         emb_reshape = tgt_emb.contiguous().view(
164 |             tgt_emb.size(0) * tgt_emb.size(1), -1)
165 |         linear_out = self.linear(emb_reshape)
166 |         x = linear_out.view(tgt_emb.size(0), tgt_emb.size(1), -1)
167 |         x = shape_transform(x)
168 | 
169 |         pad = Variable(torch.zeros(x.size(0), x.size(1),
170 |                                    self.cnn_kernel_width - 1, 1))
171 |         pad = pad.type_as(x)
172 |         base_target_emb = x
173 | 
174 |         for conv, attention in zip(self.conv_layers, self.attn_layers):
175 |             new_target_input = torch.cat([pad, x], 2)
176 |             out = conv(new_target_input)
177 |             c, attn = attention(base_target_emb, out,
178 |                                 src_context_t, src_context_c)
179 |             x = (x + (c + out) * SCALE_WEIGHT) * SCALE_WEIGHT
180 |         output = x.squeeze(3).transpose(1, 2)
181 | 
182 |         # Process the result and update the attentions.
183 |         outputs = output.transpose(0, 1).contiguous()
184 |         if state.previous_input is not None:
185 |             outputs = outputs[state.previous_input.size(0):]
186 |             attn = attn[:, state.previous_input.size(0):].squeeze()
187 |             attn = torch.stack([attn])
188 |         attns["std"] = attn
189 |         if self._copy:
190 |             attns["copy"] = attn
191 | 
192 |         # Update the state.
193 |         state.update_state(input)
194 | 
195 |         return outputs, state, attns
196 | 
197 |     def init_decoder_state(self, src, context, enc_hidden):
198 |         return CNNDecoderState(context, enc_hidden)
199 | 
200 | 
201 | class CNNDecoderState(DecoderState):
202 |     def __init__(self, context, enc_hidden):
203 |         self.init_src = (context + enc_hidden) * SCALE_WEIGHT
204 |         self.previous_input = None
205 | 
206 |     @property
207 |     def _all(self):
208 |         """
209 |         Contains attributes that need to be updated in self.beam_update().
210 |         """
211 |         return (self.previous_input,)
212 | 
213 |     def update_state(self, input):
214 |         """ Called for every decoder forward pass. """
215 |         self.previous_input = input
216 | 
217 |     def repeat_beam_size_times(self, beam_size):
218 |         """ Repeat beam_size times along batch dimension. """
219 |         self.init_src = Variable(
220 |             self.init_src.data.repeat(1, beam_size, 1), volatile=True)
221 | 


--------------------------------------------------------------------------------
/onmt/modules/ConvMultiStepAttention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from onmt.Utils import aeq
 5 | 
 6 | 
 7 | SCALE_WEIGHT = 0.5 ** 0.5
 8 | 
 9 | 
10 | def seq_linear(linear, x):
11 |     # linear transform for 3-d tensor
12 |     batch, hidden_size, length, _ = x.size()
13 |     h = linear(torch.transpose(x, 1, 2).contiguous().view(
14 |         batch * length, hidden_size))
15 |     return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
16 | 
17 | 
18 | class ConvMultiStepAttention(nn.Module):
19 |     """
20 | 
21 |     Conv attention takes a key matrix, a value matrix and a query vector.
22 |     Attention weight is calculated by key matrix with the query vector
23 |     and sum on the value matrix. And the same operation is applied
24 |     in each decode conv layer.
25 | 
26 |     """
27 | 
28 |     def __init__(self, input_size):
29 |         super(ConvMultiStepAttention, self).__init__()
30 |         self.linear_in = nn.Linear(input_size, input_size)
31 |         self.mask = None
32 | 
33 |     def apply_mask(self, mask):
34 |         self.mask = mask
35 | 
36 |     def forward(self, base_target_emb, input, encoder_out_top,
37 |                 encoder_out_combine):
38 |         """
39 |         Args:
40 |             base_target_emb: target emb tensor
41 |             input: output of decode conv
42 |             encoder_out_t: the key matrix for calculation of attetion weight,
43 |                 which is the top output of encode conv
44 |             encoder_out_combine:
45 |                 the value matrix for the attention-weighted sum,
46 |                 which is the combination of base emb and top output of encode
47 | 
48 |         """
49 |         # checks
50 |         batch, channel, height, width = base_target_emb.size()
51 |         batch_, channel_, height_, width_ = input.size()
52 |         aeq(batch, batch_)
53 |         aeq(height, height_)
54 | 
55 |         enc_batch, enc_channel, enc_height = encoder_out_top.size()
56 |         enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
57 | 
58 |         aeq(enc_batch, enc_batch_)
59 |         aeq(enc_height, enc_height_)
60 | 
61 |         preatt = seq_linear(self.linear_in, input)
62 |         target = (base_target_emb + preatt) * SCALE_WEIGHT
63 |         target = torch.squeeze(target, 3)
64 |         target = torch.transpose(target, 1, 2)
65 |         pre_attn = torch.bmm(target, encoder_out_top)
66 | 
67 |         if self.mask is not None:
68 |             pre_attn.data.masked_fill_(self.mask, -float('inf'))
69 | 
70 |         pre_attn = pre_attn.transpose(0, 2)
71 |         attn = F.softmax(pre_attn)
72 |         attn = attn.transpose(0, 2).contiguous()
73 |         context_output = torch.bmm(
74 |             attn, torch.transpose(encoder_out_combine, 1, 2))
75 |         context_output = torch.transpose(
76 |             torch.unsqueeze(context_output, 3), 1, 2)
77 |         return context_output, attn
78 | 


--------------------------------------------------------------------------------
/onmt/modules/CopyGenerator.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import torch
  4 | import torch.cuda
  5 | 
  6 | import onmt
  7 | import onmt.io
  8 | from onmt.Utils import aeq
  9 | 
 10 | 
 11 | class CopyGenerator(nn.Module):
 12 |     """Generator module that additionally considers copying
 13 |     words directly from the source.
 14 | 
 15 |     The main idea is that we have an extended "dynamic dictionary".
 16 |     It contains `|tgt_dict|` words plus an arbitrary number of
 17 |     additional words introduced by the source sentence.
 18 |     For each source sentence we have a `src_map` that maps
 19 |     each source word to an index in `tgt_dict` if it known, or
 20 |     else to an extra word.
 21 | 
 22 |     The copy generator is an extended version of the standard
 23 |     generator that computse three values.
 24 | 
 25 |     * :math:`p_{softmax}` the standard softmax over `tgt_dict`
 26 |     * :math:`p(z)` the probability of instead copying a
 27 |       word from the source, computed using a bernoulli
 28 |     * :math:`p_{copy}` the probility of copying a word instead.
 29 |       taken from the attention distribution directly.
 30 | 
 31 |     The model returns a distribution over the extend dictionary,
 32 |     computed as
 33 | 
 34 |     :math:`p(w) = p(z=1)  p_{copy}(w)  +  p(z=0)  p_{softmax}(w)`
 35 | 
 36 | 
 37 |     .. mermaid::
 38 | 
 39 |        graph BT
 40 |           A[input]
 41 |           S[src_map]
 42 |           B[softmax]
 43 |           BB[switch]
 44 |           C[attn]
 45 |           D[copy]
 46 |           O[output]
 47 |           A --> B
 48 |           A --> BB
 49 |           S --> D
 50 |           C --> D
 51 |           D --> O
 52 |           B --> O
 53 |           BB --> O
 54 | 
 55 | 
 56 |     Args:
 57 |        input_size (int): size of input representation
 58 |        tgt_dict (Vocab): output target dictionary
 59 | 
 60 |     """
 61 |     def __init__(self, input_size, tgt_dict):
 62 |         super(CopyGenerator, self).__init__()
 63 |         self.linear = nn.Linear(input_size, len(tgt_dict))
 64 |         self.linear_copy = nn.Linear(input_size, 1)
 65 |         self.tgt_dict = tgt_dict
 66 | 
 67 |     def forward(self, hidden, attn, src_map):
 68 |         """
 69 |         Compute a distribution over the target dictionary
 70 |         extended by the dynamic dictionary implied by compying
 71 |         source words.
 72 | 
 73 |         Args:
 74 |            hidden (`FloatTensor`): hidden outputs `[batch*tlen, input_size]`
 75 |            attn (`FloatTensor`): attn for each `[batch*tlen, input_size]`
 76 |            src_map (`FloatTensor`):
 77 |              A sparse indicator matrix mapping each source word to
 78 |              its index in the "extended" vocab containing.
 79 |              `[src_len, batch, extra_words]`
 80 |         """
 81 |         # CHECKS
 82 |         batch_by_tlen, _ = hidden.size()
 83 |         batch_by_tlen_, slen = attn.size()
 84 |         slen_, batch, cvocab = src_map.size()
 85 |         aeq(batch_by_tlen, batch_by_tlen_)
 86 |         aeq(slen, slen_)
 87 | 
 88 |         # Original probabilities.
 89 |         logits = self.linear(hidden)
 90 |         logits[:, self.tgt_dict.stoi[onmt.io.PAD_WORD]] = -float('inf')
 91 |         prob = F.softmax(logits)
 92 | 
 93 |         # Probability of copying p(z=1) batch.
 94 |         copy = F.sigmoid(self.linear_copy(hidden))
 95 | 
 96 |         # Probibility of not copying: p_{word}(w) * (1 - p(z))
 97 |         out_prob = torch.mul(prob,  1 - copy.expand_as(prob))
 98 |         mul_attn = torch.mul(attn, copy.expand_as(attn))
 99 |         copy_prob = torch.bmm(mul_attn.view(-1, batch, slen)
100 |                               .transpose(0, 1),
101 |                               src_map.transpose(0, 1)).transpose(0, 1)
102 |         copy_prob = copy_prob.contiguous().view(-1, cvocab)
103 |         return torch.cat([out_prob, copy_prob], 1)
104 | 
105 | 
106 | class CopyGeneratorCriterion(object):
107 |     def __init__(self, vocab_size, force_copy, pad, eps=1e-20):
108 |         self.force_copy = force_copy
109 |         self.eps = eps
110 |         self.offset = vocab_size
111 |         self.pad = pad
112 | 
113 |     def __call__(self, scores, align, target):
114 |         align = align.view(-1)
115 | 
116 |         # Copy prob.
117 |         out = scores.gather(1, align.view(-1, 1) + self.offset) \
118 |                     .view(-1).mul(align.ne(0).float())
119 |         tmp = scores.gather(1, target.view(-1, 1)).view(-1)
120 | 
121 |         # Regular prob (no unks and unks that can't be copied)
122 |         if not self.force_copy:
123 |             out = out + self.eps + tmp.mul(target.ne(0).float()) + \
124 |                   tmp.mul(align.eq(0).float()).mul(target.eq(0).float())
125 |         else:
126 |             # Forced copy.
127 |             out = out + self.eps + tmp.mul(align.eq(0).float())
128 | 
129 |         # Drop padding.
130 |         loss = -out.log().mul(target.ne(self.pad).float()).sum()
131 |         return loss
132 | 
133 | 
134 | class CopyGeneratorLossCompute(onmt.Loss.LossComputeBase):
135 |     """
136 |     Copy Generator Loss Computation.
137 |     """
138 |     def __init__(self, generator, tgt_vocab,
139 |                  force_copy, eps=1e-20):
140 |         super(CopyGeneratorLossCompute, self).__init__(
141 |             generator, tgt_vocab)
142 | 
143 |         # We lazily load datasets when there are more than one, so postpone
144 |         # the setting of cur_dataset.
145 |         self.cur_dataset = None
146 |         self.force_copy = force_copy
147 |         self.criterion = CopyGeneratorCriterion(len(tgt_vocab), force_copy,
148 |                                                 self.padding_idx)
149 | 
150 |     def _make_shard_state(self, batch, output, range_, attns):
151 |         """ See base class for args description. """
152 |         if getattr(batch, "alignment", None) is None:
153 |             raise AssertionError("using -copy_attn you need to pass in "
154 |                                  "-dynamic_dict during preprocess stage.")
155 | 
156 |         return {
157 |             "output": output,
158 |             "target": batch.tgt[range_[0] + 1: range_[1]],
159 |             "copy_attn": attns.get("copy"),
160 |             "align": batch.alignment[range_[0] + 1: range_[1]]
161 |         }
162 | 
163 |     def _compute_loss(self, batch, output, target, copy_attn, align):
164 |         """
165 |         Compute the loss. The args must match self._make_shard_state().
166 |         Args:
167 |             batch: the current batch.
168 |             output: the predict output from the model.
169 |             target: the validate target to compare output with.
170 |             copy_attn: the copy attention value.
171 |             align: the align info.
172 |         """
173 |         target = target.view(-1)
174 |         align = align.view(-1)
175 |         scores = self.generator(self._bottle(output),
176 |                                 self._bottle(copy_attn),
177 |                                 batch.src_map)
178 | 
179 |         loss = self.criterion(scores, align, target)
180 | 
181 |         scores_data = scores.data.clone()
182 |         scores_data = onmt.io.TextDataset.collapse_copy_scores(
183 |                 self._unbottle(scores_data, batch.batch_size),
184 |                 batch, self.tgt_vocab, self.cur_dataset.src_vocabs)
185 |         scores_data = self._bottle(scores_data)
186 | 
187 |         # Correct target copy token instead of <unk>
188 |         # tgt[i] = align[i] + len(tgt_vocab)
189 |         # for i such that tgt[i] == 0 and align[i] != 0
190 |         target_data = target.data.clone()
191 |         correct_mask = target_data.eq(0) * align.data.ne(0)
192 |         correct_copy = (align.data + len(self.tgt_vocab)) * correct_mask.long()
193 |         target_data = target_data + correct_copy
194 | 
195 |         # Coverage loss term.
196 |         loss_data = loss.data.clone()
197 | 
198 |         stats = self._stats(loss_data, scores_data, target_data)
199 | 
200 |         return loss, stats
201 | 


--------------------------------------------------------------------------------
/onmt/modules/Dists.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from torch.distributions import Distribution
  3 | from torch.distributions import Normal as PyNormal
  4 | 
  5 | 
  6 | def convert_symmetric_dirichlet_to_logistic_normal(concentration, dim):
  7 |     return 0., (1. / concentration) * (1. - 2. / dim) + 1. / (concentration * dim)
  8 |     #return 0., 1.
  9 | 
 10 | 
 11 | class Normal(Distribution):
 12 |     def __init__(self, mean, std):
 13 |         self.normal = PyNormal(mean,std)
 14 | 
 15 |     def mean(self):
 16 |         return self.normal.mean
 17 | 
 18 |     def params(self):
 19 |         return [self.normal.mean,self.normal.std]
 20 | 
 21 |     def sample(self):
 22 |         """
 23 |         Generates a single sample or single batch of samples if the distribution
 24 |         parameters are batched.
 25 |         """
 26 |         return self.normal.sample()
 27 | 
 28 |     def sample_n(self, n):
 29 |         """
 30 |         Generates n samples or n batches of samples if the distribution parameters
 31 |         are batched.
 32 |         """
 33 |         return self.normal.sample_n(n)
 34 | 
 35 | 
 36 |     def log_prob(self, value):
 37 |         """
 38 |         Returns the log of the probability density/mass function evaluated at
 39 |         `value`.
 40 | 
 41 |         Args:
 42 |             value (Tensor or Variable):
 43 | 	"""
 44 |         return self.normal.log_prob(value)
 45 | 
 46 |     def kl(self, other):
 47 |         """ 
 48 |         KL-divergence between two Normals: KL[N(u_i, s_i) || N(u_j, s_j)] 
 49 |         where params_i = [u_i, s_i] and similarly for j.
 50 |         Returns a tensor with the dimensionality of the location variable.
 51 |         """
 52 |         if not isinstance(other, Normal):
 53 |             raise ValueError('Impossible')
 54 |         location_i, scale_i = self.params()  # [mean, std]
 55 |         location_j, scale_j = other.params()  # [mean, std]
 56 |         var_i = scale_i ** 2.
 57 |         var_j = scale_j ** 2.
 58 |         term1 = 1. / (2. * var_j) * ((location_i - location_j) ** 2. + var_i - var_j)
 59 |         term2 = torch.log(scale_j) - torch.log(scale_i)
 60 |         return term1 + term2
 61 | 
 62 | 
 63 | class LogisticNormal(Distribution):
 64 |     def __init__(self, loc, scale, n_samples=100):
 65 |         self.normal = Normal(loc,scale)
 66 |         self.n_samples = n_samples
 67 | 
 68 |     def mean(self):
 69 |         samples = self.sample_n(self.n_samples)
 70 |         #return self.normal.mean
 71 |         return samples.mean(0)
 72 | 
 73 |     def params(self):
 74 |         """ The distribution parameters (mean,std) """
 75 |         return self.normal.params()
 76 | 
 77 |     def sample(self):
 78 |         """
 79 |         Generates a single sample or single batch of samples if the distribution
 80 |         parameters are batched.
 81 |         """
 82 |         return nn.functional.softmax(self.normal.sample(),-1)
 83 | 
 84 |     def sample_n(self, n):
 85 |         """
 86 |         Generates n samples or n batches of samples if the distribution parameters
 87 |         are batched.
 88 |         """
 89 |         return nn.functional.softmax(self.normal.sample_n(n),-1)
 90 | 
 91 | 
 92 |     def log_prob(self, value):
 93 |         """
 94 |         Returns the log of the probability density/mass function evaluated at
 95 |         `value`.
 96 | 
 97 |         Args:
 98 |             value (Tensor or Variable):
 99 |         """
100 |         raise NotImplementedError
101 | 
102 |     def kl(self, other):
103 |         if isinstance(other, LogisticNormal):
104 |             return self.normal.kl(other.normal)
105 |         else:
106 |             raise ValueError('Impossible (LogisticNormal): self %s other %s' % (type(self), type(other)))
107 | 
108 | 
109 | 
110 | class Delta(Distribution):
111 |     r"""
112 |     Creates a Delta distribution parameterized by a location `loc`.
113 | 
114 |     Example::
115 | 
116 |         >>> m = Delta(torch.Tensor([0.0]))
117 |         >>> m.sample()  # mean==0
118 |          0.
119 |         [torch.FloatTensor of size 1]
120 | 
121 |     Args:
122 |         loc (float or Tensor or Variable): location of the distribution
123 |     """
124 | 
125 |     def __init__(self, loc):
126 |         self.loc = loc
127 | 
128 |     def params(self):
129 |         """ The distribution parameters (mean,std) """
130 |         return [self.loc]
131 | 
132 |     def sample(self):
133 |         return self.loc
134 | 
135 |     def mean(self):
136 |         return self.loc
137 | 
138 |     def sample_n(self, n):
139 |         # cleanly expand float or Tensor or Variable parameters
140 |         def expand(v):
141 |             if isinstance(v, Number):
142 |                 return torch.Tensor([v]).expand(n, 1)
143 |             else:
144 |                 return v.expand(n, *v.size())
145 |         return expand(self.loc)
146 | 
147 |     def log_prob(self, value):
148 |         raise Exception('Delta is degenerate.')
149 | 
150 |     def kl(self, other):
151 |         if isinstance(other, Delta):
152 |             return torch.zeros_like(self.loc)
153 |         else:
154 |             raise ValueError('Impossible (Delta): self %s other %s' % (type(self), type(other)))
155 | 


--------------------------------------------------------------------------------
/onmt/modules/Embeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | 
  5 | from onmt.modules import BottleLinear, Elementwise
  6 | from onmt.Utils import aeq
  7 | 
  8 | 
  9 | class PositionalEncoding(nn.Module):
 10 |     """
 11 |     Implements the sinusoidal positional encoding for
 12 |     non-recurrent neural networks.
 13 | 
 14 |     Implementation based on "Attention Is All You Need"
 15 |     :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
 16 | 
 17 |     Args:
 18 |        dropout (float): dropout parameter
 19 |        dim (int): embedding size
 20 |     """
 21 | 
 22 |     def __init__(self, dropout, dim, max_len=5000):
 23 |         pe = torch.arange(0, max_len).unsqueeze(1).expand(max_len, dim)
 24 |         div_term = 1 / torch.pow(10000, torch.arange(0, dim * 2, 2) / dim)
 25 |         pe = pe * div_term.expand_as(pe)
 26 |         pe[:, 0::2] = torch.sin(pe[:, 0::2])
 27 |         pe[:, 1::2] = torch.cos(pe[:, 1::2])
 28 |         pe = pe.unsqueeze(1)
 29 |         super(PositionalEncoding, self).__init__()
 30 |         self.register_buffer('pe', pe)
 31 |         self.dropout = nn.Dropout(p=dropout)
 32 | 
 33 |     def forward(self, emb):
 34 |         # We must wrap the self.pe in Variable to compute, not the other
 35 |         # way - unwrap emb(i.e. emb.data). Otherwise the computation
 36 |         # wouldn't be watched to build the compute graph.
 37 |         emb = emb + Variable(self.pe[:emb.size(0), :1, :emb.size(2)]
 38 |                              .expand_as(emb), requires_grad=False)
 39 |         emb = self.dropout(emb)
 40 |         return emb
 41 | 
 42 | 
 43 | class Embeddings(nn.Module):
 44 |     """
 45 |     Words embeddings for encoder/decoder.
 46 | 
 47 |     Additionally includes ability to add sparse input features
 48 |     based on "Linguistic Input Features Improve Neural Machine Translation"
 49 |     :cite:`sennrich2016linguistic`.
 50 | 
 51 | 
 52 |     .. mermaid::
 53 | 
 54 |        graph LR
 55 |           A[Input]
 56 |           C[Feature 1 Lookup]
 57 |           A-->B[Word Lookup]
 58 |           A-->C
 59 |           A-->D[Feature N Lookup]
 60 |           B-->E[MLP/Concat]
 61 |           C-->E
 62 |           D-->E
 63 |           E-->F[Output]
 64 | 
 65 |     Args:
 66 |         word_vec_size (int): size of the dictionary of embeddings.
 67 |         word_padding_idx (int): padding index for words in the embeddings.
 68 |         feats_padding_idx (list of int): padding index for a list of features
 69 |                                    in the embeddings.
 70 |         word_vocab_size (int): size of dictionary of embeddings for words.
 71 |         feat_vocab_sizes ([int], optional): list of size of dictionary
 72 |                                     of embeddings for each feature.
 73 | 
 74 |         position_encoding (bool): see :obj:`onmt.modules.PositionalEncoding`
 75 | 
 76 |         feat_merge (string): merge action for the features embeddings:
 77 |                     concat, sum or mlp.
 78 |         feat_vec_exponent (float): when using `-feat_merge concat`, feature
 79 |                     embedding size is N^feat_dim_exponent, where N is the
 80 |                     number of values of feature takes.
 81 |         feat_vec_size (int): embedding dimension for features when using
 82 |                     `-feat_merge mlp`
 83 |         dropout (float): dropout probability.
 84 |     """
 85 |     def __init__(self, word_vec_size,
 86 |                  word_vocab_size,
 87 |                  word_padding_idx,
 88 |                  position_encoding=False,
 89 |                  feat_merge="concat",
 90 |                  feat_vec_exponent=0.7, feat_vec_size=-1,
 91 |                  feat_padding_idx=[],
 92 |                  feat_vocab_sizes=[],
 93 |                  dropout=0):
 94 | 
 95 |         self.word_padding_idx = word_padding_idx
 96 | 
 97 |         # Dimensions and padding for constructing the word embedding matrix
 98 |         vocab_sizes = [word_vocab_size]
 99 |         emb_dims = [word_vec_size]
100 |         pad_indices = [word_padding_idx]
101 | 
102 |         # Dimensions and padding for feature embedding matrices
103 |         # (these have no effect if feat_vocab_sizes is empty)
104 |         if feat_merge == 'sum':
105 |             feat_dims = [word_vec_size] * len(feat_vocab_sizes)
106 |         elif feat_vec_size > 0:
107 |             feat_dims = [feat_vec_size] * len(feat_vocab_sizes)
108 |         else:
109 |             feat_dims = [int(vocab ** feat_vec_exponent)
110 |                          for vocab in feat_vocab_sizes]
111 |         vocab_sizes.extend(feat_vocab_sizes)
112 |         emb_dims.extend(feat_dims)
113 |         pad_indices.extend(feat_padding_idx)
114 | 
115 |         # The embedding matrix look-up tables. The first look-up table
116 |         # is for words. Subsequent ones are for features, if any exist.
117 |         emb_params = zip(vocab_sizes, emb_dims, pad_indices)
118 |         embeddings = [nn.Embedding(vocab, dim, padding_idx=pad)
119 |                       for vocab, dim, pad in emb_params]
120 |         emb_luts = Elementwise(feat_merge, embeddings)
121 | 
122 |         # The final output size of word + feature vectors. This can vary
123 |         # from the word vector size if and only if features are defined.
124 |         # This is the attribute you should access if you need to know
125 |         # how big your embeddings are going to be.
126 |         self.embedding_size = (sum(emb_dims) if feat_merge == 'concat'
127 |                                else word_vec_size)
128 | 
129 |         # The sequence of operations that converts the input sequence
130 |         # into a sequence of embeddings. At minimum this consists of
131 |         # looking up the embeddings for each word and feature in the
132 |         # input. Model parameters may require the sequence to contain
133 |         # additional operations as well.
134 |         super(Embeddings, self).__init__()
135 |         self.make_embedding = nn.Sequential()
136 |         self.make_embedding.add_module('emb_luts', emb_luts)
137 | 
138 |         if feat_merge == 'mlp':
139 |             in_dim = sum(emb_dims)
140 |             out_dim = word_vec_size
141 |             mlp = nn.Sequential(BottleLinear(in_dim, out_dim), nn.ReLU())
142 |             self.make_embedding.add_module('mlp', mlp)
143 | 
144 |         if position_encoding:
145 |             pe = PositionalEncoding(dropout, self.embedding_size)
146 |             self.make_embedding.add_module('pe', pe)
147 | 
148 |     @property
149 |     def word_lut(self):
150 |         return self.make_embedding[0][0]
151 | 
152 |     @property
153 |     def emb_luts(self):
154 |         return self.make_embedding[0]
155 | 
156 |     def load_pretrained_vectors(self, emb_file, fixed):
157 |         """Load in pretrained embeddings.
158 | 
159 |         Args:
160 |           emb_file (str) : path to torch serialized embeddings
161 |           fixed (bool) : if true, embeddings are not updated
162 |         """
163 |         if emb_file:
164 |             pretrained = torch.load(emb_file)
165 |             self.word_lut.weight.data.copy_(pretrained)
166 |             if fixed:
167 |                 self.word_lut.weight.requires_grad = False
168 | 
169 |     def forward(self, input):
170 |         """
171 |         Computes the embeddings for words and features.
172 | 
173 |         Args:
174 |             input (`LongTensor`): index tensor `[len x batch x nfeat]`
175 |         Return:
176 |             `FloatTensor`: word embeddings `[len x batch x embedding_size]`
177 |         """
178 |         in_length, in_batch, nfeat = input.size()
179 |         aeq(nfeat, len(self.emb_luts))
180 | 
181 |         emb = self.make_embedding(input)
182 | 
183 |         out_length, out_batch, emb_size = emb.size()
184 |         aeq(in_length, out_length)
185 |         aeq(in_batch, out_batch)
186 |         aeq(emb_size, self.embedding_size)
187 | 
188 |         return emb
189 | 


--------------------------------------------------------------------------------
/onmt/modules/Gate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def context_gate_factory(type, embeddings_size, decoder_size,
 6 |                          attention_size, output_size):
 7 |     """Returns the correct ContextGate class"""
 8 | 
 9 |     gate_types = {'source': SourceContextGate,
10 |                   'target': TargetContextGate,
11 |                   'both': BothContextGate}
12 | 
13 |     assert type in gate_types, "Not valid ContextGate type: {0}".format(type)
14 |     return gate_types[type](embeddings_size, decoder_size, attention_size,
15 |                             output_size)
16 | 
17 | 
18 | class ContextGate(nn.Module):
19 |     """
20 |     Context gate is a decoder module that takes as input the previous word
21 |     embedding, the current decoder state and the attention state, and
22 |     produces a gate.
23 |     The gate can be used to select the input from the target side context
24 |     (decoder state), from the source context (attention state) or both.
25 |     """
26 |     def __init__(self, embeddings_size, decoder_size,
27 |                  attention_size, output_size):
28 |         super(ContextGate, self).__init__()
29 |         input_size = embeddings_size + decoder_size + attention_size
30 |         self.gate = nn.Linear(input_size, output_size, bias=True)
31 |         self.sig = nn.Sigmoid()
32 |         self.source_proj = nn.Linear(attention_size, output_size)
33 |         self.target_proj = nn.Linear(embeddings_size + decoder_size,
34 |                                      output_size)
35 | 
36 |     def forward(self, prev_emb, dec_state, attn_state):
37 |         input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
38 |         z = self.sig(self.gate(input_tensor))
39 |         proj_source = self.source_proj(attn_state)
40 |         proj_target = self.target_proj(
41 |             torch.cat((prev_emb, dec_state), dim=1))
42 |         return z, proj_source, proj_target
43 | 
44 | 
45 | class SourceContextGate(nn.Module):
46 |     """Apply the context gate only to the source context"""
47 | 
48 |     def __init__(self, embeddings_size, decoder_size,
49 |                  attention_size, output_size):
50 |         super(SourceContextGate, self).__init__()
51 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
52 |                                         attention_size, output_size)
53 |         self.tanh = nn.Tanh()
54 | 
55 |     def forward(self, prev_emb, dec_state, attn_state):
56 |         z, source, target = self.context_gate(
57 |             prev_emb, dec_state, attn_state)
58 |         return self.tanh(target + z * source)
59 | 
60 | 
61 | class TargetContextGate(nn.Module):
62 |     """Apply the context gate only to the target context"""
63 | 
64 |     def __init__(self, embeddings_size, decoder_size,
65 |                  attention_size, output_size):
66 |         super(TargetContextGate, self).__init__()
67 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
68 |                                         attention_size, output_size)
69 |         self.tanh = nn.Tanh()
70 | 
71 |     def forward(self, prev_emb, dec_state, attn_state):
72 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
73 |         return self.tanh(z * target + source)
74 | 
75 | 
76 | class BothContextGate(nn.Module):
77 |     """Apply the context gate to both contexts"""
78 | 
79 |     def __init__(self, embeddings_size, decoder_size,
80 |                  attention_size, output_size):
81 |         super(BothContextGate, self).__init__()
82 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
83 |                                         attention_size, output_size)
84 |         self.tanh = nn.Tanh()
85 | 
86 |     def forward(self, prev_emb, dec_state, attn_state):
87 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
88 |         return self.tanh((1. - z) * target + z * source)
89 | 


--------------------------------------------------------------------------------
/onmt/modules/GlobalAttention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from onmt.modules.UtilClass import BottleLinear
  5 | from onmt.Utils import aeq, sequence_mask
  6 | 
  7 | 
  8 | class GlobalAttention(nn.Module):
  9 |     """
 10 |     Global attention takes a matrix and a query vector. It
 11 |     then computes a parameterized convex combination of the matrix
 12 |     based on the input query.
 13 | 
 14 |     Constructs a unit mapping a query `q` of size `dim`
 15 |     and a source matrix `H` of size `n x dim`, to an output
 16 |     of size `dim`.
 17 | 
 18 | 
 19 |     .. mermaid::
 20 | 
 21 |        graph BT
 22 |           A[Query]
 23 |           subgraph RNN
 24 |             C[H 1]
 25 |             D[H 2]
 26 |             E[H N]
 27 |           end
 28 |           F[Attn]
 29 |           G[Output]
 30 |           A --> F
 31 |           C --> F
 32 |           D --> F
 33 |           E --> F
 34 |           C -.-> G
 35 |           D -.-> G
 36 |           E -.-> G
 37 |           F --> G
 38 | 
 39 |     All models compute the output as
 40 |     :math:`c = \sum_{j=1}^{SeqLength} a_j H_j` where
 41 |     :math:`a_j` is the softmax of a score function.
 42 |     Then then apply a projection layer to [q, c].
 43 | 
 44 |     However they
 45 |     differ on how they compute the attention score.
 46 | 
 47 |     * Luong Attention (dot, general):
 48 |        * dot: :math:`score(H_j,q) = H_j^T q`
 49 |        * general: :math:`score(H_j, q) = H_j^T W_a q`
 50 | 
 51 | 
 52 |     * Bahdanau Attention (mlp):
 53 |        * :math:`score(H_j, q) = v_a^T tanh(W_a q + U_a h_j)`
 54 | 
 55 | 
 56 |     Args:
 57 |        dim (int): dimensionality of query and key
 58 |        coverage (bool): use coverage term
 59 |        attn_type (str): type of attention to use, options [dot,general,mlp]
 60 | 
 61 |     """
 62 |     def __init__(self, dim, coverage=False, attn_type="dot"):
 63 |         super(GlobalAttention, self).__init__()
 64 | 
 65 |         self.dim = dim
 66 |         self.attn_type = attn_type
 67 |         assert (self.attn_type in ["dot", "general", "mlp"]), (
 68 |                 "Please select a valid attention type.")
 69 | 
 70 |         if self.attn_type == "general":
 71 |             self.linear_in = nn.Linear(dim, dim, bias=False)
 72 |         elif self.attn_type == "mlp":
 73 |             self.linear_context = BottleLinear(dim, dim, bias=False)
 74 |             self.linear_query = nn.Linear(dim, dim, bias=True)
 75 |             self.v = BottleLinear(dim, 1, bias=False)
 76 |         # mlp wants it with bias
 77 |         out_bias = self.attn_type == "mlp"
 78 |         self.linear_out = nn.Linear(dim*2, dim, bias=out_bias)
 79 | 
 80 |         self.sm = nn.Softmax()
 81 |         self.tanh = nn.Tanh()
 82 | 
 83 |         if coverage:
 84 |             self.linear_cover = nn.Linear(1, dim, bias=False)
 85 | 
 86 |     def score(self, h_t, h_s):
 87 |         """
 88 |         Args:
 89 |           h_t (`FloatTensor`): sequence of queries `[batch x tgt_len x dim]`
 90 |           h_s (`FloatTensor`): sequence of sources `[batch x src_len x dim]`
 91 | 
 92 |         Returns:
 93 |           :obj:`FloatTensor`:
 94 |            raw attention scores (unnormalized) for each src index
 95 |           `[batch x tgt_len x src_len]`
 96 | 
 97 |         """
 98 | 
 99 |         # Check input sizes
100 |         src_batch, src_len, src_dim = h_s.size()
101 |         tgt_batch, tgt_len, tgt_dim = h_t.size()
102 |         aeq(src_batch, tgt_batch)
103 |         aeq(src_dim, tgt_dim)
104 |         aeq(self.dim, src_dim)
105 | 
106 |         if self.attn_type in ["general", "dot"]:
107 |             if self.attn_type == "general":
108 |                 h_t_ = h_t.view(tgt_batch*tgt_len, tgt_dim)
109 |                 h_t_ = self.linear_in(h_t_)
110 |                 h_t = h_t_.view(tgt_batch, tgt_len, tgt_dim)
111 |             h_s_ = h_s.transpose(1, 2)
112 |             # (batch, t_len, d) x (batch, d, s_len) --> (batch, t_len, s_len)
113 |             return torch.bmm(h_t, h_s_)
114 |         else:
115 |             dim = self.dim
116 |             wq = self.linear_query(h_t.view(-1, dim))
117 |             wq = wq.view(tgt_batch, tgt_len, 1, dim)
118 |             wq = wq.expand(tgt_batch, tgt_len, src_len, dim)
119 | 
120 |             uh = self.linear_context(h_s.contiguous().view(-1, dim))
121 |             uh = uh.view(src_batch, 1, src_len, dim)
122 |             uh = uh.expand(src_batch, tgt_len, src_len, dim)
123 | 
124 |             # (batch, t_len, s_len, d)
125 |             wquh = self.tanh(wq + uh)
126 | 
127 |             return self.v(wquh.view(-1, dim)).view(tgt_batch, tgt_len, src_len)
128 | 
129 |     def forward(self, input, context, context_lengths=None, coverage=None):
130 |         """
131 | 
132 |         Args:
133 |           input (`FloatTensor`): query vectors `[batch x tgt_len x dim]`
134 |           context (`FloatTensor`): source vectors `[batch x src_len x dim]`
135 |           context_lengths (`LongTensor`): the source context lengths `[batch]`
136 |           coverage (`FloatTensor`): None (not supported yet)
137 | 
138 |         Returns:
139 |           (`FloatTensor`, `FloatTensor`):
140 | 
141 |           * Computed vector `[tgt_len x batch x dim]`
142 |           * Attention distribtutions for each query
143 |              `[tgt_len x batch x src_len]`
144 |         """
145 | 
146 |         # one step input
147 |         if input.dim() == 2:
148 |             one_step = True
149 |             input = input.unsqueeze(1)
150 |         else:
151 |             one_step = False
152 | 
153 |         batch, sourceL, dim = context.size()
154 |         batch_, targetL, dim_ = input.size()
155 |         aeq(batch, batch_)
156 |         aeq(dim, dim_)
157 |         aeq(self.dim, dim)
158 |         if coverage is not None:
159 |             batch_, sourceL_ = coverage.size()
160 |             aeq(batch, batch_)
161 |             aeq(sourceL, sourceL_)
162 | 
163 |         if coverage is not None:
164 |             cover = coverage.view(-1).unsqueeze(1)
165 |             context += self.linear_cover(cover).view_as(context)
166 |             context = self.tanh(context)
167 | 
168 |         # compute attention scores, as in Luong et al.
169 |         align = self.score(input, context)
170 | 
171 |         if context_lengths is not None:
172 |             # mask => [B, n]
173 |             mask = sequence_mask(context_lengths)
174 |             # mask => [B, 1, n]
175 |             mask = mask.unsqueeze(1)  # Make it broadcastable.
176 |             align.data.masked_fill_(1 - mask, -float('inf'))
177 | 
178 |         # Softmax to normalize attention weights
179 |         align_vectors = self.sm(align.view(batch*targetL, sourceL))
180 |         align_vectors = align_vectors.view(batch, targetL, sourceL)
181 | 
182 |         # each context vector c_t is the weighted average
183 |         # over all the source hidden states
184 |         c = torch.bmm(align_vectors, context)
185 | 
186 |         # concatenate
187 |         concat_c = torch.cat([c, input], 2).view(batch*targetL, dim*2)
188 |         attn_h = self.linear_out(concat_c).view(batch, targetL, dim)
189 |         if self.attn_type in ["general", "dot"]:
190 |             attn_h = self.tanh(attn_h)
191 | 
192 |         if one_step:
193 |             attn_h = attn_h.squeeze(1)
194 |             align_vectors = align_vectors.squeeze(1)
195 | 
196 |             # Check output sizes
197 |             batch_, dim_ = attn_h.size()
198 |             aeq(batch, batch_)
199 |             aeq(dim, dim_)
200 |             batch_, sourceL_ = align_vectors.size()
201 |             aeq(batch, batch_)
202 |             aeq(sourceL, sourceL_)
203 |         else:
204 |             attn_h = attn_h.transpose(0, 1).contiguous()
205 |             align_vectors = align_vectors.transpose(0, 1).contiguous()
206 | 
207 |             # Check output sizes
208 |             targetL_, batch_, dim_ = attn_h.size()
209 |             aeq(targetL, targetL_)
210 |             aeq(batch, batch_)
211 |             aeq(dim, dim_)
212 |             targetL_, batch_, sourceL_ = align_vectors.size()
213 |             aeq(targetL, targetL_)
214 |             aeq(batch, batch_)
215 |             aeq(sourceL, sourceL_)
216 | 
217 |         return attn_h, align_vectors
218 | 


--------------------------------------------------------------------------------
/onmt/modules/ImageEncoder.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class ImageEncoder(nn.Module):
  8 |     """
  9 |     A simple encoder convolutional -> recurrent neural network for
 10 |     image input.
 11 | 
 12 |     Args:
 13 |         num_layers (int): number of encoder layers.
 14 |         bidirectional (bool): bidirectional encoder.
 15 |         rnn_size (int): size of hidden states of the rnn.
 16 |         dropout (float): dropout probablity.
 17 |     """
 18 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout):
 19 |         super(ImageEncoder, self).__init__()
 20 |         self.num_layers = num_layers
 21 |         self.num_directions = 2 if bidirectional else 1
 22 |         self.hidden_size = rnn_size
 23 | 
 24 |         self.layer1 = nn.Conv2d(3,   64, kernel_size=(3, 3),
 25 |                                 padding=(1, 1), stride=(1, 1))
 26 |         self.layer2 = nn.Conv2d(64,  128, kernel_size=(3, 3),
 27 |                                 padding=(1, 1), stride=(1, 1))
 28 |         self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3),
 29 |                                 padding=(1, 1), stride=(1, 1))
 30 |         self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3),
 31 |                                 padding=(1, 1), stride=(1, 1))
 32 |         self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3),
 33 |                                 padding=(1, 1), stride=(1, 1))
 34 |         self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3),
 35 |                                 padding=(1, 1), stride=(1, 1))
 36 | 
 37 |         self.batch_norm1 = nn.BatchNorm2d(256)
 38 |         self.batch_norm2 = nn.BatchNorm2d(512)
 39 |         self.batch_norm3 = nn.BatchNorm2d(512)
 40 | 
 41 |         input_size = 512
 42 |         self.rnn = nn.LSTM(input_size, rnn_size,
 43 |                            num_layers=num_layers,
 44 |                            dropout=dropout,
 45 |                            bidirectional=bidirectional)
 46 |         self.pos_lut = nn.Embedding(1000, input_size)
 47 | 
 48 |     def load_pretrained_vectors(self, opt):
 49 |         # Pass in needed options only when modify function definition.
 50 |         pass
 51 | 
 52 |     def forward(self, input, lengths=None):
 53 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
 54 | 
 55 |         batch_size = input.size(0)
 56 |         # (batch_size, 64, imgH, imgW)
 57 |         # layer 1
 58 |         input = F.relu(self.layer1(input[:, :, :, :]-0.5), True)
 59 | 
 60 |         # (batch_size, 64, imgH/2, imgW/2)
 61 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 62 | 
 63 |         # (batch_size, 128, imgH/2, imgW/2)
 64 |         # layer 2
 65 |         input = F.relu(self.layer2(input), True)
 66 | 
 67 |         # (batch_size, 128, imgH/2/2, imgW/2/2)
 68 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 69 | 
 70 |         #  (batch_size, 256, imgH/2/2, imgW/2/2)
 71 |         # layer 3
 72 |         # batch norm 1
 73 |         input = F.relu(self.batch_norm1(self.layer3(input)), True)
 74 | 
 75 |         # (batch_size, 256, imgH/2/2, imgW/2/2)
 76 |         # layer4
 77 |         input = F.relu(self.layer4(input), True)
 78 | 
 79 |         # (batch_size, 256, imgH/2/2/2, imgW/2/2)
 80 |         input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2))
 81 | 
 82 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2)
 83 |         # layer 5
 84 |         # batch norm 2
 85 |         input = F.relu(self.batch_norm2(self.layer5(input)), True)
 86 | 
 87 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 88 |         input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1))
 89 | 
 90 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 91 |         input = F.relu(self.batch_norm3(self.layer6(input)), True)
 92 | 
 93 |         # # (batch_size, 512, H, W)
 94 |         all_outputs = []
 95 |         for row in range(input.size(2)):
 96 |             inp = input[:, :, row, :].transpose(0, 2)\
 97 |                                      .transpose(1, 2)
 98 |             row_vec = torch.Tensor(batch_size).type_as(inp.data)\
 99 |                                               .long().fill_(row)
100 |             pos_emb = self.pos_lut(Variable(row_vec))
101 |             with_pos = torch.cat(
102 |                 (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0)
103 |             outputs, hidden_t = self.rnn(with_pos)
104 |             all_outputs.append(outputs)
105 |         out = torch.cat(all_outputs, 0)
106 | 
107 |         return hidden_t, out
108 | 


--------------------------------------------------------------------------------
/onmt/modules/MultiHeadedAttn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | 
  6 | from onmt.Utils import aeq
  7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax
  8 | 
  9 | 
 10 | class MultiHeadedAttention(nn.Module):
 11 |     """
 12 |     Multi-Head Attention module from
 13 |     "Attention is All You Need"
 14 |     :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
 15 | 
 16 |     Similar to standard `dot` attention but uses
 17 |     multiple attention distributions simulataneously
 18 |     to select relevant items.
 19 | 
 20 |     .. mermaid::
 21 | 
 22 |        graph BT
 23 |           A[key]
 24 |           B[value]
 25 |           C[query]
 26 |           O[output]
 27 |           subgraph Attn
 28 |             D[Attn 1]
 29 |             E[Attn 2]
 30 |             F[Attn N]
 31 |           end
 32 |           A --> D
 33 |           C --> D
 34 |           A --> E
 35 |           C --> E
 36 |           A --> F
 37 |           C --> F
 38 |           D --> O
 39 |           E --> O
 40 |           F --> O
 41 |           B --> O
 42 | 
 43 |     Also includes several additional tricks.
 44 | 
 45 |     Args:
 46 |        head_count (int): number of parallel heads
 47 |        model_dim (int): the dimension of keys/values/queries,
 48 |            must be divisible by head_count
 49 |        dropout (float): dropout parameter
 50 |     """
 51 |     def __init__(self, head_count, model_dim, dropout=0.1):
 52 |         assert model_dim % head_count == 0
 53 |         self.dim_per_head = model_dim // head_count
 54 |         self.model_dim = model_dim
 55 | 
 56 |         super(MultiHeadedAttention, self).__init__()
 57 |         self.head_count = head_count
 58 | 
 59 |         self.linear_keys = BottleLinear(model_dim,
 60 |                                         head_count * self.dim_per_head,
 61 |                                         bias=False)
 62 |         self.linear_values = BottleLinear(model_dim,
 63 |                                           head_count * self.dim_per_head,
 64 |                                           bias=False)
 65 |         self.linear_query = BottleLinear(model_dim,
 66 |                                          head_count * self.dim_per_head,
 67 |                                          bias=False)
 68 |         self.sm = BottleSoftmax()
 69 |         self.activation = nn.ReLU()
 70 |         self.dropout = nn.Dropout(dropout)
 71 |         self.res_dropout = nn.Dropout(dropout)
 72 | 
 73 |     def forward(self, key, value, query, mask=None):
 74 |         """
 75 |         Compute the context vector and the attention vectors.
 76 | 
 77 |         Args:
 78 |            key (`FloatTensor`): set of `key_len`
 79 |                 key vectors `[batch, key_len, dim]`
 80 |            value (`FloatTensor`): set of `key_len`
 81 |                 value vectors `[batch, key_len, dim]`
 82 |            query (`FloatTensor`): set of `query_len`
 83 |                  query vectors  `[batch, query_len, dim]`
 84 |            mask: binary mask indicating which keys have
 85 |                  non-zero attention `[batch, query_len, key_len]`
 86 |         Returns:
 87 |            (`FloatTensor`, `FloatTensor`) :
 88 | 
 89 |            * output context vectors `[batch, query_len, dim]`
 90 |            * one of the attention vectors `[batch, query_len, key_len]`
 91 |         """
 92 | 
 93 |         # CHECKS
 94 |         batch, k_len, d = key.size()
 95 |         batch_, k_len_, d_ = value.size()
 96 |         aeq(batch, batch_)
 97 |         aeq(k_len, k_len_)
 98 |         aeq(d, d_)
 99 |         batch_, q_len, d_ = query.size()
100 |         aeq(batch, batch_)
101 |         aeq(d, d_)
102 |         aeq(self.model_dim % 8, 0)
103 |         if mask is not None:
104 |             batch_, q_len_, k_len_ = mask.size()
105 |             aeq(batch_, batch)
106 |             aeq(k_len_, k_len)
107 |             aeq(q_len_ == q_len)
108 |         # END CHECKS
109 | 
110 |         def shape_projection(x):
111 |             b, l, d = x.size()
112 |             return x.view(b, l, self.head_count, self.dim_per_head) \
113 |                 .transpose(1, 2).contiguous() \
114 |                 .view(b * self.head_count, l, self.dim_per_head)
115 | 
116 |         def unshape_projection(x, q):
117 |             b, l, d = q.size()
118 |             return x.view(b, self.head_count, l, self.dim_per_head) \
119 |                     .transpose(1, 2).contiguous() \
120 |                     .view(b, l, self.head_count * self.dim_per_head)
121 | 
122 |         residual = query
123 |         key_up = shape_projection(self.linear_keys(key))
124 |         value_up = shape_projection(self.linear_values(value))
125 |         query_up = shape_projection(self.linear_query(query))
126 | 
127 |         scaled = torch.bmm(query_up, key_up.transpose(1, 2))
128 |         scaled = scaled / math.sqrt(self.dim_per_head)
129 |         bh, l, dim_per_head = scaled.size()
130 |         b = bh // self.head_count
131 |         if mask is not None:
132 | 
133 |             scaled = scaled.view(b, self.head_count, l, dim_per_head)
134 |             mask = mask.unsqueeze(1).expand_as(scaled)
135 |             scaled = scaled.masked_fill(Variable(mask), -1e18) \
136 |                            .view(bh, l, dim_per_head)
137 |         attn = self.sm(scaled)
138 |         # Return one attn
139 |         top_attn = attn \
140 |             .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \
141 |             .contiguous()
142 | 
143 |         drop_attn = self.dropout(self.sm(scaled))
144 | 
145 |         # values : (batch * 8) x qlen x dim
146 |         out = unshape_projection(torch.bmm(drop_attn, value_up), residual)
147 | 
148 |         # Residual and layer norm
149 |         ret = self.res_dropout(out)
150 | 
151 |         # CHECK
152 |         batch_, q_len_, d_ = ret.size()
153 |         aeq(q_len, q_len_)
154 |         aeq(batch, batch_)
155 |         aeq(d, d_)
156 |         # END CHECK
157 |         return ret, top_attn
158 | 


--------------------------------------------------------------------------------
/onmt/modules/StackedRNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class StackedLSTM(nn.Module):
 6 |     """
 7 |     Our own implementation of stacked LSTM.
 8 |     Needed for the decoder, because we do input feeding.
 9 |     """
10 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
11 |         super(StackedLSTM, self).__init__()
12 |         self.dropout = nn.Dropout(dropout)
13 |         self.num_layers = num_layers
14 |         self.layers = nn.ModuleList()
15 | 
16 |         for i in range(num_layers):
17 |             self.layers.append(nn.LSTMCell(input_size, rnn_size))
18 |             input_size = rnn_size
19 | 
20 |     def forward(self, input, hidden):
21 |         h_0, c_0 = hidden
22 |         h_1, c_1 = [], []
23 |         for i, layer in enumerate(self.layers):
24 |             h_1_i, c_1_i = layer(input, (h_0[i], c_0[i]))
25 |             input = h_1_i
26 |             if i + 1 != self.num_layers:
27 |                 input = self.dropout(input)
28 |             h_1 += [h_1_i]
29 |             c_1 += [c_1_i]
30 | 
31 |         h_1 = torch.stack(h_1)
32 |         c_1 = torch.stack(c_1)
33 | 
34 |         return input, (h_1, c_1)
35 | 
36 | 
37 | class StackedGRU(nn.Module):
38 | 
39 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
40 |         super(StackedGRU, self).__init__()
41 |         self.dropout = nn.Dropout(dropout)
42 |         self.num_layers = num_layers
43 |         self.layers = nn.ModuleList()
44 | 
45 |         for i in range(num_layers):
46 |             self.layers.append(nn.GRUCell(input_size, rnn_size))
47 |             input_size = rnn_size
48 | 
49 |     def forward(self, input, hidden):
50 |         h_1 = []
51 |         for i, layer in enumerate(self.layers):
52 |             h_1_i = layer(input, hidden[0][i])
53 |             input = h_1_i
54 |             if i + 1 != self.num_layers:
55 |                 input = self.dropout(input)
56 |             h_1 += [h_1_i]
57 | 
58 |         h_1 = torch.stack(h_1)
59 |         return input, (h_1,)
60 | 


--------------------------------------------------------------------------------
/onmt/modules/StructuredAttention.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torch.cuda
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class MatrixTree(nn.Module):
 8 |     """Implementation of the matrix-tree theorem for computing marginals
 9 |     of non-projective dependency parsing. This attention layer is used
10 |     in the paper "Learning Structured Text Representations."
11 | 
12 | 
13 |     :cite:`DBLP:journals/corr/LiuL17d`
14 |     """
15 |     def __init__(self, eps=1e-5):
16 |         self.eps = eps
17 |         super(MatrixTree, self).__init__()
18 | 
19 |     def forward(self, input):
20 |         laplacian = input.exp() + self.eps
21 |         output = input.clone()
22 |         for b in range(input.size(0)):
23 |             lap = laplacian[b].masked_fill(
24 |                 Variable(torch.eye(input.size(1)).cuda().ne(0)), 0)
25 |             lap = -lap + torch.diag(lap.sum(0))
26 |             # store roots on diagonal
27 |             lap[0] = input[b].diag().exp()
28 |             inv_laplacian = lap.inverse()
29 | 
30 |             factor = inv_laplacian.diag().unsqueeze(1)\
31 |                                          .expand_as(input[b]).transpose(0, 1)
32 |             term1 = input[b].exp().mul(factor).clone()
33 |             term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
34 |             term1[:, 0] = 0
35 |             term2[0] = 0
36 |             output[b] = term1 - term2
37 |             roots_output = input[b].diag().exp().mul(
38 |                 inv_laplacian.transpose(0, 1)[0])
39 |             output[b] = output[b] + torch.diag(roots_output)
40 |         return output
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     dtree = MatrixTree()
45 |     q = torch.rand(1, 5, 5).cuda()
46 |     marg = dtree.forward(Variable(q))
47 |     print(marg.sum(1))
48 | 


--------------------------------------------------------------------------------
/onmt/modules/UtilClass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Bottle(nn.Module):
 6 |         def forward(self, input):
 7 |             if len(input.size()) <= 2:
 8 |                 return super(Bottle, self).forward(input)
 9 |             size = input.size()[:2]
10 |             out = super(Bottle, self).forward(input.view(size[0]*size[1], -1))
11 |             return out.contiguous().view(size[0], size[1], -1)
12 | 
13 | 
14 | class Bottle2(nn.Module):
15 |         def forward(self, input):
16 |             if len(input.size()) <= 3:
17 |                 return super(Bottle2, self).forward(input)
18 |             size = input.size()
19 |             out = super(Bottle2, self).forward(input.view(size[0]*size[1],
20 |                                                           size[2], size[3]))
21 |             return out.contiguous().view(size[0], size[1], size[2], size[3])
22 | 
23 | 
24 | class LayerNorm(nn.Module):
25 |     ''' Layer normalization module '''
26 | 
27 |     def __init__(self, d_hid, eps=1e-3):
28 |         super(LayerNorm, self).__init__()
29 | 
30 |         self.eps = eps
31 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
32 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
33 | 
34 |     def forward(self, z):
35 |         if z.size(1) == 1:
36 |             return z
37 |         mu = torch.mean(z, dim=1)
38 |         sigma = torch.std(z, dim=1)
39 |         # HACK. PyTorch is changing behavior
40 |         if mu.dim() == 1:
41 |             mu = mu.unsqueeze(1)
42 |             sigma = sigma.unsqueeze(1)
43 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
44 |         ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \
45 |             + self.b_2.expand_as(ln_out)
46 |         return ln_out
47 | 
48 | 
49 | class BottleLinear(Bottle, nn.Linear):
50 |     pass
51 | 
52 | 
53 | class BottleLayerNorm(Bottle, LayerNorm):
54 |     pass
55 | 
56 | 
57 | class BottleSoftmax(Bottle, nn.Softmax):
58 |     pass
59 | 
60 | 
61 | class Elementwise(nn.ModuleList):
62 |     """
63 |     A simple network container.
64 |     Parameters are a list of modules.
65 |     Inputs are a 3d Variable whose last dimension is the same length
66 |     as the list.
67 |     Outputs are the result of applying modules to inputs elementwise.
68 |     An optional merge parameter allows the outputs to be reduced to a
69 |     single Variable.
70 |     """
71 | 
72 |     def __init__(self, merge=None, *args):
73 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
74 |         self.merge = merge
75 |         super(Elementwise, self).__init__(*args)
76 | 
77 |     def forward(self, input):
78 |         inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)]
79 |         assert len(self) == len(inputs)
80 |         outputs = [f(x) for f, x in zip(self, inputs)]
81 |         if self.merge == 'first':
82 |             return outputs[0]
83 |         elif self.merge == 'concat' or self.merge == 'mlp':
84 |             return torch.cat(outputs, 2)
85 |         elif self.merge == 'sum':
86 |             return sum(outputs)
87 |         else:
88 |             return outputs
89 | 


--------------------------------------------------------------------------------
/onmt/modules/WordDropout.py:
--------------------------------------------------------------------------------
 1 | from torch.autograd import Variable
 2 | import torch.nn as nn
 3 | import torch
 4 | 
 5 | 
 6 | class WordDropout(nn.Module):
 7 |     r"""During training, randomly zeroes some of the (entire) words of the input
 8 |     tensor with probability *p* using samples from a bernoulli distribution.
 9 |     The elements to zero are randomized on every forward call.
10 | 
11 |     Furthermore, the outputs are scaled by a factor of *1/(1-p)* during
12 |     training. This means that during evaluation the module simply computes an
13 |     identity function.
14 | 
15 |     Args:
16 |         p: probability of an element to be zeroed. Default: 0.1
17 |         inplace: If set to ``True``, will do this operation in-place. Default: ``False``
18 | 
19 |     Shape:
20 |         - Input: `Any`. Input can be of any shape
21 |         - Output: `Same`. Output is of the same shape as input
22 | 
23 |     Examples::
24 | 
25 |         >>> m = nn.Dropout(p=0.2)
26 |         >>> input = autograd.Variable(torch.randn(20, 16))
27 |         >>> output = m(input)
28 | 
29 |     .. _Improving neural networks by preventing co-adaptation of feature
30 |         detectors: https://arxiv.org/abs/1207.0580
31 |     """
32 | 
33 |     def __init__(self, p=0.0, inplace=False, dim=2):
34 |         super(WordDropout, self).__init__()
35 |         if p < 0 or p > 1:
36 |             raise ValueError("dropout probability has to be between 0 and 1, "
37 |                              "but got {}".format(p))
38 |         self.p = p
39 |         # dimension of the word dropout (sequence).
40 |         # e.g. in [time, batch, features], i.e. [T, B, D], word dropout is applied on either all D or none.
41 |         self.dim = dim
42 |         self.inplace = inplace
43 | 
44 |     def forward(self, input, training=False):
45 |         if self.p == 0 or not training:
46 |             return input
47 | 
48 |         keep_prob = 1 - self.p
49 |         noise = torch.zeros_like(input.data)
50 |         noise = Variable(torch.sum(noise, dim=self.dim))
51 |         noise.data.bernoulli_( self.p )
52 |         noise = noise.byte()
53 |         noise = noise.unsqueeze(self.dim)
54 | 
55 |         output = input.masked_fill_(noise, 0.)
56 |         output /= keep_prob
57 |         return torch.mul(output, input)
58 | 
59 |     def __repr__(self):
60 |         inplace_str = ', inplace' if self.inplace else ''
61 |         return self.__class__.__name__ + '(' \
62 |             + 'p=' + str(self.p) \
63 |             + inplace_str + ')'
64 | 


--------------------------------------------------------------------------------
/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \
 2 |     BottleLayerNorm, BottleSoftmax, Elementwise
 3 | from onmt.modules.Gate import context_gate_factory, ContextGate
 4 | from onmt.modules.GlobalAttention import GlobalAttention
 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention
 6 | from onmt.modules.ImageEncoder import ImageEncoder
 7 | from onmt.modules.AudioEncoder import AudioEncoder
 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute
 9 | from onmt.modules.StructuredAttention import MatrixTree
10 | from onmt.modules.Transformer import \
11 |    TransformerEncoder, TransformerDecoder, PositionwiseFeedForward
12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder
13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention
14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU
15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding
16 | from onmt.modules.WeightNorm import WeightNormConv2d
17 | from onmt.modules.NormalVariationalEncoder import GlobalInferenceNetwork, \
18 |                                                   GlobalFullInferenceNetwork, \
19 |                                                   ImageGlobalInferenceNetwork
20 | #                                                  ImageTopicInferenceNetwork, \
21 | from onmt.modules.Dists import Delta, Normal, LogisticNormal, convert_symmetric_dirichlet_to_logistic_normal
22 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \
23 |     RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel, \
24 |     RNNVIDecoderBase, NMTVIModel
25 | 
26 | from onmt.modules.SRU import check_sru_requirement
27 | can_use_sru = check_sru_requirement()
28 | if can_use_sru:
29 |     from onmt.modules.SRU import SRU
30 | 
31 | 
32 | # For flake8 compatibility.
33 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder,
34 |            RNNEncoder, NMTModel,
35 |            StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder,
36 |            PositionwiseFeedForward, PositionalEncoding,
37 |            CopyGenerator, MultiHeadedAttention,
38 |            LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax,
39 |            TransformerEncoder, TransformerDecoder, Embeddings, Elementwise,
40 |            MatrixTree, WeightNormConv2d, ConvMultiStepAttention,
41 |            CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU,
42 |            context_gate_factory, CopyGeneratorLossCompute, AudioEncoder]
43 | 
44 | __all__ += [RNNVIDecoderBase, NMTVIModel]
45 | __all__ += [GlobalInferenceNetwork, GlobalFullInferenceNetwork, ImageGlobalInferenceNetwork]
46 | #            ImageGlobalInferenceNetwork, ImageTopicInferenceNetwork]
47 | __all__ += [Delta,Normal,LogisticNormal,convert_symmetric_dirichlet_to_logistic_normal]
48 | 
49 | if can_use_sru:
50 |     __all__.extend([SRU, check_sru_requirement])
51 | 


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/AudioEncoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/AudioEncoder.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/Conv2Conv.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Conv2Conv.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/ConvMultiStepAttention.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/ConvMultiStepAttention.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/CopyGenerator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/CopyGenerator.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/Dists.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Dists.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/Embeddings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Embeddings.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/Gate.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Gate.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/GlobalAttention.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/GlobalAttention.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/ImageEncoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/ImageEncoder.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/MultiHeadedAttn.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/MultiHeadedAttn.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/NormalVariationalEncoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/NormalVariationalEncoder.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/SRU.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/SRU.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/StackedRNN.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/StackedRNN.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/StructuredAttention.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/StructuredAttention.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/Transformer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Transformer.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/UtilClass.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/UtilClass.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/WeightNorm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/WeightNorm.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/WordDropout.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/WordDropout.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/modules/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/translate/Beam.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | 
  4 | 
  5 | class Beam(object):
  6 |     """
  7 |     Class for managing the internals of the beam search process.
  8 | 
  9 |     Takes care of beams, back pointers, and scores.
 10 | 
 11 |     Args:
 12 |        size (int): beam size
 13 |        pad, bos, eos (int): indices of padding, beginning, and ending.
 14 |        n_best (int): nbest size to use
 15 |        cuda (bool): use gpu
 16 |        global_scorer (:obj:`GlobalScorer`)
 17 |     """
 18 |     def __init__(self, size, pad, bos, eos,
 19 |                  n_best=1, cuda=False,
 20 |                  global_scorer=None,
 21 |                  min_length=0):
 22 | 
 23 |         self.size = size
 24 |         self.tt = torch.cuda if cuda else torch
 25 | 
 26 |         # The score for each translation on the beam.
 27 |         self.scores = self.tt.FloatTensor(size).zero_()
 28 |         self.all_scores = []
 29 | 
 30 |         # The backpointers at each time-step.
 31 |         self.prev_ks = []
 32 | 
 33 |         # The outputs at each time-step.
 34 |         self.next_ys = [self.tt.LongTensor(size)
 35 |                         .fill_(pad)]
 36 |         self.next_ys[0][0] = bos
 37 | 
 38 |         # Has EOS topped the beam yet.
 39 |         self._eos = eos
 40 |         self.eos_top = False
 41 | 
 42 |         # The attentions (matrix) for each time.
 43 |         self.attn = []
 44 | 
 45 |         # Time and k pair for finished.
 46 |         self.finished = []
 47 |         self.n_best = n_best
 48 | 
 49 |         # Information for global scoring.
 50 |         self.global_scorer = global_scorer
 51 |         self.global_state = {}
 52 | 
 53 |         # Minimum prediction length
 54 |         self.min_length = min_length
 55 | 
 56 |     def get_current_state(self):
 57 |         "Get the outputs for the current timestep."
 58 |         return self.next_ys[-1]
 59 | 
 60 |     def get_current_origin(self):
 61 |         "Get the backpointers for the current timestep."
 62 |         return self.prev_ks[-1]
 63 | 
 64 |     def advance(self, word_probs, attn_out):
 65 |         """
 66 |         Given prob over words for every last beam `wordLk` and attention
 67 |         `attn_out`: Compute and update the beam search.
 68 | 
 69 |         Parameters:
 70 | 
 71 |         * `word_probs`- probs of advancing from the last step (K x words)
 72 |         * `attn_out`- attention at the last step
 73 | 
 74 |         Returns: True if beam search is complete.
 75 |         """
 76 |         num_words = word_probs.size(1)
 77 | 
 78 |         # force the output to be longer than self.min_length
 79 |         cur_len = len(self.next_ys)
 80 |         if cur_len < self.min_length:
 81 |             for k in range(len(word_probs)):
 82 |                 word_probs[k][self._eos] = -1e20
 83 | 
 84 |         # Sum the previous scores.
 85 |         if len(self.prev_ks) > 0:
 86 |             beam_scores = word_probs + \
 87 |                 self.scores.unsqueeze(1).expand_as(word_probs)
 88 | 
 89 |             # Don't let EOS have children.
 90 |             for i in range(self.next_ys[-1].size(0)):
 91 |                 if self.next_ys[-1][i] == self._eos:
 92 |                     beam_scores[i] = -1e20
 93 |         else:
 94 |             beam_scores = word_probs[0]
 95 |         flat_beam_scores = beam_scores.view(-1)
 96 |         best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0,
 97 |                                                             True, True)
 98 | 
 99 |         self.all_scores.append(self.scores)
100 |         self.scores = best_scores
101 | 
102 |         # best_scores_id is flattened beam x word array, so calculate which
103 |         # word and beam each score came from
104 |         prev_k = best_scores_id / num_words
105 |         self.prev_ks.append(prev_k)
106 |         self.next_ys.append((best_scores_id - prev_k * num_words))
107 |         self.attn.append(attn_out.index_select(0, prev_k))
108 | 
109 |         if self.global_scorer is not None:
110 |             self.global_scorer.update_global_state(self)
111 | 
112 |         for i in range(self.next_ys[-1].size(0)):
113 |             if self.next_ys[-1][i] == self._eos:
114 |                 s = self.scores[i]
115 |                 if self.global_scorer is not None:
116 |                     global_scores = self.global_scorer.score(self, self.scores)
117 |                     s = global_scores[i]
118 |                 self.finished.append((s, len(self.next_ys) - 1, i))
119 | 
120 |         # End condition is when top-of-beam is EOS and no global score.
121 |         if self.next_ys[-1][0] == self._eos:
122 |             # self.all_scores.append(self.scores)
123 |             self.eos_top = True
124 | 
125 |     def done(self):
126 |         return self.eos_top and len(self.finished) >= self.n_best
127 | 
128 |     def sort_finished(self, minimum=None):
129 |         if minimum is not None:
130 |             i = 0
131 |             # Add from beam until we have minimum outputs.
132 |             while len(self.finished) < minimum:
133 |                 s = self.scores[i]
134 |                 if self.global_scorer is not None:
135 |                     global_scores = self.global_scorer.score(self, self.scores)
136 |                     s = global_scores[i]
137 |                 self.finished.append((s, len(self.next_ys) - 1, i))
138 | 
139 |         self.finished.sort(key=lambda a: -a[0])
140 |         scores = [sc for sc, _, _ in self.finished]
141 |         ks = [(t, k) for _, t, k in self.finished]
142 |         return scores, ks
143 | 
144 |     def get_hyp(self, timestep, k):
145 |         """
146 |         Walk back to construct the full hypothesis.
147 |         """
148 |         hyp, attn = [], []
149 |         for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):
150 |             hyp.append(self.next_ys[j+1][k])
151 |             attn.append(self.attn[j][k])
152 |             k = self.prev_ks[j][k]
153 |         return hyp[::-1], torch.stack(attn[::-1])
154 | 
155 | 
156 | class GNMTGlobalScorer(object):
157 |     """
158 |     NMT re-ranking score from
159 |     "Google's Neural Machine Translation System" :cite:`wu2016google`
160 | 
161 |     Args:
162 |        alpha (float): length parameter
163 |        beta (float):  coverage parameter
164 |     """
165 |     def __init__(self, alpha, beta):
166 |         self.alpha = alpha
167 |         self.beta = beta
168 | 
169 |     def score(self, beam, logprobs):
170 |         "Additional term add to log probability"
171 |         cov = beam.global_state["coverage"]
172 |         pen = self.beta * torch.min(cov, cov.clone().fill_(1.0)).log().sum(1)
173 |         l_term = (((5 + len(beam.next_ys)) ** self.alpha) /
174 |                   ((5 + 1) ** self.alpha))
175 |         return (logprobs / l_term) + pen
176 | 
177 |     def update_global_state(self, beam):
178 |         "Keeps the coverage vector as sum of attens"
179 |         if len(beam.prev_ks) == 1:
180 |             beam.global_state["coverage"] = beam.attn[-1]
181 |         else:
182 |             beam.global_state["coverage"] = beam.global_state["coverage"] \
183 |                 .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1])
184 | 


--------------------------------------------------------------------------------
/onmt/translate/Translation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, unicode_literals
  2 | 
  3 | import torch
  4 | import onmt.io
  5 | 
  6 | 
  7 | class TranslationBuilder(object):
  8 |     """
  9 |     Build a word-based translation from the batch output
 10 |     of translator and the underlying dictionaries.
 11 | 
 12 |     Replacement based on "Addressing the Rare Word
 13 |     Problem in Neural Machine Translation" :cite:`Luong2015b`
 14 | 
 15 |     Args:
 16 |        data (DataSet):
 17 |        fields (dict of Fields): data fields
 18 |        n_best (int): number of translations produced
 19 |        replace_unk (bool): replace unknown words using attention
 20 |        has_tgt (bool): will the batch have gold targets
 21 |     """
 22 |     def __init__(self, data, fields, n_best=1, replace_unk=False,
 23 |                  has_tgt=False):
 24 |         self.data = data
 25 |         self.fields = fields
 26 |         self.n_best = n_best
 27 |         self.replace_unk = replace_unk
 28 |         self.has_tgt = has_tgt
 29 | 
 30 |     def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn):
 31 |         vocab = self.fields["tgt"].vocab
 32 |         tokens = []
 33 |         for tok in pred:
 34 |             if tok < len(vocab):
 35 |                 tokens.append(vocab.itos[tok])
 36 |             else:
 37 |                 tokens.append(src_vocab.itos[tok - len(vocab)])
 38 |             if tokens[-1] == onmt.io.EOS_WORD:
 39 |                 tokens = tokens[:-1]
 40 |                 break
 41 |         if self.replace_unk and (attn is not None) and (src is not None):
 42 |             for i in range(len(tokens)):
 43 |                 if tokens[i] == vocab.itos[onmt.io.UNK]:
 44 |                     _, maxIndex = attn[i].max(0)
 45 |                     tokens[i] = src_raw[maxIndex[0]]
 46 |         return tokens
 47 | 
 48 |     def from_batch(self, translation_batch):
 49 |         batch = translation_batch["batch"]
 50 |         assert(len(translation_batch["gold_score"]) ==
 51 |                len(translation_batch["predictions"]))
 52 |         batch_size = batch.batch_size
 53 | 
 54 |         preds, pred_score, attn, gold_score, indices = list(zip(
 55 |             *sorted(zip(translation_batch["predictions"],
 56 |                         translation_batch["scores"],
 57 |                         translation_batch["attention"],
 58 |                         translation_batch["gold_score"],
 59 |                         batch.indices.data),
 60 |                     key=lambda x: x[-1])))
 61 | 
 62 |         # Sorting
 63 |         inds, perm = torch.sort(batch.indices.data)
 64 |         data_type = self.data.data_type
 65 |         if data_type == 'text':
 66 |             src = batch.src[0].data.index_select(1, perm)
 67 |         else:
 68 |             src = None
 69 | 
 70 |         if self.has_tgt:
 71 |             tgt = batch.tgt.data.index_select(1, perm)
 72 |         else:
 73 |             tgt = None
 74 | 
 75 |         translations = []
 76 |         for b in range(batch_size):
 77 |             if data_type == 'text':
 78 |                 src_vocab = self.data.src_vocabs[inds[b]] \
 79 |                   if self.data.src_vocabs else None
 80 |                 src_raw = self.data.examples[inds[b]].src
 81 |             else:
 82 |                 src_vocab = None
 83 |                 src_raw = None
 84 |             pred_sents = [self._build_target_tokens(
 85 |                 src[:, b] if src is not None else None,
 86 |                 src_vocab, src_raw,
 87 |                 preds[b][n], attn[b][n])
 88 |                           for n in range(self.n_best)]
 89 |             gold_sent = None
 90 |             if tgt is not None:
 91 |                 gold_sent = self._build_target_tokens(
 92 |                     src[:, b] if src is not None else None,
 93 |                     src_vocab, src_raw,
 94 |                     tgt[1:, b] if tgt is not None else None, None)
 95 | 
 96 |             translation = Translation(src[:, b] if src is not None else None,
 97 |                                       src_raw, pred_sents,
 98 |                                       attn[b], pred_score[b], gold_sent,
 99 |                                       gold_score[b])
100 |             translations.append(translation)
101 | 
102 |         return translations
103 | 
104 | 
105 | class Translation(object):
106 |     """
107 |     Container for a translated sentence.
108 | 
109 |     Attributes:
110 |         src (`LongTensor`): src word ids
111 |         src_raw ([str]): raw src words
112 | 
113 |         pred_sents ([[str]]): words from the n-best translations
114 |         pred_scores ([[float]]): log-probs of n-best translations
115 |         attns ([`FloatTensor`]) : attention dist for each translation
116 |         gold_sent ([str]): words from gold translation
117 |         gold_score ([float]): log-prob of gold translation
118 | 
119 |     """
120 |     def __init__(self, src, src_raw, pred_sents,
121 |                  attn, pred_scores, tgt_sent, gold_score):
122 |         self.src = src
123 |         self.src_raw = src_raw
124 |         self.pred_sents = pred_sents
125 |         self.attns = attn
126 |         self.pred_scores = pred_scores
127 |         self.gold_sent = tgt_sent
128 |         self.gold_score = gold_score
129 | 
130 |     def log(self, sent_number):
131 |         """
132 |         Log translation to stdout.
133 |         """
134 |         output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw)
135 | 
136 |         best_pred = self.pred_sents[0]
137 |         best_score = self.pred_scores[0]
138 |         pred_sent = ' '.join(best_pred)
139 |         output += 'PRED {}: {}\n'.format(sent_number, pred_sent)
140 |         print("PRED SCORE: {:.4f}".format(best_score))
141 | 
142 |         if self.gold_sent is not None:
143 |             tgt_sent = ' '.join(self.gold_sent)
144 |             output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent)
145 |             output += ("GOLD SCORE: {:.4f}".format(self.gold_score))
146 | 
147 |         if len(self.pred_sents) > 1:
148 |             print('\nBEST HYP:')
149 |             for score, sent in zip(self.pred_scores, self.pred_sents):
150 |                 output += "[{:.4f}] {}\n".format(score, sent)
151 | 
152 |         return output
153 | 


--------------------------------------------------------------------------------
/onmt/translate/Translator.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Variable
  3 | 
  4 | import onmt.translate.Beam
  5 | import onmt.io
  6 | 
  7 | 
  8 | class Translator(object):
  9 |     """
 10 |     Uses a model to translate a batch of sentences.
 11 | 
 12 | 
 13 |     Args:
 14 |        model (:obj:`onmt.modules.NMTModel`):
 15 |           NMT model to use for translation
 16 |        fields (dict of Fields): data fields
 17 |        beam_size (int): size of beam to use
 18 |        n_best (int): number of translations produced
 19 |        max_length (int): maximum length output to produce
 20 |        global_scores (:obj:`GlobalScorer`):
 21 |          object to rescore final translations
 22 |        copy_attn (bool): use copy attention during translation
 23 |        cuda (bool): use cuda
 24 |        beam_trace (bool): trace beam search for debugging
 25 |     """
 26 |     def __init__(self, model, fields,
 27 |                  beam_size, n_best=1,
 28 |                  max_length=100,
 29 |                  global_scorer=None, copy_attn=False, cuda=False,
 30 |                  beam_trace=False, min_length=0):
 31 |         self.model = model
 32 |         self.fields = fields
 33 |         self.n_best = n_best
 34 |         self.max_length = max_length
 35 |         self.global_scorer = global_scorer
 36 |         self.copy_attn = copy_attn
 37 |         self.beam_size = beam_size
 38 |         self.cuda = cuda
 39 |         self.min_length = min_length
 40 | 
 41 |         # for debugging
 42 |         self.beam_accum = None
 43 |         if beam_trace:
 44 |             self.beam_accum = {
 45 |                 "predicted_ids": [],
 46 |                 "beam_parent_ids": [],
 47 |                 "scores": [],
 48 |                 "log_probs": []}
 49 | 
 50 |     def translate_batch(self, batch, data):
 51 |         """
 52 |         Translate a batch of sentences.
 53 | 
 54 |         Mostly a wrapper around :obj:`Beam`.
 55 | 
 56 |         Args:
 57 |            batch (:obj:`Batch`): a batch from a dataset object
 58 |            data (:obj:`Dataset`): the dataset object
 59 | 
 60 | 
 61 |         Todo:
 62 |            Shouldn't need the original dataset.
 63 |         """
 64 | 
 65 |         # (0) Prep each of the components of the search.
 66 |         # And helper method for reducing verbosity.
 67 |         beam_size = self.beam_size
 68 |         batch_size = batch.batch_size
 69 |         data_type = data.data_type
 70 |         vocab = self.fields["tgt"].vocab
 71 |         beam = [onmt.translate.Beam(beam_size, n_best=self.n_best,
 72 |                                     cuda=self.cuda,
 73 |                                     global_scorer=self.global_scorer,
 74 |                                     pad=vocab.stoi[onmt.io.PAD_WORD],
 75 |                                     eos=vocab.stoi[onmt.io.EOS_WORD],
 76 |                                     bos=vocab.stoi[onmt.io.BOS_WORD],
 77 |                                     min_length=self.min_length)
 78 |                 for __ in range(batch_size)]
 79 | 
 80 |         # Help functions for working with beams and batches
 81 |         def var(a): return Variable(a, volatile=True)
 82 | 
 83 |         def rvar(a): return var(a.repeat(1, beam_size, 1))
 84 | 
 85 |         def bottle(m):
 86 |             return m.view(batch_size * beam_size, -1)
 87 | 
 88 |         def unbottle(m):
 89 |             return m.view(beam_size, batch_size, -1)
 90 | 
 91 |         # (1) Run the encoder on the src.
 92 |         src = onmt.io.make_features(batch, 'src', data_type)
 93 |         src_lengths = None
 94 |         if data_type == 'text':
 95 |             _, src_lengths = batch.src
 96 | 
 97 |         enc_states, context = self.model.encoder(src, src_lengths)
 98 |         dec_states = self.model.decoder.init_decoder_state(
 99 |                                         src, context, enc_states)
100 | 
101 |         if src_lengths is None:
102 |             src_lengths = torch.Tensor(batch_size).type_as(context.data)\
103 |                                                   .long()\
104 |                                                   .fill_(context.size(0))
105 | 
106 |         # (2) Repeat src objects `beam_size` times.
107 |         src_map = rvar(batch.src_map.data) \
108 |             if data_type == 'text' and self.copy_attn else None
109 |         context = rvar(context.data)
110 |         context_lengths = src_lengths.repeat(beam_size)
111 |         dec_states.repeat_beam_size_times(beam_size)
112 | 
113 |         # (3) run the decoder to generate sentences, using beam search.
114 |         for i in range(self.max_length):
115 |             if all((b.done() for b in beam)):
116 |                 break
117 | 
118 |             # Construct batch x beam_size nxt words.
119 |             # Get all the pending current beam words and arrange for forward.
120 |             inp = var(torch.stack([b.get_current_state() for b in beam])
121 |                       .t().contiguous().view(1, -1))
122 | 
123 |             # Turn any copied words to UNKs
124 |             # 0 is unk
125 |             if self.copy_attn:
126 |                 inp = inp.masked_fill(
127 |                     inp.gt(len(self.fields["tgt"].vocab) - 1), 0)
128 | 
129 |             # Temporary kludge solution to handle changed dim expectation
130 |             # in the decoder
131 |             inp = inp.unsqueeze(2)
132 | 
133 |             # Run one step.
134 |             dec_out, dec_states, attn = self.model.decoder(
135 |                 inp, context, dec_states, context_lengths=context_lengths)
136 |             dec_out = dec_out.squeeze(0)
137 |             # dec_out: beam x rnn_size
138 | 
139 |             # (b) Compute a vector of batch*beam word scores.
140 |             if not self.copy_attn:
141 |                 out = self.model.generator.forward(dec_out).data
142 |                 out = unbottle(out)
143 |                 # beam x tgt_vocab
144 |             else:
145 |                 out = self.model.generator.forward(dec_out,
146 |                                                    attn["copy"].squeeze(0),
147 |                                                    src_map)
148 |                 # beam x (tgt_vocab + extra_vocab)
149 |                 out = data.collapse_copy_scores(
150 |                     unbottle(out.data),
151 |                     batch, self.fields["tgt"].vocab, data.src_vocabs)
152 |                 # beam x tgt_vocab
153 |                 out = out.log()
154 | 
155 |             # (c) Advance each beam.
156 |             for j, b in enumerate(beam):
157 |                 b.advance(
158 |                     out[:, j],
159 |                     unbottle(attn["std"]).data[:, j, :context_lengths[j]])
160 |                 dec_states.beam_update(j, b.get_current_origin(), beam_size)
161 | 
162 |         # (4) Extract sentences from beam.
163 |         ret = self._from_beam(beam)
164 |         ret["gold_score"] = [0] * batch_size
165 |         if "tgt" in batch.__dict__:
166 |             ret["gold_score"] = self._run_target(batch, data)
167 |         ret["batch"] = batch
168 |         return ret
169 | 
170 |     def _from_beam(self, beam):
171 |         ret = {"predictions": [],
172 |                "scores": [],
173 |                "attention": []}
174 |         for b in beam:
175 |             n_best = self.n_best
176 |             scores, ks = b.sort_finished(minimum=n_best)
177 |             hyps, attn = [], []
178 |             for i, (times, k) in enumerate(ks[:n_best]):
179 |                 hyp, att = b.get_hyp(times, k)
180 |                 hyps.append(hyp)
181 |                 attn.append(att)
182 |             ret["predictions"].append(hyps)
183 |             ret["scores"].append(scores)
184 |             ret["attention"].append(attn)
185 |         return ret
186 | 
187 |     def _run_target(self, batch, data):
188 |         data_type = data.data_type
189 |         if data_type == 'text':
190 |             _, src_lengths = batch.src
191 |         else:
192 |             src_lengths = None
193 |         src = onmt.io.make_features(batch, 'src', data_type)
194 |         tgt_in = onmt.io.make_features(batch, 'tgt')[:-1]
195 | 
196 |         #  (1) run the encoder on the src
197 |         enc_states, context = self.model.encoder(src, src_lengths)
198 |         dec_states = self.model.decoder.init_decoder_state(src,
199 |                                                            context, enc_states)
200 | 
201 |         #  (2) if a target is specified, compute the 'goldScore'
202 |         #  (i.e. log likelihood) of the target under the model
203 |         tt = torch.cuda if self.cuda else torch
204 |         gold_scores = tt.FloatTensor(batch.batch_size).fill_(0)
205 |         dec_out, dec_states, attn = self.model.decoder(
206 |             tgt_in, context, dec_states, context_lengths=src_lengths)
207 | 
208 |         tgt_pad = self.fields["tgt"].vocab.stoi[onmt.io.PAD_WORD]
209 |         for dec, tgt in zip(dec_out, batch.tgt[1:].data):
210 |             # Log prob of each word.
211 |             out = self.model.generator.forward(dec)
212 |             tgt = tgt.unsqueeze(1)
213 |             scores = out.data.gather(1, tgt)
214 |             scores.masked_fill_(tgt.eq(tgt_pad), 0)
215 |             gold_scores += scores
216 |         return gold_scores
217 | 


--------------------------------------------------------------------------------
/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | from onmt.translate.Translator import Translator
2 | from onmt.translate.TranslatorMultimodalVI import TranslatorMultimodalVI
3 | from onmt.translate.Translation import Translation, TranslationBuilder
4 | from onmt.translate.Beam import Beam, GNMTGlobalScorer
5 | 
6 | __all__ = [Translator,
7 |            Translation, Beam, GNMTGlobalScorer, TranslationBuilder]
8 | __all__ += [TranslatorMultimodalVI]
9 | 


--------------------------------------------------------------------------------
/onmt/translate/__pycache__/Beam.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/Beam.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/translate/__pycache__/Translation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/Translation.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/translate/__pycache__/Translator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/Translator.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/translate/__pycache__/TranslatorMultimodalVI.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/TranslatorMultimodalVI.cpython-36.pyc


--------------------------------------------------------------------------------
/onmt/translate/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import os
  6 | import glob
  7 | import sys
  8 | import torch
  9 | import onmt.io
 10 | import opts
 11 | 
 12 | 
 13 | def check_existing_pt_files(opt):
 14 |     # We will use glob.glob() to find sharded {train|valid}.[0-9]*.pt
 15 |     # when training, so check to avoid tampering with existing pt files
 16 |     # or mixing them up.
 17 |     for t in ['train', 'valid', 'vocab']:
 18 |         pattern = opt.save_data + '.' + t + '*.pt'
 19 |         if glob.glob(pattern):
 20 |             sys.stderr.write("Please backup exisiting pt file: %s, "
 21 |                              "to avoid tampering!\n" % pattern)
 22 |             sys.exit(1)
 23 | 
 24 | 
 25 | def parse_args():
 26 |     parser = argparse.ArgumentParser(
 27 |         description='preprocess.py',
 28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 29 | 
 30 |     opts.add_md_help_argument(parser)
 31 |     opts.preprocess_opts(parser)
 32 | 
 33 |     opt = parser.parse_args()
 34 |     torch.manual_seed(opt.seed)
 35 | 
 36 |     check_existing_pt_files(opt)
 37 | 
 38 |     return opt
 39 | 
 40 | 
 41 | def build_save_text_dataset_in_shards(src_corpus, tgt_corpus, fields,
 42 |                                       corpus_type, opt):
 43 |     '''
 44 |     Divide the big corpus into shards, and build dataset separately.
 45 |     This is currently only for data_type=='text'.
 46 | 
 47 |     The reason we do this is to avoid taking up too much memory due
 48 |     to sucking in a huge corpus file.
 49 | 
 50 |     To tackle this, we only read in part of the corpus file of size
 51 |     `max_shard_size`(actually it is multiples of 64 bytes that equals
 52 |     or is slightly larger than this size), and process it into dataset,
 53 |     then write it to disk along the way. By doing this, we only focus on
 54 |     part of the corpus at any moment, thus effectively reducing memory use.
 55 |     According to test, this method can reduce memory footprint by ~50%.
 56 | 
 57 |     Note! As we process along the shards, previous shards might still
 58 |     stay in memory, but since we are done with them, and no more
 59 |     reference to them, if there is memory tight situation, the OS could
 60 |     easily reclaim these memory.
 61 | 
 62 |     If `max_shard_size` is 0 or is larger than the corpus size, it is
 63 |     effectively preprocessed into one dataset, i.e. no sharding.
 64 | 
 65 |     NOTE! `max_shard_size` is measuring the input corpus size, not the
 66 |     output pt file size. So a shard pt file consists of examples of size
 67 |     2 * `max_shard_size`(source + target).
 68 |     '''
 69 | 
 70 |     corpus_size = os.path.getsize(src_corpus)
 71 |     if corpus_size > 10 * (1024**2) and opt.max_shard_size == 0:
 72 |         print("Warning. The corpus %s is larger than 10M bytes, you can "
 73 |               "set '-max_shard_size' to process it by small shards "
 74 |               "to use less memory." % src_corpus)
 75 | 
 76 |     if opt.max_shard_size != 0:
 77 |         print(' * divide corpus into shards and build dataset separately'
 78 |               '(shard_size = %d bytes).' % opt.max_shard_size)
 79 | 
 80 |     ret_list = []
 81 |     src_iter = onmt.io.ShardedTextCorpusIterator(
 82 |                 src_corpus, opt.src_seq_length_trunc,
 83 |                 "src", opt.max_shard_size)
 84 |     tgt_iter = onmt.io.ShardedTextCorpusIterator(
 85 |                 tgt_corpus, opt.tgt_seq_length_trunc,
 86 |                 "tgt", opt.max_shard_size,
 87 |                 assoc_iter=src_iter)
 88 | 
 89 |     index = 0
 90 |     while not src_iter.hit_end():
 91 |         index += 1
 92 |         dataset = onmt.io.TextDataset(
 93 |                 fields, src_iter, tgt_iter,
 94 |                 src_iter.num_feats, tgt_iter.num_feats,
 95 |                 src_seq_length=opt.src_seq_length,
 96 |                 tgt_seq_length=opt.tgt_seq_length,
 97 |                 dynamic_dict=opt.dynamic_dict)
 98 | 
 99 |         # We save fields in vocab.pt seperately, so make it empty.
100 |         dataset.fields = []
101 | 
102 |         pt_file = "{:s}.{:s}.{:d}.pt".format(
103 |                 opt.save_data, corpus_type, index)
104 |         print(" * saving %s data shard to %s." % (corpus_type, pt_file))
105 |         torch.save(dataset, pt_file)
106 | 
107 |         ret_list.append(pt_file)
108 | 
109 |     return ret_list
110 | 
111 | 
112 | def build_save_dataset(corpus_type, fields, opt):
113 |     assert corpus_type in ['train', 'valid']
114 | 
115 |     if corpus_type == 'train':
116 |         src_corpus = opt.train_src
117 |         tgt_corpus = opt.train_tgt
118 |     else:
119 |         src_corpus = opt.valid_src
120 |         tgt_corpus = opt.valid_tgt
121 | 
122 |     # Currently we only do preprocess sharding for corpus: data_type=='text'.
123 |     if opt.data_type == 'text':
124 |         return build_save_text_dataset_in_shards(
125 |                 src_corpus, tgt_corpus, fields,
126 |                 corpus_type, opt)
127 | 
128 |     # For data_type == 'img' or 'audio', currently we don't do
129 |     # preprocess sharding. We only build a monolithic dataset.
130 |     # But since the interfaces are uniform, it would be not hard
131 |     # to do this should users need this feature.
132 |     dataset = onmt.io.build_dataset(
133 |                 fields, opt.data_type, src_corpus, tgt_corpus,
134 |                 src_dir=opt.src_dir,
135 |                 src_seq_length=opt.src_seq_length,
136 |                 tgt_seq_length=opt.tgt_seq_length,
137 |                 src_seq_length_trunc=opt.src_seq_length_trunc,
138 |                 tgt_seq_length_trunc=opt.tgt_seq_length_trunc,
139 |                 dynamic_dict=opt.dynamic_dict,
140 |                 sample_rate=opt.sample_rate,
141 |                 window_size=opt.window_size,
142 |                 window_stride=opt.window_stride,
143 |                 window=opt.window)
144 | 
145 |     # We save fields in vocab.pt seperately, so make it empty.
146 |     dataset.fields = []
147 | 
148 |     pt_file = "{:s}.{:s}.pt".format(opt.save_data, corpus_type)
149 |     print(" * saving %s dataset to %s." % (corpus_type, pt_file))
150 |     torch.save(dataset, pt_file)
151 | 
152 |     return [pt_file]
153 | 
154 | 
155 | def build_save_vocab(train_dataset, fields, opt):
156 |     fields = onmt.io.build_vocab(train_dataset, fields, opt.data_type,
157 |                                  opt.share_vocab,
158 |                                  opt.src_vocab,
159 |                                  opt.src_vocab_size,
160 |                                  opt.src_words_min_frequency,
161 |                                  opt.tgt_vocab,
162 |                                  opt.tgt_vocab_size,
163 |                                  opt.tgt_words_min_frequency)
164 | 
165 |     # Can't save fields, so remove/reconstruct at training time.
166 |     vocab_file = opt.save_data + '.vocab.pt'
167 |     torch.save(onmt.io.save_fields_to_vocab(fields), vocab_file)
168 | 
169 | 
170 | def main():
171 |     opt = parse_args()
172 | 
173 |     print("opt.train_src", opt.train_src)
174 |     print("opt.train_tgt", opt.train_tgt)
175 | 
176 |     print("Extracting features...")
177 |     src_nfeats = onmt.io.get_num_features(opt.data_type, opt.train_src, 'src')
178 |     tgt_nfeats = onmt.io.get_num_features(opt.data_type, opt.train_tgt, 'tgt')
179 |     print(" * number of source features: %d." % src_nfeats)
180 |     print(" * number of target features: %d." % tgt_nfeats)
181 | 
182 |     print("Building `Fields` object...")
183 |     fields = onmt.io.get_fields(opt.data_type, src_nfeats, tgt_nfeats)
184 | 
185 |     print("Building & saving training data...")
186 |     train_dataset_files = build_save_dataset('train', fields, opt)
187 | 
188 |     print("Building & saving vocabulary...")
189 |     build_save_vocab(train_dataset_files, fields, opt)
190 | 
191 |     print("Building & saving validation data...")
192 |     build_save_dataset('valid', fields, opt)
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     main()
197 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==0.3.1
2 | tables
3 | torchvision==0.2.0
4 | pretrainedmodels
5 | six
6 | tqdm
7 | torchtext==0.2.3
8 | future
9 | 


--------------------------------------------------------------------------------
/run_translated_m30k_only.sh:
--------------------------------------------------------------------------------
  1 | # this script assumes there are 2 GPU cards available in this machine (at least)
  2 | # please edit the script accordingly in case there are less GPU cards
  3 | 
  4 | DATA_PATH="/path/to/data/multi30k"
  5 | MODEL_PATH="/path/to/variational-multimodal-nmt-model-snapshots"
  6 | MODEL_FILE_NAME="MMT_VI_Model_TranslatedM30K"
  7 | 
  8 | 
  9 | # multi30k validation set
 10 | VAL_SRC="${DATA_PATH}/val.lc.norm.tok.bpe-en-de-30000.en"
 11 | VAL_TGT="${DATA_PATH}/val.lc.norm.tok.bpe-en-de-30000.de"
 12 | VAL_IMGS="${DATA_PATH}/flickr30k_valid_resnet50_cnn_features.hdf5"
 13 | 
 14 | # multi30k training set
 15 | TRAIN_SRC="${DATA_PATH}/train.lc.norm.tok.bpe-en-de-30000.en"
 16 | TRAIN_TGT="${DATA_PATH}/train.lc.norm.tok.bpe-en-de-30000.de"
 17 | TRAIN_IMGS="${DATA_PATH}/flickr30k_train_resnet50_cnn_features.hdf5"
 18 | 
 19 | # multi30k test set (2016)
 20 | TEST_2016_SRC="${DATA_PATH}/test_2016_flickr.lc.norm.tok.bpe-en-de-30000.en"
 21 | TEST_2016_TGT="${DATA_PATH}/test_2016_flickr.lc.norm.tok.bpe-en-de-30000.de"
 22 | TEST_2016_IMGS="${DATA_PATH}/flickr30k_test_resnet50_cnn_features.hdf5"
 23 | 
 24 | # multi30k test set (2017)
 25 | TEST_2017_SRC="${DATA_PATH}/test_2017_flickr.lc.norm.tok.bpe-en-de-30000.en"
 26 | TEST_2017_TGT="${DATA_PATH}/test_2017_flickr.lc.norm.tok.bpe-en-de-30000.de"
 27 | TEST_2017_IMGS="${DATA_PATH}/flickr30k_test_2017_flickr_resnet50_cnn_features.hdf5"
 28 | 
 29 | # ambiguous MSCOCO test set (2017)
 30 | TEST_2017_MSCOCO_SRC="${DATA_PATH}/test_2017_mscoco.lc.norm.tok.bpe-en-de-30000.en"
 31 | TEST_2017_MSCOCO_TGT="${DATA_PATH}/test_2017_mscoco.lc.norm.tok.bpe-en-de-30000.de"
 32 | TEST_2017_MSCOCO_IMGS="${DATA_PATH}/flickr30k_test_2017_mscoco_resnet50_cnn_features.hdf5"
 33 | 
 34 | EPOCHS=30
 35 | #EPOCHS=1
 36 | 
 37 | ##########
 38 | # train
 39 | ##########
 40 | 
 41 | # train the model on the translated Multi30k data set only (~29K src/tgt/img instances)
 42 | DATASET=${DATA_PATH}/m30k
 43 | 
 44 | # train one conditional prior and one fixed-prior model
 45 | # one model on gpu 0, another one on gpu 1 (both spawn validation set translations on gpu 1)
 46 | python train_mm_vi_model1.py \
 47 |     -gpuid 0 -epochs ${EPOCHS} -batch_size 40 -valid_batch_size 40 -optim 'adam' -learning_rate 0.002 -rnn_type LSTM \
 48 |     -rnn_size 500 --z_latent_dim 500 \
 49 |     -early_stopping_criteria 'bleu' \
 50 |     -src ${VAL_SRC} \
 51 |     -tgt ${VAL_TGT} \
 52 |     -path_to_train_img_feats ${TRAIN_IMGS} \
 53 |     -path_to_valid_img_feats ${VAL_IMGS} \
 54 |     -data ${DATASET} \
 55 |     --multimodal_model_type  vi-model1 --use_global_image_features -dropout 0.5 -dropout_imgs 0.5 \
 56 |     -save_model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior \
 57 |     -overwrite_model_file 2>&1 ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior.log &
 58 | 
 59 | python train_mm_vi_model1.py \
 60 |     -gpuid 1 -epochs ${EPOCHS} -batch_size 40 -valid_batch_size 40 -optim 'adam' -learning_rate 0.002 -rnn_type LSTM \
 61 |     -rnn_size 500 --z_latent_dim 500 \
 62 |     -early_stopping_criteria 'bleu' \
 63 |     -src ${VAL_SRC} \
 64 |     -tgt ${VAL_TGT} \
 65 |     -path_to_train_img_feats ${TRAIN_IMGS} \
 66 |     -path_to_valid_img_feats ${VAL_IMGS} \
 67 |     -data ${DATASET} \
 68 |     --multimodal_model_type  vi-model1 --use_global_image_features -dropout 0.5 -dropout_imgs 0.5 \
 69 |     -save_model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior \
 70 |     -overwrite_model_file \
 71 |     --conditional 2>&1 ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior.log &
 72 | 
 73 | wait;
 74 | 
 75 | #############
 76 | # translate
 77 | #############
 78 | 
 79 | # translate the validation set
 80 | SPLIT="validation"
 81 | python translate_mm_vi.py \
 82 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \
 83 |     -src ${VAL_SRC} \
 84 |     -path_to_test_img_feats ${VAL_IMGS} \
 85 |     -gpu 0 \
 86 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations &
 87 | 
 88 | python translate_mm_vi.py \
 89 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \
 90 |     -src ${VAL_SRC} \
 91 |     -path_to_test_img_feats ${VAL_IMGS} \
 92 |     -gpu 1 \
 93 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations &
 94 | 
 95 | wait;
 96 | 
 97 | # translate the test set (2016)
 98 | SPLIT="test2016"
 99 | python translate_mm_vi.py \
100 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \
101 |     -src ${TEST_2016_SRC} \
102 |     -path_to_test_img_feats ${TEST_2016_IMGS} \
103 |     -gpu 0 \
104 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations &
105 | 
106 | python translate_mm_vi.py \
107 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \
108 |     -src ${TEST_2016_SRC} \
109 |     -path_to_test_img_feats ${TEST_2016_IMGS} \
110 |     -gpu 1 \
111 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations &
112 | 
113 | wait;
114 | 
115 | # translate the test set (2017)
116 | SPLIT="test2017"
117 | python translate_mm_vi.py \
118 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \
119 |     -src ${TEST_2017_SRC} \
120 |     -path_to_test_img_feats ${TEST_2017_IMGS} \
121 |     -gpu 0 \
122 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations &
123 | 
124 | python translate_mm_vi.py \
125 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \
126 |     -src ${TEST_2017_SRC} \
127 |     -path_to_test_img_feats ${TEST_2017_IMGS} \
128 |     -gpu 1 \
129 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations &
130 | 
131 | wait;
132 | 
133 | # translate the ambiguous MSCOCO test set (2017)
134 | SPLIT="test2017_mscoco"
135 | python translate_mm_vi.py \
136 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \
137 |     -src ${TEST_2017_MSCOCO_SRC} \
138 |     -path_to_test_img_feats ${TEST_2017_MSCOCO_IMGS} \
139 |     -gpu 0 \
140 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations &
141 | 
142 | python translate_mm_vi.py \
143 |     -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \
144 |     -src ${TEST_2017_MSCOCO_SRC} \
145 |     -path_to_test_img_feats ${TEST_2017_MSCOCO_IMGS} \
146 |     -gpu 1 \
147 |     -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations &
148 | 
149 | wait;
150 | 
151 | echo -ne "Finished. Translations of valid/test 2016/test 2017 (Flickr and ambiguous MSCOCO) can be found in:\n${MODEL_PATH}/${MODEL_FILE_NAME}.{fixed,conditional}-prior_BestModelBleu.pt.{validation,test2016,test2017,test2017_mscoco}-translations\n"
152 | 
153 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | from setuptools import setup
4 | 
5 | setup(name='OpenNMT-py',
6 |       description='A python implementation of OpenNMT',
7 |       version='0.1',
8 |       packages=['onmt', 'onmt.io', 'onmt.translate', 'onmt.modules'])
9 | 


--------------------------------------------------------------------------------
/tools/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | # add additional references explicitly specified on the command line
 35 | shift;
 36 | foreach my $stem (@ARGV) {
 37 |     &add_to_ref($stem,\@REF) if -e $stem;
 38 | }
 39 | 
 40 | 
 41 | 
 42 | sub add_to_ref {
 43 |     my ($file,$REF) = @_;
 44 |     my $s=0;
 45 |     if ($file =~ /.gz$/) {
 46 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 47 |     } else { 
 48 | 	open(REF,$file) or die "Can't read $file";
 49 |     }
 50 |     while(<REF>) {
 51 | 	chop;
 52 | 	push @{$$REF[$s++]}, $_;
 53 |     }
 54 |     close(REF);
 55 | }
 56 | 
 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 58 | my $s=0;
 59 | while(<STDIN>) {
 60 |     chop;
 61 |     $_ = lc if $lowercase;
 62 |     my @WORD = split;
 63 |     my %REF_NGRAM = ();
 64 |     my $length_translation_this_sentence = scalar(@WORD);
 65 |     my ($closest_diff,$closest_length) = (9999,9999);
 66 |     foreach my $reference (@{$REF[$s]}) {
 67 | #      print "$s $_ <=> $reference\n";
 68 |   $reference = lc($reference) if $lowercase;
 69 | 	my @WORD = split(' ',$reference);
 70 | 	my $length = scalar(@WORD);
 71 |         my $diff = abs($length_translation_this_sentence-$length);
 72 | 	if ($diff < $closest_diff) {
 73 | 	    $closest_diff = $diff;
 74 | 	    $closest_length = $length;
 75 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 76 | 	} elsif ($diff == $closest_diff) {
 77 |             $closest_length = $length if $length < $closest_length;
 78 |             # from two references with the same closeness to me
 79 |             # take the *shorter* into account, not the "first" one.
 80 |         }
 81 | 	for(my $n=1;$n<=4;$n++) {
 82 | 	    my %REF_NGRAM_N = ();
 83 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 84 | 		my $ngram = "$n";
 85 | 		for(my $w=0;$w<$n;$w++) {
 86 | 		    $ngram .= " ".$WORD[$start+$w];
 87 | 		}
 88 | 		$REF_NGRAM_N{$ngram}++;
 89 | 	    }
 90 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 91 | 		if (!defined($REF_NGRAM{$ngram}) ||
 92 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 93 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 94 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 95 | 		}
 96 | 	    }
 97 | 	}
 98 |     }
 99 |     $length_translation += $length_translation_this_sentence;
100 |     $length_reference += $closest_length;
101 |     for(my $n=1;$n<=4;$n++) {
102 | 	my %T_NGRAM = ();
103 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104 | 	    my $ngram = "$n";
105 | 	    for(my $w=0;$w<$n;$w++) {
106 | 		$ngram .= " ".$WORD[$start+$w];
107 | 	    }
108 | 	    $T_NGRAM{$ngram}++;
109 | 	}
110 | 	foreach my $ngram (keys %T_NGRAM) {
111 | 	    $ngram =~ /^(\d+) /;
112 | 	    my $n = $1;
113 |             # my $corr = 0;
114 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
115 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
116 | 	    if (defined($REF_NGRAM{$ngram})) {
117 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
119 |                     # $corr =  $T_NGRAM{$ngram};
120 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
121 | 		}
122 | 		else {
123 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
124 |                     # $corr =  $REF_NGRAM{$ngram};
125 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
126 | 		}
127 | 	    }
128 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130 | 	}
131 |     }
132 |     $s++;
133 | }
134 | my $brevity_penalty = 1;
135 | my $bleu = 0;
136 | 
137 | my @bleu=();
138 | 
139 | for(my $n=1;$n<=4;$n++) {
140 |   if (defined ($TOTAL[$n])){
141 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143 |   }else{
144 |     $bleu[$n]=0;
145 |   }
146 | }
147 | 
148 | if ($length_reference==0){
149 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150 |   exit(1);
151 | }
152 | 
153 | if ($length_translation<$length_reference) {
154 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
155 | }
156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157 | 				my_log( $bleu[2] ) +
158 | 				my_log( $bleu[3] ) +
159 | 				my_log( $bleu[4] ) ) / 4) ;
160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161 |     100*$bleu,
162 |     100*$bleu[1],
163 |     100*$bleu[2],
164 |     100*$bleu[3],
165 |     100*$bleu[4],
166 |     $brevity_penalty,
167 |     $length_translation / $length_reference,
168 |     $length_translation,
169 |     $length_reference;
170 | 
171 | sub my_log {
172 |   return -9999999999 unless $_[0];
173 |   return log($_[0]);
174 | }
175 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 | 
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 | 
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
 1 | Dr
 2 | Dra
 3 | pàg
 4 | p
 5 | c
 6 | av
 7 | Sr
 8 | Sra
 9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
  1 | Bc
  2 | BcA
  3 | Ing
  4 | Ing.arch
  5 | MUDr
  6 | MVDr
  7 | MgA
  8 | Mgr
  9 | JUDr
 10 | PhDr
 11 | RNDr
 12 | PharmDr
 13 | ThLic
 14 | ThDr
 15 | Ph.D
 16 | Th.D
 17 | prof
 18 | doc
 19 | CSc
 20 | DrSc
 21 | dr. h. c
 22 | PaedDr
 23 | Dr
 24 | PhMr
 25 | DiS
 26 | abt
 27 | ad
 28 | a.i
 29 | aj
 30 | angl
 31 | anon
 32 | apod
 33 | atd
 34 | atp
 35 | aut
 36 | bd
 37 | biogr
 38 | b.m
 39 | b.p
 40 | b.r
 41 | cca
 42 | cit
 43 | cizojaz
 44 | c.k
 45 | col
 46 | čes
 47 | čín
 48 | čj
 49 | ed
 50 | facs
 51 | fasc
 52 | fol
 53 | fot
 54 | franc
 55 | h.c
 56 | hist
 57 | hl
 58 | hrsg
 59 | ibid
 60 | il
 61 | ind
 62 | inv.č
 63 | jap
 64 | jhdt
 65 | jv
 66 | koed
 67 | kol
 68 | korej
 69 | kl
 70 | krit
 71 | lat
 72 | lit
 73 | m.a
 74 | maď
 75 | mj
 76 | mp
 77 | násl
 78 | např
 79 | nepubl
 80 | něm
 81 | no
 82 | nr
 83 | n.s
 84 | okr
 85 | odd
 86 | odp
 87 | obr
 88 | opr
 89 | orig
 90 | phil
 91 | pl
 92 | pokrač
 93 | pol
 94 | port
 95 | pozn
 96 | př.kr
 97 | př.n.l
 98 | přel
 99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |  
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |  
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |  
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | #no german words end in single lower-case letters, so we throw those in too.
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | 
 61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
 62 | I
 63 | II
 64 | III
 65 | IV
 66 | V
 67 | VI
 68 | VII
 69 | VIII
 70 | IX
 71 | X
 72 | XI
 73 | XII
 74 | XIII
 75 | XIV
 76 | XV
 77 | XVI
 78 | XVII
 79 | XVIII
 80 | XIX
 81 | XX
 82 | i
 83 | ii
 84 | iii
 85 | iv
 86 | v
 87 | vi
 88 | vii
 89 | viii
 90 | ix
 91 | x
 92 | xi
 93 | xii
 94 | xiii
 95 | xiv
 96 | xv
 97 | xvi
 98 | xvii
 99 | xviii
100 | xix
101 | xx
102 | 
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 | 
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 | 
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 | 
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Asst
 38 | Bart
 39 | Bldg
 40 | Brig
 41 | Bros
 42 | Capt
 43 | Cmdr
 44 | Col
 45 | Comdr
 46 | Con
 47 | Corp
 48 | Cpl
 49 | DR
 50 | Dr
 51 | Drs
 52 | Ens
 53 | Gen
 54 | Gov
 55 | Hon
 56 | Hr
 57 | Hosp
 58 | Insp
 59 | Lt
 60 | MM
 61 | MR
 62 | MRS
 63 | MS
 64 | Maj
 65 | Messrs
 66 | Mlle
 67 | Mme
 68 | Mr
 69 | Mrs
 70 | Ms
 71 | Msgr
 72 | Op
 73 | Ord
 74 | Pfc
 75 | Ph
 76 | Prof
 77 | Pvt
 78 | Rep
 79 | Reps
 80 | Res
 81 | Rev
 82 | Rt
 83 | Sen
 84 | Sens
 85 | Sfc
 86 | Sgt
 87 | Sr
 88 | St
 89 | Supt
 90 | Surg
 91 | 
 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 93 | v
 94 | vs
 95 | i.e
 96 | rev
 97 | e.g
 98 | 
 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY# 
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 | 
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
 34 | 
 35 | A.C
 36 | Apdo
 37 | Av
 38 | Bco
 39 | CC.AA
 40 | Da
 41 | Dep
 42 | Dn
 43 | Dr
 44 | Dra
 45 | EE.UU
 46 | Excmo
 47 | FF.CC
 48 | Fil 
 49 | Gral
 50 | J.C
 51 | Let
 52 | Lic
 53 | N.B
 54 | P.D
 55 | P.V.P
 56 | Prof
 57 | Pts
 58 | Rte
 59 | S.A
 60 | S.A.R
 61 | S.E
 62 | S.L
 63 | S.R.C
 64 | Sr
 65 | Sra
 66 | Srta
 67 | Sta
 68 | Sto
 69 | T.V.E
 70 | Tel
 71 | Ud
 72 | Uds
 73 | V.B
 74 | V.E
 75 | Vd
 76 | Vds
 77 | a/c
 78 | adj
 79 | admón
 80 | afmo
 81 | apdo
 82 | av
 83 | c
 84 | c.f
 85 | c.g
 86 | cap
 87 | cm
 88 | cta
 89 | dcha
 90 | doc
 91 | ej
 92 | entlo
 93 | esq
 94 | etc
 95 | f.c
 96 | gr 
 97 | grs
 98 | izq
 99 | kg
100 | km
101 | mg
102 | mm
103 | nÃºm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pÃ¡g 
110 | pÃ¡gs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
  2 | #indicate an end-of-sentence marker.  Special cases are included for prefixes
  3 | #that ONLY appear before 0-9 numbers.
  4 | 
  5 | #This list is compiled from omorfi <http://code.google.com/p/omorfi> database
  6 | #by Tommi A Pirinen.
  7 | 
  8 | 
  9 | #any single upper case letter  followed by a period is not a sentence ender
 10 | A
 11 | B
 12 | C
 13 | D
 14 | E
 15 | F
 16 | G
 17 | H
 18 | I
 19 | J
 20 | K
 21 | L
 22 | M
 23 | N
 24 | O
 25 | P
 26 | Q
 27 | R
 28 | S
 29 | T
 30 | U
 31 | V
 32 | W
 33 | X
 34 | Y
 35 | Z
 36 | Å
 37 | Ä
 38 | Ö
 39 | 
 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 41 | alik
 42 | alil
 43 | amir
 44 | apul
 45 | apul.prof
 46 | arkkit
 47 | ass
 48 | assist
 49 | dipl
 50 | dipl.arkkit
 51 | dipl.ekon
 52 | dipl.ins
 53 | dipl.kielenk
 54 | dipl.kirjeenv
 55 | dipl.kosm
 56 | dipl.urk
 57 | dos
 58 | erikoiseläinl
 59 | erikoishammasl
 60 | erikoisl
 61 | erikoist
 62 | ev.luutn
 63 | evp
 64 | fil
 65 | ft
 66 | hallinton
 67 | hallintot
 68 | hammaslääket
 69 | jatk
 70 | jääk
 71 | kansaned
 72 | kapt
 73 | kapt.luutn
 74 | kenr
 75 | kenr.luutn
 76 | kenr.maj
 77 | kers
 78 | kirjeenv
 79 | kom
 80 | kom.kapt
 81 | komm
 82 | konst
 83 | korpr
 84 | luutn
 85 | maist
 86 | maj
 87 | Mr
 88 | Mrs
 89 | Ms
 90 | M.Sc
 91 | neuv
 92 | nimim
 93 | Ph.D
 94 | prof
 95 | puh.joht
 96 | pääll
 97 | res
 98 | san
 99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 | 
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #
  4 | #any single upper case letter  followed by a period is not a sentence ender
  5 | #usually upper case letters are initials in a name
  6 | #no French words end in single lower-case letters, so we throw those in too?
  7 | A
  8 | B
  9 | C
 10 | D
 11 | E
 12 | F
 13 | G
 14 | H
 15 | I
 16 | J
 17 | K
 18 | L
 19 | M
 20 | N
 21 | O
 22 | P
 23 | Q
 24 | R
 25 | S
 26 | T
 27 | U
 28 | V
 29 | W
 30 | X
 31 | Y
 32 | Z
 33 | #a
 34 | b
 35 | c
 36 | d
 37 | e
 38 | f
 39 | g
 40 | h
 41 | i
 42 | j
 43 | k
 44 | l
 45 | m
 46 | n
 47 | o
 48 | p
 49 | q
 50 | r
 51 | s
 52 | t
 53 | u
 54 | v
 55 | w
 56 | x
 57 | y
 58 | z
 59 | 
 60 | # Period-final abbreviation list for French
 61 | A.C.N
 62 | A.M
 63 | art
 64 | ann
 65 | apr
 66 | av
 67 | auj
 68 | lib
 69 | B.P
 70 | boul
 71 | ca
 72 | c.-à-d
 73 | cf
 74 | ch.-l
 75 | chap
 76 | contr
 77 | C.P.I
 78 | C.Q.F.D
 79 | C.N
 80 | C.N.S
 81 | C.S
 82 | dir
 83 | éd
 84 | e.g
 85 | env
 86 | al
 87 | etc
 88 | E.V
 89 | ex
 90 | fasc
 91 | fém
 92 | fig
 93 | fr
 94 | hab
 95 | ibid
 96 | id
 97 | i.e
 98 | inf
 99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ga:
--------------------------------------------------------------------------------
 1 | 
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | Á
29 | É
30 | Í
31 | Ó
32 | Ú
33 | 
34 | Uacht
35 | Dr
36 | B.Arch
37 | 
38 | m.sh
39 | .i
40 | Co
41 | Cf
42 | cf
43 | i.e
44 | r
45 | Chr
46 | lch #NUMERIC_ONLY#
47 | lgh #NUMERIC_ONLY#
48 | uimh #NUMERIC_ONLY#
49 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | Á
 33 | É
 34 | Í
 35 | Ó
 36 | Ö
 37 | Ő
 38 | Ú
 39 | Ü
 40 | Ű
 41 | 
 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 43 | Dr
 44 | dr
 45 | kb
 46 | Kb
 47 | vö
 48 | Vö
 49 | pl
 50 | Pl
 51 | ca
 52 | Ca
 53 | min
 54 | Min
 55 | max
 56 | Max
 57 | ún
 58 | Ún
 59 | prof
 60 | Prof
 61 | de
 62 | De
 63 | du
 64 | Du
 65 | Szt
 66 | St
 67 | 
 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 69 | # add NUMERIC_ONLY after the word for this function
 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 71 | #if followed by a number, a non-breaking prefix
 72 | 
 73 | # Month name abbreviations
 74 | jan #NUMERIC_ONLY#
 75 | Jan #NUMERIC_ONLY#
 76 | Feb #NUMERIC_ONLY#
 77 | feb #NUMERIC_ONLY#
 78 | márc #NUMERIC_ONLY#
 79 | Márc #NUMERIC_ONLY#
 80 | ápr #NUMERIC_ONLY#
 81 | Ápr #NUMERIC_ONLY#
 82 | máj #NUMERIC_ONLY#
 83 | Máj #NUMERIC_ONLY#
 84 | jún #NUMERIC_ONLY#
 85 | Jún #NUMERIC_ONLY#
 86 | Júl #NUMERIC_ONLY#
 87 | júl #NUMERIC_ONLY#
 88 | aug #NUMERIC_ONLY#
 89 | Aug #NUMERIC_ONLY#
 90 | Szept #NUMERIC_ONLY#
 91 | szept #NUMERIC_ONLY#
 92 | okt #NUMERIC_ONLY#
 93 | Okt #NUMERIC_ONLY#
 94 | nov #NUMERIC_ONLY#
 95 | Nov #NUMERIC_ONLY#
 96 | dec #NUMERIC_ONLY#
 97 | Dec #NUMERIC_ONLY#
 98 | 
 99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
  1 | no #NUMERIC_ONLY#
  2 | No #NUMERIC_ONLY#
  3 | nr #NUMERIC_ONLY#
  4 | Nr #NUMERIC_ONLY#
  5 | nR #NUMERIC_ONLY#
  6 | NR #NUMERIC_ONLY#
  7 | a
  8 | b
  9 | c
 10 | d
 11 | e
 12 | f
 13 | g
 14 | h
 15 | i
 16 | j
 17 | k
 18 | l
 19 | m
 20 | n
 21 | o
 22 | p
 23 | q
 24 | r
 25 | s
 26 | t
 27 | u
 28 | v
 29 | w
 30 | x
 31 | y
 32 | z
 33 | ^
 34 | í
 35 | á
 36 | ó
 37 | æ
 38 | A
 39 | B
 40 | C
 41 | D
 42 | E
 43 | F
 44 | G
 45 | H
 46 | I
 47 | J
 48 | K
 49 | L
 50 | M
 51 | N
 52 | O
 53 | P
 54 | Q
 55 | R
 56 | S
 57 | T
 58 | U
 59 | V
 60 | W
 61 | X
 62 | Y
 63 | Z
 64 | ab.fn
 65 | a.fn
 66 | afs
 67 | al
 68 | alm
 69 | alg
 70 | andh
 71 | ath
 72 | aths
 73 | atr
 74 | ao
 75 | au
 76 | aukaf
 77 | áfn
 78 | áhrl.s
 79 | áhrs
 80 | ákv.gr
 81 | ákv
 82 | bh
 83 | bls
 84 | dr
 85 | e.Kr
 86 | et
 87 | ef
 88 | efn
 89 | ennfr
 90 | eink
 91 | end
 92 | e.st
 93 | erl
 94 | fél
 95 | fskj
 96 | fh
 97 | f.hl
 98 | físl
 99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | B
  8 | C
  9 | D
 10 | E
 11 | F
 12 | G
 13 | H
 14 | I
 15 | J
 16 | K
 17 | L
 18 | M
 19 | N
 20 | O
 21 | P
 22 | Q
 23 | R
 24 | S
 25 | T
 26 | U
 27 | V
 28 | W
 29 | X
 30 | Y
 31 | Z
 32 | 
 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 34 | Adj
 35 | Adm
 36 | Adv
 37 | Amn 
 38 | Arch 
 39 | Asst
 40 | Avv
 41 | Bart
 42 | Bcc
 43 | Bldg
 44 | Brig
 45 | Bros
 46 | C.A.P
 47 | C.P
 48 | Capt
 49 | Cc
 50 | Cmdr
 51 | Co
 52 | Col
 53 | Comdr
 54 | Con
 55 | Corp
 56 | Cpl
 57 | DR
 58 | Dott
 59 | Dr
 60 | Drs
 61 | Egr
 62 | Ens
 63 | Gen
 64 | Geom
 65 | Gov
 66 | Hon
 67 | Hosp
 68 | Hr
 69 | Id
 70 | Ing
 71 | Insp
 72 | Lt
 73 | MM
 74 | MR
 75 | MRS
 76 | MS
 77 | Maj
 78 | Messrs
 79 | Mlle
 80 | Mme
 81 | Mo
 82 | Mons
 83 | Mr
 84 | Mrs
 85 | Ms
 86 | Msgr
 87 | N.B
 88 | Op
 89 | Ord
 90 | P.S
 91 | P.T
 92 | Pfc
 93 | Ph
 94 | Prof
 95 | Pvt
 96 | RP
 97 | RSVP
 98 | Rag
 99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 | 
124 | # other
125 | a.c 
126 | acc
127 | all 
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es 
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 | 
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 | 
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY# 
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.lt:
--------------------------------------------------------------------------------
  1 | # Anything in this file, followed by a period (and an upper-case word),
  2 | # does NOT indicate an end-of-sentence marker.
  3 | # Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  4 | 
  5 | # Any single upper case letter  followed by a period is not a sentence ender
  6 | # (excluding I occasionally, but we leave it in)
  7 | # usually upper case letters are initials in a name
  8 | A
  9 | Ā
 10 | B
 11 | C
 12 | Č
 13 | D
 14 | E
 15 | Ē
 16 | F
 17 | G
 18 | Ģ
 19 | H
 20 | I
 21 | Ī
 22 | J
 23 | K
 24 | Ķ
 25 | L
 26 | Ļ
 27 | M
 28 | N
 29 | Ņ
 30 | O
 31 | P
 32 | Q
 33 | R
 34 | S
 35 | Š
 36 | T
 37 | U
 38 | Ū
 39 | V
 40 | W
 41 | X
 42 | Y
 43 | Z
 44 | Ž
 45 | 
 46 | # Initialis -- Džonas
 47 | Dz
 48 | Dž
 49 | Just
 50 | 
 51 | # Day and month abbreviations
 52 | # m. menesis d. diena  g. gimes
 53 | m
 54 | mėn
 55 | d
 56 | g
 57 | gim
 58 | # Pirmadienis Penktadienis
 59 | Pr
 60 | Pn
 61 | Pirm
 62 | Antr
 63 | Treč
 64 | Ketv
 65 | Penkt
 66 | Šešt
 67 | Sekm
 68 | Saus
 69 | Vas
 70 | Kov
 71 | Bal
 72 | Geg
 73 | Birž
 74 | Liep
 75 | Rugpj
 76 | Rugs
 77 | Spal
 78 | Lapkr
 79 | Gruod
 80 | 
 81 | # Business, governmental, geographical terms
 82 | a
 83 | # aikštė
 84 | adv
 85 | # advokatas
 86 | akad
 87 | # akademikas
 88 | aklg
 89 | # akligatvis
 90 | akt
 91 | # aktorius
 92 | al
 93 | # alėja
 94 | A.V
 95 | # antspaudo vieta
 96 | aps
 97 | apskr
 98 | # apskritis
 99 | apyg
100 | # apygarda
101 | aps
102 | apskr
103 | # apskritis
104 | asist
105 | # asistentas
106 | asmv
107 | avd
108 | # asmenvardis
109 | a.k
110 | asm
111 | asm.k
112 | # asmens kodas
113 | atsak
114 | # atsakingasis
115 | atsisk
116 | sąsk
117 | # atsiskaitomoji sąskaita
118 | aut
119 | # autorius
120 | b
121 | k
122 | b.k
123 | # banko kodas
124 | bkl
125 | # bakalauras
126 | bt
127 | # butas
128 | buv
129 | # buvęs, -usi
130 | dail
131 | # dailininkas
132 | dek
133 | # dekanas
134 | dėst
135 | # dėstytojas
136 | dir
137 | # direktorius
138 | dirig
139 | # dirigentas
140 | doc
141 | # docentas
142 | drp
143 | # durpynas
144 | dš
145 | # dešinysis
146 | egz
147 | # egzempliorius
148 | eil
149 | # eilutė
150 | ekon
151 | # ekonomika
152 | el
153 | # elektroninis
154 | etc
155 | ež
156 | # ežeras
157 | faks
158 | # faksas
159 | fak
160 | # fakultetas
161 | gen
162 | # generolas
163 | gyd
164 | # gydytojas
165 | gv
166 | # gyvenvietė
167 | įl
168 | # įlanka
169 | Įn
170 | # įnagininkas
171 | insp
172 | # inspektorius
173 | pan
174 | # ir panašiai
175 | t.t
176 | # ir taip toliau
177 | k.a
178 | # kaip antai
179 | kand
180 | # kandidatas
181 | kat
182 | # katedra
183 | kyš
184 | # kyšulys
185 | kl
186 | # klasė
187 | kln
188 | # kalnas
189 | kn
190 | # knyga
191 | koresp
192 | # korespondentas
193 | kpt
194 | # kapitonas
195 | kr
196 | # kairysis
197 | kt
198 | # kitas
199 | kun
200 | # kunigas
201 | l
202 | e
203 | p
204 | l.e.p
205 | # laikinai einantis pareigas
206 | ltn
207 | # leitenantas
208 | m
209 | mst
210 | # miestas
211 | m.e
212 | # mūsų eros
213 | m.m
214 | # mokslo metai
215 | mot
216 | # moteris
217 | mstl
218 | # miestelis
219 | mgr
220 | # magistras
221 | mgnt
222 | # magistrantas
223 | mjr
224 | # majoras
225 | mln
226 | # milijonas
227 | mlrd
228 | # milijardas
229 | mok
230 | # mokinys
231 | mokyt
232 | # mokytojas
233 | moksl
234 | # mokslinis
235 | nkt
236 | # nekaitomas
237 | ntk
238 | # neteiktinas
239 | Nr
240 | nr
241 | # numeris
242 | p
243 | # ponas
244 | p.d
245 | a.d
246 | # pašto dėžutė, abonentinė dėžutė
247 | p.m.e
248 | # prieš mūsų erą
249 | pan
250 | # ir panašiai
251 | pav
252 | # paveikslas
253 | pavad
254 | # pavaduotojas
255 | pirm
256 | # pirmininkas
257 | pl
258 | # plentas
259 | plg
260 | # palygink
261 | plk
262 | # pulkininkas; pelkė
263 | pr
264 | # prospektas
265 | Kr
266 | pr.Kr
267 | # prieš Kristų
268 | prok
269 | # prokuroras
270 | prot
271 | # protokolas
272 | pss
273 | # pusiasalis
274 | pšt
275 | # paštas
276 | pvz
277 | # pavyzdžiui
278 | r
279 | # rajonas
280 | red
281 | # redaktorius
282 | rš
283 | # raštų kalbos
284 | sąs
285 | # sąsiuvinis
286 | saviv
287 | sav
288 | # savivaldybė
289 | sekr
290 | # sekretorius
291 | sen
292 | # seniūnija, seniūnas
293 | sk
294 | # skaityk; skyrius
295 | skg
296 | # skersgatvis
297 | skyr
298 | sk
299 | # skyrius
300 | skv
301 | # skveras
302 | sp
303 | # spauda; spaustuvė
304 | spec
305 | # specialistas
306 | sr
307 | # sritis
308 | st
309 | # stotis
310 | str
311 | # straipsnis
312 | stud
313 | # studentas
314 | š
315 | š.m
316 | # šių metų
317 | šnek
318 | # šnekamosios
319 | tir
320 | # tiražas
321 | tūkst
322 | # tūkstantis
323 | up
324 | # upė
325 | upl
326 | # upelis
327 | vad
328 | # vadinamasis, -oji
329 | vlsč
330 | # valsčius
331 | ved
332 | # vedėjas
333 | vet
334 | # veterinarija
335 | virš
336 | # viršininkas, viršaitis
337 | vyr
338 | # vyriausiasis, -ioji; vyras
339 | vyresn
340 | # vyresnysis
341 | vlsč
342 | # valsčius
343 | vs
344 | # viensėdis
345 | Vt
346 | vt
347 | # vietininkas
348 | vtv
349 | vv
350 | # vietovardis
351 | žml
352 | # žemėlapis
353 | 
354 | # Technical terms, abbreviations used in guidebooks, advertisments, etc.
355 | # Generally lower-case.
356 | air
357 | # airiškai
358 | amer
359 | # amerikanizmas
360 | anat
361 | # anatomija
362 | angl
363 | # angl. angliskai
364 | arab
365 | # arabų
366 | archeol
367 | archit
368 | asm
369 | # asmuo
370 | astr
371 | # astronomija
372 | austral
373 | # australiškai
374 | aut
375 | # automobilis
376 | av
377 | # aviacija
378 | bažn
379 | bdv
380 | # būdvardis
381 | bibl
382 | # Biblija
383 | biol
384 | # biologija
385 | bot
386 | # botanika
387 | brt
388 | # burtai, burtažodis.
389 | brus
390 | # baltarusių
391 | buh
392 | # buhalterija
393 | chem
394 | # chemija
395 | col
396 | # collectivum
397 | con
398 | conj
399 | # conjunctivus, jungtukas
400 | dab
401 | # dab. dabartine
402 | dgs
403 | # daugiskaita
404 | dial
405 | # dialektizmas
406 | dipl
407 | dktv
408 | # daiktavardis
409 | džn
410 | # dažnai
411 | ekon
412 | el
413 | # elektra
414 | esam
415 | # esamasis laikas
416 | euf
417 | # eufemizmas
418 | fam
419 | # familiariai
420 | farm
421 | # farmacija
422 | filol
423 | # filologija
424 | filos
425 | # filosofija
426 | fin
427 | # finansai
428 | fiz
429 | # fizika
430 | fiziol
431 | # fiziologija
432 | flk
433 | # folkloras
434 | fon
435 | # fonetika
436 | fot
437 | # fotografija
438 | geod
439 | # geodezija
440 | geogr
441 | geol
442 | # geologija
443 | geom
444 | # geometrija
445 | glžk
446 | gr
447 | # graikų
448 | gram
449 | her
450 | # heraldika
451 | hidr
452 | # hidrotechnika
453 | ind
454 | # Indų
455 | iron
456 | # ironiškai
457 | isp
458 | # ispanų
459 | ist
460 | istor
461 | # istorija
462 | it
463 | # italų
464 | įv
465 | reikšm
466 | įv.reikšm
467 | # įvairiomis reikšmėmis
468 | jap
469 | # japonų
470 | juok
471 | # juokaujamai
472 | jūr
473 | # jūrininkystė
474 | kalb
475 | # kalbotyra
476 | kar
477 | # karyba
478 | kas
479 | # kasyba
480 | kin
481 | # kinematografija
482 | klaus
483 | # klausiamasis
484 | knyg
485 | # knyginis
486 | kom
487 | # komercija
488 | komp
489 | # kompiuteris
490 | kosm
491 | # kosmonautika
492 | kt
493 | # kitas
494 | kul
495 | # kulinarija
496 | kuop
497 | # kuopine
498 | l
499 | # laikas
500 | lit
501 | # literatūrinis
502 | lingv
503 | # lingvistika
504 | log
505 | # logika
506 | lot
507 | # lotynų
508 | mat
509 | # matematika
510 | maž
511 | # mažybinis
512 | med
513 | # medicina
514 | medž
515 | # medžioklė
516 | men
517 | # menas
518 | menk
519 | # menkinamai
520 | metal
521 | # metalurgija
522 | meteor
523 | min
524 | # mineralogija
525 | mit
526 | # mitologija
527 | mok
528 | # mokyklinis
529 | ms
530 | # mįslė
531 | muz
532 | # muzikinis
533 | n
534 | # naujasis
535 | neig
536 | # neigiamasis
537 | neol
538 | # neologizmas
539 | niek
540 | # niekinamai
541 | ofic
542 | # oficialus
543 | opt
544 | # optika
545 | orig
546 | # original
547 | p
548 | # pietūs
549 | pan
550 | # panašiai
551 | parl
552 | # parlamentas
553 | pat
554 | # patarlė
555 | paž
556 | # pažodžiui
557 | plg
558 | # palygink
559 | poet
560 | # poetizmas
561 | poez
562 | #  poezija
563 | poligr
564 | # poligrafija
565 | polit
566 | # politika
567 | ppr
568 | # paprastai
569 | pranc
570 | pr
571 | # prancūzų, prūsų
572 | priet
573 | # prietaras
574 | prek
575 | # prekyba
576 | prk
577 | # perkeltine
578 | prs
579 | # persona, asmuo
580 | psn
581 | # pasenęs žodis
582 | psich
583 | # psichologija
584 | pvz
585 | # pavyzdžiui
586 | r
587 | # rytai
588 | rad
589 | # radiotechnika
590 | rel
591 | # religija
592 | ret
593 | # retai
594 | rus
595 | # rusų
596 | sen
597 | # senasis
598 | sl
599 | # slengas, slavų
600 | sov
601 | # sovietinis
602 | spec
603 | # specialus
604 | sport
605 | stat
606 | # statyba
607 | sudurt
608 | # sudurtinis
609 | sutr
610 | # sutrumpintas
611 | suv
612 | # suvalkiečių
613 | š
614 | # šiaurė
615 | šach
616 | # šachmatai
617 | šiaur
618 | škot
619 | # škotiškai
620 | šnek
621 | # šnekamoji
622 | teatr
623 | tech
624 | techn
625 | # technika
626 | teig
627 | # teigiamas
628 | teis
629 | # teisė
630 | tekst
631 | # tekstilė
632 | tel
633 | # telefonas
634 | teol
635 | # teologija
636 | v
637 | # tik vyriškosios, vakarai
638 | t.p
639 | t
640 | p
641 | # ir taip pat
642 | t.t
643 | # ir taip toliau
644 | t.y
645 | # tai yra
646 | vaik
647 | # vaikų
648 | vart
649 | # vartojama
650 | vet
651 | # veterinarija
652 | vid
653 | # vidurinis
654 | vksm
655 | # veiksmažodis
656 | vns
657 | # vienaskaita
658 | vok
659 | # vokiečių
660 | vulg
661 | # vulgariai
662 | zool
663 | # zoologija
664 | žr
665 | # žiūrėk
666 | ž.ū
667 | ž
668 | ū
669 | # žemės ūkis
670 | 
671 | # List of titles. These are often followed by upper-case names, but do
672 | # not indicate sentence breaks
673 | #
674 | # Jo Eminencija
675 | Em.
676 | # Gerbiamasis
677 | Gerb
678 | gerb
679 | #  malonus
680 | malon
681 | # profesorius
682 | Prof
683 | prof
684 | # daktaras (mokslų)
685 | Dr
686 | dr
687 | habil
688 | med
689 | # inž inžinierius
690 | inž
691 | Inž
692 | 
693 | 
694 | #Numbers only. These should only induce breaks when followed by a numeric sequence
695 | # add NUMERIC_ONLY after the word for this function
696 | #This case is mostly for the english "No." which can either be a sentence of its own, or
697 | #if followed by a number, a non-breaking prefix
698 | No #NUMERIC_ONLY#
699 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | A
  7 | Ā
  8 | B
  9 | C
 10 | Č
 11 | D
 12 | E
 13 | Ē
 14 | F
 15 | G
 16 | Ģ
 17 | H
 18 | I
 19 | Ī
 20 | J
 21 | K
 22 | Ķ
 23 | L
 24 | Ļ
 25 | M
 26 | N
 27 | Ņ
 28 | O
 29 | P
 30 | Q
 31 | R
 32 | S
 33 | Š
 34 | T
 35 | U
 36 | Ū
 37 | V
 38 | W
 39 | X
 40 | Y
 41 | Z
 42 | Ž
 43 | 
 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 45 | dr
 46 | Dr
 47 | med
 48 | prof
 49 | Prof
 50 | inž
 51 | Inž
 52 | ist.loc
 53 | Ist.loc
 54 | kor.loc
 55 | Kor.loc
 56 | v.i
 57 | vietn
 58 | Vietn
 59 | 
 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 61 | a.l
 62 | t.p
 63 | pārb
 64 | Pārb
 65 | vec
 66 | Vec
 67 | inv
 68 | Inv
 69 | sk
 70 | Sk
 71 | spec
 72 | Spec
 73 | vienk
 74 | Vienk
 75 | virz
 76 | Virz
 77 | māksl
 78 | Māksl
 79 | mūz
 80 | Mūz
 81 | akad
 82 | Akad
 83 | soc
 84 | Soc
 85 | galv
 86 | Galv
 87 | vad
 88 | Vad
 89 | sertif
 90 | Sertif
 91 | folkl
 92 | Folkl
 93 | hum
 94 | Hum
 95 | 
 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
 97 | # add NUMERIC_ONLY after the word for this function
 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
 99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY# 
101 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
  4 | #         http://nl.wikipedia.org/wiki/Aanspreekvorm
  5 | #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
  6 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  7 | #usually upper case letters are initials in a name
  8 | A
  9 | B
 10 | C
 11 | D
 12 | E
 13 | F
 14 | G
 15 | H
 16 | I
 17 | J
 18 | K
 19 | L
 20 | M
 21 | N
 22 | O
 23 | P
 24 | Q
 25 | R
 26 | S
 27 | T
 28 | U
 29 | V
 30 | W
 31 | X
 32 | Y
 33 | Z
 34 | 
 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 36 | bacc
 37 | bc
 38 | bgen
 39 | c.i
 40 | dhr
 41 | dr
 42 | dr.h.c
 43 | drs
 44 | drs
 45 | ds
 46 | eint
 47 | fa
 48 | Fa
 49 | fam
 50 | gen
 51 | genm
 52 | ing
 53 | ir
 54 | jhr
 55 | jkvr
 56 | jr
 57 | kand
 58 | kol
 59 | lgen
 60 | lkol
 61 | Lt
 62 | maj
 63 | Mej
 64 | mevr
 65 | Mme
 66 | mr
 67 | mr
 68 | Mw
 69 | o.b.s
 70 | plv
 71 | prof
 72 | ritm
 73 | tint
 74 | Vz
 75 | Z.D
 76 | Z.D.H
 77 | Z.E
 78 | Z.Em
 79 | Z.H
 80 | Z.K.H
 81 | Z.K.M
 82 | Z.M
 83 | z.v
 84 | 
 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 87 | a.g.v
 88 | bijv
 89 | bijz
 90 | bv
 91 | d.w.z
 92 | e.c
 93 | e.g
 94 | e.k
 95 | ev
 96 | i.p.v
 97 | i.s.m
 98 | i.t.t
 99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 | 
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY# 
113 | Nrs 
114 | nrs
115 | nr #NUMERIC_ONLY#
116 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
  1 | adw
  2 | afr
  3 | akad
  4 | al
  5 | Al
  6 | am
  7 | amer
  8 | arch
  9 | art
 10 | Art
 11 | artyst
 12 | astr
 13 | austr
 14 | bałt
 15 | bdb
 16 | bł
 17 | bm
 18 | br
 19 | bryg
 20 | bryt
 21 | centr
 22 | ces
 23 | chem
 24 | chiń
 25 | chir
 26 | c.k
 27 | c.o
 28 | cyg
 29 | cyw
 30 | cyt
 31 | czes
 32 | czw
 33 | cd
 34 | Cd
 35 | czyt
 36 | ćw
 37 | ćwicz
 38 | daw
 39 | dcn
 40 | dekl
 41 | demokr
 42 | det
 43 | diec
 44 | dł
 45 | dn
 46 | dot
 47 | dol
 48 | dop
 49 | dost
 50 | dosł
 51 | h.c
 52 | ds
 53 | dst
 54 | duszp
 55 | dypl
 56 | egz
 57 | ekol
 58 | ekon
 59 | elektr
 60 | em
 61 | ew
 62 | fab
 63 | farm
 64 | fot
 65 | fr
 66 | gat
 67 | gastr
 68 | geogr
 69 | geol
 70 | gimn
 71 | głęb
 72 | gm
 73 | godz
 74 | górn
 75 | gosp
 76 | gr
 77 | gram
 78 | hist
 79 | hiszp
 80 | hr
 81 | Hr
 82 | hot
 83 | id
 84 | in
 85 | im
 86 | iron
 87 | jn
 88 | kard
 89 | kat
 90 | katol
 91 | k.k
 92 | kk
 93 | kol
 94 | kl
 95 | k.p.a
 96 | kpc
 97 | k.p.c
 98 | kpt
 99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | F
 7 | G
 8 | H
 9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
  1 | # added Cyrillic uppercase letters [А-Я]
  2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
  3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
  4 | А
  5 | Б
  6 | В
  7 | Г
  8 | Д
  9 | Е
 10 | Ж
 11 | З
 12 | И
 13 | Й
 14 | К
 15 | Л
 16 | М
 17 | Н
 18 | О
 19 | П
 20 | Р
 21 | С
 22 | Т
 23 | У
 24 | Ф
 25 | Х
 26 | Ц
 27 | Ч
 28 | Ш
 29 | Щ
 30 | Ъ
 31 | Ы
 32 | Ь
 33 | Э
 34 | Ю
 35 | Я
 36 | A
 37 | B
 38 | C
 39 | D
 40 | E
 41 | F
 42 | G
 43 | H
 44 | I
 45 | J
 46 | K
 47 | L
 48 | M
 49 | N
 50 | O
 51 | P
 52 | Q
 53 | R
 54 | S
 55 | T
 56 | U
 57 | V
 58 | W
 59 | X
 60 | Y
 61 | Z
 62 | 0гг
 63 | 1гг
 64 | 2гг
 65 | 3гг
 66 | 4гг
 67 | 5гг
 68 | 6гг
 69 | 7гг
 70 | 8гг
 71 | 9гг
 72 | 0г
 73 | 1г
 74 | 2г
 75 | 3г
 76 | 4г
 77 | 5г
 78 | 6г
 79 | 7г
 80 | 8г
 81 | 9г
 82 | Xвв
 83 | Vвв
 84 | Iвв
 85 | Lвв
 86 | Mвв
 87 | Cвв
 88 | Xв
 89 | Vв
 90 | Iв
 91 | Lв
 92 | Mв
 93 | Cв
 94 | 0м
 95 | 1м
 96 | 2м
 97 | 3м
 98 | 4м
 99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
  1 | Bc
  2 | Mgr
  3 | RNDr
  4 | PharmDr
  5 | PhDr
  6 | JUDr
  7 | PaedDr
  8 | ThDr
  9 | Ing
 10 | MUDr
 11 | MDDr
 12 | MVDr
 13 | Dr
 14 | ThLic
 15 | PhD
 16 | ArtD
 17 | ThDr
 18 | Dr
 19 | DrSc
 20 | CSs
 21 | prof
 22 | obr
 23 | Obr
 24 | Č
 25 | č
 26 | absol
 27 | adj
 28 | admin
 29 | adr
 30 | Adr
 31 | adv
 32 | advok
 33 | afr
 34 | ak
 35 | akad
 36 | akc
 37 | akuz
 38 | et
 39 | al
 40 | alch
 41 | amer
 42 | anat
 43 | angl
 44 | Angl
 45 | anglosas
 46 | anorg
 47 | ap
 48 | apod
 49 | arch
 50 | archeol
 51 | archit
 52 | arg
 53 | art
 54 | astr
 55 | astrol
 56 | astron
 57 | atp
 58 | atď
 59 | austr
 60 | Austr
 61 | aut
 62 | belg
 63 | Belg
 64 | bibl
 65 | Bibl
 66 | biol
 67 | bot
 68 | bud
 69 | bás
 70 | býv
 71 | cest
 72 | chem
 73 | cirk
 74 | csl
 75 | čs
 76 | Čs
 77 | dat
 78 | dep
 79 | det
 80 | dial
 81 | diaľ
 82 | dipl
 83 | distrib
 84 | dokl
 85 | dosl
 86 | dopr
 87 | dram
 88 | duš
 89 | dv
 90 | dvojčl
 91 | dór
 92 | ekol
 93 | ekon
 94 | el
 95 | elektr
 96 | elektrotech
 97 | energet
 98 | epic
 99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf 
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
 1 | dr
 2 | Dr
 3 | itd
 4 | itn
 5 | št #NUMERIC_ONLY#
 6 | Št #NUMERIC_ONLY#
 7 | d
 8 | jan
 9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
 1 | #single upper case letter are usually initials
 2 | A
 3 | B
 4 | C
 5 | D
 6 | E
 7 | F
 8 | G
 9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
  1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
  2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
  3 | 
  4 | #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
  5 | #usually upper case letters are initials in a name
  6 | அ
  7 | ஆ
  8 | இ
  9 | ஈ
 10 | உ
 11 | ஊ
 12 | எ
 13 | ஏ
 14 | ஐ
 15 | ஒ
 16 | ஓ
 17 | ஔ
 18 | ஃ
 19 | க
 20 | கா
 21 | கி
 22 | கீ
 23 | கு
 24 | கூ
 25 | கெ
 26 | கே
 27 | கை
 28 | கொ
 29 | கோ
 30 | கௌ
 31 | க்
 32 | ச
 33 | சா
 34 | சி
 35 | சீ
 36 | சு
 37 | சூ
 38 | செ
 39 | சே
 40 | சை
 41 | சொ
 42 | சோ
 43 | சௌ
 44 | ச்
 45 | ட
 46 | டா
 47 | டி
 48 | டீ
 49 | டு
 50 | டூ
 51 | டெ
 52 | டே
 53 | டை
 54 | டொ
 55 | டோ
 56 | டௌ
 57 | ட்
 58 | த
 59 | தா
 60 | தி
 61 | தீ
 62 | து
 63 | தூ
 64 | தெ
 65 | தே
 66 | தை
 67 | தொ
 68 | தோ
 69 | தௌ
 70 | த்
 71 | ப
 72 | பா
 73 | பி
 74 | பீ
 75 | பு
 76 | பூ
 77 | பெ
 78 | பே
 79 | பை
 80 | பொ
 81 | போ
 82 | பௌ
 83 | ப்
 84 | ற
 85 | றா
 86 | றி
 87 | றீ
 88 | று
 89 | றூ
 90 | றெ
 91 | றே
 92 | றை
 93 | றொ
 94 | றோ
 95 | றௌ
 96 | ற்
 97 | ய
 98 | யா
 99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்  
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ் 
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந் 	
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம் 	
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 | 
254 | 
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 | 
261 | 
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 | 
267 | 
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY# 
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.yue:
--------------------------------------------------------------------------------
 1 | #
 2 | # Cantonese (Chinese)
 3 | #
 4 | # Anything in this file, followed by a period, 
 5 | # does NOT indicate an end-of-sentence marker.
 6 | #
 7 | # English/Euro-language given-name initials (appearing in
 8 | # news, periodicals, etc.)
 9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 | 
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 | 


--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.zh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Mandarin (Chinese)
 3 | #
 4 | # Anything in this file, followed by a period, 
 5 | # does NOT indicate an end-of-sentence marker.
 6 | #
 7 | # English/Euro-language given-name initials (appearing in
 8 | # news, periodicals, etc.)
 9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 | 
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 | 


--------------------------------------------------------------------------------
/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import division, unicode_literals, print_function
  4 | import os
  5 | import argparse
  6 | import math
  7 | import codecs
  8 | import torch
  9 | import time
 10 | 
 11 | from itertools import count
 12 | 
 13 | import onmt.io
 14 | import onmt.translate
 15 | import onmt
 16 | import onmt.ModelConstructor
 17 | import onmt.modules
 18 | import opts
 19 | 
 20 | parser = argparse.ArgumentParser(
 21 |     description='translate.py',
 22 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 23 | opts.add_md_help_argument(parser)
 24 | opts.translate_opts(parser)
 25 | 
 26 | opt = parser.parse_args()
 27 | 
 28 | 
 29 | def _report_score(name, score_total, words_total):
 30 |     print("%s AVG SCORE: %.4f, %s PPL: %.4f" % (
 31 |         name, score_total / words_total,
 32 |         name, math.exp(-score_total / words_total)))
 33 | 
 34 | 
 35 | def _report_bleu():
 36 |     import subprocess
 37 |     print()
 38 |     res = subprocess.check_output(
 39 |         "perl tools/multi-bleu.perl %s < %s" % (opt.tgt, opt.output),
 40 |         shell=True).decode("utf-8")
 41 |     print(">> " + res.strip())
 42 | 
 43 | 
 44 | def _report_rouge():
 45 |     import subprocess
 46 |     res = subprocess.check_output(
 47 |         "python tools/test_rouge.py -r %s -c %s" % (opt.tgt, opt.output),
 48 |         shell=True).decode("utf-8")
 49 |     print(res.strip())
 50 | 
 51 | 
 52 | def main():
 53 |     dummy_parser = argparse.ArgumentParser(description='train.py')
 54 |     opts.model_opts(dummy_parser)
 55 |     dummy_opt = dummy_parser.parse_known_args([])[0]
 56 | 
 57 |     opt.cuda = opt.gpu > -1
 58 |     if opt.cuda:
 59 |         torch.cuda.set_device(opt.gpu)
 60 | 
 61 |     # Load the model.
 62 |     fields, model, model_opt = \
 63 |         onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)
 64 | 
 65 |     # File to write sentences to.
 66 |     out_file = codecs.open(opt.output, 'w', 'utf-8')
 67 | 
 68 |     # Test data
 69 |     data = onmt.io.build_dataset(fields, opt.data_type,
 70 |                                  opt.src, opt.tgt,
 71 |                                  src_dir=opt.src_dir,
 72 |                                  sample_rate=opt.sample_rate,
 73 |                                  window_size=opt.window_size,
 74 |                                  window_stride=opt.window_stride,
 75 |                                  window=opt.window,
 76 |                                  use_filter_pred=False)
 77 | 
 78 |     # Sort batch by decreasing lengths of sentence required by pytorch.
 79 |     # sort=False means "Use dataset's sortkey instead of iterator's".
 80 |     data_iter = onmt.io.OrderedIterator(
 81 |         dataset=data, device=opt.gpu,
 82 |         batch_size=opt.batch_size, train=False, sort=False,
 83 |         sort_within_batch=True, shuffle=False)
 84 | 
 85 |     # Translator
 86 |     scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
 87 |     translator = onmt.translate.Translator(model, fields,
 88 |                                            beam_size=opt.beam_size,
 89 |                                            n_best=opt.n_best,
 90 |                                            global_scorer=scorer,
 91 |                                            max_length=opt.max_length,
 92 |                                            copy_attn=model_opt.copy_attn,
 93 |                                            cuda=opt.cuda,
 94 |                                            beam_trace=opt.dump_beam != "",
 95 |                                            min_length=opt.min_length)
 96 |     builder = onmt.translate.TranslationBuilder(
 97 |         data, translator.fields,
 98 |         opt.n_best, opt.replace_unk, opt.tgt)
 99 | 
100 |     # Statistics
101 |     counter = count(1)
102 |     pred_score_total, pred_words_total = 0, 0
103 |     gold_score_total, gold_words_total = 0, 0
104 | 
105 |     start_time = time.time()
106 |     n_processed = 0
107 |     print("Processed ", end="")
108 |     for batch in data_iter:
109 |         batch_data = translator.translate_batch(batch, data)
110 |         translations = builder.from_batch(batch_data)
111 | 
112 |         for trans in translations:
113 |             pred_score_total += trans.pred_scores[0]
114 |             pred_words_total += len(trans.pred_sents[0])
115 |             if opt.tgt:
116 |                 gold_score_total += trans.gold_score
117 |                 gold_words_total += len(trans.gold_sent)
118 | 
119 |             n_best_preds = [" ".join(pred)
120 |                             for pred in trans.pred_sents[:opt.n_best]]
121 |             out_file.write('\n'.join(n_best_preds))
122 |             out_file.write('\n')
123 |             out_file.flush()
124 | 
125 |             if opt.verbose:
126 |                 sent_number = next(counter)
127 |                 output = trans.log(sent_number)
128 |                 os.write(1, output.encode('utf-8'))
129 | 
130 |         n_processed+=len(batch_data["batch"])
131 |         if n_processed % 100 == 0:
132 |             if n_processed == 100:
133 |                 print("%d"%n_processed, end=" ", flush=True)
134 |             else:
135 |                 print(", %d"%n_processed, end=" ", flush=True)
136 |     print("", flush=True)
137 | 
138 |     elapsed_time = time.time() - start_time
139 | 
140 |     _report_score('PRED', pred_score_total, pred_words_total)
141 |     if opt.tgt:
142 |         _report_score('GOLD', gold_score_total, gold_words_total)
143 |         if opt.report_bleu:
144 |             _report_bleu()
145 |         if opt.report_rouge:
146 |             _report_rouge()
147 | 
148 |     if opt.dump_beam:
149 |         import json
150 |         json.dump(translator.beam_accum,
151 |                   codecs.open(opt.dump_beam, 'w', 'utf-8'))
152 | 
153 |     print("Translations computed in %d seconds."%elapsed_time)
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     main()
158 | 


--------------------------------------------------------------------------------
/translate_mm_vi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import division, unicode_literals
  4 | import os
  5 | import argparse
  6 | import math
  7 | import codecs
  8 | import torch
  9 | 
 10 | from itertools import count
 11 | 
 12 | import onmt.io
 13 | import onmt.translate
 14 | import onmt
 15 | import onmt.ModelConstructor
 16 | import onmt.modules
 17 | from onmt.Utils import MODEL_TYPES
 18 | import opts
 19 | import tables
 20 | 
 21 | parser = argparse.ArgumentParser(
 22 |     description='translate_mm_vi.py',
 23 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 24 | opts.add_md_help_argument(parser)
 25 | opts.translate_opts(parser)
 26 | opts.translate_mm_vi_opts(parser)
 27 | 
 28 | opt = parser.parse_args()
 29 | 
 30 | 
 31 | def _report_score(name, score_total, words_total):
 32 |     print("%s AVG SCORE: %.4f, %s PPL: %.4f" % (
 33 |         name, score_total / words_total,
 34 |         name, math.exp(-score_total / words_total)))
 35 | 
 36 | 
 37 | def _report_bleu():
 38 |     import subprocess
 39 |     print()
 40 |     res = subprocess.check_output(
 41 |         "perl tools/multi-bleu.perl %s < %s" % (opt.tgt, opt.output),
 42 |         shell=True).decode("utf-8")
 43 |     print(">> " + res.strip())
 44 | 
 45 | 
 46 | def _report_rouge():
 47 |     import subprocess
 48 |     res = subprocess.check_output(
 49 |         "python tools/test_rouge.py -r %s -c %s" % (opt.tgt, opt.output),
 50 |         shell=True).decode("utf-8")
 51 |     print(res.strip())
 52 | 
 53 | 
 54 | def main():
 55 |     dummy_parser = argparse.ArgumentParser(description='train_mm_vi.py')
 56 |     opts.model_opts(dummy_parser)
 57 |     dummy_opt = dummy_parser.parse_known_args([])[0]
 58 | 
 59 |     opt.cuda = opt.gpu > -1
 60 |     if opt.cuda:
 61 |         torch.cuda.set_device(opt.gpu)
 62 |         print("Using GPU")
 63 |         torch.set_default_tensor_type("torch.cuda.FloatTensor")
 64 |     else:
 65 |         print("Using CPU")
 66 |         torch.set_default_tensor_type("torch.FloatTensor")
 67 |     
 68 |     # loading checkpoint just to find multimodal model type
 69 |     checkpoint = torch.load(opt.model,
 70 |                             map_location=lambda storage, loc: storage)
 71 |     opt.multimodal_model_type = checkpoint['opt'].multimodal_model_type
 72 |     opt.use_global_image_features = checkpoint['opt'].use_global_image_features
 73 |     opt.use_posterior_image_features = checkpoint['opt'].use_posterior_image_features
 74 |     # work-around to get fix issue
 75 |     assert(opt.multimodal_model_type in MODEL_TYPES), \
 76 |             'Variational multimodal model type not implemented: %s'%str(opt.multimodal_model_type)
 77 |     print("Translating with multimodal_model_type: %s"%str(opt.multimodal_model_type))
 78 |     del checkpoint
 79 | 
 80 |     if opt.batch_size > 1:
 81 |         print( "Batch size > 1 not implemented! Falling back to batch_size = 1 ..." )
 82 |         opt.batch_size = 1
 83 | 
 84 |     # load test image features
 85 |     test_file = tables.open_file(opt.path_to_test_img_feats, mode='r')
 86 |     if opt.multimodal_model_type in MODEL_TYPES:
 87 |         if opt.use_global_image_features:
 88 |             # load only the global image features
 89 |             test_img_feats = test_file.root.global_feats[:]
 90 |             print('Using global image features...')
 91 |         else: # opt.use_posterior_image_features
 92 |             # load only the global image features
 93 |             test_img_feats = test_file.root.logits[:]
 94 |             print('Using image posterior class probabilities...')
 95 |     else:
 96 |         raise Exception("Model type not implemented: %s"%opt.multimodal_model_type)
 97 |     test_file.close()
 98 | 
 99 |     # Load the model.
100 |     fields, model, model_opt = \
101 |         onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)
102 | 
103 |     # File to write sentences to.
104 |     out_file = codecs.open(opt.output, 'w', 'utf-8')
105 | 
106 |     # Test data
107 |     data = onmt.io.build_dataset(fields, opt.data_type,
108 |                                  opt.src, opt.tgt,
109 |                                  src_dir=opt.src_dir,
110 |                                  sample_rate=opt.sample_rate,
111 |                                  window_size=opt.window_size,
112 |                                  window_stride=opt.window_stride,
113 |                                  window=opt.window,
114 |                                  use_filter_pred=False)
115 | 
116 |     # Sort batch by decreasing lengths of sentence required by pytorch.
117 |     # sort=False means "Use dataset's sortkey instead of iterator's".
118 |     print("opt.gpu: %s"%str(opt.gpu))
119 |     data_iter = onmt.io.OrderedIterator(
120 |         dataset=data, device=opt.gpu,
121 |         batch_size=opt.batch_size, train=False, sort=False,
122 |         sort_within_batch=True, shuffle=False)
123 | 
124 |     # Translator
125 |     scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
126 |     translator = onmt.translate.TranslatorMultimodalVI(model, fields,
127 |                                            beam_size=opt.beam_size,
128 |                                            n_best=opt.n_best,
129 |                                            global_scorer=scorer,
130 |                                            max_length=opt.max_length,
131 |                                            copy_attn=model_opt.copy_attn,
132 |                                            cuda=opt.cuda,
133 |                                            beam_trace=opt.dump_beam != "",
134 |                                            min_length=opt.min_length,
135 |                                            test_img_feats=test_img_feats,
136 |                                            multimodal_model_type=opt.multimodal_model_type)
137 |     builder = onmt.translate.TranslationBuilder(
138 |         data, translator.fields,
139 |         opt.n_best, opt.replace_unk, opt.tgt)
140 | 
141 |     # Statistics
142 |     counter = count(1)
143 |     pred_score_total, pred_words_total = 0, 0
144 |     gold_score_total, gold_words_total = 0, 0
145 | 
146 |     for sent_idx, batch in enumerate(data_iter):
147 |         batch_data = translator.translate_batch(batch, data, sent_idx)
148 |         translations = builder.from_batch(batch_data)
149 | 
150 |         for trans in translations:
151 |             pred_score_total += trans.pred_scores[0]
152 |             pred_words_total += len(trans.pred_sents[0])
153 |             if opt.tgt:
154 |                 gold_score_total += trans.gold_score
155 |                 gold_words_total += len(trans.gold_sent)
156 | 
157 |             n_best_preds = [" ".join(pred)
158 |                             for pred in trans.pred_sents[:opt.n_best]]
159 |             out_file.write('\n'.join(n_best_preds))
160 |             out_file.write('\n')
161 |             out_file.flush()
162 | 
163 |             if opt.verbose:
164 |                 sent_number = next(counter)
165 |                 output = trans.log(sent_number)
166 |                 os.write(1, output.encode('utf-8'))
167 | 
168 |     _report_score('PRED', pred_score_total, pred_words_total)
169 |     if opt.tgt:
170 |         _report_score('GOLD', gold_score_total, gold_words_total)
171 |         if opt.report_bleu:
172 |             _report_bleu()
173 |         if opt.report_rouge:
174 |             _report_rouge()
175 | 
176 |     if opt.dump_beam:
177 |         import json
178 |         json.dump(translator.beam_accum,
179 |                   codecs.open(opt.dump_beam, 'w', 'utf-8'))
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     main()
184 | 


--------------------------------------------------------------------------------