├── README.md ├── onmt ├── EarlyStop.py ├── Loss.py ├── ModelConstructor.py ├── Models.py ├── Optim.py ├── Trainer.py ├── TrainerMultimodal.py ├── Utils.py ├── VILoss.py ├── VI_Model1.py ├── __init__.py ├── __pycache__ │ ├── EarlyStop.cpython-36.pyc │ ├── Loss.cpython-36.pyc │ ├── ModelConstructor.cpython-36.pyc │ ├── Models.cpython-36.pyc │ ├── Optim.cpython-36.pyc │ ├── Trainer.cpython-36.pyc │ ├── TrainerMultimodal.cpython-36.pyc │ ├── Utils.cpython-36.pyc │ ├── VILoss.cpython-36.pyc │ ├── VI_Model1.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── io │ ├── AudioDataset.py │ ├── DatasetBase.py │ ├── IO.py │ ├── ImageDataset.py │ ├── TextDataset.py │ ├── __init__.py │ └── __pycache__ │ │ ├── AudioDataset.cpython-36.pyc │ │ ├── DatasetBase.cpython-36.pyc │ │ ├── IO.cpython-36.pyc │ │ ├── ImageDataset.cpython-36.pyc │ │ ├── TextDataset.cpython-36.pyc │ │ └── __init__.cpython-36.pyc ├── modules │ ├── AudioEncoder.py │ ├── Conv2Conv.py │ ├── ConvMultiStepAttention.py │ ├── CopyGenerator.py │ ├── Dists.py │ ├── Embeddings.py │ ├── Gate.py │ ├── GlobalAttention.py │ ├── ImageEncoder.py │ ├── MultiHeadedAttn.py │ ├── NormalVariationalEncoder.py │ ├── SRU.py │ ├── StackedRNN.py │ ├── StructuredAttention.py │ ├── Transformer.py │ ├── UtilClass.py │ ├── WeightNorm.py │ ├── WordDropout.py │ ├── __init__.py │ └── __pycache__ │ │ ├── AudioEncoder.cpython-36.pyc │ │ ├── Conv2Conv.cpython-36.pyc │ │ ├── ConvMultiStepAttention.cpython-36.pyc │ │ ├── CopyGenerator.cpython-36.pyc │ │ ├── Dists.cpython-36.pyc │ │ ├── Embeddings.cpython-36.pyc │ │ ├── Gate.cpython-36.pyc │ │ ├── GlobalAttention.cpython-36.pyc │ │ ├── ImageEncoder.cpython-36.pyc │ │ ├── MultiHeadedAttn.cpython-36.pyc │ │ ├── NormalVariationalEncoder.cpython-36.pyc │ │ ├── SRU.cpython-36.pyc │ │ ├── StackedRNN.cpython-36.pyc │ │ ├── StructuredAttention.cpython-36.pyc │ │ ├── Transformer.cpython-36.pyc │ │ ├── UtilClass.cpython-36.pyc │ │ ├── WeightNorm.cpython-36.pyc │ │ ├── WordDropout.cpython-36.pyc │ │ └── __init__.cpython-36.pyc └── translate │ ├── Beam.py │ ├── Translation.py │ ├── Translator.py │ ├── TranslatorMultimodalVI.py │ ├── __init__.py │ └── __pycache__ │ ├── Beam.cpython-36.pyc │ ├── Translation.cpython-36.pyc │ ├── Translator.cpython-36.pyc │ ├── TranslatorMultimodalVI.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── opts.py ├── preprocess.py ├── requirements.txt ├── run_additional_data.sh ├── run_translated_m30k_only.sh ├── setup.py ├── tools ├── multi-bleu.perl └── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.el │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.ga │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.lt │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.sv │ ├── nonbreaking_prefix.ta │ ├── nonbreaking_prefix.yue │ └── nonbreaking_prefix.zh ├── train.py ├── train_mm_vi_model1.py ├── translate.py └── translate_mm_vi.py /README.md: -------------------------------------------------------------------------------- 1 | # variational\_mmt 2 | 3 | ## TL-DR 4 | 5 | This is the code base one should use to reproduce results reported in the ACL 2019 paper [Latent variable model for multi-modal translation](https://www.aclweb.org/anthology/papers/P/P19/P19-1642/). 6 | We propose a conditional variational auto-encoder model for multi-modal translation, 7 | i.e. to model the interaction between visual and textual features for multi-modal neural machine translation (MMT) through a latent variable model. 8 | This latent variable can be seen as a multi-modal stochastic embedding of an image and its description in a foreign language. 9 | It is used in a target-language decoder and also to predict image features. 10 | Importantly, our model formulation utilises visual and textual inputs during training but does not require that images be available at test time. 11 | Please refer to the paper for more details. 12 | 13 | ## Before you start 14 | 15 | Before you start, please ensure that: 16 | 17 | - You have installed the right version of PyTorch and all the dependencies according to `requirements.txt`; 18 | - If you want to use your own version of the Multi30k data set, that you changed the respective variable names in the `run_*.sh` files as required. 19 | 20 | If you want to use the exact version of the Multi30k data set used in the paper: 21 | 22 | - download a tarball containing all files (PyTorch binaries and image features) for the translated Multi30k data set experiments [here](https://surfdrive.surf.nl/files/index.php/s/VmqtrhTipDv2djx). The tarball includes: 23 | - `flickr30k_train_resnet50_cnn_features.hdf5`: training set image features, 29K examples. 24 | - `flickr30k_valid_resnet50_cnn_features.hdf5`: validation set image features, 1,014 examples. 25 | - `flickr30k_test_resnet50_cnn_features.hdf5`: 2016 test set image features, 1K examples. 26 | - `flickr30k_test_2017_flickr_resnet50_cnn_features.hdf5`: 2017 test set image features, 1K examples. 27 | - `flickr30k_test_2017_mscoco_resnet50_cnn_features.hdf5`: ambiguous MSCOCO test set image features, 461 examples. 28 | - `m30k.{train,valid}.1.pt`, `m30k.vocab.pt`: PyTorch binaries containing sentences in training/validation sets and vocabulary. 29 | - `{train,val,test_2016_flickr,test_2017_flickr,test_2017_mscoco}.lc.norm.tok.bpe-en-de-30000.{en,de}`: text files containing train/validation/test sets. 30 | - download a tarball containing all files (PyTorch binaries and image features) for the backtranslated comparable + translated Multi30k data set experiments [here](https://surfdrive.surf.nl/files/index.php/s/opHKSCmeJsGtL9Q). The tarball includes: 31 | - `flickr30k_train_translated-5x-comparable-1x_resnet50_cnn_features.shuffled.hdf5`: this file contains features for 290,000 images, i.e. 29K translated Multi30k images five times each (145K) and 29K comparable Multi30k images also five times each (145K). We upsample images for the translated Multi30k to keep them about half of the images used when training the model in this setting. 32 | - `concat-multi30k-translational-5times-comparable-1time-shuffled_correct.{train,valid}.1.pt`, `concat-multi30k-translational-5times-comparable-1time-shuffled_correct.vocab.pt`: PyTorch binaries containing sentences in training/validation sets and vocabulary. 33 | - ensure that variable names are correct in the corresponding `run_translated_m30k_only.sh` and `run_additional_data.sh` files. Image features were extracted as described in the paper, i.e. using a pretrained ResNet-50 convolutional neural network. 34 | 35 | To train a model using only the translated Multi30k, you will use the shell script `run_translated_m30k.sh`; to train a model using the back-translated comparable + translated Multi30k, you will use `run_additional_data.sh`. However, before you run these scripts: 36 | - change `DATA_PATH` and `MODEL_PATH` variables (in both `run_translated_m30k.sh` and `run_additional_data.sh`), pointing them to the directory where to find the training data (decompressed from the tarball abovementioned) and to the directory where you wish to store model checkpoints, respectively. 37 | 38 | ## Training 39 | 40 | To see how to call the `train_mm_vi_model1.py` script, please refer to the `run_*.sh` scripts or run `train_mm_vi_model1.py --help`. 41 | 42 | ### Training a model on the translated Multi30k 43 | 44 | To train a model using the Translated Multi30k data set only (~29K source/target/image triplets), run: 45 | ```bash 46 | run_translated_m30k_only.sh 47 | ``` 48 | 49 | This bash script assumes you have a GPU available with at least 12GBs, e.g. TitanX, 1080Ti, etc., and sets all the hyperparameters to reproduce the results in the paper. 50 | 51 | ### Training a model on the back-translated comparable and translated Multi30k 52 | 53 | To train a model using the back-translated comparable Multi30k in addition to the translated Multi30k data set (total of ~145K source/target/image triplets), simply run: 54 | ```bash 55 | run_additional_data.sh 56 | ``` 57 | 58 | This bash script also assumes you have a GPU available with at least 12GBs (e.g. TitanX, 1080Ti, etc.) and sets all the hyperparameters to reproduce the results in the paper. 59 | 60 | ## Decoding a translation 61 | 62 | By calling the bash scripts above, you will not only train, but after finishing training will also decode the Multi30k's validation, test 2016, test 2017, and the ambiguous MSCOCO 2017 test set. 63 | By default, the model used to translate is the one selected according to best BLEU4 scores on the validation set. 64 | 65 | To see how to use the `translate_mm_vi.py` script directly, please refer to the `run_*.sh` scripts or call `translate_mm_vi.py --help`. 66 | 67 | ## Citation 68 | 69 | If you use this code base, please consider citing our paper. 70 | 71 | @inproceedings{calixto-etal-2019-latent, 72 | title = "Latent Variable Model for Multi-modal Translation", 73 | author = "Calixto, Iacer and Rios, Miguel and Aziz, Wilker", 74 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", 75 | month = jul, 76 | year = "2019", 77 | address = "Florence, Italy", 78 | publisher = "Association for Computational Linguistics", 79 | url = "https://www.aclweb.org/anthology/P19-1642", 80 | pages = "6392--6405", 81 | } 82 | 83 | -------------------------------------------------------------------------------- /onmt/Optim.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | from torch.nn.utils import clip_grad_norm 3 | 4 | 5 | class Optim(object): 6 | """ 7 | Controller class for optimization. Mostly a thin 8 | wrapper for `optim`, but also useful for implementing 9 | rate scheduling beyond what is currently available. 10 | Also implements necessary methods for training RNNs such 11 | as grad manipulations. 12 | 13 | Args: 14 | method (:obj:`str`): one of [sgd, adagrad, adadelta, adam] 15 | lr (float): learning rate 16 | lr_decay (float, optional): learning rate decay multiplier 17 | start_decay_at (int, optional): epoch to start learning rate decay 18 | beta1, beta2 (float, optional): parameters for adam 19 | adagrad_accum (float, optional): initialization parameter for adagrad 20 | decay_method (str, option): custom decay options 21 | warmup_steps (int, option): parameter for `noam` decay 22 | model_size (int, option): parameter for `noam` decay 23 | """ 24 | # We use the default parameters for Adam that are suggested by 25 | # the original paper https://arxiv.org/pdf/1412.6980.pdf 26 | # These values are also used by other established implementations, 27 | # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 28 | # https://keras.io/optimizers/ 29 | # Recently there are slightly different values used in the paper 30 | # "Attention is all you need" 31 | # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98 32 | # was used there however, beta2=0.999 is still arguably the more 33 | # established value, so we use that here as well 34 | def __init__(self, method, lr, max_grad_norm, 35 | lr_decay=1, start_decay_at=None, 36 | beta1=0.9, beta2=0.999, 37 | adagrad_accum=0.0, 38 | decay_method=None, 39 | warmup_steps=4000, 40 | model_size=None): 41 | self.last_ppl = None 42 | self.lr = lr 43 | self.original_lr = lr 44 | self.max_grad_norm = max_grad_norm 45 | self.method = method 46 | self.lr_decay = lr_decay 47 | self.start_decay_at = start_decay_at 48 | self.start_decay = False 49 | self._step = 0 50 | self.betas = [beta1, beta2] 51 | self.adagrad_accum = adagrad_accum 52 | self.decay_method = decay_method 53 | self.warmup_steps = warmup_steps 54 | self.model_size = model_size 55 | 56 | def set_parameters(self, params): 57 | self.params = [p for p in params if p.requires_grad] 58 | if self.method == 'sgd': 59 | self.optimizer = optim.SGD(self.params, lr=self.lr) 60 | elif self.method == 'adagrad': 61 | self.optimizer = optim.Adagrad(self.params, lr=self.lr) 62 | for group in self.optimizer.param_groups: 63 | for p in group['params']: 64 | self.optimizer.state[p]['sum'] = self.optimizer\ 65 | .state[p]['sum'].fill_(self.adagrad_accum) 66 | elif self.method == 'adadelta': 67 | self.optimizer = optim.Adadelta(self.params, lr=self.lr) 68 | elif self.method == 'adam': 69 | self.optimizer = optim.Adam(self.params, lr=self.lr, 70 | betas=self.betas, eps=1e-9) 71 | else: 72 | raise RuntimeError("Invalid optim method: " + self.method) 73 | 74 | def _set_rate(self, lr): 75 | self.lr = lr 76 | self.optimizer.param_groups[0]['lr'] = self.lr 77 | 78 | def step(self): 79 | """Update the model parameters based on current gradients. 80 | 81 | Optionally, will employ gradient modification or update learning 82 | rate. 83 | """ 84 | self._step += 1 85 | 86 | # Decay method used in tensor2tensor. 87 | if self.decay_method == "noam": 88 | self._set_rate( 89 | self.original_lr * 90 | (self.model_size ** (-0.5) * 91 | min(self._step ** (-0.5), 92 | self._step * self.warmup_steps**(-1.5)))) 93 | 94 | if self.max_grad_norm: 95 | clip_grad_norm(self.params, self.max_grad_norm) 96 | self.optimizer.step() 97 | 98 | def update_learning_rate(self, ppl, epoch): 99 | """ 100 | Decay learning rate if val perf does not improve 101 | or we hit the start_decay_at limit. 102 | """ 103 | 104 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 105 | self.start_decay = True 106 | if self.last_ppl is not None and ppl > self.last_ppl: 107 | self.start_decay = True 108 | 109 | if self.start_decay: 110 | self.lr = self.lr * self.lr_decay 111 | print("Decaying learning rate to %g" % self.lr) 112 | 113 | self.last_ppl = ppl 114 | self.optimizer.param_groups[0]['lr'] = self.lr 115 | -------------------------------------------------------------------------------- /onmt/Utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | # these variables must point to the directory containing multi-bleu.perl and meteor-1.5.jar, respectively 5 | BLEU_SCRIPT='%s/multi-bleu.perl' % "tools" 6 | METEOR_SCRIPT='%s/meteor-1.5.jar' % "/misc/vlgscratch4/ChoGroup/icalixto/tools/meteor-1.5" 7 | 8 | assert( os.path.isfile(BLEU_SCRIPT) ), 'ERROR: BLEU parl script not found!' 9 | assert( os.path.isfile(METEOR_SCRIPT) ), 'ERROR: METEOR jar not found!' 10 | 11 | # list with accepted model types 12 | MODEL_TYPES = ["vi-model1"] 13 | 14 | def aeq(*args): 15 | """ 16 | Assert all arguments have the same value 17 | """ 18 | arguments = (arg for arg in args) 19 | first = next(arguments) 20 | assert all(arg == first for arg in arguments), \ 21 | "Not all arguments have the same value: " + str(args) 22 | 23 | def sequence_mask(lengths, max_len=None): 24 | """ 25 | Creates a boolean mask from sequence lengths. 26 | """ 27 | batch_size = lengths.numel() 28 | max_len = max_len or lengths.max() 29 | return (torch.arange(0, max_len) 30 | .type_as(lengths) 31 | .repeat(batch_size, 1) 32 | .lt(lengths.unsqueeze(1))) 33 | 34 | def use_gpu(opt): 35 | return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \ 36 | (hasattr(opt, 'gpu') and opt.gpu > -1) 37 | -------------------------------------------------------------------------------- /onmt/__init__.py: -------------------------------------------------------------------------------- 1 | import onmt.io 2 | import onmt.Loss 3 | import onmt.VILoss 4 | from onmt.Trainer import Trainer, Statistics 5 | from onmt.TrainerMultimodal import TrainerMultimodal, VIStatistics 6 | from onmt.Optim import Optim 7 | import onmt.Models 8 | import onmt.VI_Model1 9 | import onmt.translate 10 | import onmt.EarlyStop 11 | 12 | # For flake8 compatibility 13 | __all__ = [onmt.Loss, onmt.Models, 14 | Trainer, TrainerMultimodal, 15 | Optim, Statistics, onmt.io, onmt.translate] 16 | 17 | __all__ += [onmt.VILoss, VIStatistics, 18 | onmt.VI_Model1, onmt.EarlyStop] 19 | -------------------------------------------------------------------------------- /onmt/__pycache__/EarlyStop.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/EarlyStop.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/Loss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Loss.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/ModelConstructor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/ModelConstructor.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/Models.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Models.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/Optim.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Optim.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/Trainer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Trainer.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/TrainerMultimodal.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/TrainerMultimodal.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/Utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/Utils.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/VILoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/VILoss.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/VI_Model1.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/VI_Model1.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/io/DatasetBase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import chain 4 | import torchtext 5 | 6 | 7 | PAD_WORD = '' 8 | UNK_WORD = '' 9 | UNK = 0 10 | BOS_WORD = '' 11 | EOS_WORD = '' 12 | 13 | 14 | class ONMTDatasetBase(torchtext.data.Dataset): 15 | """ 16 | A dataset basically supports iteration over all the examples 17 | it contains. We currently have 3 datasets inheriting this base 18 | for 3 types of corpus respectively: "text", "img", "audio". 19 | 20 | Internally it initializes an `torchtext.data.Dataset` object with 21 | the following attributes: 22 | 23 | `examples`: a sequence of `torchtext.data.Example` objects. 24 | `fields`: a dictionary associating str keys with `torchtext.data.Field` 25 | objects, and not necessarily having the same keys as the input fields. 26 | """ 27 | def __getstate__(self): 28 | return self.__dict__ 29 | 30 | def __setstate__(self, d): 31 | self.__dict__.update(d) 32 | 33 | def __reduce_ex__(self, proto): 34 | "This is a hack. Something is broken with torch pickle." 35 | return super(ONMTDatasetBase, self).__reduce_ex__() 36 | 37 | def load_fields(self, vocab_dict): 38 | """ Load fields from vocab.pt, and set the `fields` attribute. 39 | 40 | Args: 41 | vocab_dict (dict): a dict of loaded vocab from vocab.pt file. 42 | """ 43 | from onmt.io.IO import load_fields_from_vocab 44 | 45 | fields = load_fields_from_vocab(vocab_dict.items(), self.data_type) 46 | self.fields = dict([(k, f) for (k, f) in fields.items() 47 | if k in self.examples[0].__dict__]) 48 | 49 | @staticmethod 50 | def extract_text_features(tokens): 51 | """ 52 | Args: 53 | tokens: A list of tokens, where each token consists of a word, 54 | optionally followed by u"│"-delimited features. 55 | Returns: 56 | A sequence of words, a sequence of features, and num of features. 57 | """ 58 | if not tokens: 59 | return [], [], -1 60 | 61 | split_tokens = [token.split(u"│") for token in tokens] 62 | split_tokens = [token for token in split_tokens if token[0]] 63 | token_size = len(split_tokens[0]) 64 | 65 | assert all(len(token) == token_size for token in split_tokens), \ 66 | "all words must have the same number of features" 67 | words_and_features = list(zip(*split_tokens)) 68 | words = words_and_features[0] 69 | features = words_and_features[1:] 70 | 71 | return words, features, token_size - 1 72 | 73 | # Below are helper functions for intra-class use only. 74 | 75 | def _join_dicts(self, *args): 76 | """ 77 | Args: 78 | dictionaries with disjoint keys. 79 | 80 | Returns: 81 | a single dictionary that has the union of these keys. 82 | """ 83 | return dict(chain(*[d.items() for d in args])) 84 | 85 | def _peek(self, seq): 86 | """ 87 | Args: 88 | seq: an iterator. 89 | 90 | Returns: 91 | the first thing returned by calling next() on the iterator 92 | and an iterator created by re-chaining that value to the beginning 93 | of the iterator. 94 | """ 95 | first = next(seq) 96 | return first, chain([first], seq) 97 | 98 | def _construct_example_fromlist(self, data, fields): 99 | """ 100 | Args: 101 | data: the data to be set as the value of the attributes of 102 | the to-be-created `Example`, associating with respective 103 | `Field` objects with same key. 104 | fields: a dict of `torchtext.data.Field` objects. The keys 105 | are attributes of the to-be-created `Example`. 106 | 107 | Returns: 108 | the created `Example` object. 109 | """ 110 | ex = torchtext.data.Example() 111 | for (name, field), val in zip(fields, data): 112 | if field is not None: 113 | setattr(ex, name, field.preprocess(val)) 114 | else: 115 | setattr(ex, name, val) 116 | return ex 117 | -------------------------------------------------------------------------------- /onmt/io/ImageDataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import codecs 4 | import os 5 | 6 | import torch 7 | import torchtext 8 | 9 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, EOS_WORD 10 | 11 | 12 | class ImageDataset(ONMTDatasetBase): 13 | """ Dataset for data_type=='img' 14 | 15 | Build `Example` objects, `Field` objects, and filter_pred function 16 | from image corpus. 17 | 18 | Args: 19 | fields (dict): a dictionary of `torchtext.data.Field`. 20 | src_examples_iter (dict iter): preprocessed source example 21 | dictionary iterator. 22 | tgt_examples_iter (dict iter): preprocessed target example 23 | dictionary iterator. 24 | num_src_feats (int): number of source side features. 25 | num_tgt_feats (int): number of target side features. 26 | tgt_seq_length (int): maximum target sequence length. 27 | use_filter_pred (bool): use a custom filter predicate to filter 28 | out examples? 29 | """ 30 | def __init__(self, fields, src_examples_iter, tgt_examples_iter, 31 | num_src_feats=0, num_tgt_feats=0, 32 | tgt_seq_length=0, use_filter_pred=True): 33 | self.data_type = 'img' 34 | 35 | self.n_src_feats = num_src_feats 36 | self.n_tgt_feats = num_tgt_feats 37 | 38 | if tgt_examples_iter is not None: 39 | examples_iter = (self._join_dicts(src, tgt) for src, tgt in 40 | zip(src_examples_iter, tgt_examples_iter)) 41 | else: 42 | examples_iter = src_examples_iter 43 | 44 | # Peek at the first to see which fields are used. 45 | ex, examples_iter = self._peek(examples_iter) 46 | keys = ex.keys() 47 | 48 | out_fields = [(k, fields[k]) if k in fields else (k, None) 49 | for k in keys] 50 | example_values = ([ex[k] for k in keys] for ex in examples_iter) 51 | out_examples = (self._construct_example_fromlist( 52 | ex_values, out_fields) 53 | for ex_values in example_values) 54 | # If out_examples is a generator, we need to save the filter_pred 55 | # function in serialization too, which would cause a problem when 56 | # `torch.save()`. Thus we materialize it as a list. 57 | out_examples = list(out_examples) 58 | 59 | def filter_pred(example): 60 | if tgt_examples_iter is not None: 61 | return 0 < len(example.tgt) <= tgt_seq_length 62 | else: 63 | return True 64 | 65 | filter_pred = filter_pred if use_filter_pred else lambda x: True 66 | 67 | super(ImageDataset, self).__init__( 68 | out_examples, out_fields, filter_pred 69 | ) 70 | 71 | def sort_key(self, ex): 72 | """ Sort using the size of the image: (width, height).""" 73 | return (ex.src.size(2), ex.src.size(1)) 74 | 75 | @staticmethod 76 | def make_image_examples_nfeats_tpl(path, img_dir): 77 | """ 78 | Args: 79 | path (str): location of a src file containing image paths 80 | src_dir (str): location of source images 81 | 82 | Returns: 83 | (example_dict iterator, num_feats) tuple 84 | """ 85 | examples_iter = ImageDataset.read_img_file(path, img_dir, 'src') 86 | num_feats = 0 # Source side(img) has no features. 87 | 88 | return (examples_iter, num_feats) 89 | 90 | @staticmethod 91 | def read_img_file(path, src_dir, side, truncate=None): 92 | """ 93 | Args: 94 | path (str): location of a src file containing image paths 95 | src_dir (str): location of source images 96 | side (str): 'src' or 'tgt' 97 | truncate: maximum img size ((0,0) or None for unlimited) 98 | 99 | Yields: 100 | a dictionary containing image data, path and index for each line. 101 | """ 102 | assert (src_dir is not None) and os.path.exists(src_dir),\ 103 | 'src_dir must be a valid directory if data_type is img' 104 | 105 | global Image, transforms 106 | from PIL import Image 107 | from torchvision import transforms 108 | 109 | with codecs.open(path, "r", "utf-8") as corpus_file: 110 | index = 0 111 | for line in corpus_file: 112 | img_path = os.path.join(src_dir, line.strip()) 113 | if not os.path.exists(img_path): 114 | img_path = line 115 | 116 | assert os.path.exists(img_path), \ 117 | 'img path %s not found' % (line.strip()) 118 | 119 | img = transforms.ToTensor()(Image.open(img_path)) 120 | if truncate and truncate != (0, 0): 121 | if not (img.size(1) <= truncate[0] 122 | and img.size(2) <= truncate[1]): 123 | continue 124 | 125 | example_dict = {side: img, 126 | side+'_path': line.strip(), 127 | 'indices': index} 128 | index += 1 129 | 130 | yield example_dict 131 | 132 | @staticmethod 133 | def get_fields(n_src_features, n_tgt_features): 134 | """ 135 | Args: 136 | n_src_features: the number of source features to 137 | create `torchtext.data.Field` for. 138 | n_tgt_features: the number of target features to 139 | create `torchtext.data.Field` for. 140 | 141 | Returns: 142 | A dictionary whose keys are strings and whose values 143 | are the corresponding Field objects. 144 | """ 145 | fields = {} 146 | 147 | def make_img(data, vocab, is_train): 148 | c = data[0].size(0) 149 | h = max([t.size(1) for t in data]) 150 | w = max([t.size(2) for t in data]) 151 | imgs = torch.zeros(len(data), c, h, w) 152 | for i, img in enumerate(data): 153 | imgs[i, :, 0:img.size(1), 0:img.size(2)] = img 154 | return imgs 155 | 156 | fields["src"] = torchtext.data.Field( 157 | use_vocab=False, tensor_type=torch.FloatTensor, 158 | postprocessing=make_img, sequential=False) 159 | 160 | for j in range(n_src_features): 161 | fields["src_feat_"+str(j)] = \ 162 | torchtext.data.Field(pad_token=PAD_WORD) 163 | 164 | fields["tgt"] = torchtext.data.Field( 165 | init_token=BOS_WORD, eos_token=EOS_WORD, 166 | pad_token=PAD_WORD) 167 | 168 | for j in range(n_tgt_features): 169 | fields["tgt_feat_"+str(j)] = \ 170 | torchtext.data.Field(init_token=BOS_WORD, eos_token=EOS_WORD, 171 | pad_token=PAD_WORD) 172 | 173 | def make_src(data, vocab, is_train): 174 | src_size = max([t.size(0) for t in data]) 175 | src_vocab_size = max([t.max() for t in data]) + 1 176 | alignment = torch.zeros(src_size, len(data), src_vocab_size) 177 | for i, sent in enumerate(data): 178 | for j, t in enumerate(sent): 179 | alignment[j, i, t] = 1 180 | return alignment 181 | 182 | fields["src_map"] = torchtext.data.Field( 183 | use_vocab=False, tensor_type=torch.FloatTensor, 184 | postprocessing=make_src, sequential=False) 185 | 186 | def make_tgt(data, vocab, is_train): 187 | tgt_size = max([t.size(0) for t in data]) 188 | alignment = torch.zeros(tgt_size, len(data)).long() 189 | for i, sent in enumerate(data): 190 | alignment[:sent.size(0), i] = sent 191 | return alignment 192 | 193 | fields["alignment"] = torchtext.data.Field( 194 | use_vocab=False, tensor_type=torch.LongTensor, 195 | postprocessing=make_tgt, sequential=False) 196 | 197 | fields["indices"] = torchtext.data.Field( 198 | use_vocab=False, tensor_type=torch.LongTensor, 199 | sequential=False) 200 | 201 | return fields 202 | 203 | @staticmethod 204 | def get_num_features(corpus_file, side): 205 | """ 206 | For image corpus, source side is in form of image, thus 207 | no feature; while target side is in form of text, thus 208 | we can extract its text features. 209 | 210 | Args: 211 | corpus_file (str): file path to get the features. 212 | side (str): 'src' or 'tgt'. 213 | 214 | Returns: 215 | number of features on `side`. 216 | """ 217 | if side == 'src': 218 | num_feats = 0 219 | else: 220 | with codecs.open(corpus_file, "r", "utf-8") as cf: 221 | f_line = cf.readline().strip().split() 222 | _, _, num_feats = ImageDataset.extract_text_features(f_line) 223 | 224 | return num_feats 225 | -------------------------------------------------------------------------------- /onmt/io/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \ 2 | collect_features, get_num_features, \ 3 | load_fields_from_vocab, get_fields, \ 4 | save_fields_to_vocab, build_dataset, \ 5 | build_vocab, merge_vocabs, OrderedIterator 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \ 7 | EOS_WORD, UNK 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator 9 | from onmt.io.ImageDataset import ImageDataset 10 | from onmt.io.AudioDataset import AudioDataset 11 | 12 | 13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase, 14 | collect_feature_vocabs, make_features, 15 | collect_features, get_num_features, 16 | load_fields_from_vocab, get_fields, 17 | save_fields_to_vocab, build_dataset, 18 | build_vocab, merge_vocabs, OrderedIterator, 19 | TextDataset, ImageDataset, AudioDataset, 20 | ShardedTextCorpusIterator] 21 | -------------------------------------------------------------------------------- /onmt/io/__pycache__/AudioDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/AudioDataset.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/io/__pycache__/DatasetBase.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/DatasetBase.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/io/__pycache__/IO.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/IO.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/io/__pycache__/ImageDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/ImageDataset.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/io/__pycache__/TextDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/TextDataset.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/io/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/io/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/AudioEncoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class AudioEncoder(nn.Module): 7 | """ 8 | A simple encoder convolutional -> recurrent neural network for 9 | audio input. 10 | 11 | Args: 12 | num_layers (int): number of encoder layers. 13 | bidirectional (bool): bidirectional encoder. 14 | rnn_size (int): size of hidden states of the rnn. 15 | dropout (float): dropout probablity. 16 | sample_rate (float): input spec 17 | window_size (int): input spec 18 | 19 | """ 20 | def __init__(self, num_layers, bidirectional, rnn_size, dropout, 21 | sample_rate, window_size): 22 | super(AudioEncoder, self).__init__() 23 | self.num_layers = num_layers 24 | self.num_directions = 2 if bidirectional else 1 25 | self.hidden_size = rnn_size 26 | 27 | self.layer1 = nn.Conv2d(1, 32, kernel_size=(41, 11), 28 | padding=(0, 10), stride=(2, 2)) 29 | self.batch_norm1 = nn.BatchNorm2d(32) 30 | self.layer2 = nn.Conv2d(32, 32, kernel_size=(21, 11), 31 | padding=(0, 0), stride=(2, 1)) 32 | self.batch_norm2 = nn.BatchNorm2d(32) 33 | 34 | input_size = int(math.floor((sample_rate * window_size) / 2) + 1) 35 | input_size = int(math.floor(input_size - 41) / 2 + 1) 36 | input_size = int(math.floor(input_size - 21) / 2 + 1) 37 | input_size *= 32 38 | self.rnn = nn.LSTM(input_size, rnn_size, 39 | num_layers=num_layers, 40 | dropout=dropout, 41 | bidirectional=bidirectional) 42 | 43 | def load_pretrained_vectors(self, opt): 44 | # Pass in needed options only when modify function definition. 45 | pass 46 | 47 | def forward(self, input, lengths=None): 48 | "See :obj:`onmt.modules.EncoderBase.forward()`" 49 | # (batch_size, 1, nfft, t) 50 | # layer 1 51 | input = self.batch_norm1(self.layer1(input[:, :, :, :])) 52 | 53 | # (batch_size, 32, nfft/2, t/2) 54 | input = F.hardtanh(input, 0, 20, inplace=True) 55 | 56 | # (batch_size, 32, nfft/2/2, t/2) 57 | # layer 2 58 | input = self.batch_norm2(self.layer2(input)) 59 | 60 | # (batch_size, 32, nfft/2/2, t/2) 61 | input = F.hardtanh(input, 0, 20, inplace=True) 62 | 63 | batch_size = input.size(0) 64 | length = input.size(3) 65 | input = input.view(batch_size, -1, length) 66 | input = input.transpose(0, 2).transpose(1, 2) 67 | 68 | output, hidden = self.rnn(input) 69 | 70 | return hidden, output 71 | -------------------------------------------------------------------------------- /onmt/modules/Conv2Conv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of "Convolutional Sequence to Sequence Learning" 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | 10 | import onmt.modules 11 | from onmt.modules.WeightNorm import WeightNormConv2d 12 | from onmt.Models import EncoderBase 13 | from onmt.Models import DecoderState 14 | from onmt.Utils import aeq 15 | 16 | 17 | SCALE_WEIGHT = 0.5 ** 0.5 18 | 19 | 20 | def shape_transform(x): 21 | """ Tranform the size of the tensors to fit for conv input. """ 22 | return torch.unsqueeze(torch.transpose(x, 1, 2), 3) 23 | 24 | 25 | class GatedConv(nn.Module): 26 | def __init__(self, input_size, width=3, dropout=0.2, nopad=False): 27 | super(GatedConv, self).__init__() 28 | self.conv = WeightNormConv2d(input_size, 2 * input_size, 29 | kernel_size=(width, 1), stride=(1, 1), 30 | padding=(width // 2 * (1 - nopad), 0)) 31 | init.xavier_uniform(self.conv.weight, gain=(4 * (1 - dropout))**0.5) 32 | self.dropout = nn.Dropout(dropout) 33 | 34 | def forward(self, x_var, hidden=None): 35 | x_var = self.dropout(x_var) 36 | x_var = self.conv(x_var) 37 | out, gate = x_var.split(int(x_var.size(1) / 2), 1) 38 | out = out * F.sigmoid(gate) 39 | return out 40 | 41 | 42 | class StackedCNN(nn.Module): 43 | def __init__(self, num_layers, input_size, cnn_kernel_width=3, 44 | dropout=0.2): 45 | super(StackedCNN, self).__init__() 46 | self.dropout = dropout 47 | self.num_layers = num_layers 48 | self.layers = nn.ModuleList() 49 | for i in range(num_layers): 50 | self.layers.append( 51 | GatedConv(input_size, cnn_kernel_width, dropout)) 52 | 53 | def forward(self, x, hidden=None): 54 | for conv in self.layers: 55 | x = x + conv(x) 56 | x *= SCALE_WEIGHT 57 | return x 58 | 59 | 60 | class CNNEncoder(EncoderBase): 61 | """ 62 | Encoder built on CNN based on 63 | :cite:`DBLP:journals/corr/GehringAGYD17`. 64 | """ 65 | def __init__(self, num_layers, hidden_size, 66 | cnn_kernel_width, dropout, embeddings): 67 | super(CNNEncoder, self).__init__() 68 | 69 | self.embeddings = embeddings 70 | input_size = embeddings.embedding_size 71 | self.linear = nn.Linear(input_size, hidden_size) 72 | self.cnn = StackedCNN(num_layers, hidden_size, 73 | cnn_kernel_width, dropout) 74 | 75 | def forward(self, input, lengths=None, hidden=None): 76 | """ See :obj:`onmt.modules.EncoderBase.forward()`""" 77 | self._check_args(input, lengths, hidden) 78 | 79 | emb = self.embeddings(input) 80 | s_len, batch, emb_dim = emb.size() 81 | 82 | emb = emb.transpose(0, 1).contiguous() 83 | emb_reshape = emb.view(emb.size(0) * emb.size(1), -1) 84 | emb_remap = self.linear(emb_reshape) 85 | emb_remap = emb_remap.view(emb.size(0), emb.size(1), -1) 86 | emb_remap = shape_transform(emb_remap) 87 | out = self.cnn(emb_remap) 88 | 89 | return emb_remap.squeeze(3).transpose(0, 1).contiguous(),\ 90 | out.squeeze(3).transpose(0, 1).contiguous() 91 | 92 | 93 | class CNNDecoder(nn.Module): 94 | """ 95 | Decoder built on CNN, based on :cite:`DBLP:journals/corr/GehringAGYD17`. 96 | 97 | 98 | Consists of residual convolutional layers, with ConvMultiStepAttention. 99 | """ 100 | def __init__(self, num_layers, hidden_size, attn_type, 101 | copy_attn, cnn_kernel_width, dropout, embeddings): 102 | super(CNNDecoder, self).__init__() 103 | 104 | # Basic attributes. 105 | self.decoder_type = 'cnn' 106 | self.num_layers = num_layers 107 | self.hidden_size = hidden_size 108 | self.cnn_kernel_width = cnn_kernel_width 109 | self.embeddings = embeddings 110 | self.dropout = dropout 111 | 112 | # Build the CNN. 113 | input_size = self.embeddings.embedding_size 114 | self.linear = nn.Linear(input_size, self.hidden_size) 115 | self.conv_layers = nn.ModuleList() 116 | for i in range(self.num_layers): 117 | self.conv_layers.append( 118 | GatedConv(self.hidden_size, self.cnn_kernel_width, 119 | self.dropout, True)) 120 | 121 | self.attn_layers = nn.ModuleList() 122 | for i in range(self.num_layers): 123 | self.attn_layers.append( 124 | onmt.modules.ConvMultiStepAttention(self.hidden_size)) 125 | 126 | # CNNDecoder has its own attention mechanism. 127 | # Set up a separated copy attention layer, if needed. 128 | self._copy = False 129 | if copy_attn: 130 | self.copy_attn = onmt.modules.GlobalAttention( 131 | hidden_size, attn_type=attn_type) 132 | self._copy = True 133 | 134 | def forward(self, input, context, state, context_lengths=None): 135 | """ See :obj:`onmt.modules.RNNDecoderBase.forward()`""" 136 | # CHECKS 137 | assert isinstance(state, CNNDecoderState) 138 | input_len, input_batch, _ = input.size() 139 | contxt_len, contxt_batch, _ = context.size() 140 | aeq(input_batch, contxt_batch) 141 | # END CHECKS 142 | 143 | if state.previous_input is not None: 144 | input = torch.cat([state.previous_input, input], 0) 145 | 146 | # Initialize return variables. 147 | outputs = [] 148 | attns = {"std": []} 149 | assert not self._copy, "Copy mechanism not yet tested in conv2conv" 150 | if self._copy: 151 | attns["copy"] = [] 152 | 153 | emb = self.embeddings(input) 154 | assert emb.dim() == 3 # len x batch x embedding_dim 155 | 156 | tgt_emb = emb.transpose(0, 1).contiguous() 157 | # The output of CNNEncoder. 158 | src_context_t = context.transpose(0, 1).contiguous() 159 | # The combination of output of CNNEncoder and source embeddings. 160 | src_context_c = state.init_src.transpose(0, 1).contiguous() 161 | 162 | # Run the forward pass of the CNNDecoder. 163 | emb_reshape = tgt_emb.contiguous().view( 164 | tgt_emb.size(0) * tgt_emb.size(1), -1) 165 | linear_out = self.linear(emb_reshape) 166 | x = linear_out.view(tgt_emb.size(0), tgt_emb.size(1), -1) 167 | x = shape_transform(x) 168 | 169 | pad = Variable(torch.zeros(x.size(0), x.size(1), 170 | self.cnn_kernel_width - 1, 1)) 171 | pad = pad.type_as(x) 172 | base_target_emb = x 173 | 174 | for conv, attention in zip(self.conv_layers, self.attn_layers): 175 | new_target_input = torch.cat([pad, x], 2) 176 | out = conv(new_target_input) 177 | c, attn = attention(base_target_emb, out, 178 | src_context_t, src_context_c) 179 | x = (x + (c + out) * SCALE_WEIGHT) * SCALE_WEIGHT 180 | output = x.squeeze(3).transpose(1, 2) 181 | 182 | # Process the result and update the attentions. 183 | outputs = output.transpose(0, 1).contiguous() 184 | if state.previous_input is not None: 185 | outputs = outputs[state.previous_input.size(0):] 186 | attn = attn[:, state.previous_input.size(0):].squeeze() 187 | attn = torch.stack([attn]) 188 | attns["std"] = attn 189 | if self._copy: 190 | attns["copy"] = attn 191 | 192 | # Update the state. 193 | state.update_state(input) 194 | 195 | return outputs, state, attns 196 | 197 | def init_decoder_state(self, src, context, enc_hidden): 198 | return CNNDecoderState(context, enc_hidden) 199 | 200 | 201 | class CNNDecoderState(DecoderState): 202 | def __init__(self, context, enc_hidden): 203 | self.init_src = (context + enc_hidden) * SCALE_WEIGHT 204 | self.previous_input = None 205 | 206 | @property 207 | def _all(self): 208 | """ 209 | Contains attributes that need to be updated in self.beam_update(). 210 | """ 211 | return (self.previous_input,) 212 | 213 | def update_state(self, input): 214 | """ Called for every decoder forward pass. """ 215 | self.previous_input = input 216 | 217 | def repeat_beam_size_times(self, beam_size): 218 | """ Repeat beam_size times along batch dimension. """ 219 | self.init_src = Variable( 220 | self.init_src.data.repeat(1, beam_size, 1), volatile=True) 221 | -------------------------------------------------------------------------------- /onmt/modules/ConvMultiStepAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onmt.Utils import aeq 5 | 6 | 7 | SCALE_WEIGHT = 0.5 ** 0.5 8 | 9 | 10 | def seq_linear(linear, x): 11 | # linear transform for 3-d tensor 12 | batch, hidden_size, length, _ = x.size() 13 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 14 | batch * length, hidden_size)) 15 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 16 | 17 | 18 | class ConvMultiStepAttention(nn.Module): 19 | """ 20 | 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | 26 | """ 27 | 28 | def __init__(self, input_size): 29 | super(ConvMultiStepAttention, self).__init__() 30 | self.linear_in = nn.Linear(input_size, input_size) 31 | self.mask = None 32 | 33 | def apply_mask(self, mask): 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input: output of decode conv 42 | encoder_out_t: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | 48 | """ 49 | # checks 50 | batch, channel, height, width = base_target_emb.size() 51 | batch_, channel_, height_, width_ = input.size() 52 | aeq(batch, batch_) 53 | aeq(height, height_) 54 | 55 | enc_batch, enc_channel, enc_height = encoder_out_top.size() 56 | enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 57 | 58 | aeq(enc_batch, enc_batch_) 59 | aeq(enc_height, enc_height_) 60 | 61 | preatt = seq_linear(self.linear_in, input) 62 | target = (base_target_emb + preatt) * SCALE_WEIGHT 63 | target = torch.squeeze(target, 3) 64 | target = torch.transpose(target, 1, 2) 65 | pre_attn = torch.bmm(target, encoder_out_top) 66 | 67 | if self.mask is not None: 68 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 69 | 70 | pre_attn = pre_attn.transpose(0, 2) 71 | attn = F.softmax(pre_attn) 72 | attn = attn.transpose(0, 2).contiguous() 73 | context_output = torch.bmm( 74 | attn, torch.transpose(encoder_out_combine, 1, 2)) 75 | context_output = torch.transpose( 76 | torch.unsqueeze(context_output, 3), 1, 2) 77 | return context_output, attn 78 | -------------------------------------------------------------------------------- /onmt/modules/CopyGenerator.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | import torch.cuda 5 | 6 | import onmt 7 | import onmt.io 8 | from onmt.Utils import aeq 9 | 10 | 11 | class CopyGenerator(nn.Module): 12 | """Generator module that additionally considers copying 13 | words directly from the source. 14 | 15 | The main idea is that we have an extended "dynamic dictionary". 16 | It contains `|tgt_dict|` words plus an arbitrary number of 17 | additional words introduced by the source sentence. 18 | For each source sentence we have a `src_map` that maps 19 | each source word to an index in `tgt_dict` if it known, or 20 | else to an extra word. 21 | 22 | The copy generator is an extended version of the standard 23 | generator that computse three values. 24 | 25 | * :math:`p_{softmax}` the standard softmax over `tgt_dict` 26 | * :math:`p(z)` the probability of instead copying a 27 | word from the source, computed using a bernoulli 28 | * :math:`p_{copy}` the probility of copying a word instead. 29 | taken from the attention distribution directly. 30 | 31 | The model returns a distribution over the extend dictionary, 32 | computed as 33 | 34 | :math:`p(w) = p(z=1) p_{copy}(w) + p(z=0) p_{softmax}(w)` 35 | 36 | 37 | .. mermaid:: 38 | 39 | graph BT 40 | A[input] 41 | S[src_map] 42 | B[softmax] 43 | BB[switch] 44 | C[attn] 45 | D[copy] 46 | O[output] 47 | A --> B 48 | A --> BB 49 | S --> D 50 | C --> D 51 | D --> O 52 | B --> O 53 | BB --> O 54 | 55 | 56 | Args: 57 | input_size (int): size of input representation 58 | tgt_dict (Vocab): output target dictionary 59 | 60 | """ 61 | def __init__(self, input_size, tgt_dict): 62 | super(CopyGenerator, self).__init__() 63 | self.linear = nn.Linear(input_size, len(tgt_dict)) 64 | self.linear_copy = nn.Linear(input_size, 1) 65 | self.tgt_dict = tgt_dict 66 | 67 | def forward(self, hidden, attn, src_map): 68 | """ 69 | Compute a distribution over the target dictionary 70 | extended by the dynamic dictionary implied by compying 71 | source words. 72 | 73 | Args: 74 | hidden (`FloatTensor`): hidden outputs `[batch*tlen, input_size]` 75 | attn (`FloatTensor`): attn for each `[batch*tlen, input_size]` 76 | src_map (`FloatTensor`): 77 | A sparse indicator matrix mapping each source word to 78 | its index in the "extended" vocab containing. 79 | `[src_len, batch, extra_words]` 80 | """ 81 | # CHECKS 82 | batch_by_tlen, _ = hidden.size() 83 | batch_by_tlen_, slen = attn.size() 84 | slen_, batch, cvocab = src_map.size() 85 | aeq(batch_by_tlen, batch_by_tlen_) 86 | aeq(slen, slen_) 87 | 88 | # Original probabilities. 89 | logits = self.linear(hidden) 90 | logits[:, self.tgt_dict.stoi[onmt.io.PAD_WORD]] = -float('inf') 91 | prob = F.softmax(logits) 92 | 93 | # Probability of copying p(z=1) batch. 94 | copy = F.sigmoid(self.linear_copy(hidden)) 95 | 96 | # Probibility of not copying: p_{word}(w) * (1 - p(z)) 97 | out_prob = torch.mul(prob, 1 - copy.expand_as(prob)) 98 | mul_attn = torch.mul(attn, copy.expand_as(attn)) 99 | copy_prob = torch.bmm(mul_attn.view(-1, batch, slen) 100 | .transpose(0, 1), 101 | src_map.transpose(0, 1)).transpose(0, 1) 102 | copy_prob = copy_prob.contiguous().view(-1, cvocab) 103 | return torch.cat([out_prob, copy_prob], 1) 104 | 105 | 106 | class CopyGeneratorCriterion(object): 107 | def __init__(self, vocab_size, force_copy, pad, eps=1e-20): 108 | self.force_copy = force_copy 109 | self.eps = eps 110 | self.offset = vocab_size 111 | self.pad = pad 112 | 113 | def __call__(self, scores, align, target): 114 | align = align.view(-1) 115 | 116 | # Copy prob. 117 | out = scores.gather(1, align.view(-1, 1) + self.offset) \ 118 | .view(-1).mul(align.ne(0).float()) 119 | tmp = scores.gather(1, target.view(-1, 1)).view(-1) 120 | 121 | # Regular prob (no unks and unks that can't be copied) 122 | if not self.force_copy: 123 | out = out + self.eps + tmp.mul(target.ne(0).float()) + \ 124 | tmp.mul(align.eq(0).float()).mul(target.eq(0).float()) 125 | else: 126 | # Forced copy. 127 | out = out + self.eps + tmp.mul(align.eq(0).float()) 128 | 129 | # Drop padding. 130 | loss = -out.log().mul(target.ne(self.pad).float()).sum() 131 | return loss 132 | 133 | 134 | class CopyGeneratorLossCompute(onmt.Loss.LossComputeBase): 135 | """ 136 | Copy Generator Loss Computation. 137 | """ 138 | def __init__(self, generator, tgt_vocab, 139 | force_copy, eps=1e-20): 140 | super(CopyGeneratorLossCompute, self).__init__( 141 | generator, tgt_vocab) 142 | 143 | # We lazily load datasets when there are more than one, so postpone 144 | # the setting of cur_dataset. 145 | self.cur_dataset = None 146 | self.force_copy = force_copy 147 | self.criterion = CopyGeneratorCriterion(len(tgt_vocab), force_copy, 148 | self.padding_idx) 149 | 150 | def _make_shard_state(self, batch, output, range_, attns): 151 | """ See base class for args description. """ 152 | if getattr(batch, "alignment", None) is None: 153 | raise AssertionError("using -copy_attn you need to pass in " 154 | "-dynamic_dict during preprocess stage.") 155 | 156 | return { 157 | "output": output, 158 | "target": batch.tgt[range_[0] + 1: range_[1]], 159 | "copy_attn": attns.get("copy"), 160 | "align": batch.alignment[range_[0] + 1: range_[1]] 161 | } 162 | 163 | def _compute_loss(self, batch, output, target, copy_attn, align): 164 | """ 165 | Compute the loss. The args must match self._make_shard_state(). 166 | Args: 167 | batch: the current batch. 168 | output: the predict output from the model. 169 | target: the validate target to compare output with. 170 | copy_attn: the copy attention value. 171 | align: the align info. 172 | """ 173 | target = target.view(-1) 174 | align = align.view(-1) 175 | scores = self.generator(self._bottle(output), 176 | self._bottle(copy_attn), 177 | batch.src_map) 178 | 179 | loss = self.criterion(scores, align, target) 180 | 181 | scores_data = scores.data.clone() 182 | scores_data = onmt.io.TextDataset.collapse_copy_scores( 183 | self._unbottle(scores_data, batch.batch_size), 184 | batch, self.tgt_vocab, self.cur_dataset.src_vocabs) 185 | scores_data = self._bottle(scores_data) 186 | 187 | # Correct target copy token instead of 188 | # tgt[i] = align[i] + len(tgt_vocab) 189 | # for i such that tgt[i] == 0 and align[i] != 0 190 | target_data = target.data.clone() 191 | correct_mask = target_data.eq(0) * align.data.ne(0) 192 | correct_copy = (align.data + len(self.tgt_vocab)) * correct_mask.long() 193 | target_data = target_data + correct_copy 194 | 195 | # Coverage loss term. 196 | loss_data = loss.data.clone() 197 | 198 | stats = self._stats(loss_data, scores_data, target_data) 199 | 200 | return loss, stats 201 | -------------------------------------------------------------------------------- /onmt/modules/Dists.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.distributions import Distribution 3 | from torch.distributions import Normal as PyNormal 4 | 5 | 6 | def convert_symmetric_dirichlet_to_logistic_normal(concentration, dim): 7 | return 0., (1. / concentration) * (1. - 2. / dim) + 1. / (concentration * dim) 8 | #return 0., 1. 9 | 10 | 11 | class Normal(Distribution): 12 | def __init__(self, mean, std): 13 | self.normal = PyNormal(mean,std) 14 | 15 | def mean(self): 16 | return self.normal.mean 17 | 18 | def params(self): 19 | return [self.normal.mean,self.normal.std] 20 | 21 | def sample(self): 22 | """ 23 | Generates a single sample or single batch of samples if the distribution 24 | parameters are batched. 25 | """ 26 | return self.normal.sample() 27 | 28 | def sample_n(self, n): 29 | """ 30 | Generates n samples or n batches of samples if the distribution parameters 31 | are batched. 32 | """ 33 | return self.normal.sample_n(n) 34 | 35 | 36 | def log_prob(self, value): 37 | """ 38 | Returns the log of the probability density/mass function evaluated at 39 | `value`. 40 | 41 | Args: 42 | value (Tensor or Variable): 43 | """ 44 | return self.normal.log_prob(value) 45 | 46 | def kl(self, other): 47 | """ 48 | KL-divergence between two Normals: KL[N(u_i, s_i) || N(u_j, s_j)] 49 | where params_i = [u_i, s_i] and similarly for j. 50 | Returns a tensor with the dimensionality of the location variable. 51 | """ 52 | if not isinstance(other, Normal): 53 | raise ValueError('Impossible') 54 | location_i, scale_i = self.params() # [mean, std] 55 | location_j, scale_j = other.params() # [mean, std] 56 | var_i = scale_i ** 2. 57 | var_j = scale_j ** 2. 58 | term1 = 1. / (2. * var_j) * ((location_i - location_j) ** 2. + var_i - var_j) 59 | term2 = torch.log(scale_j) - torch.log(scale_i) 60 | return term1 + term2 61 | 62 | 63 | class LogisticNormal(Distribution): 64 | def __init__(self, loc, scale, n_samples=100): 65 | self.normal = Normal(loc,scale) 66 | self.n_samples = n_samples 67 | 68 | def mean(self): 69 | samples = self.sample_n(self.n_samples) 70 | #return self.normal.mean 71 | return samples.mean(0) 72 | 73 | def params(self): 74 | """ The distribution parameters (mean,std) """ 75 | return self.normal.params() 76 | 77 | def sample(self): 78 | """ 79 | Generates a single sample or single batch of samples if the distribution 80 | parameters are batched. 81 | """ 82 | return nn.functional.softmax(self.normal.sample(),-1) 83 | 84 | def sample_n(self, n): 85 | """ 86 | Generates n samples or n batches of samples if the distribution parameters 87 | are batched. 88 | """ 89 | return nn.functional.softmax(self.normal.sample_n(n),-1) 90 | 91 | 92 | def log_prob(self, value): 93 | """ 94 | Returns the log of the probability density/mass function evaluated at 95 | `value`. 96 | 97 | Args: 98 | value (Tensor or Variable): 99 | """ 100 | raise NotImplementedError 101 | 102 | def kl(self, other): 103 | if isinstance(other, LogisticNormal): 104 | return self.normal.kl(other.normal) 105 | else: 106 | raise ValueError('Impossible (LogisticNormal): self %s other %s' % (type(self), type(other))) 107 | 108 | 109 | 110 | class Delta(Distribution): 111 | r""" 112 | Creates a Delta distribution parameterized by a location `loc`. 113 | 114 | Example:: 115 | 116 | >>> m = Delta(torch.Tensor([0.0])) 117 | >>> m.sample() # mean==0 118 | 0. 119 | [torch.FloatTensor of size 1] 120 | 121 | Args: 122 | loc (float or Tensor or Variable): location of the distribution 123 | """ 124 | 125 | def __init__(self, loc): 126 | self.loc = loc 127 | 128 | def params(self): 129 | """ The distribution parameters (mean,std) """ 130 | return [self.loc] 131 | 132 | def sample(self): 133 | return self.loc 134 | 135 | def mean(self): 136 | return self.loc 137 | 138 | def sample_n(self, n): 139 | # cleanly expand float or Tensor or Variable parameters 140 | def expand(v): 141 | if isinstance(v, Number): 142 | return torch.Tensor([v]).expand(n, 1) 143 | else: 144 | return v.expand(n, *v.size()) 145 | return expand(self.loc) 146 | 147 | def log_prob(self, value): 148 | raise Exception('Delta is degenerate.') 149 | 150 | def kl(self, other): 151 | if isinstance(other, Delta): 152 | return torch.zeros_like(self.loc) 153 | else: 154 | raise ValueError('Impossible (Delta): self %s other %s' % (type(self), type(other))) 155 | -------------------------------------------------------------------------------- /onmt/modules/Embeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | 5 | from onmt.modules import BottleLinear, Elementwise 6 | from onmt.Utils import aeq 7 | 8 | 9 | class PositionalEncoding(nn.Module): 10 | """ 11 | Implements the sinusoidal positional encoding for 12 | non-recurrent neural networks. 13 | 14 | Implementation based on "Attention Is All You Need" 15 | :cite:`DBLP:journals/corr/VaswaniSPUJGKP17` 16 | 17 | Args: 18 | dropout (float): dropout parameter 19 | dim (int): embedding size 20 | """ 21 | 22 | def __init__(self, dropout, dim, max_len=5000): 23 | pe = torch.arange(0, max_len).unsqueeze(1).expand(max_len, dim) 24 | div_term = 1 / torch.pow(10000, torch.arange(0, dim * 2, 2) / dim) 25 | pe = pe * div_term.expand_as(pe) 26 | pe[:, 0::2] = torch.sin(pe[:, 0::2]) 27 | pe[:, 1::2] = torch.cos(pe[:, 1::2]) 28 | pe = pe.unsqueeze(1) 29 | super(PositionalEncoding, self).__init__() 30 | self.register_buffer('pe', pe) 31 | self.dropout = nn.Dropout(p=dropout) 32 | 33 | def forward(self, emb): 34 | # We must wrap the self.pe in Variable to compute, not the other 35 | # way - unwrap emb(i.e. emb.data). Otherwise the computation 36 | # wouldn't be watched to build the compute graph. 37 | emb = emb + Variable(self.pe[:emb.size(0), :1, :emb.size(2)] 38 | .expand_as(emb), requires_grad=False) 39 | emb = self.dropout(emb) 40 | return emb 41 | 42 | 43 | class Embeddings(nn.Module): 44 | """ 45 | Words embeddings for encoder/decoder. 46 | 47 | Additionally includes ability to add sparse input features 48 | based on "Linguistic Input Features Improve Neural Machine Translation" 49 | :cite:`sennrich2016linguistic`. 50 | 51 | 52 | .. mermaid:: 53 | 54 | graph LR 55 | A[Input] 56 | C[Feature 1 Lookup] 57 | A-->B[Word Lookup] 58 | A-->C 59 | A-->D[Feature N Lookup] 60 | B-->E[MLP/Concat] 61 | C-->E 62 | D-->E 63 | E-->F[Output] 64 | 65 | Args: 66 | word_vec_size (int): size of the dictionary of embeddings. 67 | word_padding_idx (int): padding index for words in the embeddings. 68 | feats_padding_idx (list of int): padding index for a list of features 69 | in the embeddings. 70 | word_vocab_size (int): size of dictionary of embeddings for words. 71 | feat_vocab_sizes ([int], optional): list of size of dictionary 72 | of embeddings for each feature. 73 | 74 | position_encoding (bool): see :obj:`onmt.modules.PositionalEncoding` 75 | 76 | feat_merge (string): merge action for the features embeddings: 77 | concat, sum or mlp. 78 | feat_vec_exponent (float): when using `-feat_merge concat`, feature 79 | embedding size is N^feat_dim_exponent, where N is the 80 | number of values of feature takes. 81 | feat_vec_size (int): embedding dimension for features when using 82 | `-feat_merge mlp` 83 | dropout (float): dropout probability. 84 | """ 85 | def __init__(self, word_vec_size, 86 | word_vocab_size, 87 | word_padding_idx, 88 | position_encoding=False, 89 | feat_merge="concat", 90 | feat_vec_exponent=0.7, feat_vec_size=-1, 91 | feat_padding_idx=[], 92 | feat_vocab_sizes=[], 93 | dropout=0): 94 | 95 | self.word_padding_idx = word_padding_idx 96 | 97 | # Dimensions and padding for constructing the word embedding matrix 98 | vocab_sizes = [word_vocab_size] 99 | emb_dims = [word_vec_size] 100 | pad_indices = [word_padding_idx] 101 | 102 | # Dimensions and padding for feature embedding matrices 103 | # (these have no effect if feat_vocab_sizes is empty) 104 | if feat_merge == 'sum': 105 | feat_dims = [word_vec_size] * len(feat_vocab_sizes) 106 | elif feat_vec_size > 0: 107 | feat_dims = [feat_vec_size] * len(feat_vocab_sizes) 108 | else: 109 | feat_dims = [int(vocab ** feat_vec_exponent) 110 | for vocab in feat_vocab_sizes] 111 | vocab_sizes.extend(feat_vocab_sizes) 112 | emb_dims.extend(feat_dims) 113 | pad_indices.extend(feat_padding_idx) 114 | 115 | # The embedding matrix look-up tables. The first look-up table 116 | # is for words. Subsequent ones are for features, if any exist. 117 | emb_params = zip(vocab_sizes, emb_dims, pad_indices) 118 | embeddings = [nn.Embedding(vocab, dim, padding_idx=pad) 119 | for vocab, dim, pad in emb_params] 120 | emb_luts = Elementwise(feat_merge, embeddings) 121 | 122 | # The final output size of word + feature vectors. This can vary 123 | # from the word vector size if and only if features are defined. 124 | # This is the attribute you should access if you need to know 125 | # how big your embeddings are going to be. 126 | self.embedding_size = (sum(emb_dims) if feat_merge == 'concat' 127 | else word_vec_size) 128 | 129 | # The sequence of operations that converts the input sequence 130 | # into a sequence of embeddings. At minimum this consists of 131 | # looking up the embeddings for each word and feature in the 132 | # input. Model parameters may require the sequence to contain 133 | # additional operations as well. 134 | super(Embeddings, self).__init__() 135 | self.make_embedding = nn.Sequential() 136 | self.make_embedding.add_module('emb_luts', emb_luts) 137 | 138 | if feat_merge == 'mlp': 139 | in_dim = sum(emb_dims) 140 | out_dim = word_vec_size 141 | mlp = nn.Sequential(BottleLinear(in_dim, out_dim), nn.ReLU()) 142 | self.make_embedding.add_module('mlp', mlp) 143 | 144 | if position_encoding: 145 | pe = PositionalEncoding(dropout, self.embedding_size) 146 | self.make_embedding.add_module('pe', pe) 147 | 148 | @property 149 | def word_lut(self): 150 | return self.make_embedding[0][0] 151 | 152 | @property 153 | def emb_luts(self): 154 | return self.make_embedding[0] 155 | 156 | def load_pretrained_vectors(self, emb_file, fixed): 157 | """Load in pretrained embeddings. 158 | 159 | Args: 160 | emb_file (str) : path to torch serialized embeddings 161 | fixed (bool) : if true, embeddings are not updated 162 | """ 163 | if emb_file: 164 | pretrained = torch.load(emb_file) 165 | self.word_lut.weight.data.copy_(pretrained) 166 | if fixed: 167 | self.word_lut.weight.requires_grad = False 168 | 169 | def forward(self, input): 170 | """ 171 | Computes the embeddings for words and features. 172 | 173 | Args: 174 | input (`LongTensor`): index tensor `[len x batch x nfeat]` 175 | Return: 176 | `FloatTensor`: word embeddings `[len x batch x embedding_size]` 177 | """ 178 | in_length, in_batch, nfeat = input.size() 179 | aeq(nfeat, len(self.emb_luts)) 180 | 181 | emb = self.make_embedding(input) 182 | 183 | out_length, out_batch, emb_size = emb.size() 184 | aeq(in_length, out_length) 185 | aeq(in_batch, out_batch) 186 | aeq(emb_size, self.embedding_size) 187 | 188 | return emb 189 | -------------------------------------------------------------------------------- /onmt/modules/Gate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def context_gate_factory(type, embeddings_size, decoder_size, 6 | attention_size, output_size): 7 | """Returns the correct ContextGate class""" 8 | 9 | gate_types = {'source': SourceContextGate, 10 | 'target': TargetContextGate, 11 | 'both': BothContextGate} 12 | 13 | assert type in gate_types, "Not valid ContextGate type: {0}".format(type) 14 | return gate_types[type](embeddings_size, decoder_size, attention_size, 15 | output_size) 16 | 17 | 18 | class ContextGate(nn.Module): 19 | """ 20 | Context gate is a decoder module that takes as input the previous word 21 | embedding, the current decoder state and the attention state, and 22 | produces a gate. 23 | The gate can be used to select the input from the target side context 24 | (decoder state), from the source context (attention state) or both. 25 | """ 26 | def __init__(self, embeddings_size, decoder_size, 27 | attention_size, output_size): 28 | super(ContextGate, self).__init__() 29 | input_size = embeddings_size + decoder_size + attention_size 30 | self.gate = nn.Linear(input_size, output_size, bias=True) 31 | self.sig = nn.Sigmoid() 32 | self.source_proj = nn.Linear(attention_size, output_size) 33 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 34 | output_size) 35 | 36 | def forward(self, prev_emb, dec_state, attn_state): 37 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 38 | z = self.sig(self.gate(input_tensor)) 39 | proj_source = self.source_proj(attn_state) 40 | proj_target = self.target_proj( 41 | torch.cat((prev_emb, dec_state), dim=1)) 42 | return z, proj_source, proj_target 43 | 44 | 45 | class SourceContextGate(nn.Module): 46 | """Apply the context gate only to the source context""" 47 | 48 | def __init__(self, embeddings_size, decoder_size, 49 | attention_size, output_size): 50 | super(SourceContextGate, self).__init__() 51 | self.context_gate = ContextGate(embeddings_size, decoder_size, 52 | attention_size, output_size) 53 | self.tanh = nn.Tanh() 54 | 55 | def forward(self, prev_emb, dec_state, attn_state): 56 | z, source, target = self.context_gate( 57 | prev_emb, dec_state, attn_state) 58 | return self.tanh(target + z * source) 59 | 60 | 61 | class TargetContextGate(nn.Module): 62 | """Apply the context gate only to the target context""" 63 | 64 | def __init__(self, embeddings_size, decoder_size, 65 | attention_size, output_size): 66 | super(TargetContextGate, self).__init__() 67 | self.context_gate = ContextGate(embeddings_size, decoder_size, 68 | attention_size, output_size) 69 | self.tanh = nn.Tanh() 70 | 71 | def forward(self, prev_emb, dec_state, attn_state): 72 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 73 | return self.tanh(z * target + source) 74 | 75 | 76 | class BothContextGate(nn.Module): 77 | """Apply the context gate to both contexts""" 78 | 79 | def __init__(self, embeddings_size, decoder_size, 80 | attention_size, output_size): 81 | super(BothContextGate, self).__init__() 82 | self.context_gate = ContextGate(embeddings_size, decoder_size, 83 | attention_size, output_size) 84 | self.tanh = nn.Tanh() 85 | 86 | def forward(self, prev_emb, dec_state, attn_state): 87 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 88 | return self.tanh((1. - z) * target + z * source) 89 | -------------------------------------------------------------------------------- /onmt/modules/GlobalAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from onmt.modules.UtilClass import BottleLinear 5 | from onmt.Utils import aeq, sequence_mask 6 | 7 | 8 | class GlobalAttention(nn.Module): 9 | """ 10 | Global attention takes a matrix and a query vector. It 11 | then computes a parameterized convex combination of the matrix 12 | based on the input query. 13 | 14 | Constructs a unit mapping a query `q` of size `dim` 15 | and a source matrix `H` of size `n x dim`, to an output 16 | of size `dim`. 17 | 18 | 19 | .. mermaid:: 20 | 21 | graph BT 22 | A[Query] 23 | subgraph RNN 24 | C[H 1] 25 | D[H 2] 26 | E[H N] 27 | end 28 | F[Attn] 29 | G[Output] 30 | A --> F 31 | C --> F 32 | D --> F 33 | E --> F 34 | C -.-> G 35 | D -.-> G 36 | E -.-> G 37 | F --> G 38 | 39 | All models compute the output as 40 | :math:`c = \sum_{j=1}^{SeqLength} a_j H_j` where 41 | :math:`a_j` is the softmax of a score function. 42 | Then then apply a projection layer to [q, c]. 43 | 44 | However they 45 | differ on how they compute the attention score. 46 | 47 | * Luong Attention (dot, general): 48 | * dot: :math:`score(H_j,q) = H_j^T q` 49 | * general: :math:`score(H_j, q) = H_j^T W_a q` 50 | 51 | 52 | * Bahdanau Attention (mlp): 53 | * :math:`score(H_j, q) = v_a^T tanh(W_a q + U_a h_j)` 54 | 55 | 56 | Args: 57 | dim (int): dimensionality of query and key 58 | coverage (bool): use coverage term 59 | attn_type (str): type of attention to use, options [dot,general,mlp] 60 | 61 | """ 62 | def __init__(self, dim, coverage=False, attn_type="dot"): 63 | super(GlobalAttention, self).__init__() 64 | 65 | self.dim = dim 66 | self.attn_type = attn_type 67 | assert (self.attn_type in ["dot", "general", "mlp"]), ( 68 | "Please select a valid attention type.") 69 | 70 | if self.attn_type == "general": 71 | self.linear_in = nn.Linear(dim, dim, bias=False) 72 | elif self.attn_type == "mlp": 73 | self.linear_context = BottleLinear(dim, dim, bias=False) 74 | self.linear_query = nn.Linear(dim, dim, bias=True) 75 | self.v = BottleLinear(dim, 1, bias=False) 76 | # mlp wants it with bias 77 | out_bias = self.attn_type == "mlp" 78 | self.linear_out = nn.Linear(dim*2, dim, bias=out_bias) 79 | 80 | self.sm = nn.Softmax() 81 | self.tanh = nn.Tanh() 82 | 83 | if coverage: 84 | self.linear_cover = nn.Linear(1, dim, bias=False) 85 | 86 | def score(self, h_t, h_s): 87 | """ 88 | Args: 89 | h_t (`FloatTensor`): sequence of queries `[batch x tgt_len x dim]` 90 | h_s (`FloatTensor`): sequence of sources `[batch x src_len x dim]` 91 | 92 | Returns: 93 | :obj:`FloatTensor`: 94 | raw attention scores (unnormalized) for each src index 95 | `[batch x tgt_len x src_len]` 96 | 97 | """ 98 | 99 | # Check input sizes 100 | src_batch, src_len, src_dim = h_s.size() 101 | tgt_batch, tgt_len, tgt_dim = h_t.size() 102 | aeq(src_batch, tgt_batch) 103 | aeq(src_dim, tgt_dim) 104 | aeq(self.dim, src_dim) 105 | 106 | if self.attn_type in ["general", "dot"]: 107 | if self.attn_type == "general": 108 | h_t_ = h_t.view(tgt_batch*tgt_len, tgt_dim) 109 | h_t_ = self.linear_in(h_t_) 110 | h_t = h_t_.view(tgt_batch, tgt_len, tgt_dim) 111 | h_s_ = h_s.transpose(1, 2) 112 | # (batch, t_len, d) x (batch, d, s_len) --> (batch, t_len, s_len) 113 | return torch.bmm(h_t, h_s_) 114 | else: 115 | dim = self.dim 116 | wq = self.linear_query(h_t.view(-1, dim)) 117 | wq = wq.view(tgt_batch, tgt_len, 1, dim) 118 | wq = wq.expand(tgt_batch, tgt_len, src_len, dim) 119 | 120 | uh = self.linear_context(h_s.contiguous().view(-1, dim)) 121 | uh = uh.view(src_batch, 1, src_len, dim) 122 | uh = uh.expand(src_batch, tgt_len, src_len, dim) 123 | 124 | # (batch, t_len, s_len, d) 125 | wquh = self.tanh(wq + uh) 126 | 127 | return self.v(wquh.view(-1, dim)).view(tgt_batch, tgt_len, src_len) 128 | 129 | def forward(self, input, context, context_lengths=None, coverage=None): 130 | """ 131 | 132 | Args: 133 | input (`FloatTensor`): query vectors `[batch x tgt_len x dim]` 134 | context (`FloatTensor`): source vectors `[batch x src_len x dim]` 135 | context_lengths (`LongTensor`): the source context lengths `[batch]` 136 | coverage (`FloatTensor`): None (not supported yet) 137 | 138 | Returns: 139 | (`FloatTensor`, `FloatTensor`): 140 | 141 | * Computed vector `[tgt_len x batch x dim]` 142 | * Attention distribtutions for each query 143 | `[tgt_len x batch x src_len]` 144 | """ 145 | 146 | # one step input 147 | if input.dim() == 2: 148 | one_step = True 149 | input = input.unsqueeze(1) 150 | else: 151 | one_step = False 152 | 153 | batch, sourceL, dim = context.size() 154 | batch_, targetL, dim_ = input.size() 155 | aeq(batch, batch_) 156 | aeq(dim, dim_) 157 | aeq(self.dim, dim) 158 | if coverage is not None: 159 | batch_, sourceL_ = coverage.size() 160 | aeq(batch, batch_) 161 | aeq(sourceL, sourceL_) 162 | 163 | if coverage is not None: 164 | cover = coverage.view(-1).unsqueeze(1) 165 | context += self.linear_cover(cover).view_as(context) 166 | context = self.tanh(context) 167 | 168 | # compute attention scores, as in Luong et al. 169 | align = self.score(input, context) 170 | 171 | if context_lengths is not None: 172 | # mask => [B, n] 173 | mask = sequence_mask(context_lengths) 174 | # mask => [B, 1, n] 175 | mask = mask.unsqueeze(1) # Make it broadcastable. 176 | align.data.masked_fill_(1 - mask, -float('inf')) 177 | 178 | # Softmax to normalize attention weights 179 | align_vectors = self.sm(align.view(batch*targetL, sourceL)) 180 | align_vectors = align_vectors.view(batch, targetL, sourceL) 181 | 182 | # each context vector c_t is the weighted average 183 | # over all the source hidden states 184 | c = torch.bmm(align_vectors, context) 185 | 186 | # concatenate 187 | concat_c = torch.cat([c, input], 2).view(batch*targetL, dim*2) 188 | attn_h = self.linear_out(concat_c).view(batch, targetL, dim) 189 | if self.attn_type in ["general", "dot"]: 190 | attn_h = self.tanh(attn_h) 191 | 192 | if one_step: 193 | attn_h = attn_h.squeeze(1) 194 | align_vectors = align_vectors.squeeze(1) 195 | 196 | # Check output sizes 197 | batch_, dim_ = attn_h.size() 198 | aeq(batch, batch_) 199 | aeq(dim, dim_) 200 | batch_, sourceL_ = align_vectors.size() 201 | aeq(batch, batch_) 202 | aeq(sourceL, sourceL_) 203 | else: 204 | attn_h = attn_h.transpose(0, 1).contiguous() 205 | align_vectors = align_vectors.transpose(0, 1).contiguous() 206 | 207 | # Check output sizes 208 | targetL_, batch_, dim_ = attn_h.size() 209 | aeq(targetL, targetL_) 210 | aeq(batch, batch_) 211 | aeq(dim, dim_) 212 | targetL_, batch_, sourceL_ = align_vectors.size() 213 | aeq(targetL, targetL_) 214 | aeq(batch, batch_) 215 | aeq(sourceL, sourceL_) 216 | 217 | return attn_h, align_vectors 218 | -------------------------------------------------------------------------------- /onmt/modules/ImageEncoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class ImageEncoder(nn.Module): 8 | """ 9 | A simple encoder convolutional -> recurrent neural network for 10 | image input. 11 | 12 | Args: 13 | num_layers (int): number of encoder layers. 14 | bidirectional (bool): bidirectional encoder. 15 | rnn_size (int): size of hidden states of the rnn. 16 | dropout (float): dropout probablity. 17 | """ 18 | def __init__(self, num_layers, bidirectional, rnn_size, dropout): 19 | super(ImageEncoder, self).__init__() 20 | self.num_layers = num_layers 21 | self.num_directions = 2 if bidirectional else 1 22 | self.hidden_size = rnn_size 23 | 24 | self.layer1 = nn.Conv2d(3, 64, kernel_size=(3, 3), 25 | padding=(1, 1), stride=(1, 1)) 26 | self.layer2 = nn.Conv2d(64, 128, kernel_size=(3, 3), 27 | padding=(1, 1), stride=(1, 1)) 28 | self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3), 29 | padding=(1, 1), stride=(1, 1)) 30 | self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3), 31 | padding=(1, 1), stride=(1, 1)) 32 | self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3), 33 | padding=(1, 1), stride=(1, 1)) 34 | self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3), 35 | padding=(1, 1), stride=(1, 1)) 36 | 37 | self.batch_norm1 = nn.BatchNorm2d(256) 38 | self.batch_norm2 = nn.BatchNorm2d(512) 39 | self.batch_norm3 = nn.BatchNorm2d(512) 40 | 41 | input_size = 512 42 | self.rnn = nn.LSTM(input_size, rnn_size, 43 | num_layers=num_layers, 44 | dropout=dropout, 45 | bidirectional=bidirectional) 46 | self.pos_lut = nn.Embedding(1000, input_size) 47 | 48 | def load_pretrained_vectors(self, opt): 49 | # Pass in needed options only when modify function definition. 50 | pass 51 | 52 | def forward(self, input, lengths=None): 53 | "See :obj:`onmt.modules.EncoderBase.forward()`" 54 | 55 | batch_size = input.size(0) 56 | # (batch_size, 64, imgH, imgW) 57 | # layer 1 58 | input = F.relu(self.layer1(input[:, :, :, :]-0.5), True) 59 | 60 | # (batch_size, 64, imgH/2, imgW/2) 61 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 62 | 63 | # (batch_size, 128, imgH/2, imgW/2) 64 | # layer 2 65 | input = F.relu(self.layer2(input), True) 66 | 67 | # (batch_size, 128, imgH/2/2, imgW/2/2) 68 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 69 | 70 | # (batch_size, 256, imgH/2/2, imgW/2/2) 71 | # layer 3 72 | # batch norm 1 73 | input = F.relu(self.batch_norm1(self.layer3(input)), True) 74 | 75 | # (batch_size, 256, imgH/2/2, imgW/2/2) 76 | # layer4 77 | input = F.relu(self.layer4(input), True) 78 | 79 | # (batch_size, 256, imgH/2/2/2, imgW/2/2) 80 | input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2)) 81 | 82 | # (batch_size, 512, imgH/2/2/2, imgW/2/2) 83 | # layer 5 84 | # batch norm 2 85 | input = F.relu(self.batch_norm2(self.layer5(input)), True) 86 | 87 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 88 | input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1)) 89 | 90 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 91 | input = F.relu(self.batch_norm3(self.layer6(input)), True) 92 | 93 | # # (batch_size, 512, H, W) 94 | all_outputs = [] 95 | for row in range(input.size(2)): 96 | inp = input[:, :, row, :].transpose(0, 2)\ 97 | .transpose(1, 2) 98 | row_vec = torch.Tensor(batch_size).type_as(inp.data)\ 99 | .long().fill_(row) 100 | pos_emb = self.pos_lut(Variable(row_vec)) 101 | with_pos = torch.cat( 102 | (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0) 103 | outputs, hidden_t = self.rnn(with_pos) 104 | all_outputs.append(outputs) 105 | out = torch.cat(all_outputs, 0) 106 | 107 | return hidden_t, out 108 | -------------------------------------------------------------------------------- /onmt/modules/MultiHeadedAttn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | from onmt.Utils import aeq 7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax 8 | 9 | 10 | class MultiHeadedAttention(nn.Module): 11 | """ 12 | Multi-Head Attention module from 13 | "Attention is All You Need" 14 | :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. 15 | 16 | Similar to standard `dot` attention but uses 17 | multiple attention distributions simulataneously 18 | to select relevant items. 19 | 20 | .. mermaid:: 21 | 22 | graph BT 23 | A[key] 24 | B[value] 25 | C[query] 26 | O[output] 27 | subgraph Attn 28 | D[Attn 1] 29 | E[Attn 2] 30 | F[Attn N] 31 | end 32 | A --> D 33 | C --> D 34 | A --> E 35 | C --> E 36 | A --> F 37 | C --> F 38 | D --> O 39 | E --> O 40 | F --> O 41 | B --> O 42 | 43 | Also includes several additional tricks. 44 | 45 | Args: 46 | head_count (int): number of parallel heads 47 | model_dim (int): the dimension of keys/values/queries, 48 | must be divisible by head_count 49 | dropout (float): dropout parameter 50 | """ 51 | def __init__(self, head_count, model_dim, dropout=0.1): 52 | assert model_dim % head_count == 0 53 | self.dim_per_head = model_dim // head_count 54 | self.model_dim = model_dim 55 | 56 | super(MultiHeadedAttention, self).__init__() 57 | self.head_count = head_count 58 | 59 | self.linear_keys = BottleLinear(model_dim, 60 | head_count * self.dim_per_head, 61 | bias=False) 62 | self.linear_values = BottleLinear(model_dim, 63 | head_count * self.dim_per_head, 64 | bias=False) 65 | self.linear_query = BottleLinear(model_dim, 66 | head_count * self.dim_per_head, 67 | bias=False) 68 | self.sm = BottleSoftmax() 69 | self.activation = nn.ReLU() 70 | self.dropout = nn.Dropout(dropout) 71 | self.res_dropout = nn.Dropout(dropout) 72 | 73 | def forward(self, key, value, query, mask=None): 74 | """ 75 | Compute the context vector and the attention vectors. 76 | 77 | Args: 78 | key (`FloatTensor`): set of `key_len` 79 | key vectors `[batch, key_len, dim]` 80 | value (`FloatTensor`): set of `key_len` 81 | value vectors `[batch, key_len, dim]` 82 | query (`FloatTensor`): set of `query_len` 83 | query vectors `[batch, query_len, dim]` 84 | mask: binary mask indicating which keys have 85 | non-zero attention `[batch, query_len, key_len]` 86 | Returns: 87 | (`FloatTensor`, `FloatTensor`) : 88 | 89 | * output context vectors `[batch, query_len, dim]` 90 | * one of the attention vectors `[batch, query_len, key_len]` 91 | """ 92 | 93 | # CHECKS 94 | batch, k_len, d = key.size() 95 | batch_, k_len_, d_ = value.size() 96 | aeq(batch, batch_) 97 | aeq(k_len, k_len_) 98 | aeq(d, d_) 99 | batch_, q_len, d_ = query.size() 100 | aeq(batch, batch_) 101 | aeq(d, d_) 102 | aeq(self.model_dim % 8, 0) 103 | if mask is not None: 104 | batch_, q_len_, k_len_ = mask.size() 105 | aeq(batch_, batch) 106 | aeq(k_len_, k_len) 107 | aeq(q_len_ == q_len) 108 | # END CHECKS 109 | 110 | def shape_projection(x): 111 | b, l, d = x.size() 112 | return x.view(b, l, self.head_count, self.dim_per_head) \ 113 | .transpose(1, 2).contiguous() \ 114 | .view(b * self.head_count, l, self.dim_per_head) 115 | 116 | def unshape_projection(x, q): 117 | b, l, d = q.size() 118 | return x.view(b, self.head_count, l, self.dim_per_head) \ 119 | .transpose(1, 2).contiguous() \ 120 | .view(b, l, self.head_count * self.dim_per_head) 121 | 122 | residual = query 123 | key_up = shape_projection(self.linear_keys(key)) 124 | value_up = shape_projection(self.linear_values(value)) 125 | query_up = shape_projection(self.linear_query(query)) 126 | 127 | scaled = torch.bmm(query_up, key_up.transpose(1, 2)) 128 | scaled = scaled / math.sqrt(self.dim_per_head) 129 | bh, l, dim_per_head = scaled.size() 130 | b = bh // self.head_count 131 | if mask is not None: 132 | 133 | scaled = scaled.view(b, self.head_count, l, dim_per_head) 134 | mask = mask.unsqueeze(1).expand_as(scaled) 135 | scaled = scaled.masked_fill(Variable(mask), -1e18) \ 136 | .view(bh, l, dim_per_head) 137 | attn = self.sm(scaled) 138 | # Return one attn 139 | top_attn = attn \ 140 | .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \ 141 | .contiguous() 142 | 143 | drop_attn = self.dropout(self.sm(scaled)) 144 | 145 | # values : (batch * 8) x qlen x dim 146 | out = unshape_projection(torch.bmm(drop_attn, value_up), residual) 147 | 148 | # Residual and layer norm 149 | ret = self.res_dropout(out) 150 | 151 | # CHECK 152 | batch_, q_len_, d_ = ret.size() 153 | aeq(q_len, q_len_) 154 | aeq(batch, batch_) 155 | aeq(d, d_) 156 | # END CHECK 157 | return ret, top_attn 158 | -------------------------------------------------------------------------------- /onmt/modules/StackedRNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class StackedLSTM(nn.Module): 6 | """ 7 | Our own implementation of stacked LSTM. 8 | Needed for the decoder, because we do input feeding. 9 | """ 10 | def __init__(self, num_layers, input_size, rnn_size, dropout): 11 | super(StackedLSTM, self).__init__() 12 | self.dropout = nn.Dropout(dropout) 13 | self.num_layers = num_layers 14 | self.layers = nn.ModuleList() 15 | 16 | for i in range(num_layers): 17 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 18 | input_size = rnn_size 19 | 20 | def forward(self, input, hidden): 21 | h_0, c_0 = hidden 22 | h_1, c_1 = [], [] 23 | for i, layer in enumerate(self.layers): 24 | h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) 25 | input = h_1_i 26 | if i + 1 != self.num_layers: 27 | input = self.dropout(input) 28 | h_1 += [h_1_i] 29 | c_1 += [c_1_i] 30 | 31 | h_1 = torch.stack(h_1) 32 | c_1 = torch.stack(c_1) 33 | 34 | return input, (h_1, c_1) 35 | 36 | 37 | class StackedGRU(nn.Module): 38 | 39 | def __init__(self, num_layers, input_size, rnn_size, dropout): 40 | super(StackedGRU, self).__init__() 41 | self.dropout = nn.Dropout(dropout) 42 | self.num_layers = num_layers 43 | self.layers = nn.ModuleList() 44 | 45 | for i in range(num_layers): 46 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 47 | input_size = rnn_size 48 | 49 | def forward(self, input, hidden): 50 | h_1 = [] 51 | for i, layer in enumerate(self.layers): 52 | h_1_i = layer(input, hidden[0][i]) 53 | input = h_1_i 54 | if i + 1 != self.num_layers: 55 | input = self.dropout(input) 56 | h_1 += [h_1_i] 57 | 58 | h_1 = torch.stack(h_1) 59 | return input, (h_1,) 60 | -------------------------------------------------------------------------------- /onmt/modules/StructuredAttention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.cuda 4 | from torch.autograd import Variable 5 | 6 | 7 | class MatrixTree(nn.Module): 8 | """Implementation of the matrix-tree theorem for computing marginals 9 | of non-projective dependency parsing. This attention layer is used 10 | in the paper "Learning Structured Text Representations." 11 | 12 | 13 | :cite:`DBLP:journals/corr/LiuL17d` 14 | """ 15 | def __init__(self, eps=1e-5): 16 | self.eps = eps 17 | super(MatrixTree, self).__init__() 18 | 19 | def forward(self, input): 20 | laplacian = input.exp() + self.eps 21 | output = input.clone() 22 | for b in range(input.size(0)): 23 | lap = laplacian[b].masked_fill( 24 | Variable(torch.eye(input.size(1)).cuda().ne(0)), 0) 25 | lap = -lap + torch.diag(lap.sum(0)) 26 | # store roots on diagonal 27 | lap[0] = input[b].diag().exp() 28 | inv_laplacian = lap.inverse() 29 | 30 | factor = inv_laplacian.diag().unsqueeze(1)\ 31 | .expand_as(input[b]).transpose(0, 1) 32 | term1 = input[b].exp().mul(factor).clone() 33 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 34 | term1[:, 0] = 0 35 | term2[0] = 0 36 | output[b] = term1 - term2 37 | roots_output = input[b].diag().exp().mul( 38 | inv_laplacian.transpose(0, 1)[0]) 39 | output[b] = output[b] + torch.diag(roots_output) 40 | return output 41 | 42 | 43 | if __name__ == "__main__": 44 | dtree = MatrixTree() 45 | q = torch.rand(1, 5, 5).cuda() 46 | marg = dtree.forward(Variable(q)) 47 | print(marg.sum(1)) 48 | -------------------------------------------------------------------------------- /onmt/modules/UtilClass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Bottle(nn.Module): 6 | def forward(self, input): 7 | if len(input.size()) <= 2: 8 | return super(Bottle, self).forward(input) 9 | size = input.size()[:2] 10 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 11 | return out.contiguous().view(size[0], size[1], -1) 12 | 13 | 14 | class Bottle2(nn.Module): 15 | def forward(self, input): 16 | if len(input.size()) <= 3: 17 | return super(Bottle2, self).forward(input) 18 | size = input.size() 19 | out = super(Bottle2, self).forward(input.view(size[0]*size[1], 20 | size[2], size[3])) 21 | return out.contiguous().view(size[0], size[1], size[2], size[3]) 22 | 23 | 24 | class LayerNorm(nn.Module): 25 | ''' Layer normalization module ''' 26 | 27 | def __init__(self, d_hid, eps=1e-3): 28 | super(LayerNorm, self).__init__() 29 | 30 | self.eps = eps 31 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 32 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 33 | 34 | def forward(self, z): 35 | if z.size(1) == 1: 36 | return z 37 | mu = torch.mean(z, dim=1) 38 | sigma = torch.std(z, dim=1) 39 | # HACK. PyTorch is changing behavior 40 | if mu.dim() == 1: 41 | mu = mu.unsqueeze(1) 42 | sigma = sigma.unsqueeze(1) 43 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 44 | ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \ 45 | + self.b_2.expand_as(ln_out) 46 | return ln_out 47 | 48 | 49 | class BottleLinear(Bottle, nn.Linear): 50 | pass 51 | 52 | 53 | class BottleLayerNorm(Bottle, LayerNorm): 54 | pass 55 | 56 | 57 | class BottleSoftmax(Bottle, nn.Softmax): 58 | pass 59 | 60 | 61 | class Elementwise(nn.ModuleList): 62 | """ 63 | A simple network container. 64 | Parameters are a list of modules. 65 | Inputs are a 3d Variable whose last dimension is the same length 66 | as the list. 67 | Outputs are the result of applying modules to inputs elementwise. 68 | An optional merge parameter allows the outputs to be reduced to a 69 | single Variable. 70 | """ 71 | 72 | def __init__(self, merge=None, *args): 73 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 74 | self.merge = merge 75 | super(Elementwise, self).__init__(*args) 76 | 77 | def forward(self, input): 78 | inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)] 79 | assert len(self) == len(inputs) 80 | outputs = [f(x) for f, x in zip(self, inputs)] 81 | if self.merge == 'first': 82 | return outputs[0] 83 | elif self.merge == 'concat' or self.merge == 'mlp': 84 | return torch.cat(outputs, 2) 85 | elif self.merge == 'sum': 86 | return sum(outputs) 87 | else: 88 | return outputs 89 | -------------------------------------------------------------------------------- /onmt/modules/WordDropout.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Variable 2 | import torch.nn as nn 3 | import torch 4 | 5 | 6 | class WordDropout(nn.Module): 7 | r"""During training, randomly zeroes some of the (entire) words of the input 8 | tensor with probability *p* using samples from a bernoulli distribution. 9 | The elements to zero are randomized on every forward call. 10 | 11 | Furthermore, the outputs are scaled by a factor of *1/(1-p)* during 12 | training. This means that during evaluation the module simply computes an 13 | identity function. 14 | 15 | Args: 16 | p: probability of an element to be zeroed. Default: 0.1 17 | inplace: If set to ``True``, will do this operation in-place. Default: ``False`` 18 | 19 | Shape: 20 | - Input: `Any`. Input can be of any shape 21 | - Output: `Same`. Output is of the same shape as input 22 | 23 | Examples:: 24 | 25 | >>> m = nn.Dropout(p=0.2) 26 | >>> input = autograd.Variable(torch.randn(20, 16)) 27 | >>> output = m(input) 28 | 29 | .. _Improving neural networks by preventing co-adaptation of feature 30 | detectors: https://arxiv.org/abs/1207.0580 31 | """ 32 | 33 | def __init__(self, p=0.0, inplace=False, dim=2): 34 | super(WordDropout, self).__init__() 35 | if p < 0 or p > 1: 36 | raise ValueError("dropout probability has to be between 0 and 1, " 37 | "but got {}".format(p)) 38 | self.p = p 39 | # dimension of the word dropout (sequence). 40 | # e.g. in [time, batch, features], i.e. [T, B, D], word dropout is applied on either all D or none. 41 | self.dim = dim 42 | self.inplace = inplace 43 | 44 | def forward(self, input, training=False): 45 | if self.p == 0 or not training: 46 | return input 47 | 48 | keep_prob = 1 - self.p 49 | noise = torch.zeros_like(input.data) 50 | noise = Variable(torch.sum(noise, dim=self.dim)) 51 | noise.data.bernoulli_( self.p ) 52 | noise = noise.byte() 53 | noise = noise.unsqueeze(self.dim) 54 | 55 | output = input.masked_fill_(noise, 0.) 56 | output /= keep_prob 57 | return torch.mul(output, input) 58 | 59 | def __repr__(self): 60 | inplace_str = ', inplace' if self.inplace else '' 61 | return self.__class__.__name__ + '(' \ 62 | + 'p=' + str(self.p) \ 63 | + inplace_str + ')' 64 | -------------------------------------------------------------------------------- /onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \ 2 | BottleLayerNorm, BottleSoftmax, Elementwise 3 | from onmt.modules.Gate import context_gate_factory, ContextGate 4 | from onmt.modules.GlobalAttention import GlobalAttention 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention 6 | from onmt.modules.ImageEncoder import ImageEncoder 7 | from onmt.modules.AudioEncoder import AudioEncoder 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute 9 | from onmt.modules.StructuredAttention import MatrixTree 10 | from onmt.modules.Transformer import \ 11 | TransformerEncoder, TransformerDecoder, PositionwiseFeedForward 12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder 13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention 14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU 15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding 16 | from onmt.modules.WeightNorm import WeightNormConv2d 17 | from onmt.modules.NormalVariationalEncoder import GlobalInferenceNetwork, \ 18 | GlobalFullInferenceNetwork, \ 19 | ImageGlobalInferenceNetwork 20 | # ImageTopicInferenceNetwork, \ 21 | from onmt.modules.Dists import Delta, Normal, LogisticNormal, convert_symmetric_dirichlet_to_logistic_normal 22 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \ 23 | RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel, \ 24 | RNNVIDecoderBase, NMTVIModel 25 | 26 | from onmt.modules.SRU import check_sru_requirement 27 | can_use_sru = check_sru_requirement() 28 | if can_use_sru: 29 | from onmt.modules.SRU import SRU 30 | 31 | 32 | # For flake8 compatibility. 33 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder, 34 | RNNEncoder, NMTModel, 35 | StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder, 36 | PositionwiseFeedForward, PositionalEncoding, 37 | CopyGenerator, MultiHeadedAttention, 38 | LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax, 39 | TransformerEncoder, TransformerDecoder, Embeddings, Elementwise, 40 | MatrixTree, WeightNormConv2d, ConvMultiStepAttention, 41 | CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU, 42 | context_gate_factory, CopyGeneratorLossCompute, AudioEncoder] 43 | 44 | __all__ += [RNNVIDecoderBase, NMTVIModel] 45 | __all__ += [GlobalInferenceNetwork, GlobalFullInferenceNetwork, ImageGlobalInferenceNetwork] 46 | # ImageGlobalInferenceNetwork, ImageTopicInferenceNetwork] 47 | __all__ += [Delta,Normal,LogisticNormal,convert_symmetric_dirichlet_to_logistic_normal] 48 | 49 | if can_use_sru: 50 | __all__.extend([SRU, check_sru_requirement]) 51 | -------------------------------------------------------------------------------- /onmt/modules/__pycache__/AudioEncoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/AudioEncoder.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/Conv2Conv.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Conv2Conv.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/ConvMultiStepAttention.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/ConvMultiStepAttention.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/CopyGenerator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/CopyGenerator.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/Dists.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Dists.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/Embeddings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Embeddings.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/Gate.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Gate.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/GlobalAttention.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/GlobalAttention.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/ImageEncoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/ImageEncoder.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/MultiHeadedAttn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/MultiHeadedAttn.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/NormalVariationalEncoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/NormalVariationalEncoder.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/SRU.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/SRU.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/StackedRNN.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/StackedRNN.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/StructuredAttention.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/StructuredAttention.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/Transformer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/Transformer.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/UtilClass.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/UtilClass.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/WeightNorm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/WeightNorm.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/WordDropout.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/WordDropout.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/modules/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/modules/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/translate/Beam.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | 4 | 5 | class Beam(object): 6 | """ 7 | Class for managing the internals of the beam search process. 8 | 9 | Takes care of beams, back pointers, and scores. 10 | 11 | Args: 12 | size (int): beam size 13 | pad, bos, eos (int): indices of padding, beginning, and ending. 14 | n_best (int): nbest size to use 15 | cuda (bool): use gpu 16 | global_scorer (:obj:`GlobalScorer`) 17 | """ 18 | def __init__(self, size, pad, bos, eos, 19 | n_best=1, cuda=False, 20 | global_scorer=None, 21 | min_length=0): 22 | 23 | self.size = size 24 | self.tt = torch.cuda if cuda else torch 25 | 26 | # The score for each translation on the beam. 27 | self.scores = self.tt.FloatTensor(size).zero_() 28 | self.all_scores = [] 29 | 30 | # The backpointers at each time-step. 31 | self.prev_ks = [] 32 | 33 | # The outputs at each time-step. 34 | self.next_ys = [self.tt.LongTensor(size) 35 | .fill_(pad)] 36 | self.next_ys[0][0] = bos 37 | 38 | # Has EOS topped the beam yet. 39 | self._eos = eos 40 | self.eos_top = False 41 | 42 | # The attentions (matrix) for each time. 43 | self.attn = [] 44 | 45 | # Time and k pair for finished. 46 | self.finished = [] 47 | self.n_best = n_best 48 | 49 | # Information for global scoring. 50 | self.global_scorer = global_scorer 51 | self.global_state = {} 52 | 53 | # Minimum prediction length 54 | self.min_length = min_length 55 | 56 | def get_current_state(self): 57 | "Get the outputs for the current timestep." 58 | return self.next_ys[-1] 59 | 60 | def get_current_origin(self): 61 | "Get the backpointers for the current timestep." 62 | return self.prev_ks[-1] 63 | 64 | def advance(self, word_probs, attn_out): 65 | """ 66 | Given prob over words for every last beam `wordLk` and attention 67 | `attn_out`: Compute and update the beam search. 68 | 69 | Parameters: 70 | 71 | * `word_probs`- probs of advancing from the last step (K x words) 72 | * `attn_out`- attention at the last step 73 | 74 | Returns: True if beam search is complete. 75 | """ 76 | num_words = word_probs.size(1) 77 | 78 | # force the output to be longer than self.min_length 79 | cur_len = len(self.next_ys) 80 | if cur_len < self.min_length: 81 | for k in range(len(word_probs)): 82 | word_probs[k][self._eos] = -1e20 83 | 84 | # Sum the previous scores. 85 | if len(self.prev_ks) > 0: 86 | beam_scores = word_probs + \ 87 | self.scores.unsqueeze(1).expand_as(word_probs) 88 | 89 | # Don't let EOS have children. 90 | for i in range(self.next_ys[-1].size(0)): 91 | if self.next_ys[-1][i] == self._eos: 92 | beam_scores[i] = -1e20 93 | else: 94 | beam_scores = word_probs[0] 95 | flat_beam_scores = beam_scores.view(-1) 96 | best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, 97 | True, True) 98 | 99 | self.all_scores.append(self.scores) 100 | self.scores = best_scores 101 | 102 | # best_scores_id is flattened beam x word array, so calculate which 103 | # word and beam each score came from 104 | prev_k = best_scores_id / num_words 105 | self.prev_ks.append(prev_k) 106 | self.next_ys.append((best_scores_id - prev_k * num_words)) 107 | self.attn.append(attn_out.index_select(0, prev_k)) 108 | 109 | if self.global_scorer is not None: 110 | self.global_scorer.update_global_state(self) 111 | 112 | for i in range(self.next_ys[-1].size(0)): 113 | if self.next_ys[-1][i] == self._eos: 114 | s = self.scores[i] 115 | if self.global_scorer is not None: 116 | global_scores = self.global_scorer.score(self, self.scores) 117 | s = global_scores[i] 118 | self.finished.append((s, len(self.next_ys) - 1, i)) 119 | 120 | # End condition is when top-of-beam is EOS and no global score. 121 | if self.next_ys[-1][0] == self._eos: 122 | # self.all_scores.append(self.scores) 123 | self.eos_top = True 124 | 125 | def done(self): 126 | return self.eos_top and len(self.finished) >= self.n_best 127 | 128 | def sort_finished(self, minimum=None): 129 | if minimum is not None: 130 | i = 0 131 | # Add from beam until we have minimum outputs. 132 | while len(self.finished) < minimum: 133 | s = self.scores[i] 134 | if self.global_scorer is not None: 135 | global_scores = self.global_scorer.score(self, self.scores) 136 | s = global_scores[i] 137 | self.finished.append((s, len(self.next_ys) - 1, i)) 138 | 139 | self.finished.sort(key=lambda a: -a[0]) 140 | scores = [sc for sc, _, _ in self.finished] 141 | ks = [(t, k) for _, t, k in self.finished] 142 | return scores, ks 143 | 144 | def get_hyp(self, timestep, k): 145 | """ 146 | Walk back to construct the full hypothesis. 147 | """ 148 | hyp, attn = [], [] 149 | for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1): 150 | hyp.append(self.next_ys[j+1][k]) 151 | attn.append(self.attn[j][k]) 152 | k = self.prev_ks[j][k] 153 | return hyp[::-1], torch.stack(attn[::-1]) 154 | 155 | 156 | class GNMTGlobalScorer(object): 157 | """ 158 | NMT re-ranking score from 159 | "Google's Neural Machine Translation System" :cite:`wu2016google` 160 | 161 | Args: 162 | alpha (float): length parameter 163 | beta (float): coverage parameter 164 | """ 165 | def __init__(self, alpha, beta): 166 | self.alpha = alpha 167 | self.beta = beta 168 | 169 | def score(self, beam, logprobs): 170 | "Additional term add to log probability" 171 | cov = beam.global_state["coverage"] 172 | pen = self.beta * torch.min(cov, cov.clone().fill_(1.0)).log().sum(1) 173 | l_term = (((5 + len(beam.next_ys)) ** self.alpha) / 174 | ((5 + 1) ** self.alpha)) 175 | return (logprobs / l_term) + pen 176 | 177 | def update_global_state(self, beam): 178 | "Keeps the coverage vector as sum of attens" 179 | if len(beam.prev_ks) == 1: 180 | beam.global_state["coverage"] = beam.attn[-1] 181 | else: 182 | beam.global_state["coverage"] = beam.global_state["coverage"] \ 183 | .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1]) 184 | -------------------------------------------------------------------------------- /onmt/translate/Translation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, unicode_literals 2 | 3 | import torch 4 | import onmt.io 5 | 6 | 7 | class TranslationBuilder(object): 8 | """ 9 | Build a word-based translation from the batch output 10 | of translator and the underlying dictionaries. 11 | 12 | Replacement based on "Addressing the Rare Word 13 | Problem in Neural Machine Translation" :cite:`Luong2015b` 14 | 15 | Args: 16 | data (DataSet): 17 | fields (dict of Fields): data fields 18 | n_best (int): number of translations produced 19 | replace_unk (bool): replace unknown words using attention 20 | has_tgt (bool): will the batch have gold targets 21 | """ 22 | def __init__(self, data, fields, n_best=1, replace_unk=False, 23 | has_tgt=False): 24 | self.data = data 25 | self.fields = fields 26 | self.n_best = n_best 27 | self.replace_unk = replace_unk 28 | self.has_tgt = has_tgt 29 | 30 | def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn): 31 | vocab = self.fields["tgt"].vocab 32 | tokens = [] 33 | for tok in pred: 34 | if tok < len(vocab): 35 | tokens.append(vocab.itos[tok]) 36 | else: 37 | tokens.append(src_vocab.itos[tok - len(vocab)]) 38 | if tokens[-1] == onmt.io.EOS_WORD: 39 | tokens = tokens[:-1] 40 | break 41 | if self.replace_unk and (attn is not None) and (src is not None): 42 | for i in range(len(tokens)): 43 | if tokens[i] == vocab.itos[onmt.io.UNK]: 44 | _, maxIndex = attn[i].max(0) 45 | tokens[i] = src_raw[maxIndex[0]] 46 | return tokens 47 | 48 | def from_batch(self, translation_batch): 49 | batch = translation_batch["batch"] 50 | assert(len(translation_batch["gold_score"]) == 51 | len(translation_batch["predictions"])) 52 | batch_size = batch.batch_size 53 | 54 | preds, pred_score, attn, gold_score, indices = list(zip( 55 | *sorted(zip(translation_batch["predictions"], 56 | translation_batch["scores"], 57 | translation_batch["attention"], 58 | translation_batch["gold_score"], 59 | batch.indices.data), 60 | key=lambda x: x[-1]))) 61 | 62 | # Sorting 63 | inds, perm = torch.sort(batch.indices.data) 64 | data_type = self.data.data_type 65 | if data_type == 'text': 66 | src = batch.src[0].data.index_select(1, perm) 67 | else: 68 | src = None 69 | 70 | if self.has_tgt: 71 | tgt = batch.tgt.data.index_select(1, perm) 72 | else: 73 | tgt = None 74 | 75 | translations = [] 76 | for b in range(batch_size): 77 | if data_type == 'text': 78 | src_vocab = self.data.src_vocabs[inds[b]] \ 79 | if self.data.src_vocabs else None 80 | src_raw = self.data.examples[inds[b]].src 81 | else: 82 | src_vocab = None 83 | src_raw = None 84 | pred_sents = [self._build_target_tokens( 85 | src[:, b] if src is not None else None, 86 | src_vocab, src_raw, 87 | preds[b][n], attn[b][n]) 88 | for n in range(self.n_best)] 89 | gold_sent = None 90 | if tgt is not None: 91 | gold_sent = self._build_target_tokens( 92 | src[:, b] if src is not None else None, 93 | src_vocab, src_raw, 94 | tgt[1:, b] if tgt is not None else None, None) 95 | 96 | translation = Translation(src[:, b] if src is not None else None, 97 | src_raw, pred_sents, 98 | attn[b], pred_score[b], gold_sent, 99 | gold_score[b]) 100 | translations.append(translation) 101 | 102 | return translations 103 | 104 | 105 | class Translation(object): 106 | """ 107 | Container for a translated sentence. 108 | 109 | Attributes: 110 | src (`LongTensor`): src word ids 111 | src_raw ([str]): raw src words 112 | 113 | pred_sents ([[str]]): words from the n-best translations 114 | pred_scores ([[float]]): log-probs of n-best translations 115 | attns ([`FloatTensor`]) : attention dist for each translation 116 | gold_sent ([str]): words from gold translation 117 | gold_score ([float]): log-prob of gold translation 118 | 119 | """ 120 | def __init__(self, src, src_raw, pred_sents, 121 | attn, pred_scores, tgt_sent, gold_score): 122 | self.src = src 123 | self.src_raw = src_raw 124 | self.pred_sents = pred_sents 125 | self.attns = attn 126 | self.pred_scores = pred_scores 127 | self.gold_sent = tgt_sent 128 | self.gold_score = gold_score 129 | 130 | def log(self, sent_number): 131 | """ 132 | Log translation to stdout. 133 | """ 134 | output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw) 135 | 136 | best_pred = self.pred_sents[0] 137 | best_score = self.pred_scores[0] 138 | pred_sent = ' '.join(best_pred) 139 | output += 'PRED {}: {}\n'.format(sent_number, pred_sent) 140 | print("PRED SCORE: {:.4f}".format(best_score)) 141 | 142 | if self.gold_sent is not None: 143 | tgt_sent = ' '.join(self.gold_sent) 144 | output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent) 145 | output += ("GOLD SCORE: {:.4f}".format(self.gold_score)) 146 | 147 | if len(self.pred_sents) > 1: 148 | print('\nBEST HYP:') 149 | for score, sent in zip(self.pred_scores, self.pred_sents): 150 | output += "[{:.4f}] {}\n".format(score, sent) 151 | 152 | return output 153 | -------------------------------------------------------------------------------- /onmt/translate/Translator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | import onmt.translate.Beam 5 | import onmt.io 6 | 7 | 8 | class Translator(object): 9 | """ 10 | Uses a model to translate a batch of sentences. 11 | 12 | 13 | Args: 14 | model (:obj:`onmt.modules.NMTModel`): 15 | NMT model to use for translation 16 | fields (dict of Fields): data fields 17 | beam_size (int): size of beam to use 18 | n_best (int): number of translations produced 19 | max_length (int): maximum length output to produce 20 | global_scores (:obj:`GlobalScorer`): 21 | object to rescore final translations 22 | copy_attn (bool): use copy attention during translation 23 | cuda (bool): use cuda 24 | beam_trace (bool): trace beam search for debugging 25 | """ 26 | def __init__(self, model, fields, 27 | beam_size, n_best=1, 28 | max_length=100, 29 | global_scorer=None, copy_attn=False, cuda=False, 30 | beam_trace=False, min_length=0): 31 | self.model = model 32 | self.fields = fields 33 | self.n_best = n_best 34 | self.max_length = max_length 35 | self.global_scorer = global_scorer 36 | self.copy_attn = copy_attn 37 | self.beam_size = beam_size 38 | self.cuda = cuda 39 | self.min_length = min_length 40 | 41 | # for debugging 42 | self.beam_accum = None 43 | if beam_trace: 44 | self.beam_accum = { 45 | "predicted_ids": [], 46 | "beam_parent_ids": [], 47 | "scores": [], 48 | "log_probs": []} 49 | 50 | def translate_batch(self, batch, data): 51 | """ 52 | Translate a batch of sentences. 53 | 54 | Mostly a wrapper around :obj:`Beam`. 55 | 56 | Args: 57 | batch (:obj:`Batch`): a batch from a dataset object 58 | data (:obj:`Dataset`): the dataset object 59 | 60 | 61 | Todo: 62 | Shouldn't need the original dataset. 63 | """ 64 | 65 | # (0) Prep each of the components of the search. 66 | # And helper method for reducing verbosity. 67 | beam_size = self.beam_size 68 | batch_size = batch.batch_size 69 | data_type = data.data_type 70 | vocab = self.fields["tgt"].vocab 71 | beam = [onmt.translate.Beam(beam_size, n_best=self.n_best, 72 | cuda=self.cuda, 73 | global_scorer=self.global_scorer, 74 | pad=vocab.stoi[onmt.io.PAD_WORD], 75 | eos=vocab.stoi[onmt.io.EOS_WORD], 76 | bos=vocab.stoi[onmt.io.BOS_WORD], 77 | min_length=self.min_length) 78 | for __ in range(batch_size)] 79 | 80 | # Help functions for working with beams and batches 81 | def var(a): return Variable(a, volatile=True) 82 | 83 | def rvar(a): return var(a.repeat(1, beam_size, 1)) 84 | 85 | def bottle(m): 86 | return m.view(batch_size * beam_size, -1) 87 | 88 | def unbottle(m): 89 | return m.view(beam_size, batch_size, -1) 90 | 91 | # (1) Run the encoder on the src. 92 | src = onmt.io.make_features(batch, 'src', data_type) 93 | src_lengths = None 94 | if data_type == 'text': 95 | _, src_lengths = batch.src 96 | 97 | enc_states, context = self.model.encoder(src, src_lengths) 98 | dec_states = self.model.decoder.init_decoder_state( 99 | src, context, enc_states) 100 | 101 | if src_lengths is None: 102 | src_lengths = torch.Tensor(batch_size).type_as(context.data)\ 103 | .long()\ 104 | .fill_(context.size(0)) 105 | 106 | # (2) Repeat src objects `beam_size` times. 107 | src_map = rvar(batch.src_map.data) \ 108 | if data_type == 'text' and self.copy_attn else None 109 | context = rvar(context.data) 110 | context_lengths = src_lengths.repeat(beam_size) 111 | dec_states.repeat_beam_size_times(beam_size) 112 | 113 | # (3) run the decoder to generate sentences, using beam search. 114 | for i in range(self.max_length): 115 | if all((b.done() for b in beam)): 116 | break 117 | 118 | # Construct batch x beam_size nxt words. 119 | # Get all the pending current beam words and arrange for forward. 120 | inp = var(torch.stack([b.get_current_state() for b in beam]) 121 | .t().contiguous().view(1, -1)) 122 | 123 | # Turn any copied words to UNKs 124 | # 0 is unk 125 | if self.copy_attn: 126 | inp = inp.masked_fill( 127 | inp.gt(len(self.fields["tgt"].vocab) - 1), 0) 128 | 129 | # Temporary kludge solution to handle changed dim expectation 130 | # in the decoder 131 | inp = inp.unsqueeze(2) 132 | 133 | # Run one step. 134 | dec_out, dec_states, attn = self.model.decoder( 135 | inp, context, dec_states, context_lengths=context_lengths) 136 | dec_out = dec_out.squeeze(0) 137 | # dec_out: beam x rnn_size 138 | 139 | # (b) Compute a vector of batch*beam word scores. 140 | if not self.copy_attn: 141 | out = self.model.generator.forward(dec_out).data 142 | out = unbottle(out) 143 | # beam x tgt_vocab 144 | else: 145 | out = self.model.generator.forward(dec_out, 146 | attn["copy"].squeeze(0), 147 | src_map) 148 | # beam x (tgt_vocab + extra_vocab) 149 | out = data.collapse_copy_scores( 150 | unbottle(out.data), 151 | batch, self.fields["tgt"].vocab, data.src_vocabs) 152 | # beam x tgt_vocab 153 | out = out.log() 154 | 155 | # (c) Advance each beam. 156 | for j, b in enumerate(beam): 157 | b.advance( 158 | out[:, j], 159 | unbottle(attn["std"]).data[:, j, :context_lengths[j]]) 160 | dec_states.beam_update(j, b.get_current_origin(), beam_size) 161 | 162 | # (4) Extract sentences from beam. 163 | ret = self._from_beam(beam) 164 | ret["gold_score"] = [0] * batch_size 165 | if "tgt" in batch.__dict__: 166 | ret["gold_score"] = self._run_target(batch, data) 167 | ret["batch"] = batch 168 | return ret 169 | 170 | def _from_beam(self, beam): 171 | ret = {"predictions": [], 172 | "scores": [], 173 | "attention": []} 174 | for b in beam: 175 | n_best = self.n_best 176 | scores, ks = b.sort_finished(minimum=n_best) 177 | hyps, attn = [], [] 178 | for i, (times, k) in enumerate(ks[:n_best]): 179 | hyp, att = b.get_hyp(times, k) 180 | hyps.append(hyp) 181 | attn.append(att) 182 | ret["predictions"].append(hyps) 183 | ret["scores"].append(scores) 184 | ret["attention"].append(attn) 185 | return ret 186 | 187 | def _run_target(self, batch, data): 188 | data_type = data.data_type 189 | if data_type == 'text': 190 | _, src_lengths = batch.src 191 | else: 192 | src_lengths = None 193 | src = onmt.io.make_features(batch, 'src', data_type) 194 | tgt_in = onmt.io.make_features(batch, 'tgt')[:-1] 195 | 196 | # (1) run the encoder on the src 197 | enc_states, context = self.model.encoder(src, src_lengths) 198 | dec_states = self.model.decoder.init_decoder_state(src, 199 | context, enc_states) 200 | 201 | # (2) if a target is specified, compute the 'goldScore' 202 | # (i.e. log likelihood) of the target under the model 203 | tt = torch.cuda if self.cuda else torch 204 | gold_scores = tt.FloatTensor(batch.batch_size).fill_(0) 205 | dec_out, dec_states, attn = self.model.decoder( 206 | tgt_in, context, dec_states, context_lengths=src_lengths) 207 | 208 | tgt_pad = self.fields["tgt"].vocab.stoi[onmt.io.PAD_WORD] 209 | for dec, tgt in zip(dec_out, batch.tgt[1:].data): 210 | # Log prob of each word. 211 | out = self.model.generator.forward(dec) 212 | tgt = tgt.unsqueeze(1) 213 | scores = out.data.gather(1, tgt) 214 | scores.masked_fill_(tgt.eq(tgt_pad), 0) 215 | gold_scores += scores 216 | return gold_scores 217 | -------------------------------------------------------------------------------- /onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.translate.Translator import Translator 2 | from onmt.translate.TranslatorMultimodalVI import TranslatorMultimodalVI 3 | from onmt.translate.Translation import Translation, TranslationBuilder 4 | from onmt.translate.Beam import Beam, GNMTGlobalScorer 5 | 6 | __all__ = [Translator, 7 | Translation, Beam, GNMTGlobalScorer, TranslationBuilder] 8 | __all__ += [TranslatorMultimodalVI] 9 | -------------------------------------------------------------------------------- /onmt/translate/__pycache__/Beam.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/Beam.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/translate/__pycache__/Translation.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/Translation.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/translate/__pycache__/Translator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/Translator.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/translate/__pycache__/TranslatorMultimodalVI.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/TranslatorMultimodalVI.cpython-36.pyc -------------------------------------------------------------------------------- /onmt/translate/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iacercalixto/variational_mmt/59e000d793fd2805d5de34d8ac2046dcbee0c90f/onmt/translate/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import os 6 | import glob 7 | import sys 8 | import torch 9 | import onmt.io 10 | import opts 11 | 12 | 13 | def check_existing_pt_files(opt): 14 | # We will use glob.glob() to find sharded {train|valid}.[0-9]*.pt 15 | # when training, so check to avoid tampering with existing pt files 16 | # or mixing them up. 17 | for t in ['train', 'valid', 'vocab']: 18 | pattern = opt.save_data + '.' + t + '*.pt' 19 | if glob.glob(pattern): 20 | sys.stderr.write("Please backup exisiting pt file: %s, " 21 | "to avoid tampering!\n" % pattern) 22 | sys.exit(1) 23 | 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser( 27 | description='preprocess.py', 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 29 | 30 | opts.add_md_help_argument(parser) 31 | opts.preprocess_opts(parser) 32 | 33 | opt = parser.parse_args() 34 | torch.manual_seed(opt.seed) 35 | 36 | check_existing_pt_files(opt) 37 | 38 | return opt 39 | 40 | 41 | def build_save_text_dataset_in_shards(src_corpus, tgt_corpus, fields, 42 | corpus_type, opt): 43 | ''' 44 | Divide the big corpus into shards, and build dataset separately. 45 | This is currently only for data_type=='text'. 46 | 47 | The reason we do this is to avoid taking up too much memory due 48 | to sucking in a huge corpus file. 49 | 50 | To tackle this, we only read in part of the corpus file of size 51 | `max_shard_size`(actually it is multiples of 64 bytes that equals 52 | or is slightly larger than this size), and process it into dataset, 53 | then write it to disk along the way. By doing this, we only focus on 54 | part of the corpus at any moment, thus effectively reducing memory use. 55 | According to test, this method can reduce memory footprint by ~50%. 56 | 57 | Note! As we process along the shards, previous shards might still 58 | stay in memory, but since we are done with them, and no more 59 | reference to them, if there is memory tight situation, the OS could 60 | easily reclaim these memory. 61 | 62 | If `max_shard_size` is 0 or is larger than the corpus size, it is 63 | effectively preprocessed into one dataset, i.e. no sharding. 64 | 65 | NOTE! `max_shard_size` is measuring the input corpus size, not the 66 | output pt file size. So a shard pt file consists of examples of size 67 | 2 * `max_shard_size`(source + target). 68 | ''' 69 | 70 | corpus_size = os.path.getsize(src_corpus) 71 | if corpus_size > 10 * (1024**2) and opt.max_shard_size == 0: 72 | print("Warning. The corpus %s is larger than 10M bytes, you can " 73 | "set '-max_shard_size' to process it by small shards " 74 | "to use less memory." % src_corpus) 75 | 76 | if opt.max_shard_size != 0: 77 | print(' * divide corpus into shards and build dataset separately' 78 | '(shard_size = %d bytes).' % opt.max_shard_size) 79 | 80 | ret_list = [] 81 | src_iter = onmt.io.ShardedTextCorpusIterator( 82 | src_corpus, opt.src_seq_length_trunc, 83 | "src", opt.max_shard_size) 84 | tgt_iter = onmt.io.ShardedTextCorpusIterator( 85 | tgt_corpus, opt.tgt_seq_length_trunc, 86 | "tgt", opt.max_shard_size, 87 | assoc_iter=src_iter) 88 | 89 | index = 0 90 | while not src_iter.hit_end(): 91 | index += 1 92 | dataset = onmt.io.TextDataset( 93 | fields, src_iter, tgt_iter, 94 | src_iter.num_feats, tgt_iter.num_feats, 95 | src_seq_length=opt.src_seq_length, 96 | tgt_seq_length=opt.tgt_seq_length, 97 | dynamic_dict=opt.dynamic_dict) 98 | 99 | # We save fields in vocab.pt seperately, so make it empty. 100 | dataset.fields = [] 101 | 102 | pt_file = "{:s}.{:s}.{:d}.pt".format( 103 | opt.save_data, corpus_type, index) 104 | print(" * saving %s data shard to %s." % (corpus_type, pt_file)) 105 | torch.save(dataset, pt_file) 106 | 107 | ret_list.append(pt_file) 108 | 109 | return ret_list 110 | 111 | 112 | def build_save_dataset(corpus_type, fields, opt): 113 | assert corpus_type in ['train', 'valid'] 114 | 115 | if corpus_type == 'train': 116 | src_corpus = opt.train_src 117 | tgt_corpus = opt.train_tgt 118 | else: 119 | src_corpus = opt.valid_src 120 | tgt_corpus = opt.valid_tgt 121 | 122 | # Currently we only do preprocess sharding for corpus: data_type=='text'. 123 | if opt.data_type == 'text': 124 | return build_save_text_dataset_in_shards( 125 | src_corpus, tgt_corpus, fields, 126 | corpus_type, opt) 127 | 128 | # For data_type == 'img' or 'audio', currently we don't do 129 | # preprocess sharding. We only build a monolithic dataset. 130 | # But since the interfaces are uniform, it would be not hard 131 | # to do this should users need this feature. 132 | dataset = onmt.io.build_dataset( 133 | fields, opt.data_type, src_corpus, tgt_corpus, 134 | src_dir=opt.src_dir, 135 | src_seq_length=opt.src_seq_length, 136 | tgt_seq_length=opt.tgt_seq_length, 137 | src_seq_length_trunc=opt.src_seq_length_trunc, 138 | tgt_seq_length_trunc=opt.tgt_seq_length_trunc, 139 | dynamic_dict=opt.dynamic_dict, 140 | sample_rate=opt.sample_rate, 141 | window_size=opt.window_size, 142 | window_stride=opt.window_stride, 143 | window=opt.window) 144 | 145 | # We save fields in vocab.pt seperately, so make it empty. 146 | dataset.fields = [] 147 | 148 | pt_file = "{:s}.{:s}.pt".format(opt.save_data, corpus_type) 149 | print(" * saving %s dataset to %s." % (corpus_type, pt_file)) 150 | torch.save(dataset, pt_file) 151 | 152 | return [pt_file] 153 | 154 | 155 | def build_save_vocab(train_dataset, fields, opt): 156 | fields = onmt.io.build_vocab(train_dataset, fields, opt.data_type, 157 | opt.share_vocab, 158 | opt.src_vocab, 159 | opt.src_vocab_size, 160 | opt.src_words_min_frequency, 161 | opt.tgt_vocab, 162 | opt.tgt_vocab_size, 163 | opt.tgt_words_min_frequency) 164 | 165 | # Can't save fields, so remove/reconstruct at training time. 166 | vocab_file = opt.save_data + '.vocab.pt' 167 | torch.save(onmt.io.save_fields_to_vocab(fields), vocab_file) 168 | 169 | 170 | def main(): 171 | opt = parse_args() 172 | 173 | print("opt.train_src", opt.train_src) 174 | print("opt.train_tgt", opt.train_tgt) 175 | 176 | print("Extracting features...") 177 | src_nfeats = onmt.io.get_num_features(opt.data_type, opt.train_src, 'src') 178 | tgt_nfeats = onmt.io.get_num_features(opt.data_type, opt.train_tgt, 'tgt') 179 | print(" * number of source features: %d." % src_nfeats) 180 | print(" * number of target features: %d." % tgt_nfeats) 181 | 182 | print("Building `Fields` object...") 183 | fields = onmt.io.get_fields(opt.data_type, src_nfeats, tgt_nfeats) 184 | 185 | print("Building & saving training data...") 186 | train_dataset_files = build_save_dataset('train', fields, opt) 187 | 188 | print("Building & saving vocabulary...") 189 | build_save_vocab(train_dataset_files, fields, opt) 190 | 191 | print("Building & saving validation data...") 192 | build_save_dataset('valid', fields, opt) 193 | 194 | 195 | if __name__ == "__main__": 196 | main() 197 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==0.3.1 2 | tables 3 | torchvision==0.2.0 4 | pretrainedmodels 5 | six 6 | tqdm 7 | torchtext==0.2.3 8 | future 9 | -------------------------------------------------------------------------------- /run_translated_m30k_only.sh: -------------------------------------------------------------------------------- 1 | # this script assumes there are 2 GPU cards available in this machine (at least) 2 | # please edit the script accordingly in case there are less GPU cards 3 | 4 | DATA_PATH="/path/to/data/multi30k" 5 | MODEL_PATH="/path/to/variational-multimodal-nmt-model-snapshots" 6 | MODEL_FILE_NAME="MMT_VI_Model_TranslatedM30K" 7 | 8 | 9 | # multi30k validation set 10 | VAL_SRC="${DATA_PATH}/val.lc.norm.tok.bpe-en-de-30000.en" 11 | VAL_TGT="${DATA_PATH}/val.lc.norm.tok.bpe-en-de-30000.de" 12 | VAL_IMGS="${DATA_PATH}/flickr30k_valid_resnet50_cnn_features.hdf5" 13 | 14 | # multi30k training set 15 | TRAIN_SRC="${DATA_PATH}/train.lc.norm.tok.bpe-en-de-30000.en" 16 | TRAIN_TGT="${DATA_PATH}/train.lc.norm.tok.bpe-en-de-30000.de" 17 | TRAIN_IMGS="${DATA_PATH}/flickr30k_train_resnet50_cnn_features.hdf5" 18 | 19 | # multi30k test set (2016) 20 | TEST_2016_SRC="${DATA_PATH}/test_2016_flickr.lc.norm.tok.bpe-en-de-30000.en" 21 | TEST_2016_TGT="${DATA_PATH}/test_2016_flickr.lc.norm.tok.bpe-en-de-30000.de" 22 | TEST_2016_IMGS="${DATA_PATH}/flickr30k_test_resnet50_cnn_features.hdf5" 23 | 24 | # multi30k test set (2017) 25 | TEST_2017_SRC="${DATA_PATH}/test_2017_flickr.lc.norm.tok.bpe-en-de-30000.en" 26 | TEST_2017_TGT="${DATA_PATH}/test_2017_flickr.lc.norm.tok.bpe-en-de-30000.de" 27 | TEST_2017_IMGS="${DATA_PATH}/flickr30k_test_2017_flickr_resnet50_cnn_features.hdf5" 28 | 29 | # ambiguous MSCOCO test set (2017) 30 | TEST_2017_MSCOCO_SRC="${DATA_PATH}/test_2017_mscoco.lc.norm.tok.bpe-en-de-30000.en" 31 | TEST_2017_MSCOCO_TGT="${DATA_PATH}/test_2017_mscoco.lc.norm.tok.bpe-en-de-30000.de" 32 | TEST_2017_MSCOCO_IMGS="${DATA_PATH}/flickr30k_test_2017_mscoco_resnet50_cnn_features.hdf5" 33 | 34 | EPOCHS=30 35 | #EPOCHS=1 36 | 37 | ########## 38 | # train 39 | ########## 40 | 41 | # train the model on the translated Multi30k data set only (~29K src/tgt/img instances) 42 | DATASET=${DATA_PATH}/m30k 43 | 44 | # train one conditional prior and one fixed-prior model 45 | # one model on gpu 0, another one on gpu 1 (both spawn validation set translations on gpu 1) 46 | python train_mm_vi_model1.py \ 47 | -gpuid 0 -epochs ${EPOCHS} -batch_size 40 -valid_batch_size 40 -optim 'adam' -learning_rate 0.002 -rnn_type LSTM \ 48 | -rnn_size 500 --z_latent_dim 500 \ 49 | -early_stopping_criteria 'bleu' \ 50 | -src ${VAL_SRC} \ 51 | -tgt ${VAL_TGT} \ 52 | -path_to_train_img_feats ${TRAIN_IMGS} \ 53 | -path_to_valid_img_feats ${VAL_IMGS} \ 54 | -data ${DATASET} \ 55 | --multimodal_model_type vi-model1 --use_global_image_features -dropout 0.5 -dropout_imgs 0.5 \ 56 | -save_model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior \ 57 | -overwrite_model_file 2>&1 ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior.log & 58 | 59 | python train_mm_vi_model1.py \ 60 | -gpuid 1 -epochs ${EPOCHS} -batch_size 40 -valid_batch_size 40 -optim 'adam' -learning_rate 0.002 -rnn_type LSTM \ 61 | -rnn_size 500 --z_latent_dim 500 \ 62 | -early_stopping_criteria 'bleu' \ 63 | -src ${VAL_SRC} \ 64 | -tgt ${VAL_TGT} \ 65 | -path_to_train_img_feats ${TRAIN_IMGS} \ 66 | -path_to_valid_img_feats ${VAL_IMGS} \ 67 | -data ${DATASET} \ 68 | --multimodal_model_type vi-model1 --use_global_image_features -dropout 0.5 -dropout_imgs 0.5 \ 69 | -save_model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior \ 70 | -overwrite_model_file \ 71 | --conditional 2>&1 ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior.log & 72 | 73 | wait; 74 | 75 | ############# 76 | # translate 77 | ############# 78 | 79 | # translate the validation set 80 | SPLIT="validation" 81 | python translate_mm_vi.py \ 82 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \ 83 | -src ${VAL_SRC} \ 84 | -path_to_test_img_feats ${VAL_IMGS} \ 85 | -gpu 0 \ 86 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations & 87 | 88 | python translate_mm_vi.py \ 89 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \ 90 | -src ${VAL_SRC} \ 91 | -path_to_test_img_feats ${VAL_IMGS} \ 92 | -gpu 1 \ 93 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations & 94 | 95 | wait; 96 | 97 | # translate the test set (2016) 98 | SPLIT="test2016" 99 | python translate_mm_vi.py \ 100 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \ 101 | -src ${TEST_2016_SRC} \ 102 | -path_to_test_img_feats ${TEST_2016_IMGS} \ 103 | -gpu 0 \ 104 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations & 105 | 106 | python translate_mm_vi.py \ 107 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \ 108 | -src ${TEST_2016_SRC} \ 109 | -path_to_test_img_feats ${TEST_2016_IMGS} \ 110 | -gpu 1 \ 111 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations & 112 | 113 | wait; 114 | 115 | # translate the test set (2017) 116 | SPLIT="test2017" 117 | python translate_mm_vi.py \ 118 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \ 119 | -src ${TEST_2017_SRC} \ 120 | -path_to_test_img_feats ${TEST_2017_IMGS} \ 121 | -gpu 0 \ 122 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations & 123 | 124 | python translate_mm_vi.py \ 125 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \ 126 | -src ${TEST_2017_SRC} \ 127 | -path_to_test_img_feats ${TEST_2017_IMGS} \ 128 | -gpu 1 \ 129 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations & 130 | 131 | wait; 132 | 133 | # translate the ambiguous MSCOCO test set (2017) 134 | SPLIT="test2017_mscoco" 135 | python translate_mm_vi.py \ 136 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt \ 137 | -src ${TEST_2017_MSCOCO_SRC} \ 138 | -path_to_test_img_feats ${TEST_2017_MSCOCO_IMGS} \ 139 | -gpu 0 \ 140 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.fixed-prior_BestModelBleu.pt.${SPLIT}-translations & 141 | 142 | python translate_mm_vi.py \ 143 | -model ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt \ 144 | -src ${TEST_2017_MSCOCO_SRC} \ 145 | -path_to_test_img_feats ${TEST_2017_MSCOCO_IMGS} \ 146 | -gpu 1 \ 147 | -output ${MODEL_PATH}/${MODEL_FILE_NAME}.conditional-prior_BestModelBleu.pt.${SPLIT}-translations & 148 | 149 | wait; 150 | 151 | echo -ne "Finished. Translations of valid/test 2016/test 2017 (Flickr and ambiguous MSCOCO) can be found in:\n${MODEL_PATH}/${MODEL_FILE_NAME}.{fixed,conditional}-prior_BestModelBleu.pt.{validation,test2016,test2017,test2017_mscoco}-translations\n" 152 | 153 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup(name='OpenNMT-py', 6 | description='A python implementation of OpenNMT', 7 | version='0.1', 8 | packages=['onmt', 'onmt.io', 'onmt.translate', 'onmt.modules']) 9 | -------------------------------------------------------------------------------- /tools/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chop; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chop; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | sub my_log { 172 | return -9999999999 unless $_[0]; 173 | return log($_[0]); 174 | } 175 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | #a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ga: -------------------------------------------------------------------------------- 1 | 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | Á 29 | É 30 | Í 31 | Ó 32 | Ú 33 | 34 | Uacht 35 | Dr 36 | B.Arch 37 | 38 | m.sh 39 | .i 40 | Co 41 | Cf 42 | cf 43 | i.e 44 | r 45 | Chr 46 | lch #NUMERIC_ONLY# 47 | lgh #NUMERIC_ONLY# 48 | uimh #NUMERIC_ONLY# 49 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.lt: -------------------------------------------------------------------------------- 1 | # Anything in this file, followed by a period (and an upper-case word), 2 | # does NOT indicate an end-of-sentence marker. 3 | # Special cases are included for prefixes that ONLY appear before 0-9 numbers. 4 | 5 | # Any single upper case letter followed by a period is not a sentence ender 6 | # (excluding I occasionally, but we leave it in) 7 | # usually upper case letters are initials in a name 8 | A 9 | Ā 10 | B 11 | C 12 | Č 13 | D 14 | E 15 | Ē 16 | F 17 | G 18 | Ģ 19 | H 20 | I 21 | Ī 22 | J 23 | K 24 | Ķ 25 | L 26 | Ļ 27 | M 28 | N 29 | Ņ 30 | O 31 | P 32 | Q 33 | R 34 | S 35 | Š 36 | T 37 | U 38 | Ū 39 | V 40 | W 41 | X 42 | Y 43 | Z 44 | Ž 45 | 46 | # Initialis -- Džonas 47 | Dz 48 | Dž 49 | Just 50 | 51 | # Day and month abbreviations 52 | # m. menesis d. diena g. gimes 53 | m 54 | mėn 55 | d 56 | g 57 | gim 58 | # Pirmadienis Penktadienis 59 | Pr 60 | Pn 61 | Pirm 62 | Antr 63 | Treč 64 | Ketv 65 | Penkt 66 | Šešt 67 | Sekm 68 | Saus 69 | Vas 70 | Kov 71 | Bal 72 | Geg 73 | Birž 74 | Liep 75 | Rugpj 76 | Rugs 77 | Spal 78 | Lapkr 79 | Gruod 80 | 81 | # Business, governmental, geographical terms 82 | a 83 | # aikštė 84 | adv 85 | # advokatas 86 | akad 87 | # akademikas 88 | aklg 89 | # akligatvis 90 | akt 91 | # aktorius 92 | al 93 | # alėja 94 | A.V 95 | # antspaudo vieta 96 | aps 97 | apskr 98 | # apskritis 99 | apyg 100 | # apygarda 101 | aps 102 | apskr 103 | # apskritis 104 | asist 105 | # asistentas 106 | asmv 107 | avd 108 | # asmenvardis 109 | a.k 110 | asm 111 | asm.k 112 | # asmens kodas 113 | atsak 114 | # atsakingasis 115 | atsisk 116 | sąsk 117 | # atsiskaitomoji sąskaita 118 | aut 119 | # autorius 120 | b 121 | k 122 | b.k 123 | # banko kodas 124 | bkl 125 | # bakalauras 126 | bt 127 | # butas 128 | buv 129 | # buvęs, -usi 130 | dail 131 | # dailininkas 132 | dek 133 | # dekanas 134 | dėst 135 | # dėstytojas 136 | dir 137 | # direktorius 138 | dirig 139 | # dirigentas 140 | doc 141 | # docentas 142 | drp 143 | # durpynas 144 | dš 145 | # dešinysis 146 | egz 147 | # egzempliorius 148 | eil 149 | # eilutė 150 | ekon 151 | # ekonomika 152 | el 153 | # elektroninis 154 | etc 155 | ež 156 | # ežeras 157 | faks 158 | # faksas 159 | fak 160 | # fakultetas 161 | gen 162 | # generolas 163 | gyd 164 | # gydytojas 165 | gv 166 | # gyvenvietė 167 | įl 168 | # įlanka 169 | Įn 170 | # įnagininkas 171 | insp 172 | # inspektorius 173 | pan 174 | # ir panašiai 175 | t.t 176 | # ir taip toliau 177 | k.a 178 | # kaip antai 179 | kand 180 | # kandidatas 181 | kat 182 | # katedra 183 | kyš 184 | # kyšulys 185 | kl 186 | # klasė 187 | kln 188 | # kalnas 189 | kn 190 | # knyga 191 | koresp 192 | # korespondentas 193 | kpt 194 | # kapitonas 195 | kr 196 | # kairysis 197 | kt 198 | # kitas 199 | kun 200 | # kunigas 201 | l 202 | e 203 | p 204 | l.e.p 205 | # laikinai einantis pareigas 206 | ltn 207 | # leitenantas 208 | m 209 | mst 210 | # miestas 211 | m.e 212 | # mūsų eros 213 | m.m 214 | # mokslo metai 215 | mot 216 | # moteris 217 | mstl 218 | # miestelis 219 | mgr 220 | # magistras 221 | mgnt 222 | # magistrantas 223 | mjr 224 | # majoras 225 | mln 226 | # milijonas 227 | mlrd 228 | # milijardas 229 | mok 230 | # mokinys 231 | mokyt 232 | # mokytojas 233 | moksl 234 | # mokslinis 235 | nkt 236 | # nekaitomas 237 | ntk 238 | # neteiktinas 239 | Nr 240 | nr 241 | # numeris 242 | p 243 | # ponas 244 | p.d 245 | a.d 246 | # pašto dėžutė, abonentinė dėžutė 247 | p.m.e 248 | # prieš mūsų erą 249 | pan 250 | # ir panašiai 251 | pav 252 | # paveikslas 253 | pavad 254 | # pavaduotojas 255 | pirm 256 | # pirmininkas 257 | pl 258 | # plentas 259 | plg 260 | # palygink 261 | plk 262 | # pulkininkas; pelkė 263 | pr 264 | # prospektas 265 | Kr 266 | pr.Kr 267 | # prieš Kristų 268 | prok 269 | # prokuroras 270 | prot 271 | # protokolas 272 | pss 273 | # pusiasalis 274 | pšt 275 | # paštas 276 | pvz 277 | # pavyzdžiui 278 | r 279 | # rajonas 280 | red 281 | # redaktorius 282 | rš 283 | # raštų kalbos 284 | sąs 285 | # sąsiuvinis 286 | saviv 287 | sav 288 | # savivaldybė 289 | sekr 290 | # sekretorius 291 | sen 292 | # seniūnija, seniūnas 293 | sk 294 | # skaityk; skyrius 295 | skg 296 | # skersgatvis 297 | skyr 298 | sk 299 | # skyrius 300 | skv 301 | # skveras 302 | sp 303 | # spauda; spaustuvė 304 | spec 305 | # specialistas 306 | sr 307 | # sritis 308 | st 309 | # stotis 310 | str 311 | # straipsnis 312 | stud 313 | # studentas 314 | š 315 | š.m 316 | # šių metų 317 | šnek 318 | # šnekamosios 319 | tir 320 | # tiražas 321 | tūkst 322 | # tūkstantis 323 | up 324 | # upė 325 | upl 326 | # upelis 327 | vad 328 | # vadinamasis, -oji 329 | vlsč 330 | # valsčius 331 | ved 332 | # vedėjas 333 | vet 334 | # veterinarija 335 | virš 336 | # viršininkas, viršaitis 337 | vyr 338 | # vyriausiasis, -ioji; vyras 339 | vyresn 340 | # vyresnysis 341 | vlsč 342 | # valsčius 343 | vs 344 | # viensėdis 345 | Vt 346 | vt 347 | # vietininkas 348 | vtv 349 | vv 350 | # vietovardis 351 | žml 352 | # žemėlapis 353 | 354 | # Technical terms, abbreviations used in guidebooks, advertisments, etc. 355 | # Generally lower-case. 356 | air 357 | # airiškai 358 | amer 359 | # amerikanizmas 360 | anat 361 | # anatomija 362 | angl 363 | # angl. angliskai 364 | arab 365 | # arabų 366 | archeol 367 | archit 368 | asm 369 | # asmuo 370 | astr 371 | # astronomija 372 | austral 373 | # australiškai 374 | aut 375 | # automobilis 376 | av 377 | # aviacija 378 | bažn 379 | bdv 380 | # būdvardis 381 | bibl 382 | # Biblija 383 | biol 384 | # biologija 385 | bot 386 | # botanika 387 | brt 388 | # burtai, burtažodis. 389 | brus 390 | # baltarusių 391 | buh 392 | # buhalterija 393 | chem 394 | # chemija 395 | col 396 | # collectivum 397 | con 398 | conj 399 | # conjunctivus, jungtukas 400 | dab 401 | # dab. dabartine 402 | dgs 403 | # daugiskaita 404 | dial 405 | # dialektizmas 406 | dipl 407 | dktv 408 | # daiktavardis 409 | džn 410 | # dažnai 411 | ekon 412 | el 413 | # elektra 414 | esam 415 | # esamasis laikas 416 | euf 417 | # eufemizmas 418 | fam 419 | # familiariai 420 | farm 421 | # farmacija 422 | filol 423 | # filologija 424 | filos 425 | # filosofija 426 | fin 427 | # finansai 428 | fiz 429 | # fizika 430 | fiziol 431 | # fiziologija 432 | flk 433 | # folkloras 434 | fon 435 | # fonetika 436 | fot 437 | # fotografija 438 | geod 439 | # geodezija 440 | geogr 441 | geol 442 | # geologija 443 | geom 444 | # geometrija 445 | glžk 446 | gr 447 | # graikų 448 | gram 449 | her 450 | # heraldika 451 | hidr 452 | # hidrotechnika 453 | ind 454 | # Indų 455 | iron 456 | # ironiškai 457 | isp 458 | # ispanų 459 | ist 460 | istor 461 | # istorija 462 | it 463 | # italų 464 | įv 465 | reikšm 466 | įv.reikšm 467 | # įvairiomis reikšmėmis 468 | jap 469 | # japonų 470 | juok 471 | # juokaujamai 472 | jūr 473 | # jūrininkystė 474 | kalb 475 | # kalbotyra 476 | kar 477 | # karyba 478 | kas 479 | # kasyba 480 | kin 481 | # kinematografija 482 | klaus 483 | # klausiamasis 484 | knyg 485 | # knyginis 486 | kom 487 | # komercija 488 | komp 489 | # kompiuteris 490 | kosm 491 | # kosmonautika 492 | kt 493 | # kitas 494 | kul 495 | # kulinarija 496 | kuop 497 | # kuopine 498 | l 499 | # laikas 500 | lit 501 | # literatūrinis 502 | lingv 503 | # lingvistika 504 | log 505 | # logika 506 | lot 507 | # lotynų 508 | mat 509 | # matematika 510 | maž 511 | # mažybinis 512 | med 513 | # medicina 514 | medž 515 | # medžioklė 516 | men 517 | # menas 518 | menk 519 | # menkinamai 520 | metal 521 | # metalurgija 522 | meteor 523 | min 524 | # mineralogija 525 | mit 526 | # mitologija 527 | mok 528 | # mokyklinis 529 | ms 530 | # mįslė 531 | muz 532 | # muzikinis 533 | n 534 | # naujasis 535 | neig 536 | # neigiamasis 537 | neol 538 | # neologizmas 539 | niek 540 | # niekinamai 541 | ofic 542 | # oficialus 543 | opt 544 | # optika 545 | orig 546 | # original 547 | p 548 | # pietūs 549 | pan 550 | # panašiai 551 | parl 552 | # parlamentas 553 | pat 554 | # patarlė 555 | paž 556 | # pažodžiui 557 | plg 558 | # palygink 559 | poet 560 | # poetizmas 561 | poez 562 | # poezija 563 | poligr 564 | # poligrafija 565 | polit 566 | # politika 567 | ppr 568 | # paprastai 569 | pranc 570 | pr 571 | # prancūzų, prūsų 572 | priet 573 | # prietaras 574 | prek 575 | # prekyba 576 | prk 577 | # perkeltine 578 | prs 579 | # persona, asmuo 580 | psn 581 | # pasenęs žodis 582 | psich 583 | # psichologija 584 | pvz 585 | # pavyzdžiui 586 | r 587 | # rytai 588 | rad 589 | # radiotechnika 590 | rel 591 | # religija 592 | ret 593 | # retai 594 | rus 595 | # rusų 596 | sen 597 | # senasis 598 | sl 599 | # slengas, slavų 600 | sov 601 | # sovietinis 602 | spec 603 | # specialus 604 | sport 605 | stat 606 | # statyba 607 | sudurt 608 | # sudurtinis 609 | sutr 610 | # sutrumpintas 611 | suv 612 | # suvalkiečių 613 | š 614 | # šiaurė 615 | šach 616 | # šachmatai 617 | šiaur 618 | škot 619 | # škotiškai 620 | šnek 621 | # šnekamoji 622 | teatr 623 | tech 624 | techn 625 | # technika 626 | teig 627 | # teigiamas 628 | teis 629 | # teisė 630 | tekst 631 | # tekstilė 632 | tel 633 | # telefonas 634 | teol 635 | # teologija 636 | v 637 | # tik vyriškosios, vakarai 638 | t.p 639 | t 640 | p 641 | # ir taip pat 642 | t.t 643 | # ir taip toliau 644 | t.y 645 | # tai yra 646 | vaik 647 | # vaikų 648 | vart 649 | # vartojama 650 | vet 651 | # veterinarija 652 | vid 653 | # vidurinis 654 | vksm 655 | # veiksmažodis 656 | vns 657 | # vienaskaita 658 | vok 659 | # vokiečių 660 | vulg 661 | # vulgariai 662 | zool 663 | # zoologija 664 | žr 665 | # žiūrėk 666 | ž.ū 667 | ž 668 | ū 669 | # žemės ūkis 670 | 671 | # List of titles. These are often followed by upper-case names, but do 672 | # not indicate sentence breaks 673 | # 674 | # Jo Eminencija 675 | Em. 676 | # Gerbiamasis 677 | Gerb 678 | gerb 679 | # malonus 680 | malon 681 | # profesorius 682 | Prof 683 | prof 684 | # daktaras (mokslų) 685 | Dr 686 | dr 687 | habil 688 | med 689 | # inž inžinierius 690 | inž 691 | Inž 692 | 693 | 694 | #Numbers only. These should only induce breaks when followed by a numeric sequence 695 | # add NUMERIC_ONLY after the word for this function 696 | #This case is mostly for the english "No." which can either be a sentence of its own, or 697 | #if followed by a number, a non-breaking prefix 698 | No #NUMERIC_ONLY# 699 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.yue: -------------------------------------------------------------------------------- 1 | # 2 | # Cantonese (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.zh: -------------------------------------------------------------------------------- 1 | # 2 | # Mandarin (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division, unicode_literals, print_function 4 | import os 5 | import argparse 6 | import math 7 | import codecs 8 | import torch 9 | import time 10 | 11 | from itertools import count 12 | 13 | import onmt.io 14 | import onmt.translate 15 | import onmt 16 | import onmt.ModelConstructor 17 | import onmt.modules 18 | import opts 19 | 20 | parser = argparse.ArgumentParser( 21 | description='translate.py', 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 23 | opts.add_md_help_argument(parser) 24 | opts.translate_opts(parser) 25 | 26 | opt = parser.parse_args() 27 | 28 | 29 | def _report_score(name, score_total, words_total): 30 | print("%s AVG SCORE: %.4f, %s PPL: %.4f" % ( 31 | name, score_total / words_total, 32 | name, math.exp(-score_total / words_total))) 33 | 34 | 35 | def _report_bleu(): 36 | import subprocess 37 | print() 38 | res = subprocess.check_output( 39 | "perl tools/multi-bleu.perl %s < %s" % (opt.tgt, opt.output), 40 | shell=True).decode("utf-8") 41 | print(">> " + res.strip()) 42 | 43 | 44 | def _report_rouge(): 45 | import subprocess 46 | res = subprocess.check_output( 47 | "python tools/test_rouge.py -r %s -c %s" % (opt.tgt, opt.output), 48 | shell=True).decode("utf-8") 49 | print(res.strip()) 50 | 51 | 52 | def main(): 53 | dummy_parser = argparse.ArgumentParser(description='train.py') 54 | opts.model_opts(dummy_parser) 55 | dummy_opt = dummy_parser.parse_known_args([])[0] 56 | 57 | opt.cuda = opt.gpu > -1 58 | if opt.cuda: 59 | torch.cuda.set_device(opt.gpu) 60 | 61 | # Load the model. 62 | fields, model, model_opt = \ 63 | onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) 64 | 65 | # File to write sentences to. 66 | out_file = codecs.open(opt.output, 'w', 'utf-8') 67 | 68 | # Test data 69 | data = onmt.io.build_dataset(fields, opt.data_type, 70 | opt.src, opt.tgt, 71 | src_dir=opt.src_dir, 72 | sample_rate=opt.sample_rate, 73 | window_size=opt.window_size, 74 | window_stride=opt.window_stride, 75 | window=opt.window, 76 | use_filter_pred=False) 77 | 78 | # Sort batch by decreasing lengths of sentence required by pytorch. 79 | # sort=False means "Use dataset's sortkey instead of iterator's". 80 | data_iter = onmt.io.OrderedIterator( 81 | dataset=data, device=opt.gpu, 82 | batch_size=opt.batch_size, train=False, sort=False, 83 | sort_within_batch=True, shuffle=False) 84 | 85 | # Translator 86 | scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) 87 | translator = onmt.translate.Translator(model, fields, 88 | beam_size=opt.beam_size, 89 | n_best=opt.n_best, 90 | global_scorer=scorer, 91 | max_length=opt.max_length, 92 | copy_attn=model_opt.copy_attn, 93 | cuda=opt.cuda, 94 | beam_trace=opt.dump_beam != "", 95 | min_length=opt.min_length) 96 | builder = onmt.translate.TranslationBuilder( 97 | data, translator.fields, 98 | opt.n_best, opt.replace_unk, opt.tgt) 99 | 100 | # Statistics 101 | counter = count(1) 102 | pred_score_total, pred_words_total = 0, 0 103 | gold_score_total, gold_words_total = 0, 0 104 | 105 | start_time = time.time() 106 | n_processed = 0 107 | print("Processed ", end="") 108 | for batch in data_iter: 109 | batch_data = translator.translate_batch(batch, data) 110 | translations = builder.from_batch(batch_data) 111 | 112 | for trans in translations: 113 | pred_score_total += trans.pred_scores[0] 114 | pred_words_total += len(trans.pred_sents[0]) 115 | if opt.tgt: 116 | gold_score_total += trans.gold_score 117 | gold_words_total += len(trans.gold_sent) 118 | 119 | n_best_preds = [" ".join(pred) 120 | for pred in trans.pred_sents[:opt.n_best]] 121 | out_file.write('\n'.join(n_best_preds)) 122 | out_file.write('\n') 123 | out_file.flush() 124 | 125 | if opt.verbose: 126 | sent_number = next(counter) 127 | output = trans.log(sent_number) 128 | os.write(1, output.encode('utf-8')) 129 | 130 | n_processed+=len(batch_data["batch"]) 131 | if n_processed % 100 == 0: 132 | if n_processed == 100: 133 | print("%d"%n_processed, end=" ", flush=True) 134 | else: 135 | print(", %d"%n_processed, end=" ", flush=True) 136 | print("", flush=True) 137 | 138 | elapsed_time = time.time() - start_time 139 | 140 | _report_score('PRED', pred_score_total, pred_words_total) 141 | if opt.tgt: 142 | _report_score('GOLD', gold_score_total, gold_words_total) 143 | if opt.report_bleu: 144 | _report_bleu() 145 | if opt.report_rouge: 146 | _report_rouge() 147 | 148 | if opt.dump_beam: 149 | import json 150 | json.dump(translator.beam_accum, 151 | codecs.open(opt.dump_beam, 'w', 'utf-8')) 152 | 153 | print("Translations computed in %d seconds."%elapsed_time) 154 | 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /translate_mm_vi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division, unicode_literals 4 | import os 5 | import argparse 6 | import math 7 | import codecs 8 | import torch 9 | 10 | from itertools import count 11 | 12 | import onmt.io 13 | import onmt.translate 14 | import onmt 15 | import onmt.ModelConstructor 16 | import onmt.modules 17 | from onmt.Utils import MODEL_TYPES 18 | import opts 19 | import tables 20 | 21 | parser = argparse.ArgumentParser( 22 | description='translate_mm_vi.py', 23 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 24 | opts.add_md_help_argument(parser) 25 | opts.translate_opts(parser) 26 | opts.translate_mm_vi_opts(parser) 27 | 28 | opt = parser.parse_args() 29 | 30 | 31 | def _report_score(name, score_total, words_total): 32 | print("%s AVG SCORE: %.4f, %s PPL: %.4f" % ( 33 | name, score_total / words_total, 34 | name, math.exp(-score_total / words_total))) 35 | 36 | 37 | def _report_bleu(): 38 | import subprocess 39 | print() 40 | res = subprocess.check_output( 41 | "perl tools/multi-bleu.perl %s < %s" % (opt.tgt, opt.output), 42 | shell=True).decode("utf-8") 43 | print(">> " + res.strip()) 44 | 45 | 46 | def _report_rouge(): 47 | import subprocess 48 | res = subprocess.check_output( 49 | "python tools/test_rouge.py -r %s -c %s" % (opt.tgt, opt.output), 50 | shell=True).decode("utf-8") 51 | print(res.strip()) 52 | 53 | 54 | def main(): 55 | dummy_parser = argparse.ArgumentParser(description='train_mm_vi.py') 56 | opts.model_opts(dummy_parser) 57 | dummy_opt = dummy_parser.parse_known_args([])[0] 58 | 59 | opt.cuda = opt.gpu > -1 60 | if opt.cuda: 61 | torch.cuda.set_device(opt.gpu) 62 | print("Using GPU") 63 | torch.set_default_tensor_type("torch.cuda.FloatTensor") 64 | else: 65 | print("Using CPU") 66 | torch.set_default_tensor_type("torch.FloatTensor") 67 | 68 | # loading checkpoint just to find multimodal model type 69 | checkpoint = torch.load(opt.model, 70 | map_location=lambda storage, loc: storage) 71 | opt.multimodal_model_type = checkpoint['opt'].multimodal_model_type 72 | opt.use_global_image_features = checkpoint['opt'].use_global_image_features 73 | opt.use_posterior_image_features = checkpoint['opt'].use_posterior_image_features 74 | # work-around to get fix issue 75 | assert(opt.multimodal_model_type in MODEL_TYPES), \ 76 | 'Variational multimodal model type not implemented: %s'%str(opt.multimodal_model_type) 77 | print("Translating with multimodal_model_type: %s"%str(opt.multimodal_model_type)) 78 | del checkpoint 79 | 80 | if opt.batch_size > 1: 81 | print( "Batch size > 1 not implemented! Falling back to batch_size = 1 ..." ) 82 | opt.batch_size = 1 83 | 84 | # load test image features 85 | test_file = tables.open_file(opt.path_to_test_img_feats, mode='r') 86 | if opt.multimodal_model_type in MODEL_TYPES: 87 | if opt.use_global_image_features: 88 | # load only the global image features 89 | test_img_feats = test_file.root.global_feats[:] 90 | print('Using global image features...') 91 | else: # opt.use_posterior_image_features 92 | # load only the global image features 93 | test_img_feats = test_file.root.logits[:] 94 | print('Using image posterior class probabilities...') 95 | else: 96 | raise Exception("Model type not implemented: %s"%opt.multimodal_model_type) 97 | test_file.close() 98 | 99 | # Load the model. 100 | fields, model, model_opt = \ 101 | onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) 102 | 103 | # File to write sentences to. 104 | out_file = codecs.open(opt.output, 'w', 'utf-8') 105 | 106 | # Test data 107 | data = onmt.io.build_dataset(fields, opt.data_type, 108 | opt.src, opt.tgt, 109 | src_dir=opt.src_dir, 110 | sample_rate=opt.sample_rate, 111 | window_size=opt.window_size, 112 | window_stride=opt.window_stride, 113 | window=opt.window, 114 | use_filter_pred=False) 115 | 116 | # Sort batch by decreasing lengths of sentence required by pytorch. 117 | # sort=False means "Use dataset's sortkey instead of iterator's". 118 | print("opt.gpu: %s"%str(opt.gpu)) 119 | data_iter = onmt.io.OrderedIterator( 120 | dataset=data, device=opt.gpu, 121 | batch_size=opt.batch_size, train=False, sort=False, 122 | sort_within_batch=True, shuffle=False) 123 | 124 | # Translator 125 | scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) 126 | translator = onmt.translate.TranslatorMultimodalVI(model, fields, 127 | beam_size=opt.beam_size, 128 | n_best=opt.n_best, 129 | global_scorer=scorer, 130 | max_length=opt.max_length, 131 | copy_attn=model_opt.copy_attn, 132 | cuda=opt.cuda, 133 | beam_trace=opt.dump_beam != "", 134 | min_length=opt.min_length, 135 | test_img_feats=test_img_feats, 136 | multimodal_model_type=opt.multimodal_model_type) 137 | builder = onmt.translate.TranslationBuilder( 138 | data, translator.fields, 139 | opt.n_best, opt.replace_unk, opt.tgt) 140 | 141 | # Statistics 142 | counter = count(1) 143 | pred_score_total, pred_words_total = 0, 0 144 | gold_score_total, gold_words_total = 0, 0 145 | 146 | for sent_idx, batch in enumerate(data_iter): 147 | batch_data = translator.translate_batch(batch, data, sent_idx) 148 | translations = builder.from_batch(batch_data) 149 | 150 | for trans in translations: 151 | pred_score_total += trans.pred_scores[0] 152 | pred_words_total += len(trans.pred_sents[0]) 153 | if opt.tgt: 154 | gold_score_total += trans.gold_score 155 | gold_words_total += len(trans.gold_sent) 156 | 157 | n_best_preds = [" ".join(pred) 158 | for pred in trans.pred_sents[:opt.n_best]] 159 | out_file.write('\n'.join(n_best_preds)) 160 | out_file.write('\n') 161 | out_file.flush() 162 | 163 | if opt.verbose: 164 | sent_number = next(counter) 165 | output = trans.log(sent_number) 166 | os.write(1, output.encode('utf-8')) 167 | 168 | _report_score('PRED', pred_score_total, pred_words_total) 169 | if opt.tgt: 170 | _report_score('GOLD', gold_score_total, gold_words_total) 171 | if opt.report_bleu: 172 | _report_bleu() 173 | if opt.report_rouge: 174 | _report_rouge() 175 | 176 | if opt.dump_beam: 177 | import json 178 | json.dump(translator.beam_accum, 179 | codecs.open(opt.dump_beam, 'w', 'utf-8')) 180 | 181 | 182 | if __name__ == "__main__": 183 | main() 184 | --------------------------------------------------------------------------------