├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── ONMT_README.md ├── README.md ├── available_models └── example.conf.json ├── config ├── config-rnn-summarization.yml ├── config-transformer-base-1GPU.yml └── config-transformer-base-4GPU.yml ├── data ├── README.md ├── morph │ ├── src.train │ ├── src.valid │ ├── tgt.train │ └── tgt.valid ├── src-test.txt ├── src-train.txt ├── src-val.txt ├── test_model2.src ├── test_model2.tgt ├── tgt-train.txt └── tgt-val.txt ├── docs ├── Makefile ├── requirements.txt └── source │ ├── CONTRIBUTING.md │ ├── FAQ.md │ ├── Library.ipynb │ ├── Library.md │ ├── Summarization.md │ ├── _static │ └── theme_overrides.css │ ├── conf.py │ ├── examples.rst │ ├── extended.md │ ├── im2text.md │ ├── index.md │ ├── index.rst │ ├── main.md │ ├── modules.rst │ ├── onmt.inputters.rst │ ├── onmt.modules.rst │ ├── onmt.rst │ ├── onmt.translate.translation_server.rst │ ├── onmt.translation.rst │ ├── options │ ├── preprocess.rst │ ├── server.rst │ ├── train.rst │ └── translate.rst │ ├── quickstart.md │ ├── ref.rst │ ├── refs.bib │ ├── speech2text.md │ └── vid2text.rst ├── floyd.yml ├── floyd_requirements.txt ├── github_deploy_key_opennmt_opennmt_py.enc ├── onmt ├── __init__.py ├── bin │ ├── __init__.py │ ├── preprocess.py │ ├── server.py │ ├── train.py │ └── translate.py ├── decoders │ ├── __init__.py │ ├── cnn_decoder.py │ ├── decoder.py │ ├── ensemble.py │ └── transformer.py ├── encoders │ ├── __init__.py │ ├── audio_encoder.py │ ├── cnn_encoder.py │ ├── encoder.py │ ├── image_encoder.py │ ├── mean_encoder.py │ ├── rnn_encoder.py │ └── transformer.py ├── inputters │ ├── __init__.py │ ├── audio_dataset.py │ ├── datareader_base.py │ ├── dataset_base.py │ ├── image_dataset.py │ ├── inputter.py │ ├── text_dataset.py │ └── vec_dataset.py ├── model_builder.py ├── models │ ├── __init__.py │ ├── model.py │ ├── model_saver.py │ ├── sru.py │ └── stacked_rnn.py ├── modules │ ├── __init__.py │ ├── average_attn.py │ ├── conv_multi_step_attention.py │ ├── copy_generator.py │ ├── embeddings.py │ ├── gate.py │ ├── global_attention.py │ ├── multi_headed_attn.py │ ├── position_ffn.py │ ├── sparse_activations.py │ ├── sparse_losses.py │ ├── structured_attention.py │ ├── util_class.py │ └── weight_norm.py ├── opts.py ├── tests │ ├── __init__.py │ ├── output_hyp.txt │ ├── pull_request_chk.sh │ ├── rebuild_test_models.sh │ ├── sample_glove.txt │ ├── test_attention.py │ ├── test_audio_dataset.py │ ├── test_beam.py │ ├── test_beam_search.py │ ├── test_copy_generator.py │ ├── test_embeddings.py │ ├── test_image_dataset.py │ ├── test_model.pt │ ├── test_model2.pt │ ├── test_models.py │ ├── test_models.sh │ ├── test_preprocess.py │ ├── test_random_sampling.py │ ├── test_simple.py │ ├── test_structured_attention.py │ ├── test_text_dataset.py │ ├── test_translation_server.py │ └── utils_for_tests.py ├── train_single.py ├── trainer.py ├── translate │ ├── __init__.py │ ├── beam.py │ ├── beam_search.py │ ├── decode_strategy.py │ ├── penalties.py │ ├── process_zh.py │ ├── random_sampling.py │ ├── translation.py │ ├── translation_server.py │ └── translator.py └── utils │ ├── __init__.py │ ├── cnn_factory.py │ ├── distributed.py │ ├── earlystopping.py │ ├── logging.py │ ├── loss.py │ ├── misc.py │ ├── optimizers.py │ ├── parse.py │ ├── report_manager.py │ ├── rnn_factory.py │ └── statistics.py ├── preprocess.py ├── requirement.txt ├── requirements.opt.txt ├── scripts ├── eval.py ├── inference_daily.sh ├── inference_ost.sh ├── preprocess.sh ├── train_daily.sh └── train_ost.sh ├── server.py ├── setup.py ├── tools ├── README.md ├── apply_bpe.py ├── average_models.py ├── bpe_pipeline.sh ├── create_vocabulary.py ├── detokenize.perl ├── embeddings_to_torch.py ├── extract_embeddings.py ├── learn_bpe.py ├── multi-bleu-detok.perl ├── multi-bleu.perl ├── nonbreaking_prefixes │ ├── README.txt │ ├── nonbreaking_prefix.ca │ ├── nonbreaking_prefix.cs │ ├── nonbreaking_prefix.de │ ├── nonbreaking_prefix.el │ ├── nonbreaking_prefix.en │ ├── nonbreaking_prefix.es │ ├── nonbreaking_prefix.fi │ ├── nonbreaking_prefix.fr │ ├── nonbreaking_prefix.ga │ ├── nonbreaking_prefix.hu │ ├── nonbreaking_prefix.is │ ├── nonbreaking_prefix.it │ ├── nonbreaking_prefix.lt │ ├── nonbreaking_prefix.lv │ ├── nonbreaking_prefix.nl │ ├── nonbreaking_prefix.pl │ ├── nonbreaking_prefix.ro │ ├── nonbreaking_prefix.ru │ ├── nonbreaking_prefix.sk │ ├── nonbreaking_prefix.sl │ ├── nonbreaking_prefix.sv │ ├── nonbreaking_prefix.ta │ ├── nonbreaking_prefix.yue │ └── nonbreaking_prefix.zh ├── release_model.py ├── test_rouge.py ├── tokenizer.perl └── vid_feature_extractor.py ├── train.py └── translate.py /.gitignore: -------------------------------------------------------------------------------- 1 | # repo-specific stuff 2 | pred.txt 3 | multi-bleu.perl 4 | *.pt 5 | \#*# 6 | .idea 7 | *.sublime-* 8 | .DS_Store 9 | data/ 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | # Tensorboard 112 | runs/ 113 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | OpenNMT-py is a community developed project and we love developer contributions. 4 | 5 | ## Guidelines 6 | Before sending a PR, please do this checklist first: 7 | 8 | - Please run `onmt/tests/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks: 9 | 1. flake8 check for coding style; 10 | 2. unittest; 11 | 3. continuous integration tests listed in `.travis.yml`. 12 | - When adding/modifying class constructor, please make the arguments as same naming style as its superclass in PyTorch. 13 | - If your change is based on a paper, please include a clear comment and reference in the code (more on that below). 14 | 15 | ### Docstrings 16 | Above all, try to follow the Google docstring format 17 | ([Napoleon example](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html), 18 | [Google styleguide](http://google.github.io/styleguide/pyguide.html)). 19 | This makes it easy to include your contributions in the Sphinx documentation. And, do feel free 20 | to autodoc your contributions in the API ``.rst`` files in the `docs/source` folder! If you do, check that 21 | your additions look right. 22 | 23 | ```bash 24 | cd docs 25 | # install some dependencies if necessary: 26 | # recommonmark, sphinx_rtd_theme, sphinxcontrib-bibtex 27 | make html 28 | firefox build/html/main.html # or your browser of choice 29 | ``` 30 | 31 | Some particular advice: 32 | - Try to follow Python 3 [``typing`` module](https://docs.python.org/3/library/typing.html) conventions when documenting types. 33 | - Exception: use "or" instead of unions for more readability 34 | - For external types, use the full "import name". Common abbreviations (e.g. ``np``) are acceptable. 35 | For ``torch.Tensor`` types, the ``torch.`` is optional. 36 | - Please don't use tics like `` (`str`) `` or rst directives like `` (:obj:`str`) ``. Napoleon handles types 37 | very well without additional help, so avoid the clutter. 38 | - [Google docstrings don't support multiple returns](https://stackoverflow.com/questions/29221551/can-sphinx-napoleon-document-function-returning-multiple-arguments). 39 | For multiple returns, the following works well with Sphinx and is still very readable. 40 | ```python 41 | def foo(a, b): 42 | """This is my docstring. 43 | 44 | Args: 45 | a (object): Something. 46 | b (class): Another thing. 47 | 48 | Returns: 49 | (object, class): 50 | 51 | * a: Something or rather with a long 52 | description that spills over. 53 | * b: And another thing. 54 | """ 55 | 56 | return a, b 57 | ``` 58 | - When citing a paper, avoid directly linking in the docstring! Add a Bibtex entry to `docs/source/refs.bib`. 59 | E.g., to cite "Attention Is All You Need", visit [arXiv](https://arxiv.org/abs/1706.03762), choose the 60 | [bibtext](https://dblp.uni-trier.de/rec/bibtex/journals/corr/VaswaniSPUJGKP17) link, search `docs/source/refs.bib` 61 | using `CTRL-F` for `DBLP:journals/corr/VaswaniSPUJGKP17`, and if you do not find it then copy-paste the 62 | citation into `refs.bib`. Then, in your docstring, use ``:cite:`DBLP:journals/corr/VaswaniSPUJGKP17` ``. 63 | - However, a link is better than nothing. 64 | - Please document tensor shapes. Prefer the format 65 | ``` ``(a, b, c)`` ```. This style is easy to read, allows using ``x`` for multplication, and is common 66 | (PyTorch uses a few variations on the parentheses format, AllenNLP uses exactly this format, Fairseq uses 67 | the parentheses format with single ticks). 68 | - Again, a different style is better than no shape documentation. 69 | - Please avoid unnecessary space characters, try to capitalize, and try to punctuate. 70 | 71 | For multi-line docstrings, add a blank line after the closing ``"""``. 72 | Don't use a blank line before the closing quotes. 73 | 74 | ``""" not this """`` ``"""This."""`` 75 | 76 | ```python 77 | """ 78 | Not this. 79 | """ 80 | ``` 81 | ```python 82 | """This.""" 83 | ``` 84 | 85 | This note is the least important. Focus on content first, but remember that consistent docs look good. 86 | - Be sensible about the first line. Generally, one stand-alone summary line (per the Google guidelines) is good. 87 | Sometimes, it's better to cut directly to the args or an extended description. It's always acceptable to have a 88 | "trailing" citation. 89 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017-Present OpenNMT 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AdaLabel 2 | 3 | Code/data for ACL'21 paper "Diversifying Dialog Generation via Adaptive Label Smoothing". 4 | 5 | We implemented an Adaptive Label Smoothing (AdaLabel) approach that can adaptively estimate a target label distribution at each time step for different contexts. 6 | Our method is an extension of the traditional MLE loss. 7 | The current implementation is designed for the task of dialogue generation. 8 | However, our approach can be readily extended to other text generation tasks such as summarization. 9 | Please refer to our paper for more details. 10 | 11 | Our implementation is based on the [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py) project, 12 | therefore most behaviors of our code follow the default settings in OpenNMT-py. 13 | Specifically, we forked from [this commit](https://github.com/OpenNMT/OpenNMT-py/tree/1bbf410a00e1d15c87fc5393b9124d531e134445) of OpenNMT-py, 14 | and implemented our code on top of it. 15 | This repo reserves all previous commits of OpenNMT-py and ignores all the follow-up commits. 16 | Our changes can be viewed by comparing the [commits](https://github.com/lemon234071/AdaLabel/commit/4b9531943a4e00f1ee8a7f4b8bf3554e2b1e0f41). 17 | 18 | Our code is tested on Ubuntu 16.04 using python 3.7.4 and PyTorch 1.7.1. 19 | 20 | ## How to use 21 | 22 | ### Step 1: Setup 23 | 24 | Install dependencies: 25 | 26 | ```bash 27 | conda create -n adalabel python=3.7.4 28 | conda activate adalabel 29 | conda install pytorch==1.7.1 cudatoolkit=10.1 -c PyTorch -n adalabel 30 | pip install -r requirement.txt 31 | ``` 32 | 33 | Make folders to store training and testing files: 34 | 35 | ```bash 36 | mkdir checkpoint # Model checkpoints will be saved here 37 | mkdir log_dir # The training log will be placed here 38 | mkdir result # The inferred results will be saved here 39 | ``` 40 | 41 | ### Step 2: Preprocess the data 42 | 43 | The data can be downloaded from this [link](https://drive.google.com/file/d/1U4M0h9tLNeCyu9JBfSgR3r5EE6IIqyNZ/view?usp=sharing). 44 | After downloading and unzipping, the DailyDialog and OpenSubtitle datasets used in our paper can be found in the `data_daily` and `data_ost` folders, respectively. 45 | We provide a script `scripts/preprocess.sh` to preprocess the data. 46 | 47 | ```bash 48 | bash scripts/preprocess.sh 49 | ``` 50 | 51 | Note: 52 | 53 | - Before running `scripts/preprocess.sh`, remember to modify its first line (i.e., the value of `DATA_DIR`) to specify the correct data folder. 54 | - The default choice of our tokenizer is [bert-base-uncased](https://huggingface.co/bert-base-uncased) 55 | 56 | ### Step 3: Train the model 57 | 58 | The training of our model can be performed using the following script: 59 | 60 | ```bash 61 | bash scripts/train_daily.sh # Train models on the DailyDialog dataset 62 | ``` 63 | 64 | or 65 | 66 | ```bash 67 | bash scripts/train_ost.sh # Train models on the OpenSubtitle dataset 68 | ``` 69 | 70 | Note: 71 | 72 | - The resulting checkpoints will be written to the `checkpoint` folder. 73 | - By default, our script uses the first available GPU. 74 | - Once the training is completed, the training script will log out the best performing model on the validation set. 75 | - Experiments in our paper are performed using TITAN XP with 12GB memory. 76 | 77 | ### Step 4: Inference 78 | 79 | The inference of our model can be performed using the following script: 80 | 81 | ```bash 82 | bash scripts/inference_daily.sh {which GPU to use} {path to your model checkpoint} # Infer models on the DailyDialog dataset 83 | ``` 84 | 85 | or 86 | 87 | ```bash 88 | bash scripts/inference_ost.sh {which GPU to use} {path to your model checkpoint} # Infer models on the OpenSubtitle dataset 89 | ``` 90 | 91 | 92 | Note: 93 | 94 | - Inferred outputs will be saved to the `result` folder. 95 | 96 | ### Step 5: Evaluation 97 | 98 | The following script can be used to evaluate our model based on the inferred outputs obtained in Step 4: 99 | 100 | ```bash 101 | python scripts/eval.py {path to the data folder} {path to the inferred output file} 102 | ``` 103 | 104 | ## Citation 105 | 106 | Please cite our paper if you find this repo useful :) 107 | 108 | ```BibTeX 109 | @inproceedings{wang2021adalabel, 110 | title={Diversifying Dialog Generation via Adaptive Label Smoothing}, 111 | author={Wang, Yida and Zheng, Yinhe and Jiang, Yong and Huang, Minlie}, 112 | booktitle={Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics}, 113 | year={2021} 114 | } 115 | ``` 116 | 117 | ---- 118 | 119 | Issues and pull requests are welcomed. 120 | -------------------------------------------------------------------------------- /available_models/example.conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "models_root": "./available_models", 3 | "models": [ 4 | { 5 | "id": 100, 6 | "model": "model_0.pt", 7 | "timeout": 600, 8 | "on_timeout": "to_cpu", 9 | "load": true, 10 | "opt": { 11 | "gpu": 0, 12 | "beam_size": 5 13 | }, 14 | "tokenizer": { 15 | "type": "sentencepiece", 16 | "model": "wmtenfr.model" 17 | } 18 | },{ 19 | "model": "model_0.light.pt", 20 | "timeout": -1, 21 | "on_timeout": "unload", 22 | "model_root": "../other_models", 23 | "opt": { 24 | "batch_size": 1, 25 | "beam_size": 10 26 | } 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /config/config-rnn-summarization.yml: -------------------------------------------------------------------------------- 1 | data: data/cnndm/CNNDM 2 | save_model: models/cnndm 3 | save_checkpoint_steps: 10000 4 | keep_checkpoint: 10 5 | seed: 3435 6 | train_steps: 100000 7 | valid_steps: 10000 8 | report_every: 100 9 | 10 | encoder_type: brnn 11 | word_vec_size: 128 12 | rnn_size: 512 13 | layers: 1 14 | 15 | optim: adagrad 16 | learning_rate: 0.15 17 | adagrad_accumulator_init: 0.1 18 | max_grad_norm: 2 19 | 20 | batch_size: 16 21 | dropout: 0.0 22 | 23 | copy_attn: 'true' 24 | global_attention: mlp 25 | reuse_copy_attn: 'true' 26 | bridge: 'true' 27 | 28 | world_size: 2 29 | gpu_ranks: 30 | - 0 31 | - 1 32 | -------------------------------------------------------------------------------- /config/config-transformer-base-1GPU.yml: -------------------------------------------------------------------------------- 1 | data: exp/dataset.de-en 2 | save_model: exp/model.de-en 3 | save_checkpoint_steps: 10000 4 | keep_checkpoint: 10 5 | seed: 3435 6 | train_steps: 500000 7 | valid_steps: 10000 8 | warmup_steps: 8000 9 | report_every: 100 10 | 11 | decoder_type: transformer 12 | encoder_type: transformer 13 | word_vec_size: 512 14 | rnn_size: 512 15 | layers: 6 16 | transformer_ff: 2048 17 | heads: 8 18 | 19 | accum_count: 8 20 | optim: adam 21 | adam_beta1: 0.9 22 | adam_beta2: 0.998 23 | decay_method: noam 24 | learning_rate: 2.0 25 | max_grad_norm: 0.0 26 | 27 | batch_size: 4096 28 | batch_type: tokens 29 | normalization: tokens 30 | dropout: 0.1 31 | label_smoothing: 0.1 32 | 33 | max_generator_batches: 2 34 | 35 | param_init: 0.0 36 | param_init_glorot: 'true' 37 | position_encoding: 'true' 38 | 39 | world_size: 1 40 | gpu_ranks: 41 | - 0 42 | 43 | -------------------------------------------------------------------------------- /config/config-transformer-base-4GPU.yml: -------------------------------------------------------------------------------- 1 | data: exp/dataset.de-en 2 | save_model: exp/model.de-en 3 | save_checkpoint_steps: 10000 4 | keep_checkpoint: 10 5 | seed: 3435 6 | train_steps: 200000 7 | valid_steps: 10000 8 | warmup_steps: 8000 9 | report_every: 100 10 | 11 | decoder_type: transformer 12 | encoder_type: transformer 13 | word_vec_size: 512 14 | rnn_size: 512 15 | layers: 6 16 | transformer_ff: 2048 17 | heads: 8 18 | 19 | accum_count: 2 20 | optim: adam 21 | adam_beta1: 0.9 22 | adam_beta2: 0.998 23 | decay_method: noam 24 | learning_rate: 2.0 25 | max_grad_norm: 0.0 26 | 27 | batch_size: 4096 28 | batch_type: tokens 29 | normalization: tokens 30 | dropout: 0.1 31 | label_smoothing: 0.1 32 | 33 | max_generator_batches: 2 34 | 35 | param_init: 0.0 36 | param_init_glorot: 'true' 37 | position_encoding: 'true' 38 | 39 | world_size: 4 40 | gpu_ranks: 41 | - 0 42 | - 1 43 | - 2 44 | - 3 45 | 46 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | > python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 6 | 7 | > python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -world_size 1 -gpu_ranks 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam -learning_rate 0.001 8 | -------------------------------------------------------------------------------- /data/morph/src.valid: -------------------------------------------------------------------------------- 1 | o p r u g a 2 | б е с т и ј а 3 | f a l s e t 4 | b e z i m e n o 5 | п р с и т и 6 | d e c e m b a r s k i 7 | м е ђ а 8 | š t a n d a r a c 9 | к р п о 10 | t a j n i k 11 | к е с и т и 12 | с р ч а н 13 | s c e n o g r a f 14 | б р а т и ћ 15 | з а у з е т и 16 | v e z a 17 | н е п а л а ц 18 | p r i l a g o d l j i v o s t 19 | g o l f e r i c a 20 | г р а ђ е в и н а ц 21 | с е к л у з и ј а 22 | р а с к р с н и ц а 23 | n e o p r a v d a n o 24 | k o m a d 25 | b e s t i j a l n o s t 26 | t u p o g l a v o 27 | с т р а н 28 | к р и в и ч а р 29 | п р о ј а х а т и 30 | о р г а н с к и 31 | r a z m i r i c a 32 | ц е с а р 33 | l j u t n j a 34 | н е в е ш т 35 | н о б е л о в а ц 36 | o s v a j a č 37 | х е д о н и с т 38 | р о ж н а т 39 | s r e b r e n 40 | p r i m e r a n 41 | и с к р и ч а в о 42 | s a g i n j a t i 43 | с и т а н 44 | v e š m a š i n a 45 | b l a m a ž a 46 | н е г и р а т и 47 | n a z d r a v i t i 48 | б о ж и 49 | r e š o 50 | к о ш у т е 51 | и м е н д а н 52 | e l e g a n t n o 53 | s k v r č i t i 54 | b e z v o l j a 55 | н а г и б 56 | k a p i t a l a n 57 | r e n o m e 58 | ф а с ц и н и р а т и 59 | с а к у п љ а т и 60 | п ш е н и ч н и 61 | u s m j e r a v a t i 62 | s k r i v a t i 63 | t i c a t i 64 | b i t n o 65 | j u r i š 66 | ч а с о п и с 67 | b e s k o n a č n o s t 68 | с а ф и р н и 69 | f a l s i f i k a t 70 | o n t o g e n e z a 71 | p r i m j e n a 72 | к р е к е т 73 | а е р о д р о м 74 | o s l o b a đ a t i 75 | с т у д и ј 76 | к р а т к о в и д 77 | л у п е ш т в о 78 | ž i v a c 79 | ј а ј а ш ц е 80 | r a z u v e r i t i 81 | z a v i s t 82 | k o n d u r a 83 | а м ф и т е а т а р 84 | а л г о н к и ј с к и 85 | у м и в а т и 86 | д а ж д е в њ а к 87 | д е ф е т и с т 88 | p r e s k a k a t i 89 | п е р у а н с к и 90 | s u p r o t s t a v l j a t i 91 | s p l j o š t e n 92 | с а ф и р 93 | п о ш т е ђ и в а т и 94 | к р а н и о л о г и ј с к и 95 | s v o j t a 96 | у п а д а т и 97 | r a ž a n j 98 | д о л и к о в а т и 99 | с а м о у в е р е н о с т 100 | х у м о р и с т и ч к и 101 | ч е ш к и 102 | s r a t i 103 | б л е б н у т и 104 | о д с е ц а т и 105 | n o v o r o đ e n 106 | b r e s k v a 107 | о б р е д 108 | k o n a č a n 109 | г р и с т и 110 | i k a v a c 111 | e p i l e p t i č a r k a 112 | d e z o r g a n i z a c i j a 113 | о т е ћ и 114 | z n a m e n j e 115 | м у љ а в 116 | i t a l i j a n s k i 117 | p r e t v o r b a 118 | g e r i j a t r i j a 119 | a l b a n i z a c i j a 120 | о ц ј е н а 121 | o d b i t i 122 | r e t o r i č n o 123 | м о љ а ц 124 | š k a f e t i n 125 | s p a n a ć 126 | у р а н 127 | n e p r i s t u p a č a n 128 | č e l n i k 129 | о п л о ђ и в а т и 130 | к и с т 131 | с а с е ћ и 132 | п о з д р а в 133 | ч а ђ а в о с т 134 | х а н 135 | п р и п о в е д а ч 136 | k i n o l o g i j a 137 | a s t r o n o m i j s k i 138 | n e i z l j e č i v o s t 139 | u s l o v a n 140 | s r p s k i 141 | e v o l u c i o n i z a m 142 | а н о т и р а т и 143 | d o s t a v i t i 144 | с а д а 145 | д о д а т а к 146 | p r o p i s i v a t i 147 | u s t v r đ i v a t i 148 | m i j e š a l i c a 149 | е г з и с т е н ц и ј а л а н 150 | и в и ц а 151 | a u t o k e f a l a n 152 | ж и в а х н о с т 153 | и з о р а в а т и 154 | х л а ч е 155 | к о н с о н а н т 156 | у з м а ћ и 157 | и м и г р и р а т и 158 | п л и т а к 159 | т е о з о ф и ј а 160 | к р а т к о в р а т 161 | s a t a n i z a m 162 | х а р а ч 163 | o t r c a n 164 | g i b a k 165 | k o s t r u š i t i 166 | o d g o v o r i t i 167 | b a l a v i t i 168 | f l a š i r a t i 169 | с л а ч и ц а 170 | t e n d e n c i j a 171 | g d a 172 | m n o ž i n a 173 | т е л е о л о г и ј а 174 | k r i z a n t e m a 175 | l j e t o 176 | к о н т р а д и к ц и ј а 177 | и н д и р е к т а н 178 | s l a m k a 179 | š t r a j h a t i 180 | п а п и р н и ч а р 181 | k u r t o a z n o 182 | o č e k i v a n j e 183 | n a m a ć i 184 | z a m a ć i 185 | b i f t e k 186 | с т о г а 187 | х о м о г е н 188 | з а м а к н у т и 189 | o k r i v i t i 190 | у ч и н и т и 191 | k i n u t i 192 | o b r t a t i 193 | t r a n s p l a n t a c i j a 194 | п р е в е л и к 195 | u k i d a t i 196 | к а м е н и 197 | -------------------------------------------------------------------------------- /data/morph/tgt.valid: -------------------------------------------------------------------------------- 1 | o p r u ɡ a 2 | b e s t i j a 3 | f a l s e t 4 | b e z i m e n o 5 | p r s i t i 6 | d e t s e m b a r s k i 7 | m e d z a 8 | ʃ t a n d a r a t s 9 | k r p o 10 | t a j n i k 11 | k e s i t i 12 | s r t ʃ a n 13 | s t s e n o ɡ r a f 14 | b r a t i t x 15 | z a u z e t i 16 | ʋ e z a 17 | n e p a l a t s 18 | p r i l a ɡ o d ʎ i ʋ o s t 19 | ɡ o l f e r i t s a 20 | ɡ r a d z e ʋ i n a t s 21 | s e k l u z i j a 22 | r a s k r s n i t s a 23 | n e o p r a ʋ d a n o 24 | k o m a d 25 | b e s t i j a l n o s t 26 | t u p o ɡ l a ʋ o 27 | s t r a n 28 | k r i ʋ i t ʃ a r 29 | p r o j a x a t i 30 | o r ɡ a n s k i 31 | r a z m i r i t s a 32 | t s e s a r 33 | ʎ u t ɲ a 34 | n e ʋ e ʃ t 35 | n o b e l o ʋ a t s 36 | o s ʋ a j a t ʃ 37 | x e d o n i s t 38 | r o ʒ n a t 39 | s r e b r e n 40 | p r i m e r a n 41 | i s k r i t ʃ a ʋ o 42 | s a ɡ i ɲ a t i 43 | s i t a n 44 | ʋ e ʃ m a ʃ i n a 45 | b l a m a ʒ a 46 | n e ɡ i r a t i 47 | n a z d r a ʋ i t i 48 | b o ʒ i 49 | r e ʃ o 50 | k o ʃ u t e 51 | i m e n d a n 52 | e l e ɡ a n t n o 53 | s k ʋ r t ʃ i t i 54 | b e z ʋ o ʎ a 55 | n a ɡ i b 56 | k a p i t a l a n 57 | r e n o m e 58 | f a s t s i n i r a t i 59 | s a k u p ʎ a t i 60 | p ʃ e n i t ʃ n i 61 | u s m j e r a ʋ a t i 62 | s k r i ʋ a t i 63 | t i t s a t i 64 | b i t n o 65 | j u r i ʃ 66 | t ʃ a s o p i s 67 | b e s k o n a t ʃ n o s t 68 | s a f i r n i 69 | f a l s i f i k a t 70 | o n t o ɡ e n e z a 71 | p r i m j e n a 72 | k r e k e t 73 | a e r o d r o m 74 | o s l o b a d z a t i 75 | s t u d i j 76 | k r a t k o ʋ i d 77 | l u p e ʃ t ʋ o 78 | ʒ i ʋ a t s 79 | j a j a ʃ t s e 80 | r a z u ʋ e r i t i 81 | z a ʋ i s t 82 | k o n d u r a 83 | a m f i t e a t a r 84 | a l ɡ o n k i j s k i 85 | u m i ʋ a t i 86 | d a ʒ d e ʋ ɲ a k 87 | d e f e t i s t 88 | p r e s k a k a t i 89 | p e r u a n s k i 90 | s u p r o t s t a ʋ ʎ a t i 91 | s p ʎ o ʃ t e n 92 | s a f i r 93 | p o ʃ t e d z i ʋ a t i 94 | k r a n i o l o ɡ i j s k i 95 | s ʋ o j t a 96 | u p a d a t i 97 | r a ʒ a ɲ 98 | d o l i k o ʋ a t i 99 | s a m o u ʋ e r e n o s t 100 | x u m o r i s t i t ʃ k i 101 | t ʃ e ʃ k i 102 | s r a t i 103 | b l e b n u t i 104 | o d s e t s a t i 105 | n o ʋ o r o d z e n 106 | b r e s k ʋ a 107 | o b r e d 108 | k o n a t ʃ a n 109 | ɡ r i s t i 110 | i k a ʋ a t s 111 | e p i l e p t i t ʃ a r k a 112 | d e z o r ɡ a n i z a t s i j a 113 | o t e t x i 114 | z n a m e ɲ e 115 | m u ʎ a ʋ 116 | i t a l i j a n s k i 117 | p r e t ʋ o r b a 118 | ɡ e r i j a t r i j a 119 | a l b a n i z a t s i j a 120 | o t s j e n a 121 | o d b i t i 122 | r e t o r i t ʃ n o 123 | m o ʎ a t s 124 | ʃ k a f e t i n 125 | s p a n a t x 126 | u r a n 127 | n e p r i s t u p a t ʃ a n 128 | t ʃ e l n i k 129 | o p l o d z i ʋ a t i 130 | k i s t 131 | s a s e t x i 132 | p o z d r a ʋ 133 | t ʃ a d z a ʋ o s t 134 | x a n 135 | p r i p o ʋ e d a t ʃ 136 | k i n o l o ɡ i j a 137 | a s t r o n o m i j s k i 138 | n e i z ʎ e t ʃ i ʋ o s t 139 | u s l o ʋ a n 140 | s r p s k i 141 | e ʋ o l u t s i o n i z a m 142 | a n o t i r a t i 143 | d o s t a ʋ i t i 144 | s a d a 145 | d o d a t a k 146 | p r o p i s i ʋ a t i 147 | u s t ʋ r d z i ʋ a t i 148 | m i j e ʃ a l i t s a 149 | e ɡ z i s t e n t s i j a l a n 150 | i ʋ i t s a 151 | a u t o k e f a l a n 152 | ʒ i ʋ a x n o s t 153 | i z o r a ʋ a t i 154 | x l a t ʃ e 155 | k o n s o n a n t 156 | u z m a t x i 157 | i m i ɡ r i r a t i 158 | p l i t a k 159 | t e o z o f i j a 160 | k r a t k o ʋ r a t 161 | s a t a n i z a m 162 | x a r a t ʃ 163 | o t r t s a n 164 | ɡ i b a k 165 | k o s t r u ʃ i t i 166 | o d ɡ o ʋ o r i t i 167 | b a l a ʋ i t i 168 | f l a ʃ i r a t i 169 | s l a t ʃ i t s a 170 | t e n d e n t s i j a 171 | ɡ d a 172 | m n o ʒ i n a 173 | t e l e o l o ɡ i j a 174 | k r i z a n t e m a 175 | ʎ e t o 176 | k o n t r a d i k t s i j a 177 | i n d i r e k t a n 178 | s l a m k a 179 | ʃ t r a j x a t i 180 | p a p i r n i t ʃ a r 181 | k u r t o a z n o 182 | o t ʃ e k i ʋ a ɲ e 183 | n a m a t x i 184 | z a m a t x i 185 | b i f t e k 186 | s t o ɡ a 187 | x o m o ɡ e n 188 | z a m a k n u t i 189 | o k r i ʋ i t i 190 | u t ʃ i n i t i 191 | k i n u t i 192 | o b r t a t i 193 | t r a n s p l a n t a t s i j a 194 | p r e ʋ e l i k 195 | u k i d a t i 196 | k a m e n i 197 | -------------------------------------------------------------------------------- /data/test_model2.src: -------------------------------------------------------------------------------- 1 | а з и ј а т с к и 2 | а к р о б а т с к и 3 | а л к о х о л и ч а р 4 | а р м а т у р а 5 | а у т о н о м а ш т в о 6 | б а р о к н и 7 | б е з б р о ј а н 8 | б о р о в и н а 9 | б о с а н а ц 10 | б р а у з е р 11 | в о ј н и ш т в о 12 | г д е г д е 13 | г р а б љ е 14 | г р д о с и ј а 15 | д а л е к о в и д н о 16 | д е з и н ф о р м а ц и ј а 17 | д е р и ш т е 18 | д р о б њ а к 19 | е м а н ц и п а ц и ј а 20 | ж а н д а р м е р и ј а 21 | з а в и д љ и в а ц 22 | з а к о ч и т и 23 | з а н е м а р и т и 24 | з в о н о 25 | з л о д ј е л о 26 | и г р о к а з 27 | ј е д и н и т и 28 | ј е д н о с т а в н о 29 | к о з л и н а ц 30 | к о н с т р у к т и в а н 31 | к р е к е т н у т и 32 | к у ш а ч и ц а 33 | л е г и т и м н о 34 | л и з а л и ц а 35 | л о м љ а в а 36 | м а м у р л у к 37 | м е д а љ а 38 | м о р а л н о 39 | н е г о с т о љ у б и в о с т 40 | н е д ј е љ н и 41 | н е к о ј и 42 | н е с т а н а к 43 | н е с т р у ч њ а к 44 | о д м о р и т и 45 | о п о р и ц а т и 46 | о п р и ч а т и 47 | о с а о 48 | п а њ 49 | п е р ф е к т 50 | п о н а в љ а т и 51 | п о п р и м и т и 52 | п р а с л о в е н с к и 53 | п р и г о д а н 54 | п р и п р е м а т и 55 | п с и х о п а т о л о г и ј а 56 | п с о в а ч к и 57 | п у н ч 58 | р а з а п и њ а т и 59 | с а б и т и 60 | с а г и б љ и в о с т 61 | с а к р и т и 62 | с а к р о с а н к т а н 63 | с а л а т н и 64 | с и р н и 65 | с к у п о ц ј е н 66 | с л а т к о р ј е ч и в о 67 | с н о ш љ и в о 68 | с о ч н о 69 | с т и д љ и в 70 | т а ј 71 | т а н а ц 72 | т е с т е н и н а 73 | т р а н з и т 74 | т р ч а т и 75 | ћ у м у р џ и ј а 76 | у ж и в а л а ц 77 | у к о р е н и т и 78 | у п о м о ћ 79 | у р о т н и к 80 | у с м ј е р а в а т и 81 | у с п у т 82 | у с т а ј а т и 83 | у ц е н и т и 84 | ф а н а т и ч н о с т 85 | ф о т к а 86 | х и љ а д у 87 | х и п и 88 | х у м а н и з а м 89 | ц р н е т и 90 | ш а м п и о н с к и 91 | ш и ф к а р т а 92 | ш л а ј е р 93 | ш л а ј ф а т и 94 | ш л у к 95 | ш п а ј з 96 | ш п а ј с к а р т а 97 | ш п а н с к и 98 | ш т о к а в а ц 99 | ш т р а ј х е р 100 | ш у п ч и ћ 101 | a g r e s o r s k i 102 | a k t e r 103 | a m b r o z i j a 104 | a p s t i n e n c i j a 105 | a s i m e t r i j a 106 | a v i o k o m p a n i j a 107 | d a ž d e v n j a k 108 | d e l o m 109 | d i j a l o g 110 | d o h v a t i t i 111 | d o k t o r a n d 112 | d o k t o r s k i 113 | d o v i t l j i v 114 | d o z v o l a 115 | d v o s m i s l e n 116 | e r i t r e j a 117 | e s t e t i k a 118 | e v r o p s k i 119 | f i z i o t e r a p i j s k i 120 | g a j 121 | g m a z 122 | h a j d e m o 123 | i n j e 124 | i n t o n a c i j a 125 | k e s t e n j a s t 126 | k o l e v k a 127 | k o z j i 128 | k r a l j e š a k 129 | k r a t k o 130 | k r č e v i n a 131 | k r e a t i v a n 132 | k r e š e n d o 133 | k u ć a n i c a 134 | k u ć n i 135 | l i č a n 136 | l j u l j a t i 137 | m e s n i 138 | m r l j a 139 | m u š k o s t 140 | n a b r u s i v a t i 141 | n a d o b u d a n 142 | n a k a l e m i t i 143 | n a r e č j e 144 | n a s l e đ i v a t i 145 | n e ć a k 146 | n e d e l j i v 147 | n e o s e t l j i v o 148 | n e s a v e s t a n 149 | n e s u v i s a o 150 | n e u r a s t e n i č a n 151 | n e u s t a v a n 152 | n i š a n d ž i j a 153 | o b r a ć a t i 154 | o k r e t a t i 155 | o n e s p o s o b l j i v a t i 156 | o s a k a ć i v a t i 157 | o s v e t l j i v 158 | p a l e o z o i k 159 | p e s n i k 160 | p l a t n e n 161 | p l e m e n i t 162 | p r a v o v a l j a n o s t 163 | p r a ž a n i n 164 | p r i p r a v l j a t i 165 | p r o l e t o s 166 | r a z g o v a r a t i 167 | r a z n o l i č a n 168 | r a z r e d 169 | r a z v r a t n o 170 | r i g i d i t e t 171 | r u b e š k i 172 | s e r i o z a n 173 | s i n o ć 174 | s i r u t k a 175 | s l e d i t i 176 | s m j e š t a j 177 | š n i r a t i 178 | s o k a k 179 | š p e k 180 | s r a m i t i 181 | š t i t o n o š a 182 | s t o t i n a 183 | t i n j a t i 184 | t o k s i k o l o g i j a 185 | t o l i k 186 | t r a n s a t l a n t s k i 187 | u n u t a r 188 | u z g r e d i c e 189 | v a k c i n a 190 | v a š m a š i n a 191 | v e ć e 192 | v l a d a v i n a 193 | v o k a c i j a 194 | z a b u š a n t s k i 195 | z a k l i n j a t i 196 | z a k o č i t i 197 | z n a t a n 198 | z o r a n 199 | -------------------------------------------------------------------------------- /data/test_model2.tgt: -------------------------------------------------------------------------------- 1 | a z i j a t s k i 2 | a k r o b a t s k i 3 | a l k o x o l i t ʃ a r 4 | a r m a t u r a 5 | a u t o n o m a ʃ t ʋ o 6 | b a r o k n i 7 | b e z b r o j a n 8 | b o r o ʋ i n a 9 | b o s a n a t s 10 | b r a u z e r 11 | ʋ o j n i ʃ t ʋ o 12 | ɡ d e ɡ d e 13 | ɡ r a b ʎ e 14 | ɡ r d o s i j a 15 | d a l e k o ʋ i d n o 16 | d e z i n f o r m a t s i j a 17 | d e r i ʃ t e 18 | d r o b ɲ a k 19 | e m a n t s i p a t s i j a 20 | ʒ a n d a r m e r i j a 21 | z a ʋ i d ʎ i ʋ a t s 22 | z a k o t ʃ i t i 23 | z a n e m a r i t i 24 | z ʋ o n o 25 | z l o d j e l o 26 | i ɡ r o k a z 27 | j e d i n i t i 28 | j e d n o s t a ʋ n o 29 | k o z l i n a t s 30 | k o n s t r u k t i ʋ a n 31 | k r e k e t n u t i 32 | k u ʃ a t ʃ i t s a 33 | l e ɡ i t i m n o 34 | l i z a l i t s a 35 | l o m ʎ a ʋ a 36 | m a m u r l u k 37 | m e d a ʎ a 38 | m o r a l n o 39 | n e ɡ o s t o ʎ u b i ʋ o s t 40 | n e d j e ʎ n i 41 | n e k o j i 42 | n e s t a n a k 43 | n e s t r u t ʃ ɲ a k 44 | o d m o r i t i 45 | o p o r i t s a t i 46 | o p r i t ʃ a t i 47 | o s a o 48 | p a ɲ 49 | p e r f e k t 50 | p o n a ʋ ʎ a t i 51 | p o p r i m i t i 52 | p r a s l o ʋ e n s k i 53 | p r i ɡ o d a n 54 | p r i p r e m a t i 55 | p s i x o p a t o l o ɡ i j a 56 | p s o ʋ a t ʃ k i 57 | p u n t ʃ 58 | r a z a p i ɲ a t i 59 | s a b i t i 60 | s a ɡ i b ʎ i ʋ o s t 61 | s a k r i t i 62 | s a k r o s a n k t a n 63 | s a l a t n i 64 | s i r n i 65 | s k u p o t s j e n 66 | s l a t k o r j e t ʃ i ʋ o 67 | s n o ʃ ʎ i ʋ o 68 | s o t ʃ n o 69 | s t i d ʎ i ʋ 70 | t a j 71 | t a n a t s 72 | t e s t e n i n a 73 | t r a n z i t 74 | t r t ʃ a t i 75 | t x u m u r d ʒ i j a 76 | u ʒ i ʋ a l a t s 77 | u k o r e n i t i 78 | u p o m o t x 79 | u r o t n i k 80 | u s m j e r a ʋ a t i 81 | u s p u t 82 | u s t a j a t i 83 | u t s e n i t i 84 | f a n a t i t ʃ n o s t 85 | f o t k a 86 | x i ʎ a d u 87 | x i p i 88 | x u m a n i z a m 89 | t s r n e t i 90 | ʃ a m p i o n s k i 91 | ʃ i f k a r t a 92 | ʃ l a j e r 93 | ʃ l a j f a t i 94 | ʃ l u k 95 | ʃ p a j z 96 | ʃ p a j s k a r t a 97 | ʃ p a n s k i 98 | ʃ t o k a ʋ a t s 99 | ʃ t r a j x e r 100 | ʃ u p t ʃ i t x 101 | a ɡ r e s o r s k i 102 | a k t e r 103 | a m b r o z i j a 104 | a p s t i n e n t s i j a 105 | a s i m e t r i j a 106 | a ʋ i o k o m p a n i j a 107 | d a ʒ d e ʋ ɲ a k 108 | d e l o m 109 | d i j a l o ɡ 110 | d o x ʋ a t i t i 111 | d o k t o r a n d 112 | d o k t o r s k i 113 | d o ʋ i t ʎ i ʋ 114 | d o z ʋ o l a 115 | d ʋ o s m i s l e n 116 | e r i t r e j a 117 | e s t e t i k a 118 | e ʋ r o p s k i 119 | f i z i o t e r a p i j s k i 120 | ɡ a j 121 | ɡ m a z 122 | x a j d e m o 123 | i ɲ e 124 | i n t o n a t s i j a 125 | k e s t e ɲ a s t 126 | k o l e ʋ k a 127 | k o z j i 128 | k r a ʎ e ʃ a k 129 | k r a t k o 130 | k r t ʃ e ʋ i n a 131 | k r e a t i ʋ a n 132 | k r e ʃ e n d o 133 | k u t x a n i t s a 134 | k u t x n i 135 | l i t ʃ a n 136 | ʎ u ʎ a t i 137 | m e s n i 138 | m r ʎ a 139 | m u ʃ k o s t 140 | n a b r u s i ʋ a t i 141 | n a d o b u d a n 142 | n a k a l e m i t i 143 | n a r e t ʃ j e 144 | n a s l e d z i ʋ a t i 145 | n e t x a k 146 | n e d e ʎ i ʋ 147 | n e o s e t ʎ i ʋ o 148 | n e s a ʋ e s t a n 149 | n e s u ʋ i s a o 150 | n e u r a s t e n i t ʃ a n 151 | n e u s t a ʋ a n 152 | n i ʃ a n d ʒ i j a 153 | o b r a t x a t i 154 | o k r e t a t i 155 | o n e s p o s o b ʎ i ʋ a t i 156 | o s a k a t x i ʋ a t i 157 | o s ʋ e t ʎ i ʋ 158 | p a l e o z o i k 159 | p e s n i k 160 | p l a t n e n 161 | p l e m e n i t 162 | p r a ʋ o ʋ a ʎ a n o s t 163 | p r a ʒ a n i n 164 | p r i p r a ʋ ʎ a t i 165 | p r o l e t o s 166 | r a z ɡ o ʋ a r a t i 167 | r a z n o l i t ʃ a n 168 | r a z r e d 169 | r a z ʋ r a t n o 170 | r i ɡ i d i t e t 171 | r u b e ʃ k i 172 | s e r i o z a n 173 | s i n o t x 174 | s i r u t k a 175 | s l e d i t i 176 | s m j e ʃ t a j 177 | ʃ n i r a t i 178 | s o k a k 179 | ʃ p e k 180 | s r a m i t i 181 | ʃ t i t o n o ʃ a 182 | s t o t i n a 183 | t i ɲ a t i 184 | t o k s i k o l o ɡ i j a 185 | t o l i k 186 | t r a n s a t l a n t s k i 187 | u n u t a r 188 | u z ɡ r e d i t s e 189 | ʋ a k t s i n a 190 | ʋ a ʃ m a ʃ i n a 191 | ʋ e t x e 192 | ʋ l a d a ʋ i n a 193 | ʋ o k a t s i j a 194 | z a b u ʃ a n t s k i 195 | z a k l i ɲ a t i 196 | z a k o t ʃ i t i 197 | z n a t a n 198 | z o r a n 199 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python3 -msphinx 7 | SPHINXPROJ = OpenNMT-py 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinxcontrib.bibtex 3 | sphinxcontrib.mermaid 4 | sphinx-rtd-theme 5 | recommonmark 6 | sphinx-argparse 7 | sphinx_markdown_tables 8 | -------------------------------------------------------------------------------- /docs/source/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | OpenNMT-py is a community developed project and we love developer contributions. 4 | 5 | ## Guidelines 6 | Before sending a PR, please do this checklist first: 7 | 8 | - Please run `tools/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks: 9 | 1. flake8 check for coding style; 10 | 2. unittest; 11 | 3. continuous integration tests listed in `.travis.yml`. 12 | - When adding/modifying class constructor, please make the arguments as same naming style as its superclass in PyTorch. 13 | - If your change is based on a paper, please include a clear comment and reference in the code (more on that below). 14 | 15 | ### Docstrings 16 | Above all, try to follow the Google docstring format 17 | ([Napoleon example](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html), 18 | [Google styleguide](http://google.github.io/styleguide/pyguide.html)). 19 | This makes it easy to include your contributions in the Sphinx documentation. And, do feel free 20 | to autodoc your contributions in the API ``.rst`` files in the `docs/source` folder! If you do, check that 21 | your additions look right. 22 | 23 | ```bash 24 | cd docs 25 | # install some dependencies if necessary: 26 | # recommonmark, sphinx_rtd_theme, sphinxcontrib-bibtex 27 | make html 28 | firefox build/html/main.html # or your browser of choice 29 | ``` 30 | 31 | Some particular advice: 32 | - Try to follow Python 3 [``typing`` module](https://docs.python.org/3/library/typing.html) conventions when documenting types. 33 | - Exception: use "or" instead of unions for more readability 34 | - For external types, use the full "import name". Common abbreviations (e.g. ``np``) are acceptable. 35 | For ``torch.Tensor`` types, the ``torch.`` is optional. 36 | - Please don't use tics like `` (`str`) `` or rst directives like `` (:obj:`str`) ``. Napoleon handles types 37 | very well without additional help, so avoid the clutter. 38 | - [Google docstrings don't support multiple returns](https://stackoverflow.com/questions/29221551/can-sphinx-napoleon-document-function-returning-multiple-arguments). 39 | For multiple returns, the following works well with Sphinx and is still very readable. 40 | ```python 41 | def foo(a, b): 42 | """This is my docstring. 43 | 44 | Args: 45 | a (object): Something. 46 | b (class): Another thing. 47 | 48 | Returns: 49 | (object, class): 50 | 51 | * a: Something or rather with a long 52 | description that spills over. 53 | * b: And another thing. 54 | """ 55 | 56 | return a, b 57 | ``` 58 | - When citing a paper, avoid directly linking in the docstring! Add a Bibtex entry to `docs/source/refs.bib`. 59 | E.g., to cite "Attention Is All You Need", visit [arXiv](https://arxiv.org/abs/1706.03762), choose the 60 | [bibtext](https://dblp.uni-trier.de/rec/bibtex/journals/corr/VaswaniSPUJGKP17) link, search `docs/source/refs.bib` 61 | using `CTRL-F` for `DBLP:journals/corr/VaswaniSPUJGKP17`, and if you do not find it then copy-paste the 62 | citation into `refs.bib`. Then, in your docstring, use ``:cite:`DBLP:journals/corr/VaswaniSPUJGKP17` ``. 63 | - However, a link is better than nothing. 64 | - Please document tensor shapes. Prefer the format 65 | ``` ``(a, b, c)`` ```. This style is easy to read, allows using ``x`` for multplication, and is common 66 | (PyTorch uses a few variations on the parentheses format, AllenNLP uses exactly this format, Fairseq uses 67 | the parentheses format with single ticks). 68 | - Again, a different style is better than no shape documentation. 69 | - Please avoid unnecessary space characters, try to capitalize, and try to punctuate. 70 | 71 | For multi-line docstrings, add a blank line after the closing ``"""``. 72 | Don't use a blank line before the closing quotes. 73 | 74 | ``""" not this """`` ``"""This."""`` 75 | 76 | ```python 77 | """ 78 | Not this. 79 | """ 80 | ``` 81 | ```python 82 | """This.""" 83 | ``` 84 | 85 | This note is the least important. Focus on content first, but remember that consistent docs look good. 86 | - Be sensible about the first line. Generally, one stand-alone summary line (per the Google guidelines) is good. 87 | Sometimes, it's better to cut directly to the args or an extended description. It's always acceptable to have a 88 | "trailing" citation. -------------------------------------------------------------------------------- /docs/source/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* override table width restrictions */ 2 | @media screen and (min-width: 767px) { 3 | 4 | .wy-table-responsive table td { 5 | /* !important prevents the common CSS stylesheets from overriding 6 | this as on RTD they are loaded after this stylesheet */ 7 | white-space: normal !important; 8 | } 9 | 10 | .wy-table-responsive { 11 | overflow: visible !important; 12 | } 13 | } -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | == Examples == 2 | 3 | 4 | .. include:: quickstart.md 5 | .. include:: extended.md 6 | -------------------------------------------------------------------------------- /docs/source/extended.md: -------------------------------------------------------------------------------- 1 | 2 | # Translation 3 | 4 | The example below uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the data and the moses BLEU script for evaluation. This example if for training for the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html). 5 | 6 | Step 0. Download the data. 7 | 8 | ```bash 9 | mkdir -p data/multi30k 10 | wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/multi30k && rm training.tar.gz 11 | wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/multi30k && rm validation.tar.gz 12 | wget http://www.quest.dcs.shef.ac.uk/wmt17_files_mmt/mmt_task1_test2016.tar.gz && tar -xf mmt_task1_test2016.tar.gz -C data/multi30k && rm mmt_task1_test2016.tar.gz 13 | ``` 14 | 15 | Step 1. Preprocess the data. 16 | 17 | ```bash 18 | for l in en de; do for f in data/multi30k/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; done; done 19 | for l in en de; do for f in data/multi30k/*.$l; do perl tools/tokenizer.perl -a -no-escape -l $l -q < $f > $f.atok; done; done 20 | onmt_preprocess -train_src data/multi30k/train.en.atok -train_tgt data/multi30k/train.de.atok -valid_src data/multi30k/val.en.atok -valid_tgt data/multi30k/val.de.atok -save_data data/multi30k.atok.low -lower 21 | ``` 22 | 23 | Step 2. Train the model. 24 | 25 | ```bash 26 | onmt_train -data data/multi30k.atok.low -save_model multi30k_model -gpu_ranks 0 27 | ``` 28 | 29 | Step 3. Translate sentences. 30 | 31 | ```bash 32 | onmt_translate -gpu 0 -model multi30k_model_*_e13.pt -src data/multi30k/test2016.en.atok -tgt data/multi30k/test2016.de.atok -replace_unk -verbose -output multi30k.test.pred.atok 33 | ``` 34 | 35 | And evaluate 36 | 37 | ```bash 38 | perl tools/multi-bleu.perl data/multi30k/test2016.de.atok < multi30k.test.pred.atok 39 | ``` 40 | -------------------------------------------------------------------------------- /docs/source/im2text.md: -------------------------------------------------------------------------------- 1 | # Image to Text 2 | 3 | A deep learning-based approach to learning the image-to-text conversion, built on top of the OpenNMT system. It is completely data-driven, hence can be used for a variety of image-to-text problems, such as image captioning, optical character recognition and LaTeX decompilation. 4 | 5 | Take LaTeX decompilation as an example, given a formula image: 6 | 7 |

8 | 9 | The goal is to infer the LaTeX source that can be compiled to such an image: 10 | 11 | ``` 12 | d s _ { 1 1 } ^ { 2 } = d x ^ { + } d x ^ { - } + l _ { p } ^ { 9 } \frac { p _ { - } } { r ^ { 7 } } \delta ( x ^ { - } ) d x ^ { - } d x ^ { - } + d x _ { 1 } ^ { 2 } + \; \cdots \; + d x _ { 9 } ^ { 2 } 13 | ``` 14 | 15 | The paper [[What You Get Is What You See: A Visual Markup Decompiler]](https://arxiv.org/pdf/1609.04938.pdf) provides more technical details of this model. 16 | 17 | ### Dependencies 18 | 19 | * `torchvision`: `conda install torchvision` 20 | * `Pillow`: `pip install Pillow` 21 | 22 | ### Quick Start 23 | 24 | To get started, we provide a toy Math-to-LaTex example. We assume that the working directory is `OpenNMT-py` throughout this document. 25 | 26 | Im2Text consists of four commands: 27 | 28 | 0) Download the data. 29 | 30 | ```bash 31 | wget -O data/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; tar zxf data/im2text.tgz -C data/ 32 | ``` 33 | 34 | 1) Preprocess the data. 35 | 36 | ```bash 37 | onmt_preprocess -data_type img \ 38 | -src_dir data/im2text/images/ \ 39 | -train_src data/im2text/src-train.txt \ 40 | -train_tgt data/im2text/tgt-train.txt -valid_src data/im2text/src-val.txt \ 41 | -valid_tgt data/im2text/tgt-val.txt -save_data data/im2text/demo \ 42 | -tgt_seq_length 150 \ 43 | -tgt_words_min_frequency 2 \ 44 | -shard_size 500 \ 45 | -image_channel_size 1 46 | ``` 47 | 48 | 2) Train the model. 49 | 50 | ```bash 51 | onmt_train -model_type img \ 52 | -data data/im2text/demo \ 53 | -save_model demo-model \ 54 | -gpu_ranks 0 \ 55 | -batch_size 20 \ 56 | -max_grad_norm 20 \ 57 | -learning_rate 0.1 \ 58 | -word_vec_size 80 \ 59 | -encoder_type brnn \ 60 | -image_channel_size 1 61 | ``` 62 | 63 | 3) Translate the images. 64 | 65 | ```bash 66 | onmt_translate -data_type img \ 67 | -model demo-model_acc_x_ppl_x_e13.pt \ 68 | -src_dir data/im2text/images \ 69 | -src data/im2text/src-test.txt \ 70 | -output pred.txt \ 71 | -max_length 150 \ 72 | -beam_size 5 \ 73 | -gpu 0 \ 74 | -verbose 75 | ``` 76 | 77 | The above dataset is sampled from the [im2latex-100k-dataset](http://lstm.seas.harvard.edu/latex/im2text.tgz). We provide a trained model [[link]](http://lstm.seas.harvard.edu/latex/py-model.pt) on this dataset. 78 | 79 | ### Options 80 | 81 | * `-src_dir`: The directory containing the images. 82 | 83 | * `-train_tgt`: The file storing the tokenized labels, one label per line. It shall look like: 84 | ``` 85 | ... 86 | ... 87 | ... 88 | ... 89 | ``` 90 | 91 | * `-train_src`: The file storing the paths of the images (relative to `src_dir`). 92 | ``` 93 | 94 | 95 | 96 | ... 97 | ``` 98 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | 2 | .. toctree:: 3 | :maxdepth: 2 4 | 5 | index.md 6 | quickstart.md 7 | extended.md 8 | 9 | 10 | This portal provides a detailled documentation of the OpenNMT toolkit. It describes how to use the PyTorch project and how it works. 11 | 12 | 13 | 14 | ## Installation 15 | 16 | 1\. [Install PyTorch](http://pytorch.org/) 17 | 18 | 2\. Clone the OpenNMT-py repository: 19 | 20 | ```bash 21 | git clone https://github.com/OpenNMT/OpenNMT-py 22 | cd OpenNMT-py 23 | ``` 24 | 25 | 3\. Install required libraries 26 | 27 | ```bash 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | And you are ready to go! Take a look at the [quickstart](quickstart.md) to familiarize yourself with the main training workflow. 32 | 33 | Alternatively you can use Docker to install with `nvidia-docker`. The main Dockerfile is included 34 | in the root directory. 35 | 36 | ## Citation 37 | 38 | When using OpenNMT for research please cite our 39 | [OpenNMT technical report](https://doi.org/10.18653/v1/P17-4012) 40 | 41 | ``` 42 | @inproceedings{opennmt, 43 | author = {Guillaume Klein and 44 | Yoon Kim and 45 | Yuntian Deng and 46 | Jean Senellart and 47 | Alexander M. Rush}, 48 | title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation}, 49 | booktitle = {Proc. ACL}, 50 | year = {2017}, 51 | url = {https://doi.org/10.18653/v1/P17-4012}, 52 | doi = {10.18653/v1/P17-4012} 53 | } 54 | ``` 55 | 56 | ## Additional resources 57 | 58 | You can find additional help or tutorials in the following resources: 59 | 60 | * [Gitter channel](https://gitter.im/OpenNMT/openmt-py) 61 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Contents 2 | -------- 3 | 4 | .. toctree:: 5 | :caption: Getting Started 6 | :maxdepth: 2 7 | 8 | main.md 9 | quickstart.md 10 | FAQ.md 11 | CONTRIBUTING.md 12 | ref.rst 13 | 14 | 15 | .. toctree:: 16 | :caption: Examples 17 | :maxdepth: 2 18 | 19 | Library.md 20 | extended.md 21 | Summarization.md 22 | im2text.md 23 | speech2text.md 24 | vid2text.rst 25 | 26 | 27 | .. toctree:: 28 | :caption: Scripts 29 | :maxdepth: 2 30 | 31 | options/preprocess.rst 32 | options/train.rst 33 | options/translate.rst 34 | options/server.rst 35 | 36 | 37 | .. toctree:: 38 | :caption: API 39 | :maxdepth: 2 40 | 41 | onmt.rst 42 | onmt.modules.rst 43 | onmt.translation.rst 44 | onmt.translate.translation_server.rst 45 | onmt.inputters.rst -------------------------------------------------------------------------------- /docs/source/main.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | 4 | This portal provides a detailed documentation of the OpenNMT toolkit. It describes how to use the PyTorch project and how it works. 5 | 6 | 7 | 8 | ## Installation 9 | Install from `pip`: 10 | Install `OpenNMT-py` from `pip`: 11 | ```bash 12 | pip install OpenNMT-py 13 | ``` 14 | 15 | or from the sources: 16 | ```bash 17 | git clone https://github.com/OpenNMT/OpenNMT-py.git 18 | cd OpenNMT-py 19 | python setup.py install 20 | ``` 21 | 22 | *(Optionnal)* some advanced features (e.g. working audio, image or pretrained models) requires extra packages, you can install it with: 23 | ```bash 24 | pip install -r requirements.opt.txt 25 | ``` 26 | 27 | And you are ready to go! Take a look at the [quickstart](quickstart) to familiarize yourself with the main training workflow. 28 | 29 | Alternatively you can use Docker to install with `nvidia-docker`. The main Dockerfile is included 30 | in the root directory. 31 | 32 | ## Citation 33 | 34 | When using OpenNMT for research please cite our 35 | [OpenNMT technical report](https://doi.org/10.18653/v1/P17-4012) 36 | 37 | ``` 38 | @inproceedings{opennmt, 39 | author = {Guillaume Klein and 40 | Yoon Kim and 41 | Yuntian Deng and 42 | Jean Senellart and 43 | Alexander M. Rush}, 44 | title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation}, 45 | booktitle = {Proc. ACL}, 46 | year = {2017}, 47 | url = {https://doi.org/10.18653/v1/P17-4012}, 48 | doi = {10.18653/v1/P17-4012} 49 | } 50 | ``` 51 | 52 | ## Additional resources 53 | 54 | You can find additional help or tutorials in the following resources: 55 | 56 | * [Gitter channel](https://gitter.im/OpenNMT/openmt-py) 57 | 58 | * [Forum](http://forum.opennmt.net/) 59 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | onmt 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | onmt 8 | -------------------------------------------------------------------------------- /docs/source/onmt.inputters.rst: -------------------------------------------------------------------------------- 1 | Data Loaders 2 | ================= 3 | 4 | Data Readers 5 | ------------- 6 | 7 | .. autoexception:: onmt.inputters.datareader_base.MissingDependencyException 8 | 9 | .. autoclass:: onmt.inputters.DataReaderBase 10 | :members: 11 | 12 | .. autoclass:: onmt.inputters.TextDataReader 13 | :members: 14 | 15 | .. autoclass:: onmt.inputters.ImageDataReader 16 | :members: 17 | 18 | .. autoclass:: onmt.inputters.AudioDataReader 19 | :members: 20 | 21 | 22 | Dataset 23 | -------- 24 | 25 | .. autoclass:: onmt.inputters.Dataset 26 | :members: 27 | -------------------------------------------------------------------------------- /docs/source/onmt.modules.rst: -------------------------------------------------------------------------------- 1 | Modules 2 | ============= 3 | 4 | Core Modules 5 | ------------ 6 | 7 | .. autoclass:: onmt.modules.Embeddings 8 | :members: 9 | 10 | 11 | Encoders 12 | --------- 13 | 14 | .. autoclass:: onmt.encoders.EncoderBase 15 | :members: 16 | 17 | .. autoclass:: onmt.encoders.MeanEncoder 18 | :members: 19 | 20 | .. autoclass:: onmt.encoders.RNNEncoder 21 | :members: 22 | 23 | 24 | Decoders 25 | --------- 26 | 27 | 28 | .. autoclass:: onmt.decoders.DecoderBase 29 | :members: 30 | 31 | .. autoclass:: onmt.decoders.decoder.RNNDecoderBase 32 | :members: 33 | 34 | .. autoclass:: onmt.decoders.StdRNNDecoder 35 | :members: 36 | 37 | .. autoclass:: onmt.decoders.InputFeedRNNDecoder 38 | :members: 39 | 40 | Attention 41 | ---------- 42 | 43 | .. autoclass:: onmt.modules.AverageAttention 44 | :members: 45 | 46 | .. autoclass:: onmt.modules.GlobalAttention 47 | :members: 48 | 49 | 50 | 51 | Architecture: Transformer 52 | ---------------------------- 53 | 54 | .. autoclass:: onmt.modules.PositionalEncoding 55 | :members: 56 | 57 | .. autoclass:: onmt.modules.position_ffn.PositionwiseFeedForward 58 | :members: 59 | 60 | .. autoclass:: onmt.encoders.TransformerEncoder 61 | :members: 62 | 63 | .. autoclass:: onmt.decoders.TransformerDecoder 64 | :members: 65 | 66 | .. autoclass:: onmt.modules.MultiHeadedAttention 67 | :members: 68 | :undoc-members: 69 | 70 | 71 | Architecture: Conv2Conv 72 | ---------------------------- 73 | 74 | (These methods are from a user contribution 75 | and have not been thoroughly tested.) 76 | 77 | 78 | .. autoclass:: onmt.encoders.CNNEncoder 79 | :members: 80 | 81 | 82 | .. autoclass:: onmt.decoders.CNNDecoder 83 | :members: 84 | 85 | .. autoclass:: onmt.modules.ConvMultiStepAttention 86 | :members: 87 | 88 | .. autoclass:: onmt.modules.WeightNormConv2d 89 | :members: 90 | 91 | Architecture: SRU 92 | ---------------------------- 93 | 94 | .. autoclass:: onmt.models.sru.SRU 95 | :members: 96 | 97 | 98 | Alternative Encoders 99 | -------------------- 100 | 101 | onmt\.modules\.AudioEncoder 102 | 103 | .. autoclass:: onmt.encoders.AudioEncoder 104 | :members: 105 | 106 | 107 | onmt\.modules\.ImageEncoder 108 | 109 | .. autoclass:: onmt.encoders.ImageEncoder 110 | :members: 111 | 112 | 113 | Copy Attention 114 | -------------- 115 | 116 | .. autoclass:: onmt.modules.CopyGenerator 117 | :members: 118 | 119 | 120 | Structured Attention 121 | ------------------------------------------- 122 | 123 | .. autoclass:: onmt.modules.structured_attention.MatrixTree 124 | :members: 125 | -------------------------------------------------------------------------------- /docs/source/onmt.rst: -------------------------------------------------------------------------------- 1 | Framework 2 | ================= 3 | 4 | Model 5 | ----- 6 | 7 | .. autoclass:: onmt.models.NMTModel 8 | :members: 9 | 10 | Trainer 11 | ------- 12 | 13 | .. autoclass:: onmt.Trainer 14 | :members: 15 | 16 | 17 | .. autoclass:: onmt.utils.Statistics 18 | :members: 19 | 20 | Loss 21 | ---- 22 | 23 | 24 | .. autoclass:: onmt.utils.loss.LossComputeBase 25 | :members: 26 | 27 | 28 | Optimizer 29 | ----- 30 | 31 | .. autoclass:: onmt.utils.Optimizer 32 | :members: 33 | -------------------------------------------------------------------------------- /docs/source/onmt.translate.translation_server.rst: -------------------------------------------------------------------------------- 1 | Server 2 | ====== 3 | 4 | 5 | Models 6 | ------------- 7 | 8 | .. autoclass:: onmt.translate.translation_server.ServerModel 9 | :members: 10 | 11 | 12 | Core Server 13 | ------------ 14 | 15 | .. autoexception:: onmt.translate.translation_server.ServerModelError 16 | 17 | .. autoclass:: onmt.translate.translation_server.Timer 18 | :members: 19 | 20 | .. autoclass:: onmt.translate.translation_server.TranslationServer 21 | :members: 22 | -------------------------------------------------------------------------------- /docs/source/onmt.translation.rst: -------------------------------------------------------------------------------- 1 | Translation 2 | ================== 3 | 4 | Translations 5 | ------------- 6 | 7 | .. autoclass:: onmt.translate.Translation 8 | :members: 9 | 10 | Translator Class 11 | ----------------- 12 | 13 | .. autoclass:: onmt.translate.Translator 14 | :members: 15 | 16 | .. autoclass:: onmt.translate.TranslationBuilder 17 | :members: 18 | 19 | 20 | Decoding Strategies 21 | -------------------- 22 | .. autoclass:: onmt.translate.DecodeStrategy 23 | :members: 24 | 25 | .. autoclass:: onmt.translate.BeamSearch 26 | :members: 27 | 28 | .. autofunction:: onmt.translate.random_sampling.sample_with_temperature 29 | 30 | .. autoclass:: onmt.translate.RandomSampling 31 | :members: 32 | 33 | Scoring 34 | -------- 35 | .. autoclass:: onmt.translate.penalties.PenaltyBuilder 36 | :members: 37 | 38 | .. autoclass:: onmt.translate.GNMTGlobalScorer 39 | :members: 40 | -------------------------------------------------------------------------------- /docs/source/options/preprocess.rst: -------------------------------------------------------------------------------- 1 | Preprocess 2 | ========== 3 | 4 | .. argparse:: 5 | :filename: ../onmt/bin/preprocess.py 6 | :func: _get_parser 7 | :prog: preprocess.py -------------------------------------------------------------------------------- /docs/source/options/server.rst: -------------------------------------------------------------------------------- 1 | Server 2 | ========= 3 | 4 | .. argparse:: 5 | :filename: ../onmt/bin/server.py 6 | :func: _get_parser 7 | :prog: server.py -------------------------------------------------------------------------------- /docs/source/options/train.rst: -------------------------------------------------------------------------------- 1 | Train 2 | ===== 3 | 4 | .. argparse:: 5 | :filename: ../onmt/bin/train.py 6 | :func: _get_parser 7 | :prog: train.py -------------------------------------------------------------------------------- /docs/source/options/translate.rst: -------------------------------------------------------------------------------- 1 | Translate 2 | ========= 3 | 4 | .. argparse:: 5 | :filename: ../onmt/bin/translate.py 6 | :func: _get_parser 7 | :prog: translate.py -------------------------------------------------------------------------------- /docs/source/quickstart.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Quickstart 4 | 5 | 6 | ### Step 1: Preprocess the data 7 | 8 | ```bash 9 | onmt_preprocess -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo 10 | ``` 11 | 12 | We will be working with some example data in `data/` folder. 13 | 14 | The data consists of parallel source (`src`) and target (`tgt`) data containing one sentence per line with tokens separated by a space: 15 | 16 | * `src-train.txt` 17 | * `tgt-train.txt` 18 | * `src-val.txt` 19 | * `tgt-val.txt` 20 | 21 | Validation files are required and used to evaluate the convergence of the training. It usually contains no more than 5000 sentences. 22 | 23 | ```text 24 | $ head -n 3 data/src-train.txt 25 | It is not acceptable that , with the help of the national bureaucracies , Parliament 's legislative prerogative should be made null and void by means of implementing provisions whose content , purpose and extent are not laid down in advance . 26 | Federal Master Trainer and Senior Instructor of the Italian Federation of Aerobic Fitness , Group Fitness , Postural Gym , Stretching and Pilates; from 2004 , he has been collaborating with Antiche Terme as personal Trainer and Instructor of Stretching , Pilates and Postural Gym . 27 | " Two soldiers came up to me and told me that if I refuse to sleep with them , they will kill me . They beat me and ripped my clothes . 28 | ``` 29 | 30 | ### Step 2: Train the model 31 | 32 | ```bash 33 | onmt_train -data data/demo -save_model demo-model 34 | ``` 35 | 36 | The main train command is quite simple. Minimally it takes a data file 37 | and a save file. This will run the default model, which consists of a 38 | 2-layer LSTM with 500 hidden units on both the encoder/decoder. 39 | If you want to train on GPU, you need to set, as an example: 40 | CUDA_VISIBLE_DEVICES=1,3 41 | `-world_size 2 -gpu_ranks 0 1` to use (say) GPU 1 and 3 on this node only. 42 | To know more about distributed training on single or multi nodes, read the FAQ section. 43 | 44 | ### Step 3: Translate 45 | 46 | ```bash 47 | onmt_translate -model demo-model_XYZ.pt -src data/src-test.txt -output pred.txt -replace_unk -verbose 48 | ``` 49 | 50 | Now you have a model which you can use to predict on new data. We do this by running beam search. This will output predictions into `pred.txt`. 51 | 52 | Note: 53 | 54 | The predictions are going to be quite terrible, as the demo dataset is small. Try running on some larger datasets! For example you can download millions of parallel sentences for [translation](http://www.statmt.org/wmt16/translation-task.html) or [summarization](https://github.com/harvardnlp/sent-summary). 55 | -------------------------------------------------------------------------------- /docs/source/ref.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | References 3 | ========== 4 | 5 | 6 | 7 | References 8 | 9 | .. bibliography:: refs.bib 10 | 11 | -------------------------------------------------------------------------------- /docs/source/speech2text.md: -------------------------------------------------------------------------------- 1 | # Speech to Text 2 | 3 | A deep learning-based approach to learning the speech-to-text conversion, built on top of the OpenNMT system. 4 | 5 | Given raw audio, we first apply short-time Fourier transform (STFT), then apply Convolutional Neural Networks to get the source features. Based on this source representation, we use an LSTM decoder with attention to produce the text character by character. 6 | 7 | ### Dependencies 8 | 9 | * `torchaudio`: `sudo apt-get install -y sox libsox-dev libsox-fmt-all; pip install git+https://github.com/pytorch/audio` 10 | * `librosa`: `pip install librosa` 11 | 12 | ### Quick Start 13 | 14 | To get started, we provide a toy speech-to-text example. We assume that the working directory is `OpenNMT-py` throughout this document. 15 | 16 | 0) Download the data. 17 | 18 | ``` 19 | wget -O data/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf data/speech.tgz -C data/ 20 | ``` 21 | 22 | 23 | 1) Preprocess the data. 24 | 25 | ``` 26 | onmt_preprocess -data_type audio -src_dir data/speech/an4_dataset -train_src data/speech/src-train.txt -train_tgt data/speech/tgt-train.txt -valid_src data/speech/src-val.txt -valid_tgt data/speech/tgt-val.txt -shard_size 300 -save_data data/speech/demo 27 | ``` 28 | 29 | 2) Train the model. 30 | 31 | ``` 32 | onmt_train -model_type audio -enc_rnn_size 512 -dec_rnn_size 512 -audio_enc_pooling 1,1,2,2 -dropout 0 -enc_layers 4 -dec_layers 1 -rnn_type LSTM -data data/speech/demo -save_model demo-model -global_attention mlp -gpu_ranks 0 -batch_size 8 -optim adam -max_grad_norm 100 -learning_rate 0.0003 -learning_rate_decay 0.8 -train_steps 100000 33 | ``` 34 | 35 | 3) Translate the speechs. 36 | 37 | ``` 38 | onmt_translate -data_type audio -model demo-model_acc_x_ppl_x_e13.pt -src_dir data/speech/an4_dataset -src data/speech/src-val.txt -output pred.txt -gpu 0 -verbose 39 | ``` 40 | 41 | 42 | ### Options 43 | 44 | * `-src_dir`: The directory containing the audio files. 45 | 46 | * `-train_tgt`: The file storing the tokenized labels, one label per line. It shall look like: 47 | ``` 48 | ... 49 | ... 50 | ... 51 | ... 52 | ``` 53 | 54 | * `-train_src`: The file storing the paths of the audio files (relative to `src_dir`). 55 | ``` 56 | 57 | 58 | 59 | ... 60 | ``` 61 | 62 | * `sample_rate`: Sample rate. Default: 16000. 63 | * `window_size`: Window size for spectrogram in seconds. Default: 0.02. 64 | * `window_stride`: Window stride for spectrogram in seconds. Default: 0.01. 65 | * `window`: Window type for spectrogram generation. Default: hamming. 66 | 67 | ### Acknowledgement 68 | 69 | Our preprocessing and CNN encoder is adapted from [deepspeech.pytorch](https://github.com/SeanNaren/deepspeech.pytorch). 70 | -------------------------------------------------------------------------------- /floyd.yml: -------------------------------------------------------------------------------- 1 | env: pytorch-0.4 2 | machine: cpu 3 | -------------------------------------------------------------------------------- /floyd_requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/pytorch/text 2 | -------------------------------------------------------------------------------- /github_deploy_key_opennmt_opennmt_py.enc: -------------------------------------------------------------------------------- 1 | gAAAAABaPWC5LTHR5xMoviRbhWsMCxo0FPMTXwcm4DBbG2jYaTxuqdjT78PXu1XxcEfbRuZ-xX8723WjgJMaOVFRuB6k1Oow7Qw8YlO6CV5fyjU8jJFy0D4fSEE40P6A0GbvtMwj2uVKyhrCK341_8roVVegN96S40muebu0oi3cY0sDwLybAOBQYdf_J6gQgWIxf289hPMzmV4iy332V9gRN-cNbmpUYaVxINrxv0Ce6pw3NV99mNNK5izq-g4hlpErnF7LG60Jar7Vh7bw52C0PpEVJmUXWIJOtDGy6d_SuvR4SIj64J4IEDO78s7PyI8jAyP5Nu5emcH_eOV8z7C2nszkNbx6RwtDPh5qK0HILCgGmF4nzOTVK8mE9_8gD-tlWpS7jj7y_IJwNPJB3Gnqt383sg5NIQpgQqJMzmKtacXPF-sDvczsyf4t6GEURhPYobNociBQa3ZZBtJU0O_moUtwdSRsjkk1RdUbIgG3tcX73T_SYJqMLGtMKywmyDzv1CVqFCdCAhlVAcSnLLvP2xlJ1uJSKa46dtSDoUXleWCGMR9SoLz2UpPvtnJ1zZ8YKW7UD9iQfAsznBMSG4wKGEdZdFymvCuLnZQYWmJK9UFSyoYnrW1Jy1pmOJ8a25kfyI6_LiK52iC1zr9DZcn5MP2FGgrJnz0RfuvPtcgKFtvs731LzVycUT-u1I4WftPh_6b6fPxYuSnRPdJ39m7OnaGb5VOobleElaZMkh8niXM4K654i1dQA_ItuYeWjU3HPhwN86aOif6GeZSlq_Xjp3Z2DACSmYqyxKccVBWYBdZO8WSdSt07TEeWUboDDQTu_xCPEh-E8Z-Bb-xjTjVM99jkvZSrbqJn6TeY__nH2thfl9cVMj73o7wIp0EJgSpUuKEnJqPwenPwm-VEj_ODB8qbNYC3y4QkfHBL6nbdUt8Qx6P59i8C54st2v5OdZ31bF6bqbJxE5UElJRyuASmE92vu8QqQqPGjZqLhIE9Tl6EC4JFdwJMZI53gztzfKTYMAQLkbV0zYtSoBYavbBCTwQTlG49qDeZk6r5K4DPwZh9xM-M9j32Yr3NYE6QvS4sPaikPkAGoLqTAWVrfdLDc7IdIgAmZNt1D5E3Wm2n7wlQflrdLu6VgiGT1rZgsax_C1bTvsi7InkjuQuNphzXEn3_9FlWmnatDK0Nb0MFqGtEAd0S5SDGI2cf7drLVOtJzvNw9GUgdMoqn-hutvJNS1vpIZK2KektVoFMB-gBJj4oPp4gx8WDbvmkd88Jbitk3xuQp8JmoxPcVkZhPJYYMouMHnO982N9HiJ7AsvFmML_AEe72_qQCh5jcGpsbMq_U5Cu8S2L6MpaMmcn1Piup9ClCricSNEtJD-QS9EEyn-mCHnXnQ1_z6AQ-An5wwm2eNrsEN1F1DjqLcyO3ziE5pHKNXh5W1H3Ec1_ETpInJRBoZ7DEvPpI1KFyxSnwCCrONAIZwrZzMDHPsXgJbXZZfX8_bah36380_eecZOmeCVE1UsimA2MLE3K-ziv0YhXiyHkdzROSmXruXSmzr1NW1bn26Fwy3M3L3GDmHI4Wd62eiYlPAdiOOGO2rA1H37q47X-65BBdh9XXz0k_5YRPLtQeDUavLKzd9MIHc8Ef4g2PkHJTRp9jdkertDy1NkKg3rV-QZ12fCCce97ftcMJ4BSXLgEx_jvxISTo4mB8R0fAAWYJAYCd0vFc7Q4PRFHhyJsm_5BtrwEC5JFQF6sNQllkIRbixJ-kGaieAwRZ-JKzR7gzQ3MJVjArZKcZJV6N8YYRQvKcR8sEcgLv_lr_1hQNLjmGyFeZ1RYxagaddVLAxwp8W5_vofhnKCc5JpnVcAm4W-h_l7uZd42raso-7HeRYIacW9tuFhmUi7iZBHzsNz9G0XFdsdeD2FKJb2yt30Ze4VA1crIOWwVkHsfXid2tjV4wEkR1GGQXYJ2HSHeiH5W4_9vxyYlpum8swrEWY_vLywnv92Bqerk2pfBi6kJqE1ZyZR-8NQuZMxQO_l8pTurirI-nCeHY5Im-jhs4MmA4-zwthY6RKQqbijYCbEd3HeHHMS0k8c84NlMiVAlEd7cAQZYSvlrAxNsaUWmBazE6HAGhXlB0X5pYDYV0LDalIU4guqpVLx-B4iwvnQ7nA3EzsXSBSJDsYbtVQaOHabG_jTL-SDKpkMEdb1Fh0UAeflB02fSenwj1DmsZysiJDD16IxKq22XjGslQZKNvZqk2XivzbL7JfVkCDU6N8XgyOpImZmh28Cq5iyN0GfgzYBvUscrspXQd7QJmiatoGLA-nkCZae4XRfeEh9l0qj_jiLnDzDXxF8pz9A-2GMTUUiUFwehSw2haTZJ4Ndqj3ekItvVJZxwVPYs_Voim3orgFUKmT1SUWXy5lKWPuqpWpbhBs0W5EJ2gt5EzV_ejsnnMqyDoxS-R03-ZATHRaFtvf96Zz0qo7xP__UONT1c5l8FX4Tf_kBF5JlTFe3FbSk9fa38QJGqH3RiF1mx91VXOwXR4fw-vGy5CuZoCND3QVzrdwmYE3jqxClBo7AnAjLTXD-lUCf7gqFqHFU-on1zypAZaXhwMVmfuKeolQhPsuybzUWTlRQW5OT2rxnwI-xO_6s78sRIyBwtbQba6lcOUnNH5PF9TbGj4Z2ErzA7eBS6ZBlnEE_fx8QrHoF32x2KLbyX6ELgEG4pt6aWfroWTWWC2T1CjUrswmMEfF5F0aA0uvr-vikxFl62Ob2yIuyF39ytmr8mb_o4JBpd3Etj4m_T-5HwmrsNnAf8bUqf0hTHuQlS9ek5jJK-_pNWWL1Q3yQ7x-4eiJkppero7UYyOKXGLRqgWchry26edqEETCybJMvgjmN2kHqcrg3XBM4ItjOPw0s4XklG7YZzEVmq8O3hgp-fVozpX_RAaaFSGmDzuZcQl2R_-Yo13KzjLj8wu3KjBCfVhJoAjc4T2VZMGVL3T4AOZOEN_GXEKjT5rbrEo1E7eQUoKE_PKKxmyDeNZN3W3hULAS_FMKAURyCLT_nfQ-cKU7pg113AyV6juAS_DFnBPZkcwM-PJBKz69QsrN_D3s3M53rART78zbUAab-La7Q803g-eaSgxpGJgCZKqHHafE4OpMnhKJl1eXaO_YekbtNR-JNXxdMS5wMEA_BOpqu_ixwuw_vJx-tZxKJ1p_o75OVFK9YH9ZFT5_--ngM8G-kHZrV6u5XKc5Jymrq9m6nZaH__HdAMvQmRfMWbOsSXl3HrlyEoPK5nyBcKtlHLwANc_1WeMJp3HjpHi5HelTnqNDxi5I5Z0RWP1mU0f8mUMkTvGb5U1wW0pL0Aq_5vSfn5LQhH0QAt2JcHrFasMe_7dABIzMLb8_ph0yQQ57IAIfXUYleOwyD1ZpAFgysnh9V9duxPmg3yswRlJ9MZK9tYkwWcj_nOjq2407qR42aThqWYL4702HVycoQgErx6K4XSkF5mmJdfsZ515IIpqHJt-7Q5n_gzIPQa4Wq5ANgS5-2y97uN61NkoE9eIiLHZMY6OvuORvSdMeL6_84MuLBsKS_3OgXrOQFOgdK5mCn9Iv53UZiMkR0rLGHOLnb2hnTZGq4ao3yiNsauBqf0O4r6ecarYxGty4yWZBxB8aHLFcK-FAlFuoEL8PlRLChOEUqvUoaFs3jzyQY_iRZRyCMszPi0xPrvdiILk4VDaa0NR0XtCC-kA3tdcb_Xbdfv_Djw-wVLf7Dx6iBlPNwtjE4OzweqBaAkNkk5Ij35vk-6QQryHhAgiAHdXDGZoegdHZdKUeC_GSCMud0wpXloEPxDREskWu1VN310OXaa6VvpG0VB1B2CrUlFNvwzmal3PYCrb7XPAT1Lu5C4oSH3bTr6Hk9wtIEv0sAgt4B9RPhZ0Kq-lP85raW748Pkc0PDK1C4g4SzAxl_x7JTSTYUk_fjMnc7yEN0iBRJCMfmUq-ILtj2zOI7f3dazGCp9dXBOTVTYMVNRpcka7vWjlGHMMuVvid3Oz6GgBZl_I3csNzGXTZEvJurp3qXSaXL_THxHmDBDn7T_uY58uPaTC-qjdvkKNDUzg2kRtzejmO7TPEGIRAQghEkVK-ruZU5llxjMg1NOTeXfhXZlRK2Ri8F9QPs6FSFuiqLgOzgbl_rlecf3E6iJ9fgTsdE8OGgekAwmF5hi7Tp5DsGNlKXpWvc4TftLO7len-b9Tqa7XYPU5NKv1hVIIobSRjYuFuW1yDSWtXY0zzzqPsdhtrv97JoM71QL8fZ3tUDBDWhvlBmpXSSfjf4qYQ0PmP7pQWLjb_DuVBDO5EDV0xblgz_stLcNvxRIYChm0ytxN8B2jCaH1n_CLEWTvFloWBP72ovnRWcd1gqbZ4bD4KrI_Tb7VcepWqUg1CO-yTRHR4zQUSBBfM= -------------------------------------------------------------------------------- /onmt/__init__.py: -------------------------------------------------------------------------------- 1 | """ Main entry point of the ONMT library """ 2 | from __future__ import division, print_function 3 | 4 | import onmt.inputters 5 | import onmt.encoders 6 | import onmt.decoders 7 | import onmt.models 8 | import onmt.utils 9 | import onmt.modules 10 | from onmt.trainer import Trainer 11 | import sys 12 | import onmt.utils.optimizers 13 | onmt.utils.optimizers.Optim = onmt.utils.optimizers.Optimizer 14 | sys.modules["onmt.Optim"] = onmt.utils.optimizers 15 | 16 | # For Flake 17 | __all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models, 18 | onmt.utils, onmt.modules, "Trainer"] 19 | 20 | __version__ = "1.0.0.rc2" 21 | -------------------------------------------------------------------------------- /onmt/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/bin/__init__.py -------------------------------------------------------------------------------- /onmt/bin/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import configargparse 3 | 4 | from flask import Flask, jsonify, request 5 | from onmt.translate import TranslationServer, ServerModelError 6 | 7 | STATUS_OK = "ok" 8 | STATUS_ERROR = "error" 9 | 10 | 11 | def start(config_file, 12 | url_root="./translator", 13 | host="0.0.0.0", 14 | port=5000, 15 | debug=True): 16 | def prefix_route(route_function, prefix='', mask='{0}{1}'): 17 | def newroute(route, *args, **kwargs): 18 | return route_function(mask.format(prefix, route), *args, **kwargs) 19 | return newroute 20 | 21 | app = Flask(__name__) 22 | app.route = prefix_route(app.route, url_root) 23 | translation_server = TranslationServer() 24 | translation_server.start(config_file) 25 | 26 | @app.route('/models', methods=['GET']) 27 | def get_models(): 28 | out = translation_server.list_models() 29 | return jsonify(out) 30 | 31 | @app.route('/health', methods=['GET']) 32 | def health(): 33 | out = {} 34 | out['status'] = STATUS_OK 35 | return jsonify(out) 36 | 37 | @app.route('/clone_model/', methods=['POST']) 38 | def clone_model(model_id): 39 | out = {} 40 | data = request.get_json(force=True) 41 | timeout = -1 42 | if 'timeout' in data: 43 | timeout = data['timeout'] 44 | del data['timeout'] 45 | 46 | opt = data.get('opt', None) 47 | try: 48 | model_id, load_time = translation_server.clone_model( 49 | model_id, opt, timeout) 50 | except ServerModelError as e: 51 | out['status'] = STATUS_ERROR 52 | out['error'] = str(e) 53 | else: 54 | out['status'] = STATUS_OK 55 | out['model_id'] = model_id 56 | out['load_time'] = load_time 57 | 58 | return jsonify(out) 59 | 60 | @app.route('/unload_model/', methods=['GET']) 61 | def unload_model(model_id): 62 | out = {"model_id": model_id} 63 | 64 | try: 65 | translation_server.unload_model(model_id) 66 | out['status'] = STATUS_OK 67 | except Exception as e: 68 | out['status'] = STATUS_ERROR 69 | out['error'] = str(e) 70 | 71 | return jsonify(out) 72 | 73 | @app.route('/translate', methods=['POST']) 74 | def translate(): 75 | inputs = request.get_json(force=True) 76 | out = {} 77 | try: 78 | translation, scores, n_best, times = translation_server.run(inputs) 79 | assert len(translation) == len(inputs) 80 | assert len(scores) == len(inputs) 81 | 82 | out = [[{"src": inputs[i]['src'], "tgt": translation[i], 83 | "n_best": n_best, 84 | "pred_score": scores[i]} 85 | for i in range(len(translation))]] 86 | except ServerModelError as e: 87 | out['error'] = str(e) 88 | out['status'] = STATUS_ERROR 89 | 90 | return jsonify(out) 91 | 92 | @app.route('/to_cpu/', methods=['GET']) 93 | def to_cpu(model_id): 94 | out = {'model_id': model_id} 95 | translation_server.models[model_id].to_cpu() 96 | 97 | out['status'] = STATUS_OK 98 | return jsonify(out) 99 | 100 | @app.route('/to_gpu/', methods=['GET']) 101 | def to_gpu(model_id): 102 | out = {'model_id': model_id} 103 | translation_server.models[model_id].to_gpu() 104 | 105 | out['status'] = STATUS_OK 106 | return jsonify(out) 107 | 108 | app.run(debug=debug, host=host, port=port, use_reloader=False, 109 | threaded=True) 110 | 111 | 112 | def _get_parser(): 113 | parser = configargparse.ArgumentParser( 114 | config_file_parser_class=configargparse.YAMLConfigFileParser, 115 | description="OpenNMT-py REST Server") 116 | parser.add_argument("--ip", type=str, default="0.0.0.0") 117 | parser.add_argument("--port", type=int, default="5000") 118 | parser.add_argument("--url_root", type=str, default="/translator") 119 | parser.add_argument("--debug", "-d", action="store_true") 120 | parser.add_argument("--config", "-c", type=str, 121 | default="./available_models/conf.json") 122 | return parser 123 | 124 | 125 | def main(): 126 | parser = _get_parser() 127 | args = parser.parse_args() 128 | start(args.config, url_root=args.url_root, host=args.ip, port=args.port, 129 | debug=args.debug) 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /onmt/bin/translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals 5 | from itertools import repeat 6 | 7 | from onmt.utils.logging import init_logger 8 | from onmt.utils.misc import split_corpus 9 | from onmt.translate.translator import build_translator 10 | 11 | import onmt.opts as opts 12 | from onmt.utils.parse import ArgumentParser 13 | 14 | 15 | def translate(opt): 16 | ArgumentParser.validate_translate_opts(opt) 17 | logger = init_logger(opt.log_file) 18 | 19 | translator = build_translator(opt, report_score=True) 20 | src_shards = split_corpus(opt.src, opt.shard_size) 21 | tgt_shards = split_corpus(opt.tgt, opt.shard_size) \ 22 | if opt.tgt is not None else repeat(None) 23 | shard_pairs = zip(src_shards, tgt_shards) 24 | 25 | for i, (src_shard, tgt_shard) in enumerate(shard_pairs): 26 | logger.info("Translating shard %d." % i) 27 | translator.translate( 28 | src=src_shard, 29 | tgt=tgt_shard, 30 | src_dir=opt.src_dir, 31 | batch_size=opt.batch_size, 32 | batch_type=opt.batch_type, 33 | attn_debug=opt.attn_debug 34 | ) 35 | 36 | 37 | def _get_parser(): 38 | parser = ArgumentParser(description='translate.py') 39 | 40 | opts.config_opts(parser) 41 | opts.translate_opts(parser) 42 | return parser 43 | 44 | 45 | def main(): 46 | parser = _get_parser() 47 | 48 | opt = parser.parse_args() 49 | translate(opt) 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /onmt/decoders/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining decoders.""" 2 | from onmt.decoders.decoder import DecoderBase, InputFeedRNNDecoder, \ 3 | StdRNNDecoder 4 | from onmt.decoders.transformer import TransformerDecoder, BiTransformerDecoder 5 | from onmt.decoders.cnn_decoder import CNNDecoder 6 | 7 | 8 | str2dec = {"rnn": StdRNNDecoder, "ifrnn": InputFeedRNNDecoder, 9 | "cnn": CNNDecoder, "transformer": TransformerDecoder, "bidecoder": BiTransformerDecoder} 10 | 11 | __all__ = ["DecoderBase", "TransformerDecoder", "StdRNNDecoder", "CNNDecoder", 12 | "InputFeedRNNDecoder", "str2dec"] 13 | -------------------------------------------------------------------------------- /onmt/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining encoders.""" 2 | from onmt.encoders.encoder import EncoderBase 3 | from onmt.encoders.transformer import TransformerEncoder 4 | from onmt.encoders.rnn_encoder import RNNEncoder 5 | from onmt.encoders.cnn_encoder import CNNEncoder 6 | from onmt.encoders.mean_encoder import MeanEncoder 7 | from onmt.encoders.audio_encoder import AudioEncoder 8 | from onmt.encoders.image_encoder import ImageEncoder 9 | 10 | 11 | str2enc = {"rnn": RNNEncoder, "brnn": RNNEncoder, "cnn": CNNEncoder, 12 | "transformer": TransformerEncoder, "img": ImageEncoder, 13 | "audio": AudioEncoder, "mean": MeanEncoder} 14 | 15 | __all__ = ["EncoderBase", "TransformerEncoder", "RNNEncoder", "CNNEncoder", 16 | "MeanEncoder", "str2enc"] 17 | -------------------------------------------------------------------------------- /onmt/encoders/cnn_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of "Convolutional Sequence to Sequence Learning" 3 | """ 4 | import torch.nn as nn 5 | 6 | from onmt.encoders.encoder import EncoderBase 7 | from onmt.utils.cnn_factory import shape_transform, StackedCNN 8 | 9 | SCALE_WEIGHT = 0.5 ** 0.5 10 | 11 | 12 | class CNNEncoder(EncoderBase): 13 | """Encoder based on "Convolutional Sequence to Sequence Learning" 14 | :cite:`DBLP:journals/corr/GehringAGYD17`. 15 | """ 16 | 17 | def __init__(self, num_layers, hidden_size, 18 | cnn_kernel_width, dropout, embeddings): 19 | super(CNNEncoder, self).__init__() 20 | 21 | self.embeddings = embeddings 22 | input_size = embeddings.embedding_size 23 | self.linear = nn.Linear(input_size, hidden_size) 24 | self.cnn = StackedCNN(num_layers, hidden_size, 25 | cnn_kernel_width, dropout) 26 | 27 | @classmethod 28 | def from_opt(cls, opt, embeddings): 29 | """Alternate constructor.""" 30 | return cls( 31 | opt.enc_layers, 32 | opt.enc_rnn_size, 33 | opt.cnn_kernel_width, 34 | opt.dropout[0] if type(opt.dropout) is list else opt.dropout, 35 | embeddings) 36 | 37 | def forward(self, input, lengths=None, hidden=None): 38 | """See :class:`onmt.modules.EncoderBase.forward()`""" 39 | self._check_args(input, lengths, hidden) 40 | 41 | emb = self.embeddings(input) 42 | # s_len, batch, emb_dim = emb.size() 43 | 44 | emb = emb.transpose(0, 1).contiguous() 45 | emb_reshape = emb.view(emb.size(0) * emb.size(1), -1) 46 | emb_remap = self.linear(emb_reshape) 47 | emb_remap = emb_remap.view(emb.size(0), emb.size(1), -1) 48 | emb_remap = shape_transform(emb_remap) 49 | out = self.cnn(emb_remap) 50 | 51 | return emb_remap.squeeze(3).transpose(0, 1).contiguous(), \ 52 | out.squeeze(3).transpose(0, 1).contiguous(), lengths 53 | 54 | def update_dropout(self, dropout): 55 | self.cnn.dropout.p = dropout 56 | -------------------------------------------------------------------------------- /onmt/encoders/encoder.py: -------------------------------------------------------------------------------- 1 | """Base class for encoders and generic multi encoders.""" 2 | 3 | import torch.nn as nn 4 | 5 | from onmt.utils.misc import aeq 6 | 7 | 8 | class EncoderBase(nn.Module): 9 | """ 10 | Base encoder class. Specifies the interface used by different encoder types 11 | and required by :class:`onmt.Models.NMTModel`. 12 | 13 | .. mermaid:: 14 | 15 | graph BT 16 | A[Input] 17 | subgraph RNN 18 | C[Pos 1] 19 | D[Pos 2] 20 | E[Pos N] 21 | end 22 | F[Memory_Bank] 23 | G[Final] 24 | A-->C 25 | A-->D 26 | A-->E 27 | C-->F 28 | D-->F 29 | E-->F 30 | E-->G 31 | """ 32 | 33 | @classmethod 34 | def from_opt(cls, opt, embeddings=None): 35 | raise NotImplementedError 36 | 37 | def _check_args(self, src, lengths=None, hidden=None): 38 | n_batch = src.size(1) 39 | if lengths is not None: 40 | n_batch_, = lengths.size() 41 | aeq(n_batch, n_batch_) 42 | 43 | def forward(self, src, lengths=None): 44 | """ 45 | Args: 46 | src (LongTensor): 47 | padded sequences of sparse indices ``(src_len, batch, nfeat)`` 48 | lengths (LongTensor): length of each sequence ``(batch,)`` 49 | 50 | 51 | Returns: 52 | (FloatTensor, FloatTensor): 53 | 54 | * final encoder state, used to initialize decoder 55 | * memory bank for attention, ``(src_len, batch, hidden)`` 56 | """ 57 | 58 | raise NotImplementedError 59 | -------------------------------------------------------------------------------- /onmt/encoders/mean_encoder.py: -------------------------------------------------------------------------------- 1 | """Define a minimal encoder.""" 2 | from onmt.encoders.encoder import EncoderBase 3 | from onmt.utils.misc import sequence_mask 4 | import torch 5 | 6 | 7 | class MeanEncoder(EncoderBase): 8 | """A trivial non-recurrent encoder. Simply applies mean pooling. 9 | 10 | Args: 11 | num_layers (int): number of replicated layers 12 | embeddings (onmt.modules.Embeddings): embedding module to use 13 | """ 14 | 15 | def __init__(self, num_layers, embeddings): 16 | super(MeanEncoder, self).__init__() 17 | self.num_layers = num_layers 18 | self.embeddings = embeddings 19 | 20 | @classmethod 21 | def from_opt(cls, opt, embeddings): 22 | """Alternate constructor.""" 23 | return cls( 24 | opt.enc_layers, 25 | embeddings) 26 | 27 | def forward(self, src, lengths=None): 28 | """See :func:`EncoderBase.forward()`""" 29 | self._check_args(src, lengths) 30 | 31 | emb = self.embeddings(src) 32 | _, batch, emb_dim = emb.size() 33 | 34 | if lengths is not None: 35 | # we avoid padding while mean pooling 36 | mask = sequence_mask(lengths).float() 37 | mask = mask / lengths.unsqueeze(1).float() 38 | mean = torch.bmm(mask.unsqueeze(1), emb.transpose(0, 1)).squeeze(1) 39 | else: 40 | mean = emb.mean(0) 41 | 42 | mean = mean.expand(self.num_layers, batch, emb_dim) 43 | memory_bank = emb 44 | encoder_final = (mean, mean) 45 | return encoder_final, memory_bank, lengths 46 | -------------------------------------------------------------------------------- /onmt/encoders/rnn_encoder.py: -------------------------------------------------------------------------------- 1 | """Define RNN-based encoders.""" 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.nn.utils.rnn import pack_padded_sequence as pack 6 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 7 | 8 | from onmt.encoders.encoder import EncoderBase 9 | from onmt.utils.rnn_factory import rnn_factory 10 | 11 | 12 | class RNNEncoder(EncoderBase): 13 | """ A generic recurrent neural network encoder. 14 | 15 | Args: 16 | rnn_type (str): 17 | style of recurrent unit to use, one of [RNN, LSTM, GRU, SRU] 18 | bidirectional (bool) : use a bidirectional RNN 19 | num_layers (int) : number of stacked layers 20 | hidden_size (int) : hidden size of each layer 21 | dropout (float) : dropout value for :class:`torch.nn.Dropout` 22 | embeddings (onmt.modules.Embeddings): embedding module to use 23 | """ 24 | 25 | def __init__(self, rnn_type, bidirectional, num_layers, 26 | hidden_size, dropout=0.0, embeddings=None, 27 | use_bridge=False): 28 | super(RNNEncoder, self).__init__() 29 | assert embeddings is not None 30 | 31 | num_directions = 2 if bidirectional else 1 32 | assert hidden_size % num_directions == 0 33 | hidden_size = hidden_size // num_directions 34 | self.embeddings = embeddings 35 | 36 | self.rnn, self.no_pack_padded_seq = \ 37 | rnn_factory(rnn_type, 38 | input_size=embeddings.embedding_size, 39 | hidden_size=hidden_size, 40 | num_layers=num_layers, 41 | dropout=dropout, 42 | bidirectional=bidirectional) 43 | 44 | # Initialize the bridge layer 45 | self.use_bridge = use_bridge 46 | if self.use_bridge: 47 | self._initialize_bridge(rnn_type, 48 | hidden_size, 49 | num_layers) 50 | 51 | @classmethod 52 | def from_opt(cls, opt, embeddings): 53 | """Alternate constructor.""" 54 | return cls( 55 | opt.rnn_type, 56 | opt.brnn, 57 | opt.enc_layers, 58 | opt.enc_rnn_size, 59 | opt.dropout[0] if type(opt.dropout) is list else opt.dropout, 60 | embeddings, 61 | opt.bridge) 62 | 63 | def forward(self, src, lengths=None): 64 | """See :func:`EncoderBase.forward()`""" 65 | self._check_args(src, lengths) 66 | 67 | emb = self.embeddings(src) 68 | # s_len, batch, emb_dim = emb.size() 69 | 70 | packed_emb = emb 71 | if lengths is not None and not self.no_pack_padded_seq: 72 | # Lengths data is wrapped inside a Tensor. 73 | lengths_list = lengths.view(-1).tolist() 74 | packed_emb = pack(emb, lengths_list) 75 | 76 | memory_bank, encoder_final = self.rnn(packed_emb) 77 | 78 | if lengths is not None and not self.no_pack_padded_seq: 79 | memory_bank = unpack(memory_bank)[0] 80 | 81 | if self.use_bridge: 82 | encoder_final = self._bridge(encoder_final) 83 | return encoder_final, memory_bank, lengths 84 | 85 | def _initialize_bridge(self, rnn_type, 86 | hidden_size, 87 | num_layers): 88 | 89 | # LSTM has hidden and cell state, other only one 90 | number_of_states = 2 if rnn_type == "LSTM" else 1 91 | # Total number of states 92 | self.total_hidden_dim = hidden_size * num_layers 93 | 94 | # Build a linear layer for each 95 | self.bridge = nn.ModuleList([nn.Linear(self.total_hidden_dim, 96 | self.total_hidden_dim, 97 | bias=True) 98 | for _ in range(number_of_states)]) 99 | 100 | def _bridge(self, hidden): 101 | """Forward hidden state through bridge.""" 102 | def bottle_hidden(linear, states): 103 | """ 104 | Transform from 3D to 2D, apply linear and return initial size 105 | """ 106 | size = states.size() 107 | result = linear(states.view(-1, self.total_hidden_dim)) 108 | return F.relu(result).view(size) 109 | 110 | if isinstance(hidden, tuple): # LSTM 111 | outs = tuple([bottle_hidden(layer, hidden[ix]) 112 | for ix, layer in enumerate(self.bridge)]) 113 | else: 114 | outs = bottle_hidden(self.bridge[0], hidden) 115 | return outs 116 | 117 | def update_dropout(self, dropout): 118 | self.rnn.dropout = dropout 119 | -------------------------------------------------------------------------------- /onmt/inputters/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining inputters. 2 | 3 | Inputters implement the logic of transforming raw data to vectorized inputs, 4 | e.g., from a line of text to a sequence of embeddings. 5 | """ 6 | from onmt.inputters.inputter import \ 7 | load_old_vocab, get_fields, OrderedIterator, \ 8 | build_vocab, old_style_vocab, filter_example 9 | from onmt.inputters.dataset_base import Dataset 10 | from onmt.inputters.text_dataset import text_sort_key, TextDataReader 11 | from onmt.inputters.image_dataset import img_sort_key, ImageDataReader 12 | from onmt.inputters.audio_dataset import audio_sort_key, AudioDataReader 13 | from onmt.inputters.vec_dataset import vec_sort_key, VecDataReader 14 | from onmt.inputters.datareader_base import DataReaderBase 15 | 16 | 17 | str2reader = { 18 | "text": TextDataReader, "img": ImageDataReader, "audio": AudioDataReader, 19 | "vec": VecDataReader} 20 | str2sortkey = { 21 | 'text': text_sort_key, 'img': img_sort_key, 'audio': audio_sort_key, 22 | 'vec': vec_sort_key} 23 | 24 | 25 | __all__ = ['Dataset', 'load_old_vocab', 'get_fields', 'DataReaderBase', 26 | 'filter_example', 'old_style_vocab', 27 | 'build_vocab', 'OrderedIterator', 28 | 'text_sort_key', 'img_sort_key', 'audio_sort_key', 'vec_sort_key', 29 | 'TextDataReader', 'ImageDataReader', 'AudioDataReader', 30 | 'VecDataReader'] 31 | -------------------------------------------------------------------------------- /onmt/inputters/datareader_base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | # several data readers need optional dependencies. There's no 5 | # appropriate builtin exception 6 | class MissingDependencyException(Exception): 7 | pass 8 | 9 | 10 | class DataReaderBase(object): 11 | """Read data from file system and yield as dicts. 12 | 13 | Raises: 14 | onmt.inputters.datareader_base.MissingDependencyException: A number 15 | of DataReaders need specific additional packages. 16 | If any are missing, this will be raised. 17 | """ 18 | 19 | @classmethod 20 | def from_opt(cls, opt): 21 | """Alternative constructor. 22 | 23 | Args: 24 | opt (argparse.Namespace): The parsed arguments. 25 | """ 26 | 27 | return cls() 28 | 29 | @classmethod 30 | def _read_file(cls, path): 31 | """Line-by-line read a file as bytes.""" 32 | with open(path, "rb") as f: 33 | for line in f: 34 | yield line 35 | 36 | @staticmethod 37 | def _raise_missing_dep(*missing_deps): 38 | """Raise missing dep exception with standard error message.""" 39 | raise MissingDependencyException( 40 | "Could not create reader. Be sure to install " 41 | "the following dependencies: " + ", ".join(missing_deps)) 42 | 43 | def read(self, data, side, src_dir): 44 | """Read data from file system and yield as dicts.""" 45 | raise NotImplementedError() 46 | -------------------------------------------------------------------------------- /onmt/inputters/image_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | import torch 6 | from torchtext.data import Field 7 | 8 | from onmt.inputters.datareader_base import DataReaderBase 9 | 10 | # domain specific dependencies 11 | try: 12 | from PIL import Image 13 | from torchvision import transforms 14 | import cv2 15 | except ImportError: 16 | Image, transforms, cv2 = None, None, None 17 | 18 | 19 | class ImageDataReader(DataReaderBase): 20 | """Read image data from disk. 21 | 22 | Args: 23 | truncate (tuple[int] or NoneType): maximum img size. Use 24 | ``(0,0)`` or ``None`` for unlimited. 25 | channel_size (int): Number of channels per image. 26 | 27 | Raises: 28 | onmt.inputters.datareader_base.MissingDependencyException: If 29 | importing any of ``PIL``, ``torchvision``, or ``cv2`` fail. 30 | """ 31 | 32 | def __init__(self, truncate=None, channel_size=3): 33 | self._check_deps() 34 | self.truncate = truncate 35 | self.channel_size = channel_size 36 | 37 | @classmethod 38 | def from_opt(cls, opt): 39 | return cls(channel_size=opt.image_channel_size) 40 | 41 | @classmethod 42 | def _check_deps(cls): 43 | if any([Image is None, transforms is None, cv2 is None]): 44 | cls._raise_missing_dep( 45 | "PIL", "torchvision", "cv2") 46 | 47 | def read(self, images, side, img_dir=None): 48 | """Read data into dicts. 49 | 50 | Args: 51 | images (str or Iterable[str]): Sequence of image paths or 52 | path to file containing audio paths. 53 | In either case, the filenames may be relative to ``src_dir`` 54 | (default behavior) or absolute. 55 | side (str): Prefix used in return dict. Usually 56 | ``"src"`` or ``"tgt"``. 57 | img_dir (str): Location of source image files. See ``images``. 58 | 59 | Yields: 60 | a dictionary containing image data, path and index for each line. 61 | """ 62 | if isinstance(images, str): 63 | images = DataReaderBase._read_file(images) 64 | 65 | for i, filename in enumerate(images): 66 | filename = filename.decode("utf-8").strip() 67 | img_path = os.path.join(img_dir, filename) 68 | if not os.path.exists(img_path): 69 | img_path = filename 70 | 71 | assert os.path.exists(img_path), \ 72 | 'img path %s not found' % filename 73 | 74 | if self.channel_size == 1: 75 | img = transforms.ToTensor()( 76 | Image.fromarray(cv2.imread(img_path, 0))) 77 | else: 78 | img = transforms.ToTensor()(Image.open(img_path)) 79 | if self.truncate and self.truncate != (0, 0): 80 | if not (img.size(1) <= self.truncate[0] 81 | and img.size(2) <= self.truncate[1]): 82 | continue 83 | yield {side: img, side + '_path': filename, 'indices': i} 84 | 85 | 86 | def img_sort_key(ex): 87 | """Sort using the size of the image: (width, height).""" 88 | return ex.src.size(2), ex.src.size(1) 89 | 90 | 91 | def batch_img(data, vocab): 92 | """Pad and batch a sequence of images.""" 93 | c = data[0].size(0) 94 | h = max([t.size(1) for t in data]) 95 | w = max([t.size(2) for t in data]) 96 | imgs = torch.zeros(len(data), c, h, w).fill_(1) 97 | for i, img in enumerate(data): 98 | imgs[i, :, 0:img.size(1), 0:img.size(2)] = img 99 | return imgs 100 | 101 | 102 | def image_fields(**kwargs): 103 | img = Field( 104 | use_vocab=False, dtype=torch.float, 105 | postprocessing=batch_img, sequential=False) 106 | return img 107 | -------------------------------------------------------------------------------- /onmt/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining models.""" 2 | from onmt.models.model_saver import build_model_saver, ModelSaver 3 | from onmt.models.model import NMTModel 4 | 5 | __all__ = ["build_model_saver", "ModelSaver", "NMTModel"] 6 | -------------------------------------------------------------------------------- /onmt/models/model.py: -------------------------------------------------------------------------------- 1 | """ Onmt NMT Model base class definition """ 2 | import torch.nn as nn 3 | 4 | 5 | class NMTModel(nn.Module): 6 | """ 7 | Core trainable object in OpenNMT. Implements a trainable interface 8 | for a simple, generic encoder + decoder model. 9 | 10 | Args: 11 | encoder (onmt.encoders.EncoderBase): an encoder object 12 | decoder (onmt.decoders.DecoderBase): a decoder object 13 | """ 14 | 15 | def __init__(self, encoder, decoder, bidecoder=None): 16 | super(NMTModel, self).__init__() 17 | self.encoder = encoder 18 | self.decoder = decoder 19 | self.bidecoder = bidecoder 20 | 21 | def forward(self, src, tgt, lengths, bptt=False): 22 | """Forward propagate a `src` and `tgt` pair for training. 23 | Possible initialized with a beginning decoder state. 24 | 25 | Args: 26 | src (Tensor): A source sequence passed to encoder. 27 | typically for inputs this will be a padded `LongTensor` 28 | of size ``(len, batch, features)``. However, may be an 29 | image or other generic input depending on encoder. 30 | tgt (LongTensor): A target sequence of size ``(tgt_len, batch)``. 31 | lengths(LongTensor): The src lengths, pre-padding ``(batch,)``. 32 | bptt (Boolean): A flag indicating if truncated bptt is set. 33 | If reset then init_state 34 | 35 | Returns: 36 | (FloatTensor, dict[str, FloatTensor]): 37 | 38 | * decoder output ``(tgt_len, batch, hidden)`` 39 | * dictionary attention dists of ``(tgt_len, batch, src_len)`` 40 | """ 41 | tgt = tgt[:-1] # exclude last target from inputs 42 | 43 | enc_state, memory_bank, lengths = self.encoder(src, lengths) 44 | 45 | if bptt is False: 46 | self.decoder.init_state(src, memory_bank, enc_state) 47 | dec_out, attns = self.decoder(tgt, memory_bank, 48 | memory_lengths=lengths) 49 | # bidecoder 50 | bidec_out, bidec_attns = None, None 51 | if self.bidecoder is not None: 52 | if bptt is False: 53 | self.bidecoder.init_state(src, memory_bank, enc_state) 54 | bidec_out, bidec_attns = self.bidecoder( 55 | tgt, memory_bank, memory_lengths=lengths) 56 | 57 | return dec_out, attns, bidec_out, bidec_attns 58 | 59 | def update_dropout(self, dropout): 60 | self.encoder.update_dropout(dropout) 61 | self.decoder.update_dropout(dropout) 62 | self.bidecoder.update_dropout(dropout) 63 | -------------------------------------------------------------------------------- /onmt/models/model_saver.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from collections import deque 5 | from onmt.utils.logging import logger 6 | 7 | from copy import deepcopy 8 | 9 | 10 | def build_model_saver(model_opt, opt, model, fields, optim): 11 | model_saver = ModelSaver(opt.save_model, 12 | model, 13 | model_opt, 14 | fields, 15 | optim, 16 | opt.keep_checkpoint) 17 | return model_saver 18 | 19 | 20 | class ModelSaverBase(object): 21 | """Base class for model saving operations 22 | 23 | Inherited classes must implement private methods: 24 | * `_save` 25 | * `_rm_checkpoint 26 | """ 27 | 28 | def __init__(self, base_path, model, model_opt, fields, optim, 29 | keep_checkpoint=-1): 30 | self.base_path = base_path 31 | self.model = model 32 | self.model_opt = model_opt 33 | self.fields = fields 34 | self.optim = optim 35 | self.last_saved_step = None 36 | self.keep_checkpoint = keep_checkpoint 37 | if keep_checkpoint > 0: 38 | self.checkpoint_queue = deque([], maxlen=keep_checkpoint) 39 | 40 | def save(self, step, moving_average=None): 41 | """Main entry point for model saver 42 | 43 | It wraps the `_save` method with checks and apply `keep_checkpoint` 44 | related logic 45 | """ 46 | 47 | if self.keep_checkpoint == 0 or step == self.last_saved_step: 48 | return 49 | 50 | if moving_average: 51 | save_model = deepcopy(self.model) 52 | for avg, param in zip(moving_average, save_model.parameters()): 53 | param.data.copy_(avg.data) 54 | else: 55 | save_model = self.model 56 | 57 | chkpt, chkpt_name = self._save(step, save_model) 58 | self.last_saved_step = step 59 | 60 | if moving_average: 61 | del save_model 62 | 63 | if self.keep_checkpoint > 0: 64 | if len(self.checkpoint_queue) == self.checkpoint_queue.maxlen: 65 | todel = self.checkpoint_queue.popleft() 66 | self._rm_checkpoint(todel) 67 | self.checkpoint_queue.append(chkpt_name) 68 | 69 | def _save(self, step): 70 | """Save a resumable checkpoint. 71 | 72 | Args: 73 | step (int): step number 74 | 75 | Returns: 76 | (object, str): 77 | 78 | * checkpoint: the saved object 79 | * checkpoint_name: name (or path) of the saved checkpoint 80 | """ 81 | 82 | raise NotImplementedError() 83 | 84 | def _rm_checkpoint(self, name): 85 | """Remove a checkpoint 86 | 87 | Args: 88 | name(str): name that indentifies the checkpoint 89 | (it may be a filepath) 90 | """ 91 | 92 | raise NotImplementedError() 93 | 94 | 95 | class ModelSaver(ModelSaverBase): 96 | """Simple model saver to filesystem""" 97 | 98 | def _save(self, step, model): 99 | model_state_dict = model.state_dict() 100 | model_state_dict = {k: v for k, v in model_state_dict.items() 101 | if 'generator' not in k} 102 | generator_state_dict = model.generator.state_dict() 103 | 104 | # NOTE: We need to trim the vocab to remove any unk tokens that 105 | # were not originally here. 106 | 107 | vocab = deepcopy(self.fields) 108 | for side in ["src", "tgt"]: 109 | keys_to_pop = [] 110 | if hasattr(vocab[side], "fields"): 111 | unk_token = vocab[side].fields[0][1].vocab.itos[0] 112 | for key, value in vocab[side].fields[0][1].vocab.stoi.items(): 113 | if value == 0 and key != unk_token: 114 | keys_to_pop.append(key) 115 | for key in keys_to_pop: 116 | vocab[side].fields[0][1].vocab.stoi.pop(key, None) 117 | 118 | checkpoint = { 119 | 'model': model_state_dict, 120 | 'generator': generator_state_dict, 121 | 'vocab': vocab, 122 | 'opt': self.model_opt, 123 | 'optim': self.optim.state_dict(), 124 | } 125 | if model.bidecoder_generator is not None: 126 | checkpoint['bidecoder_generator'] = model.bidecoder_generator.state_dict() 127 | 128 | logger.info("Saving checkpoint %s_step_%d.pt" % (self.base_path, step)) 129 | checkpoint_path = '%s_step_%d.pt' % (self.base_path, step) 130 | torch.save(checkpoint, checkpoint_path) 131 | return checkpoint, checkpoint_path 132 | 133 | def _rm_checkpoint(self, name): 134 | os.remove(name) 135 | -------------------------------------------------------------------------------- /onmt/models/stacked_rnn.py: -------------------------------------------------------------------------------- 1 | """ Implementation of ONMT RNN for Input Feeding Decoding """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class StackedLSTM(nn.Module): 7 | """ 8 | Our own implementation of stacked LSTM. 9 | Needed for the decoder, because we do input feeding. 10 | """ 11 | 12 | def __init__(self, num_layers, input_size, rnn_size, dropout): 13 | super(StackedLSTM, self).__init__() 14 | self.dropout = nn.Dropout(dropout) 15 | self.num_layers = num_layers 16 | self.layers = nn.ModuleList() 17 | 18 | for _ in range(num_layers): 19 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 20 | input_size = rnn_size 21 | 22 | def forward(self, input_feed, hidden): 23 | h_0, c_0 = hidden 24 | h_1, c_1 = [], [] 25 | for i, layer in enumerate(self.layers): 26 | h_1_i, c_1_i = layer(input_feed, (h_0[i], c_0[i])) 27 | input_feed = h_1_i 28 | if i + 1 != self.num_layers: 29 | input_feed = self.dropout(input_feed) 30 | h_1 += [h_1_i] 31 | c_1 += [c_1_i] 32 | 33 | h_1 = torch.stack(h_1) 34 | c_1 = torch.stack(c_1) 35 | 36 | return input_feed, (h_1, c_1) 37 | 38 | 39 | class StackedGRU(nn.Module): 40 | """ 41 | Our own implementation of stacked GRU. 42 | Needed for the decoder, because we do input feeding. 43 | """ 44 | 45 | def __init__(self, num_layers, input_size, rnn_size, dropout): 46 | super(StackedGRU, self).__init__() 47 | self.dropout = nn.Dropout(dropout) 48 | self.num_layers = num_layers 49 | self.layers = nn.ModuleList() 50 | 51 | for _ in range(num_layers): 52 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 53 | input_size = rnn_size 54 | 55 | def forward(self, input_feed, hidden): 56 | h_1 = [] 57 | for i, layer in enumerate(self.layers): 58 | h_1_i = layer(input_feed, hidden[0][i]) 59 | input_feed = h_1_i 60 | if i + 1 != self.num_layers: 61 | input_feed = self.dropout(input_feed) 62 | h_1 += [h_1_i] 63 | 64 | h_1 = torch.stack(h_1) 65 | return input_feed, (h_1,) 66 | -------------------------------------------------------------------------------- /onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | """ Attention and normalization modules """ 2 | from onmt.modules.util_class import Elementwise 3 | from onmt.modules.gate import context_gate_factory, ContextGate 4 | from onmt.modules.global_attention import GlobalAttention 5 | from onmt.modules.conv_multi_step_attention import ConvMultiStepAttention 6 | from onmt.modules.copy_generator import CopyGenerator, CopyGeneratorLoss, \ 7 | CopyGeneratorLossCompute 8 | from onmt.modules.multi_headed_attn import MultiHeadedAttention 9 | from onmt.modules.embeddings import Embeddings, PositionalEncoding, \ 10 | VecEmbedding 11 | from onmt.modules.weight_norm import WeightNormConv2d 12 | from onmt.modules.average_attn import AverageAttention 13 | 14 | __all__ = ["Elementwise", "context_gate_factory", "ContextGate", 15 | "GlobalAttention", "ConvMultiStepAttention", "CopyGenerator", 16 | "CopyGeneratorLoss", "CopyGeneratorLossCompute", 17 | "MultiHeadedAttention", "Embeddings", "PositionalEncoding", 18 | "WeightNormConv2d", "AverageAttention", "VecEmbedding"] 19 | -------------------------------------------------------------------------------- /onmt/modules/average_attn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Average Attention module.""" 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from onmt.modules.position_ffn import PositionwiseFeedForward 8 | 9 | 10 | class AverageAttention(nn.Module): 11 | """ 12 | Average Attention module from 13 | "Accelerating Neural Transformer via an Average Attention Network" 14 | :cite:`DBLP:journals/corr/abs-1805-00631`. 15 | 16 | Args: 17 | model_dim (int): the dimension of keys/values/queries, 18 | must be divisible by head_count 19 | dropout (float): dropout parameter 20 | """ 21 | 22 | def __init__(self, model_dim, dropout=0.1, aan_useffn=False): 23 | self.model_dim = model_dim 24 | self.aan_useffn = aan_useffn 25 | super(AverageAttention, self).__init__() 26 | if aan_useffn: 27 | self.average_layer = PositionwiseFeedForward(model_dim, model_dim, 28 | dropout) 29 | self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2) 30 | 31 | def cumulative_average_mask(self, batch_size, inputs_len, device): 32 | """ 33 | Builds the mask to compute the cumulative average as described in 34 | :cite:`DBLP:journals/corr/abs-1805-00631` -- Figure 3 35 | 36 | Args: 37 | batch_size (int): batch size 38 | inputs_len (int): length of the inputs 39 | 40 | Returns: 41 | (FloatTensor): 42 | 43 | * A Tensor of shape ``(batch_size, input_len, input_len)`` 44 | """ 45 | 46 | triangle = torch.tril(torch.ones(inputs_len, inputs_len, 47 | dtype=torch.float, device=device)) 48 | weights = torch.ones(1, inputs_len, dtype=torch.float, device=device) \ 49 | / torch.arange(1, inputs_len + 1, dtype=torch.float, device=device) 50 | mask = triangle * weights.transpose(0, 1) 51 | 52 | return mask.unsqueeze(0).expand(batch_size, inputs_len, inputs_len) 53 | 54 | def cumulative_average(self, inputs, mask_or_step, 55 | layer_cache=None, step=None): 56 | """ 57 | Computes the cumulative average as described in 58 | :cite:`DBLP:journals/corr/abs-1805-00631` -- Equations (1) (5) (6) 59 | 60 | Args: 61 | inputs (FloatTensor): sequence to average 62 | ``(batch_size, input_len, dimension)`` 63 | mask_or_step: if cache is set, this is assumed 64 | to be the current step of the 65 | dynamic decoding. Otherwise, it is the mask matrix 66 | used to compute the cumulative average. 67 | layer_cache: a dictionary containing the cumulative average 68 | of the previous step. 69 | 70 | Returns: 71 | a tensor of the same shape and type as ``inputs``. 72 | """ 73 | 74 | if layer_cache is not None: 75 | step = mask_or_step 76 | average_attention = (inputs + step * 77 | layer_cache["prev_g"]) / (step + 1) 78 | layer_cache["prev_g"] = average_attention 79 | return average_attention 80 | else: 81 | mask = mask_or_step 82 | return torch.matmul(mask.to(inputs.dtype), inputs) 83 | 84 | def forward(self, inputs, mask=None, layer_cache=None, step=None): 85 | """ 86 | Args: 87 | inputs (FloatTensor): ``(batch_size, input_len, model_dim)`` 88 | 89 | Returns: 90 | (FloatTensor, FloatTensor): 91 | 92 | * gating_outputs ``(batch_size, input_len, model_dim)`` 93 | * average_outputs average attention 94 | ``(batch_size, input_len, model_dim)`` 95 | """ 96 | 97 | batch_size = inputs.size(0) 98 | inputs_len = inputs.size(1) 99 | average_outputs = self.cumulative_average( 100 | inputs, self.cumulative_average_mask(batch_size, 101 | inputs_len, inputs.device) 102 | if layer_cache is None else step, layer_cache=layer_cache) 103 | if self.aan_useffn: 104 | average_outputs = self.average_layer(average_outputs) 105 | gating_outputs = self.gating_layer(torch.cat((inputs, 106 | average_outputs), -1)) 107 | input_gate, forget_gate = torch.chunk(gating_outputs, 2, dim=2) 108 | gating_outputs = torch.sigmoid(input_gate) * inputs + \ 109 | torch.sigmoid(forget_gate) * average_outputs 110 | 111 | return gating_outputs, average_outputs 112 | -------------------------------------------------------------------------------- /onmt/modules/conv_multi_step_attention.py: -------------------------------------------------------------------------------- 1 | """ Multi Step Attention for CNN """ 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from onmt.utils.misc import aeq 6 | 7 | 8 | SCALE_WEIGHT = 0.5 ** 0.5 9 | 10 | 11 | def seq_linear(linear, x): 12 | """ linear transform for 3-d tensor """ 13 | batch, hidden_size, length, _ = x.size() 14 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 15 | batch * length, hidden_size)) 16 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 17 | 18 | 19 | class ConvMultiStepAttention(nn.Module): 20 | """ 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | """ 26 | 27 | def __init__(self, input_size): 28 | super(ConvMultiStepAttention, self).__init__() 29 | self.linear_in = nn.Linear(input_size, input_size) 30 | self.mask = None 31 | 32 | def apply_mask(self, mask): 33 | """ Apply mask """ 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input_from_dec, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input_from_dec: output of decode conv 42 | encoder_out_top: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | """ 48 | 49 | # checks 50 | # batch, channel, height, width = base_target_emb.size() 51 | batch, _, height, _ = base_target_emb.size() 52 | # batch_, channel_, height_, width_ = input_from_dec.size() 53 | batch_, _, height_, _ = input_from_dec.size() 54 | aeq(batch, batch_) 55 | aeq(height, height_) 56 | 57 | # enc_batch, enc_channel, enc_height = encoder_out_top.size() 58 | enc_batch, _, enc_height = encoder_out_top.size() 59 | # enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 60 | enc_batch_, _, enc_height_ = encoder_out_combine.size() 61 | 62 | aeq(enc_batch, enc_batch_) 63 | aeq(enc_height, enc_height_) 64 | 65 | preatt = seq_linear(self.linear_in, input_from_dec) 66 | target = (base_target_emb + preatt) * SCALE_WEIGHT 67 | target = torch.squeeze(target, 3) 68 | target = torch.transpose(target, 1, 2) 69 | pre_attn = torch.bmm(target, encoder_out_top) 70 | 71 | if self.mask is not None: 72 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 73 | 74 | attn = F.softmax(pre_attn, dim=2) 75 | 76 | context_output = torch.bmm( 77 | attn, torch.transpose(encoder_out_combine, 1, 2)) 78 | context_output = torch.transpose( 79 | torch.unsqueeze(context_output, 3), 1, 2) 80 | return context_output, attn 81 | -------------------------------------------------------------------------------- /onmt/modules/gate.py: -------------------------------------------------------------------------------- 1 | """ ContextGate module """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def context_gate_factory(gate_type, embeddings_size, decoder_size, 7 | attention_size, output_size): 8 | """Returns the correct ContextGate class""" 9 | 10 | gate_types = {'source': SourceContextGate, 11 | 'target': TargetContextGate, 12 | 'both': BothContextGate} 13 | 14 | assert gate_type in gate_types, "Not valid ContextGate type: {0}".format( 15 | gate_type) 16 | return gate_types[gate_type](embeddings_size, decoder_size, attention_size, 17 | output_size) 18 | 19 | 20 | class ContextGate(nn.Module): 21 | """ 22 | Context gate is a decoder module that takes as input the previous word 23 | embedding, the current decoder state and the attention state, and 24 | produces a gate. 25 | The gate can be used to select the input from the target side context 26 | (decoder state), from the source context (attention state) or both. 27 | """ 28 | 29 | def __init__(self, embeddings_size, decoder_size, 30 | attention_size, output_size): 31 | super(ContextGate, self).__init__() 32 | input_size = embeddings_size + decoder_size + attention_size 33 | self.gate = nn.Linear(input_size, output_size, bias=True) 34 | self.sig = nn.Sigmoid() 35 | self.source_proj = nn.Linear(attention_size, output_size) 36 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 37 | output_size) 38 | 39 | def forward(self, prev_emb, dec_state, attn_state): 40 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 41 | z = self.sig(self.gate(input_tensor)) 42 | proj_source = self.source_proj(attn_state) 43 | proj_target = self.target_proj( 44 | torch.cat((prev_emb, dec_state), dim=1)) 45 | return z, proj_source, proj_target 46 | 47 | 48 | class SourceContextGate(nn.Module): 49 | """Apply the context gate only to the source context""" 50 | 51 | def __init__(self, embeddings_size, decoder_size, 52 | attention_size, output_size): 53 | super(SourceContextGate, self).__init__() 54 | self.context_gate = ContextGate(embeddings_size, decoder_size, 55 | attention_size, output_size) 56 | self.tanh = nn.Tanh() 57 | 58 | def forward(self, prev_emb, dec_state, attn_state): 59 | z, source, target = self.context_gate( 60 | prev_emb, dec_state, attn_state) 61 | return self.tanh(target + z * source) 62 | 63 | 64 | class TargetContextGate(nn.Module): 65 | """Apply the context gate only to the target context""" 66 | 67 | def __init__(self, embeddings_size, decoder_size, 68 | attention_size, output_size): 69 | super(TargetContextGate, self).__init__() 70 | self.context_gate = ContextGate(embeddings_size, decoder_size, 71 | attention_size, output_size) 72 | self.tanh = nn.Tanh() 73 | 74 | def forward(self, prev_emb, dec_state, attn_state): 75 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 76 | return self.tanh(z * target + source) 77 | 78 | 79 | class BothContextGate(nn.Module): 80 | """Apply the context gate to both contexts""" 81 | 82 | def __init__(self, embeddings_size, decoder_size, 83 | attention_size, output_size): 84 | super(BothContextGate, self).__init__() 85 | self.context_gate = ContextGate(embeddings_size, decoder_size, 86 | attention_size, output_size) 87 | self.tanh = nn.Tanh() 88 | 89 | def forward(self, prev_emb, dec_state, attn_state): 90 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 91 | return self.tanh((1. - z) * target + z * source) 92 | -------------------------------------------------------------------------------- /onmt/modules/position_ffn.py: -------------------------------------------------------------------------------- 1 | """Position feed-forward network from "Attention is All You Need".""" 2 | 3 | import torch.nn as nn 4 | 5 | 6 | class PositionwiseFeedForward(nn.Module): 7 | """ A two-layer Feed-Forward-Network with residual layer norm. 8 | 9 | Args: 10 | d_model (int): the size of input for the first-layer of the FFN. 11 | d_ff (int): the hidden layer size of the second-layer 12 | of the FNN. 13 | dropout (float): dropout probability in :math:`[0, 1)`. 14 | """ 15 | 16 | def __init__(self, d_model, d_ff, dropout=0.1): 17 | super(PositionwiseFeedForward, self).__init__() 18 | self.w_1 = nn.Linear(d_model, d_ff) 19 | self.w_2 = nn.Linear(d_ff, d_model) 20 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 21 | self.dropout_1 = nn.Dropout(dropout) 22 | self.relu = nn.ReLU() 23 | self.dropout_2 = nn.Dropout(dropout) 24 | 25 | def forward(self, x): 26 | """Layer definition. 27 | 28 | Args: 29 | x: ``(batch_size, input_len, model_dim)`` 30 | 31 | Returns: 32 | (FloatTensor): Output ``(batch_size, input_len, model_dim)``. 33 | """ 34 | 35 | inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x)))) 36 | output = self.dropout_2(self.w_2(inter)) 37 | return output + x 38 | 39 | def update_dropout(self, dropout): 40 | self.dropout_1.p = dropout 41 | self.dropout_2.p = dropout 42 | -------------------------------------------------------------------------------- /onmt/modules/sparse_activations.py: -------------------------------------------------------------------------------- 1 | """ 2 | An implementation of sparsemax (Martins & Astudillo, 2016). See 3 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description. 4 | 5 | By Ben Peters and Vlad Niculae 6 | """ 7 | 8 | import torch 9 | from torch.autograd import Function 10 | import torch.nn as nn 11 | 12 | 13 | def _make_ix_like(input, dim=0): 14 | d = input.size(dim) 15 | rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) 16 | view = [1] * input.dim() 17 | view[0] = -1 18 | return rho.view(view).transpose(0, dim) 19 | 20 | 21 | def _threshold_and_support(input, dim=0): 22 | """Sparsemax building block: compute the threshold 23 | 24 | Args: 25 | input: any dimension 26 | dim: dimension along which to apply the sparsemax 27 | 28 | Returns: 29 | the threshold value 30 | """ 31 | 32 | input_srt, _ = torch.sort(input, descending=True, dim=dim) 33 | input_cumsum = input_srt.cumsum(dim) - 1 34 | rhos = _make_ix_like(input, dim) 35 | support = rhos * input_srt > input_cumsum 36 | 37 | support_size = support.sum(dim=dim).unsqueeze(dim) 38 | tau = input_cumsum.gather(dim, support_size - 1) 39 | tau /= support_size.to(input.dtype) 40 | return tau, support_size 41 | 42 | 43 | class SparsemaxFunction(Function): 44 | 45 | @staticmethod 46 | def forward(ctx, input, dim=0): 47 | """sparsemax: normalizing sparse transform (a la softmax) 48 | 49 | Parameters: 50 | input (Tensor): any shape 51 | dim: dimension along which to apply sparsemax 52 | 53 | Returns: 54 | output (Tensor): same shape as input 55 | """ 56 | ctx.dim = dim 57 | max_val, _ = input.max(dim=dim, keepdim=True) 58 | input -= max_val # same numerical stability trick as for softmax 59 | tau, supp_size = _threshold_and_support(input, dim=dim) 60 | output = torch.clamp(input - tau, min=0) 61 | ctx.save_for_backward(supp_size, output) 62 | return output 63 | 64 | @staticmethod 65 | def backward(ctx, grad_output): 66 | supp_size, output = ctx.saved_tensors 67 | dim = ctx.dim 68 | grad_input = grad_output.clone() 69 | grad_input[output == 0] = 0 70 | 71 | v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() 72 | v_hat = v_hat.unsqueeze(dim) 73 | grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) 74 | return grad_input, None 75 | 76 | 77 | sparsemax = SparsemaxFunction.apply 78 | 79 | 80 | class Sparsemax(nn.Module): 81 | 82 | def __init__(self, dim=0): 83 | self.dim = dim 84 | super(Sparsemax, self).__init__() 85 | 86 | def forward(self, input): 87 | return sparsemax(input, self.dim) 88 | 89 | 90 | class LogSparsemax(nn.Module): 91 | 92 | def __init__(self, dim=0): 93 | self.dim = dim 94 | super(LogSparsemax, self).__init__() 95 | 96 | def forward(self, input): 97 | return torch.log(sparsemax(input, self.dim)) 98 | -------------------------------------------------------------------------------- /onmt/modules/sparse_losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from onmt.modules.sparse_activations import _threshold_and_support 5 | from onmt.utils.misc import aeq 6 | 7 | 8 | class SparsemaxLossFunction(Function): 9 | 10 | @staticmethod 11 | def forward(ctx, input, target): 12 | """ 13 | input (FloatTensor): ``(n, num_classes)``. 14 | target (LongTensor): ``(n,)``, the indices of the target classes 15 | """ 16 | input_batch, classes = input.size() 17 | target_batch = target.size(0) 18 | aeq(input_batch, target_batch) 19 | 20 | z_k = input.gather(1, target.unsqueeze(1)).squeeze() 21 | tau_z, support_size = _threshold_and_support(input, dim=1) 22 | support = input > tau_z 23 | x = torch.where( 24 | support, input**2 - tau_z**2, 25 | torch.tensor(0.0, device=input.device) 26 | ).sum(dim=1) 27 | ctx.save_for_backward(input, target, tau_z) 28 | # clamping necessary because of numerical errors: loss should be lower 29 | # bounded by zero, but negative values near zero are possible without 30 | # the clamp 31 | return torch.clamp(x / 2 - z_k + 0.5, min=0.0) 32 | 33 | @staticmethod 34 | def backward(ctx, grad_output): 35 | input, target, tau_z = ctx.saved_tensors 36 | sparsemax_out = torch.clamp(input - tau_z, min=0) 37 | delta = torch.zeros_like(sparsemax_out) 38 | delta.scatter_(1, target.unsqueeze(1), 1) 39 | return sparsemax_out - delta, None 40 | 41 | 42 | sparsemax_loss = SparsemaxLossFunction.apply 43 | 44 | 45 | class SparsemaxLoss(nn.Module): 46 | """ 47 | An implementation of sparsemax loss, first proposed in 48 | :cite:`DBLP:journals/corr/MartinsA16`. If using 49 | a sparse output layer, it is not possible to use negative log likelihood 50 | because the loss is infinite in the case the target is assigned zero 51 | probability. Inputs to SparsemaxLoss are arbitrary dense real-valued 52 | vectors (like in nn.CrossEntropyLoss), not probability vectors (like in 53 | nn.NLLLoss). 54 | """ 55 | 56 | def __init__(self, weight=None, ignore_index=-100, 57 | reduction='elementwise_mean'): 58 | assert reduction in ['elementwise_mean', 'sum', 'none'] 59 | self.reduction = reduction 60 | self.weight = weight 61 | self.ignore_index = ignore_index 62 | super(SparsemaxLoss, self).__init__() 63 | 64 | def forward(self, input, target): 65 | loss = sparsemax_loss(input, target) 66 | if self.ignore_index >= 0: 67 | ignored_positions = target == self.ignore_index 68 | size = float((target.size(0) - ignored_positions.sum()).item()) 69 | loss.masked_fill_(ignored_positions, 0.0) 70 | else: 71 | size = float(target.size(0)) 72 | if self.reduction == 'sum': 73 | loss = loss.sum() 74 | elif self.reduction == 'elementwise_mean': 75 | loss = loss.sum() / size 76 | return loss 77 | -------------------------------------------------------------------------------- /onmt/modules/structured_attention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.cuda 4 | 5 | 6 | class MatrixTree(nn.Module): 7 | """Implementation of the matrix-tree theorem for computing marginals 8 | of non-projective dependency parsing. This attention layer is used 9 | in the paper "Learning Structured Text Representations" 10 | :cite:`DBLP:journals/corr/LiuL17d`. 11 | """ 12 | 13 | def __init__(self, eps=1e-5): 14 | self.eps = eps 15 | super(MatrixTree, self).__init__() 16 | 17 | def forward(self, input): 18 | laplacian = input.exp() + self.eps 19 | output = input.clone() 20 | for b in range(input.size(0)): 21 | lap = laplacian[b].masked_fill( 22 | torch.eye(input.size(1), device=input.device).ne(0), 0) 23 | lap = -lap + torch.diag(lap.sum(0)) 24 | # store roots on diagonal 25 | lap[0] = input[b].diag().exp() 26 | inv_laplacian = lap.inverse() 27 | 28 | factor = inv_laplacian.diag().unsqueeze(1)\ 29 | .expand_as(input[b]).transpose(0, 1) 30 | term1 = input[b].exp().mul(factor).clone() 31 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 32 | term1[:, 0] = 0 33 | term2[0] = 0 34 | output[b] = term1 - term2 35 | roots_output = input[b].diag().exp().mul( 36 | inv_laplacian.transpose(0, 1)[0]) 37 | output[b] = output[b] + torch.diag(roots_output) 38 | return output 39 | -------------------------------------------------------------------------------- /onmt/modules/util_class.py: -------------------------------------------------------------------------------- 1 | """ Misc classes """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | # At the moment this class is only used by embeddings.Embeddings look-up tables 7 | class Elementwise(nn.ModuleList): 8 | """ 9 | A simple network container. 10 | Parameters are a list of modules. 11 | Inputs are a 3d Tensor whose last dimension is the same length 12 | as the list. 13 | Outputs are the result of applying modules to inputs elementwise. 14 | An optional merge parameter allows the outputs to be reduced to a 15 | single Tensor. 16 | """ 17 | 18 | def __init__(self, merge=None, *args): 19 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 20 | self.merge = merge 21 | super(Elementwise, self).__init__(*args) 22 | 23 | def forward(self, inputs): 24 | inputs_ = [feat.squeeze(2) for feat in inputs.split(1, dim=2)] 25 | assert len(self) == len(inputs_) 26 | outputs = [f(x) for f, x in zip(self, inputs_)] 27 | if self.merge == 'first': 28 | return outputs[0] 29 | elif self.merge == 'concat' or self.merge == 'mlp': 30 | return torch.cat(outputs, 2) 31 | elif self.merge == 'sum': 32 | return sum(outputs) 33 | else: 34 | return outputs 35 | 36 | 37 | class Cast(nn.Module): 38 | """ 39 | Basic layer that casts its input to a specific data type. The same tensor 40 | is returned if the data type is already correct. 41 | """ 42 | 43 | def __init__(self, dtype): 44 | super(Cast, self).__init__() 45 | self._dtype = dtype 46 | 47 | def forward(self, x): 48 | return x.to(self._dtype) 49 | -------------------------------------------------------------------------------- /onmt/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/tests/__init__.py -------------------------------------------------------------------------------- /onmt/tests/rebuild_test_models.sh: -------------------------------------------------------------------------------- 1 | # # Retrain the models used for CI. 2 | # # Should be done rarely, indicates a major breaking change. 3 | my_python=python 4 | 5 | ############### TEST regular RNN choose either -rnn_type LSTM / GRU / SRU and set input_feed 0 for SRU 6 | if true; then 7 | rm data/*.pt 8 | $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 9 | 10 | $my_python train.py -data data/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 1 -train_steps 10000 -optim adam -learning_rate 0.001 -rnn_type LSTM -input_feed 0 11 | #-truncated_decoder 5 12 | #-label_smoothing 0.1 13 | 14 | mv tmp*e10.pt onmt/tests/test_model.pt 15 | rm tmp*.pt 16 | fi 17 | # 18 | # 19 | ############### TEST CNN 20 | if false; then 21 | rm data/*.pt 22 | $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 23 | 24 | $my_python train.py -data data/data -save_model /tmp/tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 2 -train_steps 10000 -optim adam -learning_rate 0.001 -encoder_type cnn -decoder_type cnn 25 | 26 | 27 | mv /tmp/tmp*e10.pt onmt/tests/test_model.pt 28 | 29 | rm /tmp/tmp*.pt 30 | fi 31 | # 32 | ################# MORPH DATA 33 | if true; then 34 | rm data/morph/*.pt 35 | $my_python preprocess.py -train_src data/morph/src.train -train_tgt data/morph/tgt.train -valid_src data/morph/src.valid -valid_tgt data/morph/tgt.valid -save_data data/morph/data 36 | 37 | $my_python train.py -data data/morph/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 400 -word_vec_size 100 -layers 1 -train_steps 8000 -optim adam -learning_rate 0.001 38 | 39 | 40 | mv tmp*e8.pt onmt/tests/test_model2.pt 41 | 42 | rm tmp*.pt 43 | fi 44 | ############### TEST TRANSFORMER 45 | if false; then 46 | rm data/*.pt 47 | $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 -share_vocab 48 | 49 | 50 | $my_python train.py -data data/data -save_model /tmp/tmp -batch_type tokens -batch_size 1024 -accum_count 4 \ 51 | -layers 4 -rnn_size 256 -word_vec_size 256 -encoder_type transformer -decoder_type transformer -share_embedding \ 52 | -train_steps 10000 -world_size 1 -gpu_ranks 0 -max_generator_batches 4 -dropout 0.1 -normalization tokens \ 53 | -max_grad_norm 0 -optim adam -decay_method noam -learning_rate 2 -label_smoothing 0.1 \ 54 | -position_encoding -param_init 0 -warmup_steps 100 -param_init_glorot -adam_beta2 0.998 55 | # 56 | mv /tmp/tmp*e10.pt onmt/tests/test_model.pt 57 | rm /tmp/tmp*.pt 58 | fi 59 | # 60 | if false; then 61 | $my_python translate.py -gpu 0 -model onmt/tests/test_model.pt \ 62 | -src data/src-val.txt -output onmt/tests/output_hyp.txt -beam 5 -batch_size 16 63 | 64 | fi 65 | 66 | 67 | -------------------------------------------------------------------------------- /onmt/tests/test_attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Here come the tests for attention types and their compatibility 3 | """ 4 | import unittest 5 | import torch 6 | from torch.autograd import Variable 7 | 8 | import onmt 9 | 10 | 11 | class TestAttention(unittest.TestCase): 12 | 13 | def test_masked_global_attention(self): 14 | 15 | source_lengths = torch.IntTensor([7, 3, 5, 2]) 16 | # illegal_weights_mask = torch.ByteTensor([ 17 | # [0, 0, 0, 0, 0, 0, 0], 18 | # [0, 0, 0, 1, 1, 1, 1], 19 | # [0, 0, 0, 0, 0, 1, 1], 20 | # [0, 0, 1, 1, 1, 1, 1]]) 21 | 22 | batch_size = source_lengths.size(0) 23 | dim = 20 24 | 25 | memory_bank = Variable(torch.randn(batch_size, 26 | source_lengths.max(), dim)) 27 | hidden = Variable(torch.randn(batch_size, dim)) 28 | 29 | attn = onmt.modules.GlobalAttention(dim) 30 | 31 | _, alignments = attn(hidden, memory_bank, 32 | memory_lengths=source_lengths) 33 | # TODO: fix for pytorch 0.3 34 | # illegal_weights = alignments.masked_select(illegal_weights_mask) 35 | 36 | # self.assertEqual(0.0, illegal_weights.data.sum()) 37 | -------------------------------------------------------------------------------- /onmt/tests/test_image_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from onmt.inputters.image_dataset import ImageDataReader 3 | 4 | import os 5 | import shutil 6 | 7 | import cv2 8 | import numpy as np 9 | import torch 10 | 11 | 12 | class TestImageDataReader(unittest.TestCase): 13 | # this test touches the file system, so it could be considered an 14 | # integration test 15 | _THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 16 | _IMG_DATA_DIRNAME = "test_image_data" 17 | _IMG_DATA_DIR = os.path.join(_THIS_DIR, _IMG_DATA_DIRNAME) 18 | _IMG_DATA_FMT = "test_img_{:d}.png" 19 | _IMG_DATA_PATH_FMT = os.path.join(_IMG_DATA_DIR, _IMG_DATA_FMT) 20 | 21 | _IMG_LIST_DIR = "test_image_filenames" 22 | # file to hold full paths to image data 23 | _IMG_LIST_PATHS_FNAME = "test_files.txt" 24 | _IMG_LIST_PATHS_PATH = os.path.join( 25 | _IMG_LIST_DIR, _IMG_LIST_PATHS_FNAME) 26 | # file to hold image paths relative to _IMG_DATA_DIR (i.e. file names) 27 | _IMG_LIST_FNAMES_FNAME = "test_fnames.txt" 28 | _IMG_LIST_FNAMES_PATH = os.path.join( 29 | _IMG_LIST_DIR, _IMG_LIST_FNAMES_FNAME) 30 | 31 | # it's ok if non-image files co-exist with image files in the data dir 32 | _JUNK_FILE = os.path.join( 33 | _IMG_DATA_DIR, "this_is_junk.txt") 34 | 35 | _N_EXAMPLES = 20 36 | _N_CHANNELS = 3 37 | 38 | @classmethod 39 | def setUpClass(cls): 40 | if not os.path.exists(cls._IMG_DATA_DIR): 41 | os.makedirs(cls._IMG_DATA_DIR) 42 | if not os.path.exists(cls._IMG_LIST_DIR): 43 | os.makedirs(cls._IMG_LIST_DIR) 44 | 45 | with open(cls._JUNK_FILE, "w") as f: 46 | f.write("this is some garbage\nShould have no impact.") 47 | 48 | with open(cls._IMG_LIST_PATHS_PATH, "w") as f_list_fnames, \ 49 | open(cls._IMG_LIST_FNAMES_PATH, "w") as f_list_paths: 50 | cls.n_rows = torch.randint(30, 314, (cls._N_EXAMPLES,)) 51 | cls.n_cols = torch.randint(30, 314, (cls._N_EXAMPLES,)) 52 | for i in range(cls._N_EXAMPLES): 53 | img = np.random.randint( 54 | 0, 255, (cls.n_rows[i], cls.n_cols[i], cls._N_CHANNELS)) 55 | f_path = cls._IMG_DATA_PATH_FMT.format(i) 56 | cv2.imwrite(f_path, img) 57 | f_name_short = cls._IMG_DATA_FMT.format(i) 58 | f_list_fnames.write(f_name_short + "\n") 59 | f_list_paths.write(f_path + "\n") 60 | 61 | @classmethod 62 | def tearDownClass(cls): 63 | shutil.rmtree(cls._IMG_DATA_DIR) 64 | shutil.rmtree(cls._IMG_LIST_DIR) 65 | 66 | def test_read_from_dir_and_data_file_containing_filenames(self): 67 | rdr = ImageDataReader(channel_size=self._N_CHANNELS) 68 | i = 0 # initialize since there's a sanity check on i 69 | for i, img in enumerate(rdr.read( 70 | self._IMG_LIST_FNAMES_PATH, "src", self._IMG_DATA_DIR)): 71 | self.assertEqual( 72 | img["src"].shape, 73 | (self._N_CHANNELS, self.n_rows[i], self.n_cols[i])) 74 | self.assertEqual(img["src_path"], 75 | self._IMG_DATA_PATH_FMT.format(i)) 76 | self.assertGreater(i, 0, "No image data was read.") 77 | 78 | def test_read_from_dir_and_data_file_containing_paths(self): 79 | rdr = ImageDataReader(channel_size=self._N_CHANNELS) 80 | i = 0 # initialize since there's a sanity check on i 81 | for i, img in enumerate(rdr.read( 82 | self._IMG_LIST_PATHS_PATH, "src", self._IMG_DATA_DIR)): 83 | self.assertEqual( 84 | img["src"].shape, 85 | (self._N_CHANNELS, self.n_rows[i], self.n_cols[i])) 86 | self.assertEqual(img["src_path"], 87 | self._IMG_DATA_FMT.format(i)) 88 | self.assertGreater(i, 0, "No image data was read.") 89 | 90 | 91 | class TestImageDataReader1Channel(TestImageDataReader): 92 | _N_CHANNELS = 1 93 | -------------------------------------------------------------------------------- /onmt/tests/test_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/tests/test_model.pt -------------------------------------------------------------------------------- /onmt/tests/test_model2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/tests/test_model2.pt -------------------------------------------------------------------------------- /onmt/tests/test_simple.py: -------------------------------------------------------------------------------- 1 | import onmt 2 | 3 | 4 | def test_load(): 5 | onmt 6 | pass 7 | -------------------------------------------------------------------------------- /onmt/tests/test_structured_attention.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from onmt.modules.structured_attention import MatrixTree 3 | 4 | import torch 5 | 6 | 7 | class TestStructuredAttention(unittest.TestCase): 8 | def test_matrix_tree_marg_pdfs_sum_to_1(self): 9 | dtree = MatrixTree() 10 | q = torch.rand(1, 5, 5) 11 | marg = dtree.forward(q) 12 | self.assertTrue( 13 | marg.sum(1).allclose(torch.tensor(1.0))) 14 | -------------------------------------------------------------------------------- /onmt/tests/utils_for_tests.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | 4 | def product_dict(**kwargs): 5 | keys = kwargs.keys() 6 | vals = kwargs.values() 7 | for instance in itertools.product(*vals): 8 | yield dict(zip(keys, instance)) 9 | -------------------------------------------------------------------------------- /onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | """ Modules for translation """ 2 | from onmt.translate.translator import Translator 3 | from onmt.translate.translation import Translation, TranslationBuilder 4 | from onmt.translate.beam import Beam, GNMTGlobalScorer 5 | from onmt.translate.beam_search import BeamSearch 6 | from onmt.translate.decode_strategy import DecodeStrategy 7 | from onmt.translate.random_sampling import RandomSampling 8 | from onmt.translate.penalties import PenaltyBuilder 9 | from onmt.translate.translation_server import TranslationServer, \ 10 | ServerModelError 11 | 12 | __all__ = ['Translator', 'Translation', 'Beam', 'BeamSearch', 13 | 'GNMTGlobalScorer', 'TranslationBuilder', 14 | 'PenaltyBuilder', 'TranslationServer', 'ServerModelError', 15 | "DecodeStrategy", "RandomSampling"] 16 | -------------------------------------------------------------------------------- /onmt/translate/penalties.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | 4 | 5 | class PenaltyBuilder(object): 6 | """Returns the Length and Coverage Penalty function for Beam Search. 7 | 8 | Args: 9 | length_pen (str): option name of length pen 10 | cov_pen (str): option name of cov pen 11 | 12 | Attributes: 13 | has_cov_pen (bool): Whether coverage penalty is None (applying it 14 | is a no-op). Note that the converse isn't true. Setting beta 15 | to 0 should force coverage length to be a no-op. 16 | has_len_pen (bool): Whether length penalty is None (applying it 17 | is a no-op). Note that the converse isn't true. Setting alpha 18 | to 1 should force length penalty to be a no-op. 19 | coverage_penalty (callable[[FloatTensor, float], FloatTensor]): 20 | Calculates the coverage penalty. 21 | length_penalty (callable[[int, float], float]): Calculates 22 | the length penalty. 23 | """ 24 | 25 | def __init__(self, cov_pen, length_pen): 26 | self.has_cov_pen = not self._pen_is_none(cov_pen) 27 | self.coverage_penalty = self._coverage_penalty(cov_pen) 28 | self.has_len_pen = not self._pen_is_none(length_pen) 29 | self.length_penalty = self._length_penalty(length_pen) 30 | 31 | @staticmethod 32 | def _pen_is_none(pen): 33 | return pen == "none" or pen is None 34 | 35 | def _coverage_penalty(self, cov_pen): 36 | if cov_pen == "wu": 37 | return self.coverage_wu 38 | elif cov_pen == "summary": 39 | return self.coverage_summary 40 | elif self._pen_is_none(cov_pen): 41 | return self.coverage_none 42 | else: 43 | raise NotImplementedError("No '{:s}' coverage penalty.".format( 44 | cov_pen)) 45 | 46 | def _length_penalty(self, length_pen): 47 | if length_pen == "wu": 48 | return self.length_wu 49 | elif length_pen == "avg": 50 | return self.length_average 51 | elif self._pen_is_none(length_pen): 52 | return self.length_none 53 | else: 54 | raise NotImplementedError("No '{:s}' length penalty.".format( 55 | length_pen)) 56 | 57 | # Below are all the different penalty terms implemented so far. 58 | # Subtract coverage penalty from topk log probs. 59 | # Divide topk log probs by length penalty. 60 | 61 | def coverage_wu(self, cov, beta=0.): 62 | """GNMT coverage re-ranking score. 63 | 64 | See "Google's Neural Machine Translation System" :cite:`wu2016google`. 65 | ``cov`` is expected to be sized ``(*, seq_len)``, where ``*`` is 66 | probably ``batch_size x beam_size`` but could be several 67 | dimensions like ``(batch_size, beam_size)``. If ``cov`` is attention, 68 | then the ``seq_len`` axis probably sums to (almost) 1. 69 | """ 70 | 71 | penalty = -torch.min(cov, cov.clone().fill_(1.0)).log().sum(-1) 72 | return beta * penalty 73 | 74 | def coverage_summary(self, cov, beta=0.): 75 | """Our summary penalty.""" 76 | penalty = torch.max(cov, cov.clone().fill_(1.0)).sum(-1) 77 | penalty -= cov.size(-1) 78 | return beta * penalty 79 | 80 | def coverage_none(self, cov, beta=0.): 81 | """Returns zero as penalty""" 82 | none = torch.zeros((1,), device=cov.device, 83 | dtype=torch.float) 84 | if cov.dim() == 3: 85 | none = none.unsqueeze(0) 86 | return none 87 | 88 | def length_wu(self, cur_len, alpha=0.): 89 | """GNMT length re-ranking score. 90 | 91 | See "Google's Neural Machine Translation System" :cite:`wu2016google`. 92 | """ 93 | 94 | return ((5 + cur_len) / 6.0) ** alpha 95 | 96 | def length_average(self, cur_len, alpha=0.): 97 | """Returns the current sequence length.""" 98 | return cur_len 99 | 100 | def length_none(self, cur_len, alpha=0.): 101 | """Returns unmodified scores.""" 102 | return 1.0 103 | -------------------------------------------------------------------------------- /onmt/translate/process_zh.py: -------------------------------------------------------------------------------- 1 | from pyhanlp import HanLP 2 | from snownlp import SnowNLP 3 | import pkuseg 4 | 5 | 6 | # Chinese segmentation 7 | def zh_segmentator(line): 8 | return " ".join(pkuseg.pkuseg().cut(line)) 9 | 10 | 11 | # Chinese simplify -> Chinese traditional standard 12 | def zh_traditional_standard(line): 13 | return HanLP.convertToTraditionalChinese(line) 14 | 15 | 16 | # Chinese simplify -> Chinese traditional (HongKong) 17 | def zh_traditional_hk(line): 18 | return HanLP.s2hk(line) 19 | 20 | 21 | # Chinese simplify -> Chinese traditional (Taiwan) 22 | def zh_traditional_tw(line): 23 | return HanLP.s2tw(line) 24 | 25 | 26 | # Chinese traditional -> Chinese simplify (v1) 27 | def zh_simplify(line): 28 | return HanLP.convertToSimplifiedChinese(line) 29 | 30 | 31 | # Chinese traditional -> Chinese simplify (v2) 32 | def zh_simplify_v2(line): 33 | return SnowNLP(line).han 34 | -------------------------------------------------------------------------------- /onmt/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Module defining various utilities.""" 2 | from onmt.utils.misc import split_corpus, aeq, use_gpu, set_random_seed 3 | from onmt.utils.report_manager import ReportMgr, build_report_manager 4 | from onmt.utils.statistics import Statistics 5 | from onmt.utils.optimizers import MultipleOptimizer, \ 6 | Optimizer, AdaFactor 7 | from onmt.utils.earlystopping import EarlyStopping, scorers_from_opts 8 | 9 | __all__ = ["split_corpus", "aeq", "use_gpu", "set_random_seed", "ReportMgr", 10 | "build_report_manager", "Statistics", 11 | "MultipleOptimizer", "Optimizer", "AdaFactor", "EarlyStopping", 12 | "scorers_from_opts"] 13 | -------------------------------------------------------------------------------- /onmt/utils/cnn_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of "Convolutional Sequence to Sequence Learning" 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | 8 | import onmt.modules 9 | 10 | SCALE_WEIGHT = 0.5 ** 0.5 11 | 12 | 13 | def shape_transform(x): 14 | """ Tranform the size of the tensors to fit for conv input. """ 15 | return torch.unsqueeze(torch.transpose(x, 1, 2), 3) 16 | 17 | 18 | class GatedConv(nn.Module): 19 | """ Gated convolution for CNN class """ 20 | 21 | def __init__(self, input_size, width=3, dropout=0.2, nopad=False): 22 | super(GatedConv, self).__init__() 23 | self.conv = onmt.modules.WeightNormConv2d( 24 | input_size, 2 * input_size, kernel_size=(width, 1), stride=(1, 1), 25 | padding=(width // 2 * (1 - nopad), 0)) 26 | init.xavier_uniform_(self.conv.weight, gain=(4 * (1 - dropout))**0.5) 27 | self.dropout = nn.Dropout(dropout) 28 | 29 | def forward(self, x_var): 30 | x_var = self.dropout(x_var) 31 | x_var = self.conv(x_var) 32 | out, gate = x_var.split(int(x_var.size(1) / 2), 1) 33 | out = out * torch.sigmoid(gate) 34 | return out 35 | 36 | 37 | class StackedCNN(nn.Module): 38 | """ Stacked CNN class """ 39 | 40 | def __init__(self, num_layers, input_size, cnn_kernel_width=3, 41 | dropout=0.2): 42 | super(StackedCNN, self).__init__() 43 | self.dropout = dropout 44 | self.num_layers = num_layers 45 | self.layers = nn.ModuleList() 46 | for _ in range(num_layers): 47 | self.layers.append( 48 | GatedConv(input_size, cnn_kernel_width, dropout)) 49 | 50 | def forward(self, x): 51 | for conv in self.layers: 52 | x = x + conv(x) 53 | x *= SCALE_WEIGHT 54 | return x 55 | -------------------------------------------------------------------------------- /onmt/utils/distributed.py: -------------------------------------------------------------------------------- 1 | """ Pytorch Distributed utils 2 | This piece of code was heavily inspired by the equivalent of Fairseq-py 3 | https://github.com/pytorch/fairseq 4 | """ 5 | 6 | 7 | from __future__ import print_function 8 | 9 | import math 10 | import pickle 11 | import torch.distributed 12 | 13 | from onmt.utils.logging import logger 14 | 15 | 16 | def is_master(opt, device_id): 17 | return opt.gpu_ranks[device_id] == 0 18 | 19 | 20 | def multi_init(opt, device_id): 21 | dist_init_method = 'tcp://{master_ip}:{master_port}'.format( 22 | master_ip=opt.master_ip, 23 | master_port=opt.master_port) 24 | dist_world_size = opt.world_size 25 | torch.distributed.init_process_group( 26 | backend=opt.gpu_backend, init_method=dist_init_method, 27 | world_size=dist_world_size, rank=opt.gpu_ranks[device_id]) 28 | gpu_rank = torch.distributed.get_rank() 29 | if not is_master(opt, device_id): 30 | logger.disabled = True 31 | 32 | return gpu_rank 33 | 34 | 35 | def all_reduce_and_rescale_tensors(tensors, rescale_denom, 36 | buffer_size=10485760): 37 | """All-reduce and rescale tensors in chunks of the specified size. 38 | 39 | Args: 40 | tensors: list of Tensors to all-reduce 41 | rescale_denom: denominator for rescaling summed Tensors 42 | buffer_size: all-reduce chunk size in bytes 43 | """ 44 | # buffer size in bytes, determine equiv. # of elements based on data type 45 | buffer_t = tensors[0].new( 46 | math.ceil(buffer_size / tensors[0].element_size())).zero_() 47 | buffer = [] 48 | 49 | def all_reduce_buffer(): 50 | # copy tensors into buffer_t 51 | offset = 0 52 | for t in buffer: 53 | numel = t.numel() 54 | buffer_t[offset:offset+numel].copy_(t.view(-1)) 55 | offset += numel 56 | 57 | # all-reduce and rescale 58 | torch.distributed.all_reduce(buffer_t[:offset]) 59 | buffer_t.div_(rescale_denom) 60 | 61 | # copy all-reduced buffer back into tensors 62 | offset = 0 63 | for t in buffer: 64 | numel = t.numel() 65 | t.view(-1).copy_(buffer_t[offset:offset+numel]) 66 | offset += numel 67 | 68 | filled = 0 69 | for t in tensors: 70 | sz = t.numel() * t.element_size() 71 | if sz > buffer_size: 72 | # tensor is bigger than buffer, all-reduce and rescale directly 73 | torch.distributed.all_reduce(t) 74 | t.div_(rescale_denom) 75 | elif filled + sz > buffer_size: 76 | # buffer is full, all-reduce and replace buffer with grad 77 | all_reduce_buffer() 78 | buffer = [t] 79 | filled = sz 80 | else: 81 | # add tensor to buffer 82 | buffer.append(t) 83 | filled += sz 84 | 85 | if len(buffer) > 0: 86 | all_reduce_buffer() 87 | 88 | 89 | def all_gather_list(data, max_size=4096): 90 | """Gathers arbitrary data from all nodes into a list.""" 91 | world_size = torch.distributed.get_world_size() 92 | if not hasattr(all_gather_list, '_in_buffer') or \ 93 | max_size != all_gather_list._in_buffer.size(): 94 | all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size) 95 | all_gather_list._out_buffers = [ 96 | torch.cuda.ByteTensor(max_size) 97 | for i in range(world_size) 98 | ] 99 | in_buffer = all_gather_list._in_buffer 100 | out_buffers = all_gather_list._out_buffers 101 | 102 | enc = pickle.dumps(data) 103 | enc_size = len(enc) 104 | if enc_size + 2 > max_size: 105 | raise ValueError( 106 | 'encoded data exceeds max_size: {}'.format(enc_size + 2)) 107 | assert max_size < 255*256 108 | in_buffer[0] = enc_size // 255 # this encoding works for max_size < 65k 109 | in_buffer[1] = enc_size % 255 110 | in_buffer[2:enc_size+2] = torch.ByteTensor(list(enc)) 111 | 112 | torch.distributed.all_gather(out_buffers, in_buffer.cuda()) 113 | 114 | results = [] 115 | for i in range(world_size): 116 | out_buffer = out_buffers[i] 117 | size = (255 * out_buffer[0].item()) + out_buffer[1].item() 118 | 119 | bytes_list = bytes(out_buffer[2:size+2].tolist()) 120 | result = pickle.loads(bytes_list) 121 | results.append(result) 122 | return results 123 | -------------------------------------------------------------------------------- /onmt/utils/logging.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import logging 5 | 6 | logger = logging.getLogger() 7 | 8 | 9 | def init_logger(log_file=None, log_file_level=logging.NOTSET): 10 | log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s") 11 | logger = logging.getLogger() 12 | logger.setLevel(logging.INFO) 13 | 14 | console_handler = logging.StreamHandler() 15 | console_handler.setFormatter(log_format) 16 | logger.handlers = [console_handler] 17 | 18 | if log_file and log_file != '': 19 | file_handler = logging.FileHandler(log_file) 20 | file_handler.setLevel(log_file_level) 21 | file_handler.setFormatter(log_format) 22 | logger.addHandler(file_handler) 23 | 24 | return logger 25 | -------------------------------------------------------------------------------- /onmt/utils/misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import torch 4 | import random 5 | import inspect 6 | from itertools import islice 7 | 8 | 9 | def split_corpus(path, shard_size): 10 | with open(path, "rb") as f: 11 | if shard_size <= 0: 12 | yield f.readlines() 13 | else: 14 | while True: 15 | shard = list(islice(f, shard_size)) 16 | if not shard: 17 | break 18 | yield shard 19 | 20 | 21 | def aeq(*args): 22 | """ 23 | Assert all arguments have the same value 24 | """ 25 | arguments = (arg for arg in args) 26 | first = next(arguments) 27 | assert all(arg == first for arg in arguments), \ 28 | "Not all arguments have the same value: " + str(args) 29 | 30 | 31 | def sequence_mask(lengths, max_len=None): 32 | """ 33 | Creates a boolean mask from sequence lengths. 34 | """ 35 | batch_size = lengths.numel() 36 | max_len = max_len or lengths.max() 37 | return (torch.arange(0, max_len, device=lengths.device) 38 | .type_as(lengths) 39 | .repeat(batch_size, 1) 40 | .lt(lengths.unsqueeze(1))) 41 | 42 | 43 | def tile(x, count, dim=0): 44 | """ 45 | Tiles x on dimension dim count times. 46 | """ 47 | perm = list(range(len(x.size()))) 48 | if dim != 0: 49 | perm[0], perm[dim] = perm[dim], perm[0] 50 | x = x.permute(perm).contiguous() 51 | out_size = list(x.size()) 52 | out_size[0] *= count 53 | batch = x.size(0) 54 | x = x.view(batch, -1) \ 55 | .transpose(0, 1) \ 56 | .repeat(count, 1) \ 57 | .transpose(0, 1) \ 58 | .contiguous() \ 59 | .view(*out_size) 60 | if dim != 0: 61 | x = x.permute(perm).contiguous() 62 | return x 63 | 64 | 65 | def use_gpu(opt): 66 | """ 67 | Creates a boolean if gpu used 68 | """ 69 | return (hasattr(opt, 'gpu_ranks') and len(opt.gpu_ranks) > 0) or \ 70 | (hasattr(opt, 'gpu') and opt.gpu > -1) 71 | 72 | 73 | def set_random_seed(seed, is_cuda): 74 | """Sets the random seed.""" 75 | if seed > 0: 76 | torch.manual_seed(seed) 77 | # this one is needed for torchtext random call (shuffled iterator) 78 | # in multi gpu it ensures datasets are read in the same order 79 | random.seed(seed) 80 | # some cudnn methods can be random even after fixing the seed 81 | # unless you tell it to be deterministic 82 | torch.backends.cudnn.deterministic = True 83 | 84 | if is_cuda and seed > 0: 85 | # These ensure same initialization in multi gpu mode 86 | torch.cuda.manual_seed(seed) 87 | 88 | 89 | def generate_relative_positions_matrix(length, max_relative_positions, 90 | cache=False): 91 | """Generate the clipped relative positions matrix 92 | for a given length and maximum relative positions""" 93 | if cache: 94 | distance_mat = torch.arange(-length+1, 1, 1).unsqueeze(0) 95 | else: 96 | range_vec = torch.arange(length) 97 | range_mat = range_vec.unsqueeze(-1).expand(-1, length).transpose(0, 1) 98 | distance_mat = range_mat - range_mat.transpose(0, 1) 99 | distance_mat_clipped = torch.clamp(distance_mat, 100 | min=-max_relative_positions, 101 | max=max_relative_positions) 102 | # Shift values to be >= 0 103 | final_mat = distance_mat_clipped + max_relative_positions 104 | return final_mat 105 | 106 | 107 | def relative_matmul(x, z, transpose): 108 | """Helper function for relative positions attention.""" 109 | batch_size = x.shape[0] 110 | heads = x.shape[1] 111 | length = x.shape[2] 112 | x_t = x.permute(2, 0, 1, 3) 113 | x_t_r = x_t.reshape(length, heads * batch_size, -1) 114 | if transpose: 115 | z_t = z.transpose(1, 2) 116 | x_tz_matmul = torch.matmul(x_t_r, z_t) 117 | else: 118 | x_tz_matmul = torch.matmul(x_t_r, z) 119 | x_tz_matmul_r = x_tz_matmul.reshape(length, batch_size, heads, -1) 120 | x_tz_matmul_r_t = x_tz_matmul_r.permute(1, 2, 0, 3) 121 | return x_tz_matmul_r_t 122 | 123 | 124 | def fn_args(fun): 125 | """Returns the list of function arguments name.""" 126 | return inspect.getfullargspec(fun).args 127 | -------------------------------------------------------------------------------- /onmt/utils/rnn_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | RNN tools 3 | """ 4 | import torch.nn as nn 5 | import onmt.models 6 | 7 | 8 | def rnn_factory(rnn_type, **kwargs): 9 | """ rnn factory, Use pytorch version when available. """ 10 | no_pack_padded_seq = False 11 | if rnn_type == "SRU": 12 | # SRU doesn't support PackedSequence. 13 | no_pack_padded_seq = True 14 | rnn = onmt.models.sru.SRU(**kwargs) 15 | else: 16 | rnn = getattr(nn, rnn_type)(**kwargs) 17 | return rnn, no_pack_padded_seq 18 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import torch 3 | import numpy as np 4 | import random 5 | 6 | from onmt.bin.preprocess import main 7 | 8 | 9 | def setup_seed(seed): 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | np.random.seed(seed) 13 | random.seed(seed) 14 | torch.backends.cudnn.deterministic = True 15 | 16 | 17 | setup_seed(2020) 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | torch==1.7.1 2 | torchtext==0.4.0 3 | nltk==3.4.3 4 | transformers==2.1.1 5 | numpy==1.17.2 6 | requests 7 | configargparse 8 | -------------------------------------------------------------------------------- /requirements.opt.txt: -------------------------------------------------------------------------------- 1 | cffi 2 | torchvision 3 | joblib 4 | librosa 5 | Pillow 6 | git+git://github.com/pytorch/audio.git@d92de5b97fc6204db4b1e3ed20c03ac06f5d53f0 7 | pyrouge 8 | opencv-python 9 | git+https://github.com/NVIDIA/apex 10 | pretrainedmodels 11 | -------------------------------------------------------------------------------- /scripts/inference_daily.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR="data_daily" 2 | #DATA_DIR="data_ost" 3 | DATASET="bert" 4 | 5 | python3 translate.py -gpu "$1" -model "$2" \ 6 | -output result/"$DATASET"_adalab_"$DATA_DIR".txt -beam 1 -batch_size 128 \ 7 | -src "$DATA_DIR"/src-test.txt -max_length 30 -tokenizer bert 8 | -------------------------------------------------------------------------------- /scripts/inference_ost.sh: -------------------------------------------------------------------------------- 1 | # DATA_DIR="data_daily" 2 | DATA_DIR="data_ost" 3 | DATASET="bert" 4 | 5 | python3 translate.py -gpu "$1" -model "$2" \ 6 | -output result/"$DATASET"_adalab_"$DATA_DIR".txt -beam 1 -batch_size 128 \ 7 | -src "$DATA_DIR"/src-test.txt -max_length 30 -tokenizer bert 8 | -------------------------------------------------------------------------------- /scripts/preprocess.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR="data_daily" 2 | #DATA_DIR="data_ost" 3 | DATASET="bert" 4 | VOCAB="vocab.txt" 5 | 6 | 7 | python3 preprocess.py -train_src "$DATA_DIR"/src-train.txt -train_tgt "$DATA_DIR"/tgt-train.txt \ 8 | -valid_src "$DATA_DIR"/src-valid.txt -valid_tgt "$DATA_DIR"/tgt-valid.txt \ 9 | -save_data "$DATA_DIR"/"$DATASET" -share_vocab \ 10 | -src_vocab_size 300000 -tgt_vocab_size 300000 \ 11 | -src_vocab "$DATA_DIR"/"$VOCAB" -tgt_vocab "$DATA_DIR"/"$VOCAB" \ 12 | -src_seq_length 512 -tgt_seq_length 512 \ 13 | -tokenizer bert -------------------------------------------------------------------------------- /scripts/train_daily.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR="data_daily" 2 | #DATA_DIR="data_ost" 3 | DATASET="bert" 4 | EMB=512 5 | STEPS=1000000 6 | BS=64 7 | ACCUM=2 8 | SAVESTEPS=1000 9 | 10 | 11 | python3 train.py -adalab -bidecoder -ada_temp 1.5 \ 12 | -world_size 1 -gpu_ranks 0 \ 13 | -log_file ./log_dir/"$DATASET"_transformer_adalab_"$DATA_DIR".log -data "$DATA_DIR"/"$DATASET" \ 14 | -save_model checkpoint/"$DATASET"_trainsformer_adalab_"$DATA_DIR" \ 15 | -train_steps "$STEPS" -save_checkpoint_steps "$SAVESTEPS" -valid_steps "$SAVESTEPS" -report_every 100 \ 16 | -max_generator_batches 0 -dropout 0.1 -max_grad_norm 1 \ 17 | -encoder_type transformer -decoder_type transformer -position_encoding \ 18 | -param_init 0 -param_init_glorot -transformer_ff 512 -heads 8 \ 19 | -batch_size "$BS" -accum_count "$ACCUM" -layers 6 -rnn_size "$EMB" -word_vec_size "$EMB" \ 20 | -optim adam -learning_rate 1e-4 -start_decay_steps 100000000 -early_stopping 10 -------------------------------------------------------------------------------- /scripts/train_ost.sh: -------------------------------------------------------------------------------- 1 | #DATA_DIR="data_daily" 2 | DATA_DIR="data_ost" 3 | DATASET="bert" 4 | EMB=512 5 | STEPS=1000000 6 | BS=64 7 | ACCUM=2 8 | SAVESTEPS=1000 9 | 10 | 11 | python3 train.py -adalab -bidecoder -ada_temp 1 \ 12 | -world_size 1 -gpu_ranks 0 \ 13 | -log_file ./log_dir/"$DATASET"_transformer_adalab_"$DATA_DIR".log -data "$DATA_DIR"/"$DATASET" \ 14 | -save_model checkpoint/"$DATASET"_trainsformer_adalab_"$DATA_DIR" \ 15 | -train_steps "$STEPS" -save_checkpoint_steps "$SAVESTEPS" -valid_steps "$SAVESTEPS" -report_every 100 \ 16 | -max_generator_batches 0 -dropout 0.1 -max_grad_norm 1 \ 17 | -encoder_type transformer -decoder_type transformer -position_encoding \ 18 | -param_init 0 -param_init_glorot -transformer_ff 512 -heads 8 \ 19 | -batch_size "$BS" -accum_count "$ACCUM" -layers 6 -rnn_size "$EMB" -word_vec_size "$EMB" \ 20 | -optim adam -learning_rate 1e-4 -start_decay_steps 100000000 -early_stopping 30 -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from onmt.bin.server import main 3 | 4 | 5 | if __name__ == "__main__": 6 | main() 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, find_packages 3 | from os import path 4 | 5 | this_directory = path.abspath(path.dirname(__file__)) 6 | with open(path.join(this_directory, 'ONMT_README.md'), encoding='utf-8') as f: 7 | long_description = f.read() 8 | 9 | setup( 10 | name='OpenNMT-py', 11 | description='A python implementation of OpenNMT', 12 | long_description=long_description, 13 | long_description_content_type='text/markdown', 14 | version='1.0.0.rc2', 15 | packages=find_packages(), 16 | project_urls={ 17 | "Documentation": "http://opennmt.net/OpenNMT-py/", 18 | "Forum": "http://forum.opennmt.net/", 19 | "Gitter": "https://gitter.im/OpenNMT/OpenNMT-py", 20 | "Source": "https://github.com/OpenNMT/OpenNMT-py/" 21 | }, 22 | install_requires=[ 23 | "six", 24 | "tqdm~=4.30.0", 25 | "torch>=1.2", 26 | "torchtext==0.4.0", 27 | "future", 28 | "configargparse", 29 | "tensorboard>=1.14", 30 | "flask", 31 | "pyonmttok==1.*;platform_system=='Linux'", 32 | ], 33 | entry_points={ 34 | "console_scripts": [ 35 | "onmt_server=onmt.bin.server:main", 36 | "onmt_train=onmt.bin.train:main", 37 | "onmt_translate=onmt.bin.translate:main", 38 | "onmt_preprocess=onmt.bin.preprocess:main", 39 | ], 40 | } 41 | ) 42 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | This directly contains scripts and tools adopted from other open source projects such as Apache Joshua and Moses Decoder. 2 | 3 | TODO: credit the authors and resolve license issues (if any) 4 | -------------------------------------------------------------------------------- /tools/average_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import torch 4 | 5 | 6 | def average_models(model_files): 7 | vocab = None 8 | opt = None 9 | avg_model = None 10 | avg_generator = None 11 | 12 | for i, model_file in enumerate(model_files): 13 | m = torch.load(model_file, map_location='cpu') 14 | model_weights = m['model'] 15 | generator_weights = m['generator'] 16 | 17 | if i == 0: 18 | vocab, opt = m['vocab'], m['opt'] 19 | avg_model = model_weights 20 | avg_generator = generator_weights 21 | else: 22 | for (k, v) in avg_model.items(): 23 | avg_model[k].mul_(i).add_(model_weights[k]).div_(i + 1) 24 | 25 | for (k, v) in avg_generator.items(): 26 | avg_generator[k].mul_(i).add_(generator_weights[k]).div_(i + 1) 27 | 28 | final = {"vocab": vocab, "opt": opt, "optim": None, 29 | "generator": avg_generator, "model": avg_model} 30 | return final 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser(description="") 35 | parser.add_argument("-models", "-m", nargs="+", required=True, 36 | help="List of models") 37 | parser.add_argument("-output", "-o", required=True, 38 | help="Output file") 39 | opt = parser.parse_args() 40 | 41 | final = average_models(opt.models) 42 | torch.save(final, opt.output) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /tools/create_vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import argparse 4 | import sys 5 | import os 6 | 7 | 8 | def read_files_batch(file_list): 9 | """Reads the provided files in batches""" 10 | batch = [] # Keep batch for each file 11 | fd_list = [] # File descriptor list 12 | 13 | exit = False # Flag used for quitting the program in case of error 14 | try: 15 | for filename in file_list: 16 | fd_list.append(open(filename)) 17 | 18 | for lines in zip(*fd_list): 19 | for i, line in enumerate(lines): 20 | line = line.rstrip("\n").split(" ") 21 | batch.append(line) 22 | 23 | yield batch 24 | batch = [] # Reset batch 25 | 26 | except IOError: 27 | print("Error reading file " + filename + ".") 28 | exit = True # Flag to exit the program 29 | 30 | finally: 31 | for fd in fd_list: 32 | fd.close() 33 | 34 | if exit: # An error occurred, end execution 35 | sys.exit(-1) 36 | 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('-file_type', default='text', 41 | choices=['text', 'field'], required=True, 42 | help="""Options for vocabulary creation. 43 | The default is 'text' where the user passes 44 | a corpus or a list of corpora files for which 45 | they want to create a vocabulary from. 46 | If choosing the option 'field', we assume 47 | the file passed is a torch file created during 48 | the preprocessing stage of an already 49 | preprocessed corpus. The vocabulary file created 50 | will just be the vocabulary inside the field 51 | corresponding to the argument 'side'.""") 52 | parser.add_argument("-file", type=str, nargs="+", required=True) 53 | parser.add_argument("-out_file", type=str, required=True) 54 | parser.add_argument("-side", choices=['src', 'tgt'], help="""Specifies 55 | 'src' or 'tgt' side for 'field' file_type.""") 56 | 57 | opt = parser.parse_args() 58 | 59 | vocabulary = {} 60 | if opt.file_type == 'text': 61 | print("Reading input file...") 62 | for batch in read_files_batch(opt.file): 63 | for sentence in batch: 64 | for w in sentence: 65 | if w in vocabulary: 66 | vocabulary[w] += 1 67 | else: 68 | vocabulary[w] = 1 69 | 70 | print("Writing vocabulary file...") 71 | with open(opt.out_file, "w") as f: 72 | for w, count in sorted(vocabulary.items(), key=lambda x: x[1], 73 | reverse=True): 74 | f.write("{0}\n".format(w)) 75 | else: 76 | if opt.side not in ['src', 'tgt']: 77 | raise ValueError("If using -file_type='field', specifies " 78 | "'src' or 'tgt' argument for -side.") 79 | import torch 80 | try: 81 | from onmt.inputters.inputter import _old_style_vocab 82 | except ImportError: 83 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 84 | from onmt.inputters.inputter import _old_style_vocab 85 | 86 | print("Reading input file...") 87 | if not len(opt.file) == 1: 88 | raise ValueError("If using -file_type='field', only pass one " 89 | "argument for -file.") 90 | vocabs = torch.load(opt.file[0]) 91 | voc = dict(vocabs)[opt.side] 92 | if _old_style_vocab(voc): 93 | word_list = voc.itos 94 | else: 95 | try: 96 | word_list = voc[0][1].base_field.vocab.itos 97 | except AttributeError: 98 | word_list = voc[0][1].vocab.itos 99 | 100 | print("Writing vocabulary file...") 101 | with open(opt.out_file, "wb") as f: 102 | for w in word_list: 103 | f.write(u"{0}\n".format(w).encode("utf-8")) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /tools/extract_embeddings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | import onmt 6 | import onmt.model_builder 7 | import onmt.inputters as inputters 8 | import onmt.opts 9 | 10 | from onmt.utils.misc import use_gpu 11 | from onmt.utils.logging import init_logger, logger 12 | 13 | parser = argparse.ArgumentParser(description='translate.py') 14 | 15 | parser.add_argument('-model', required=True, 16 | help='Path to model .pt file') 17 | parser.add_argument('-output_dir', default='.', 18 | help="""Path to output the embeddings""") 19 | parser.add_argument('-gpu', type=int, default=-1, 20 | help="Device to run on") 21 | 22 | 23 | def write_embeddings(filename, dict, embeddings): 24 | with open(filename, 'wb') as file: 25 | for i in range(min(len(embeddings), len(dict.itos))): 26 | str = dict.itos[i].encode("utf-8") 27 | for j in range(len(embeddings[0])): 28 | str = str + (" %5f" % (embeddings[i][j])).encode("utf-8") 29 | file.write(str + b"\n") 30 | 31 | 32 | def main(): 33 | dummy_parser = argparse.ArgumentParser(description='train.py') 34 | onmt.opts.model_opts(dummy_parser) 35 | dummy_opt = dummy_parser.parse_known_args([])[0] 36 | opt = parser.parse_args() 37 | opt.cuda = opt.gpu > -1 38 | if opt.cuda: 39 | torch.cuda.set_device(opt.gpu) 40 | 41 | # Add in default model arguments, possibly added since training. 42 | checkpoint = torch.load(opt.model, 43 | map_location=lambda storage, loc: storage) 44 | model_opt = checkpoint['opt'] 45 | 46 | vocab = checkpoint['vocab'] 47 | if inputters.old_style_vocab(vocab): 48 | fields = onmt.inputters.load_old_vocab(vocab) 49 | else: 50 | fields = vocab 51 | src_dict = fields['src'].base_field.vocab # assumes src is text 52 | tgt_dict = fields['tgt'].base_field.vocab 53 | 54 | model_opt = checkpoint['opt'] 55 | for arg in dummy_opt.__dict__: 56 | if arg not in model_opt: 57 | model_opt.__dict__[arg] = dummy_opt.__dict__[arg] 58 | 59 | model = onmt.model_builder.build_base_model( 60 | model_opt, fields, use_gpu(opt), checkpoint) 61 | encoder = model.encoder 62 | decoder = model.decoder 63 | 64 | encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist() 65 | decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist() 66 | 67 | logger.info("Writing source embeddings") 68 | write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict, 69 | encoder_embeddings) 70 | 71 | logger.info("Writing target embeddings") 72 | write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict, 73 | decoder_embeddings) 74 | 75 | logger.info('... done.') 76 | logger.info('Converting model...') 77 | 78 | 79 | if __name__ == "__main__": 80 | init_logger('extract_embeddings.log') 81 | main() 82 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/README.txt: -------------------------------------------------------------------------------- 1 | The language suffix can be found here: 2 | 3 | http://www.loc.gov/standards/iso639-2/php/code_list.php 4 | 5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations). 6 | This code includes data from czech wiktionary (also czech abbreviations). 7 | 8 | 9 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ca: -------------------------------------------------------------------------------- 1 | Dr 2 | Dra 3 | pàg 4 | p 5 | c 6 | av 7 | Sr 8 | Sra 9 | adm 10 | esq 11 | Prof 12 | S.A 13 | S.L 14 | p.e 15 | ptes 16 | Sta 17 | St 18 | pl 19 | màx 20 | cast 21 | dir 22 | nre 23 | fra 24 | admdora 25 | Emm 26 | Excma 27 | espf 28 | dc 29 | admdor 30 | tel 31 | angl 32 | aprox 33 | ca 34 | dept 35 | dj 36 | dl 37 | dt 38 | ds 39 | dg 40 | dv 41 | ed 42 | entl 43 | al 44 | i.e 45 | maj 46 | smin 47 | n 48 | núm 49 | pta 50 | A 51 | B 52 | C 53 | D 54 | E 55 | F 56 | G 57 | H 58 | I 59 | J 60 | K 61 | L 62 | M 63 | N 64 | O 65 | P 66 | Q 67 | R 68 | S 69 | T 70 | U 71 | V 72 | W 73 | X 74 | Y 75 | Z 76 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.cs: -------------------------------------------------------------------------------- 1 | Bc 2 | BcA 3 | Ing 4 | Ing.arch 5 | MUDr 6 | MVDr 7 | MgA 8 | Mgr 9 | JUDr 10 | PhDr 11 | RNDr 12 | PharmDr 13 | ThLic 14 | ThDr 15 | Ph.D 16 | Th.D 17 | prof 18 | doc 19 | CSc 20 | DrSc 21 | dr. h. c 22 | PaedDr 23 | Dr 24 | PhMr 25 | DiS 26 | abt 27 | ad 28 | a.i 29 | aj 30 | angl 31 | anon 32 | apod 33 | atd 34 | atp 35 | aut 36 | bd 37 | biogr 38 | b.m 39 | b.p 40 | b.r 41 | cca 42 | cit 43 | cizojaz 44 | c.k 45 | col 46 | čes 47 | čín 48 | čj 49 | ed 50 | facs 51 | fasc 52 | fol 53 | fot 54 | franc 55 | h.c 56 | hist 57 | hl 58 | hrsg 59 | ibid 60 | il 61 | ind 62 | inv.č 63 | jap 64 | jhdt 65 | jv 66 | koed 67 | kol 68 | korej 69 | kl 70 | krit 71 | lat 72 | lit 73 | m.a 74 | maď 75 | mj 76 | mp 77 | násl 78 | např 79 | nepubl 80 | něm 81 | no 82 | nr 83 | n.s 84 | okr 85 | odd 86 | odp 87 | obr 88 | opr 89 | orig 90 | phil 91 | pl 92 | pokrač 93 | pol 94 | port 95 | pozn 96 | př.kr 97 | př.n.l 98 | přel 99 | přeprac 100 | příl 101 | pseud 102 | pt 103 | red 104 | repr 105 | resp 106 | revid 107 | rkp 108 | roč 109 | roz 110 | rozš 111 | samost 112 | sect 113 | sest 114 | seš 115 | sign 116 | sl 117 | srv 118 | stol 119 | sv 120 | šk 121 | šk.ro 122 | špan 123 | tab 124 | t.č 125 | tis 126 | tj 127 | tř 128 | tzv 129 | univ 130 | uspoř 131 | vol 132 | vl.jm 133 | vs 134 | vyd 135 | vyobr 136 | zal 137 | zejm 138 | zkr 139 | zprac 140 | zvl 141 | n.p 142 | např 143 | než 144 | MUDr 145 | abl 146 | absol 147 | adj 148 | adv 149 | ak 150 | ak. sl 151 | akt 152 | alch 153 | amer 154 | anat 155 | angl 156 | anglosas 157 | arab 158 | arch 159 | archit 160 | arg 161 | astr 162 | astrol 163 | att 164 | bás 165 | belg 166 | bibl 167 | biol 168 | boh 169 | bot 170 | bulh 171 | círk 172 | csl 173 | č 174 | čas 175 | čes 176 | dat 177 | děj 178 | dep 179 | dět 180 | dial 181 | dór 182 | dopr 183 | dosl 184 | ekon 185 | epic 186 | etnonym 187 | eufem 188 | f 189 | fam 190 | fem 191 | fil 192 | film 193 | form 194 | fot 195 | fr 196 | fut 197 | fyz 198 | gen 199 | geogr 200 | geol 201 | geom 202 | germ 203 | gram 204 | hebr 205 | herald 206 | hist 207 | hl 208 | hovor 209 | hud 210 | hut 211 | chcsl 212 | chem 213 | ie 214 | imp 215 | impf 216 | ind 217 | indoevr 218 | inf 219 | instr 220 | interj 221 | ión 222 | iron 223 | it 224 | kanad 225 | katalán 226 | klas 227 | kniž 228 | komp 229 | konj 230 | 231 | konkr 232 | kř 233 | kuch 234 | lat 235 | lék 236 | les 237 | lid 238 | lit 239 | liturg 240 | lok 241 | log 242 | m 243 | mat 244 | meteor 245 | metr 246 | mod 247 | ms 248 | mysl 249 | n 250 | náb 251 | námoř 252 | neklas 253 | něm 254 | nesklon 255 | nom 256 | ob 257 | obch 258 | obyč 259 | ojed 260 | opt 261 | part 262 | pas 263 | pejor 264 | pers 265 | pf 266 | pl 267 | plpf 268 | 269 | práv 270 | prep 271 | předl 272 | přivl 273 | r 274 | rcsl 275 | refl 276 | reg 277 | rkp 278 | ř 279 | řec 280 | s 281 | samohl 282 | sg 283 | sl 284 | souhl 285 | spec 286 | srov 287 | stfr 288 | střv 289 | stsl 290 | subj 291 | subst 292 | superl 293 | sv 294 | sz 295 | táz 296 | tech 297 | telev 298 | teol 299 | trans 300 | typogr 301 | var 302 | vedl 303 | verb 304 | vl. jm 305 | voj 306 | vok 307 | vůb 308 | vulg 309 | výtv 310 | vztaž 311 | zahr 312 | zájm 313 | zast 314 | zejm 315 | 316 | zeměd 317 | zkr 318 | zř 319 | mj 320 | dl 321 | atp 322 | sport 323 | Mgr 324 | horn 325 | MVDr 326 | JUDr 327 | RSDr 328 | Bc 329 | PhDr 330 | ThDr 331 | Ing 332 | aj 333 | apod 334 | PharmDr 335 | pomn 336 | ev 337 | slang 338 | nprap 339 | odp 340 | dop 341 | pol 342 | st 343 | stol 344 | p. n. l 345 | před n. l 346 | n. l 347 | př. Kr 348 | po Kr 349 | př. n. l 350 | odd 351 | RNDr 352 | tzv 353 | atd 354 | tzn 355 | resp 356 | tj 357 | p 358 | br 359 | č. j 360 | čj 361 | č. p 362 | čp 363 | a. s 364 | s. r. o 365 | spol. s r. o 366 | p. o 367 | s. p 368 | v. o. s 369 | k. s 370 | o. p. s 371 | o. s 372 | v. r 373 | v z 374 | ml 375 | vč 376 | kr 377 | mld 378 | hod 379 | popř 380 | ap 381 | event 382 | rus 383 | slov 384 | rum 385 | švýc 386 | P. T 387 | zvl 388 | hor 389 | dol 390 | S.O.S -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.de: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | #no german words end in single lower-case letters, so we throw those in too. 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | 61 | #Roman Numerals. A dot after one of these is not a sentence break in German. 62 | I 63 | II 64 | III 65 | IV 66 | V 67 | VI 68 | VII 69 | VIII 70 | IX 71 | X 72 | XI 73 | XII 74 | XIII 75 | XIV 76 | XV 77 | XVI 78 | XVII 79 | XVIII 80 | XIX 81 | XX 82 | i 83 | ii 84 | iii 85 | iv 86 | v 87 | vi 88 | vii 89 | viii 90 | ix 91 | x 92 | xi 93 | xii 94 | xiii 95 | xiv 96 | xv 97 | xvi 98 | xvii 99 | xviii 100 | xix 101 | xx 102 | 103 | #Titles and Honorifics 104 | Adj 105 | Adm 106 | Adv 107 | Asst 108 | Bart 109 | Bldg 110 | Brig 111 | Bros 112 | Capt 113 | Cmdr 114 | Col 115 | Comdr 116 | Con 117 | Corp 118 | Cpl 119 | DR 120 | Dr 121 | Ens 122 | Gen 123 | Gov 124 | Hon 125 | Hosp 126 | Insp 127 | Lt 128 | MM 129 | MR 130 | MRS 131 | MS 132 | Maj 133 | Messrs 134 | Mlle 135 | Mme 136 | Mr 137 | Mrs 138 | Ms 139 | Msgr 140 | Op 141 | Ord 142 | Pfc 143 | Ph 144 | Prof 145 | Pvt 146 | Rep 147 | Reps 148 | Res 149 | Rev 150 | Rt 151 | Sen 152 | Sens 153 | Sfc 154 | Sgt 155 | Sr 156 | St 157 | Supt 158 | Surg 159 | 160 | #Misc symbols 161 | Mio 162 | Mrd 163 | bzw 164 | v 165 | vs 166 | usw 167 | d.h 168 | z.B 169 | u.a 170 | etc 171 | Mrd 172 | MwSt 173 | ggf 174 | d.J 175 | D.h 176 | m.E 177 | vgl 178 | I.F 179 | z.T 180 | sogen 181 | ff 182 | u.E 183 | g.U 184 | g.g.A 185 | c.-à-d 186 | Buchst 187 | u.s.w 188 | sog 189 | u.ä 190 | Std 191 | evtl 192 | Zt 193 | Chr 194 | u.U 195 | o.ä 196 | Ltd 197 | b.A 198 | z.Zt 199 | spp 200 | sen 201 | SA 202 | k.o 203 | jun 204 | i.H.v 205 | dgl 206 | dergl 207 | Co 208 | zzt 209 | usf 210 | s.p.a 211 | Dkr 212 | Corp 213 | bzgl 214 | BSE 215 | 216 | #Number indicators 217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it 218 | No 219 | Nos 220 | Art 221 | Nr 222 | pp 223 | ca 224 | Ca 225 | 226 | #Ordinals are done with . in German - "1." = "1st" in English 227 | 1 228 | 2 229 | 3 230 | 4 231 | 5 232 | 6 233 | 7 234 | 8 235 | 9 236 | 10 237 | 11 238 | 12 239 | 13 240 | 14 241 | 15 242 | 16 243 | 17 244 | 18 245 | 19 246 | 20 247 | 21 248 | 22 249 | 23 250 | 24 251 | 25 252 | 26 253 | 27 254 | 28 255 | 29 256 | 30 257 | 31 258 | 32 259 | 33 260 | 34 261 | 35 262 | 36 263 | 37 264 | 38 265 | 39 266 | 40 267 | 41 268 | 42 269 | 43 270 | 44 271 | 45 272 | 46 273 | 47 274 | 48 275 | 49 276 | 50 277 | 51 278 | 52 279 | 53 280 | 54 281 | 55 282 | 56 283 | 57 284 | 58 285 | 59 286 | 60 287 | 61 288 | 62 289 | 63 290 | 64 291 | 65 292 | 66 293 | 67 294 | 68 295 | 69 296 | 70 297 | 71 298 | 72 299 | 73 300 | 74 301 | 75 302 | 76 303 | 77 304 | 78 305 | 79 306 | 80 307 | 81 308 | 82 309 | 83 310 | 84 311 | 85 312 | 86 313 | 87 314 | 88 315 | 89 316 | 90 317 | 91 318 | 92 319 | 93 320 | 94 321 | 95 322 | 96 323 | 97 324 | 98 325 | 99 326 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.en: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Asst 38 | Bart 39 | Bldg 40 | Brig 41 | Bros 42 | Capt 43 | Cmdr 44 | Col 45 | Comdr 46 | Con 47 | Corp 48 | Cpl 49 | DR 50 | Dr 51 | Drs 52 | Ens 53 | Gen 54 | Gov 55 | Hon 56 | Hr 57 | Hosp 58 | Insp 59 | Lt 60 | MM 61 | MR 62 | MRS 63 | MS 64 | Maj 65 | Messrs 66 | Mlle 67 | Mme 68 | Mr 69 | Mrs 70 | Ms 71 | Msgr 72 | Op 73 | Ord 74 | Pfc 75 | Ph 76 | Prof 77 | Pvt 78 | Rep 79 | Reps 80 | Res 81 | Rev 82 | Rt 83 | Sen 84 | Sens 85 | Sfc 86 | Sgt 87 | Sr 88 | St 89 | Supt 90 | Surg 91 | 92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 93 | v 94 | vs 95 | i.e 96 | rev 97 | e.g 98 | 99 | #Numbers only. These should only induce breaks when followed by a numeric sequence 100 | # add NUMERIC_ONLY after the word for this function 101 | #This case is mostly for the english "No." which can either be a sentence of its own, or 102 | #if followed by a number, a non-breaking prefix 103 | No #NUMERIC_ONLY# 104 | Nos 105 | Art #NUMERIC_ONLY# 106 | Nr 107 | pp #NUMERIC_ONLY# 108 | 109 | #month abbreviations 110 | Jan 111 | Feb 112 | Mar 113 | Apr 114 | #May is a full word 115 | Jun 116 | Jul 117 | Aug 118 | Sep 119 | Oct 120 | Nov 121 | Dec 122 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.es: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm 34 | 35 | A.C 36 | Apdo 37 | Av 38 | Bco 39 | CC.AA 40 | Da 41 | Dep 42 | Dn 43 | Dr 44 | Dra 45 | EE.UU 46 | Excmo 47 | FF.CC 48 | Fil 49 | Gral 50 | J.C 51 | Let 52 | Lic 53 | N.B 54 | P.D 55 | P.V.P 56 | Prof 57 | Pts 58 | Rte 59 | S.A 60 | S.A.R 61 | S.E 62 | S.L 63 | S.R.C 64 | Sr 65 | Sra 66 | Srta 67 | Sta 68 | Sto 69 | T.V.E 70 | Tel 71 | Ud 72 | Uds 73 | V.B 74 | V.E 75 | Vd 76 | Vds 77 | a/c 78 | adj 79 | admón 80 | afmo 81 | apdo 82 | av 83 | c 84 | c.f 85 | c.g 86 | cap 87 | cm 88 | cta 89 | dcha 90 | doc 91 | ej 92 | entlo 93 | esq 94 | etc 95 | f.c 96 | gr 97 | grs 98 | izq 99 | kg 100 | km 101 | mg 102 | mm 103 | núm 104 | núm 105 | p 106 | p.a 107 | p.ej 108 | ptas 109 | pág 110 | págs 111 | pág 112 | págs 113 | q.e.g.e 114 | q.e.s.m 115 | s 116 | s.s.s 117 | vid 118 | vol 119 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.fi: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT 2 | #indicate an end-of-sentence marker. Special cases are included for prefixes 3 | #that ONLY appear before 0-9 numbers. 4 | 5 | #This list is compiled from omorfi database 6 | #by Tommi A Pirinen. 7 | 8 | 9 | #any single upper case letter followed by a period is not a sentence ender 10 | A 11 | B 12 | C 13 | D 14 | E 15 | F 16 | G 17 | H 18 | I 19 | J 20 | K 21 | L 22 | M 23 | N 24 | O 25 | P 26 | Q 27 | R 28 | S 29 | T 30 | U 31 | V 32 | W 33 | X 34 | Y 35 | Z 36 | Å 37 | Ä 38 | Ö 39 | 40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 41 | alik 42 | alil 43 | amir 44 | apul 45 | apul.prof 46 | arkkit 47 | ass 48 | assist 49 | dipl 50 | dipl.arkkit 51 | dipl.ekon 52 | dipl.ins 53 | dipl.kielenk 54 | dipl.kirjeenv 55 | dipl.kosm 56 | dipl.urk 57 | dos 58 | erikoiseläinl 59 | erikoishammasl 60 | erikoisl 61 | erikoist 62 | ev.luutn 63 | evp 64 | fil 65 | ft 66 | hallinton 67 | hallintot 68 | hammaslääket 69 | jatk 70 | jääk 71 | kansaned 72 | kapt 73 | kapt.luutn 74 | kenr 75 | kenr.luutn 76 | kenr.maj 77 | kers 78 | kirjeenv 79 | kom 80 | kom.kapt 81 | komm 82 | konst 83 | korpr 84 | luutn 85 | maist 86 | maj 87 | Mr 88 | Mrs 89 | Ms 90 | M.Sc 91 | neuv 92 | nimim 93 | Ph.D 94 | prof 95 | puh.joht 96 | pääll 97 | res 98 | san 99 | siht 100 | suom 101 | sähköp 102 | säv 103 | toht 104 | toim 105 | toim.apul 106 | toim.joht 107 | toim.siht 108 | tuom 109 | ups 110 | vänr 111 | vääp 112 | ye.ups 113 | ylik 114 | ylil 115 | ylim 116 | ylimatr 117 | yliop 118 | yliopp 119 | ylip 120 | yliv 121 | 122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall 123 | #into this category - it sometimes ends a sentence) 124 | e.g 125 | ent 126 | esim 127 | huom 128 | i.e 129 | ilm 130 | l 131 | mm 132 | myöh 133 | nk 134 | nyk 135 | par 136 | po 137 | t 138 | v 139 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.fr: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | # 4 | #any single upper case letter followed by a period is not a sentence ender 5 | #usually upper case letters are initials in a name 6 | #no French words end in single lower-case letters, so we throw those in too? 7 | A 8 | B 9 | C 10 | D 11 | E 12 | F 13 | G 14 | H 15 | I 16 | J 17 | K 18 | L 19 | M 20 | N 21 | O 22 | P 23 | Q 24 | R 25 | S 26 | T 27 | U 28 | V 29 | W 30 | X 31 | Y 32 | Z 33 | #a 34 | b 35 | c 36 | d 37 | e 38 | f 39 | g 40 | h 41 | i 42 | j 43 | k 44 | l 45 | m 46 | n 47 | o 48 | p 49 | q 50 | r 51 | s 52 | t 53 | u 54 | v 55 | w 56 | x 57 | y 58 | z 59 | 60 | # Period-final abbreviation list for French 61 | A.C.N 62 | A.M 63 | art 64 | ann 65 | apr 66 | av 67 | auj 68 | lib 69 | B.P 70 | boul 71 | ca 72 | c.-à-d 73 | cf 74 | ch.-l 75 | chap 76 | contr 77 | C.P.I 78 | C.Q.F.D 79 | C.N 80 | C.N.S 81 | C.S 82 | dir 83 | éd 84 | e.g 85 | env 86 | al 87 | etc 88 | E.V 89 | ex 90 | fasc 91 | fém 92 | fig 93 | fr 94 | hab 95 | ibid 96 | id 97 | i.e 98 | inf 99 | LL.AA 100 | LL.AA.II 101 | LL.AA.RR 102 | LL.AA.SS 103 | L.D 104 | LL.EE 105 | LL.MM 106 | LL.MM.II.RR 107 | loc.cit 108 | masc 109 | MM 110 | ms 111 | N.B 112 | N.D.A 113 | N.D.L.R 114 | N.D.T 115 | n/réf 116 | NN.SS 117 | N.S 118 | N.D 119 | N.P.A.I 120 | p.c.c 121 | pl 122 | pp 123 | p.ex 124 | p.j 125 | P.S 126 | R.A.S 127 | R.-V 128 | R.P 129 | R.I.P 130 | SS 131 | S.S 132 | S.A 133 | S.A.I 134 | S.A.R 135 | S.A.S 136 | S.E 137 | sec 138 | sect 139 | sing 140 | S.M 141 | S.M.I.R 142 | sq 143 | sqq 144 | suiv 145 | sup 146 | suppl 147 | tél 148 | T.S.V.P 149 | vb 150 | vol 151 | vs 152 | X.O 153 | Z.I 154 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ga: -------------------------------------------------------------------------------- 1 | 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | Á 29 | É 30 | Í 31 | Ó 32 | Ú 33 | 34 | Uacht 35 | Dr 36 | B.Arch 37 | 38 | m.sh 39 | .i 40 | Co 41 | Cf 42 | cf 43 | i.e 44 | r 45 | Chr 46 | lch #NUMERIC_ONLY# 47 | lgh #NUMERIC_ONLY# 48 | uimh #NUMERIC_ONLY# 49 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.hu: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | Á 33 | É 34 | Í 35 | Ó 36 | Ö 37 | Ő 38 | Ú 39 | Ü 40 | Ű 41 | 42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 43 | Dr 44 | dr 45 | kb 46 | Kb 47 | vö 48 | Vö 49 | pl 50 | Pl 51 | ca 52 | Ca 53 | min 54 | Min 55 | max 56 | Max 57 | ún 58 | Ún 59 | prof 60 | Prof 61 | de 62 | De 63 | du 64 | Du 65 | Szt 66 | St 67 | 68 | #Numbers only. These should only induce breaks when followed by a numeric sequence 69 | # add NUMERIC_ONLY after the word for this function 70 | #This case is mostly for the english "No." which can either be a sentence of its own, or 71 | #if followed by a number, a non-breaking prefix 72 | 73 | # Month name abbreviations 74 | jan #NUMERIC_ONLY# 75 | Jan #NUMERIC_ONLY# 76 | Feb #NUMERIC_ONLY# 77 | feb #NUMERIC_ONLY# 78 | márc #NUMERIC_ONLY# 79 | Márc #NUMERIC_ONLY# 80 | ápr #NUMERIC_ONLY# 81 | Ápr #NUMERIC_ONLY# 82 | máj #NUMERIC_ONLY# 83 | Máj #NUMERIC_ONLY# 84 | jún #NUMERIC_ONLY# 85 | Jún #NUMERIC_ONLY# 86 | Júl #NUMERIC_ONLY# 87 | júl #NUMERIC_ONLY# 88 | aug #NUMERIC_ONLY# 89 | Aug #NUMERIC_ONLY# 90 | Szept #NUMERIC_ONLY# 91 | szept #NUMERIC_ONLY# 92 | okt #NUMERIC_ONLY# 93 | Okt #NUMERIC_ONLY# 94 | nov #NUMERIC_ONLY# 95 | Nov #NUMERIC_ONLY# 96 | dec #NUMERIC_ONLY# 97 | Dec #NUMERIC_ONLY# 98 | 99 | # Other abbreviations 100 | tel #NUMERIC_ONLY# 101 | Tel #NUMERIC_ONLY# 102 | Fax #NUMERIC_ONLY# 103 | fax #NUMERIC_ONLY# 104 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.is: -------------------------------------------------------------------------------- 1 | no #NUMERIC_ONLY# 2 | No #NUMERIC_ONLY# 3 | nr #NUMERIC_ONLY# 4 | Nr #NUMERIC_ONLY# 5 | nR #NUMERIC_ONLY# 6 | NR #NUMERIC_ONLY# 7 | a 8 | b 9 | c 10 | d 11 | e 12 | f 13 | g 14 | h 15 | i 16 | j 17 | k 18 | l 19 | m 20 | n 21 | o 22 | p 23 | q 24 | r 25 | s 26 | t 27 | u 28 | v 29 | w 30 | x 31 | y 32 | z 33 | ^ 34 | í 35 | á 36 | ó 37 | æ 38 | A 39 | B 40 | C 41 | D 42 | E 43 | F 44 | G 45 | H 46 | I 47 | J 48 | K 49 | L 50 | M 51 | N 52 | O 53 | P 54 | Q 55 | R 56 | S 57 | T 58 | U 59 | V 60 | W 61 | X 62 | Y 63 | Z 64 | ab.fn 65 | a.fn 66 | afs 67 | al 68 | alm 69 | alg 70 | andh 71 | ath 72 | aths 73 | atr 74 | ao 75 | au 76 | aukaf 77 | áfn 78 | áhrl.s 79 | áhrs 80 | ákv.gr 81 | ákv 82 | bh 83 | bls 84 | dr 85 | e.Kr 86 | et 87 | ef 88 | efn 89 | ennfr 90 | eink 91 | end 92 | e.st 93 | erl 94 | fél 95 | fskj 96 | fh 97 | f.hl 98 | físl 99 | fl 100 | fn 101 | fo 102 | forl 103 | frb 104 | frl 105 | frh 106 | frt 107 | fsl 108 | fsh 109 | fs 110 | fsk 111 | fst 112 | f.Kr 113 | ft 114 | fv 115 | fyrrn 116 | fyrrv 117 | germ 118 | gm 119 | gr 120 | hdl 121 | hdr 122 | hf 123 | hl 124 | hlsk 125 | hljsk 126 | hljv 127 | hljóðv 128 | hr 129 | hv 130 | hvk 131 | holl 132 | Hos 133 | höf 134 | hk 135 | hrl 136 | ísl 137 | kaf 138 | kap 139 | Khöfn 140 | kk 141 | kg 142 | kk 143 | km 144 | kl 145 | klst 146 | kr 147 | kt 148 | kgúrsk 149 | kvk 150 | leturbr 151 | lh 152 | lh.nt 153 | lh.þt 154 | lo 155 | ltr 156 | mlja 157 | mljó 158 | millj 159 | mm 160 | mms 161 | m.fl 162 | miðm 163 | mgr 164 | mst 165 | mín 166 | nf 167 | nh 168 | nhm 169 | nl 170 | nk 171 | nmgr 172 | no 173 | núv 174 | nt 175 | o.áfr 176 | o.m.fl 177 | ohf 178 | o.fl 179 | o.s.frv 180 | ófn 181 | ób 182 | óákv.gr 183 | óákv 184 | pfn 185 | PR 186 | pr 187 | Ritstj 188 | Rvík 189 | Rvk 190 | samb 191 | samhlj 192 | samn 193 | samn 194 | sbr 195 | sek 196 | sérn 197 | sf 198 | sfn 199 | sh 200 | sfn 201 | sh 202 | s.hl 203 | sk 204 | skv 205 | sl 206 | sn 207 | so 208 | ss.us 209 | s.st 210 | samþ 211 | sbr 212 | shlj 213 | sign 214 | skál 215 | st 216 | st.s 217 | stk 218 | sþ 219 | teg 220 | tbl 221 | tfn 222 | tl 223 | tvíhlj 224 | tvt 225 | till 226 | to 227 | umr 228 | uh 229 | us 230 | uppl 231 | útg 232 | vb 233 | Vf 234 | vh 235 | vkf 236 | Vl 237 | vl 238 | vlf 239 | vmf 240 | 8vo 241 | vsk 242 | vth 243 | þt 244 | þf 245 | þjs 246 | þgf 247 | þlt 248 | þolm 249 | þm 250 | þml 251 | þýð 252 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.it: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | B 8 | C 9 | D 10 | E 11 | F 12 | G 13 | H 14 | I 15 | J 16 | K 17 | L 18 | M 19 | N 20 | O 21 | P 22 | Q 23 | R 24 | S 25 | T 26 | U 27 | V 28 | W 29 | X 30 | Y 31 | Z 32 | 33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 34 | Adj 35 | Adm 36 | Adv 37 | Amn 38 | Arch 39 | Asst 40 | Avv 41 | Bart 42 | Bcc 43 | Bldg 44 | Brig 45 | Bros 46 | C.A.P 47 | C.P 48 | Capt 49 | Cc 50 | Cmdr 51 | Co 52 | Col 53 | Comdr 54 | Con 55 | Corp 56 | Cpl 57 | DR 58 | Dott 59 | Dr 60 | Drs 61 | Egr 62 | Ens 63 | Gen 64 | Geom 65 | Gov 66 | Hon 67 | Hosp 68 | Hr 69 | Id 70 | Ing 71 | Insp 72 | Lt 73 | MM 74 | MR 75 | MRS 76 | MS 77 | Maj 78 | Messrs 79 | Mlle 80 | Mme 81 | Mo 82 | Mons 83 | Mr 84 | Mrs 85 | Ms 86 | Msgr 87 | N.B 88 | Op 89 | Ord 90 | P.S 91 | P.T 92 | Pfc 93 | Ph 94 | Prof 95 | Pvt 96 | RP 97 | RSVP 98 | Rag 99 | Rep 100 | Reps 101 | Res 102 | Rev 103 | Rif 104 | Rt 105 | S.A 106 | S.B.F 107 | S.P.M 108 | S.p.A 109 | S.r.l 110 | Sen 111 | Sens 112 | Sfc 113 | Sgt 114 | Sig 115 | Sigg 116 | Soc 117 | Spett 118 | Sr 119 | St 120 | Supt 121 | Surg 122 | V.P 123 | 124 | # other 125 | a.c 126 | acc 127 | all 128 | banc 129 | c.a 130 | c.c.p 131 | c.m 132 | c.p 133 | c.s 134 | c.v 135 | corr 136 | dott 137 | e.p.c 138 | ecc 139 | es 140 | fatt 141 | gg 142 | int 143 | lett 144 | ogg 145 | on 146 | p.c 147 | p.c.c 148 | p.es 149 | p.f 150 | p.r 151 | p.v 152 | post 153 | pp 154 | racc 155 | ric 156 | s.n.c 157 | seg 158 | sgg 159 | ss 160 | tel 161 | u.s 162 | v.r 163 | v.s 164 | 165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 166 | v 167 | vs 168 | i.e 169 | rev 170 | e.g 171 | 172 | #Numbers only. These should only induce breaks when followed by a numeric sequence 173 | # add NUMERIC_ONLY after the word for this function 174 | #This case is mostly for the english "No." which can either be a sentence of its own, or 175 | #if followed by a number, a non-breaking prefix 176 | No #NUMERIC_ONLY# 177 | Nos 178 | Art #NUMERIC_ONLY# 179 | Nr 180 | pp #NUMERIC_ONLY# 181 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.lv: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | A 7 | Ā 8 | B 9 | C 10 | Č 11 | D 12 | E 13 | Ē 14 | F 15 | G 16 | Ģ 17 | H 18 | I 19 | Ī 20 | J 21 | K 22 | Ķ 23 | L 24 | Ļ 25 | M 26 | N 27 | Ņ 28 | O 29 | P 30 | Q 31 | R 32 | S 33 | Š 34 | T 35 | U 36 | Ū 37 | V 38 | W 39 | X 40 | Y 41 | Z 42 | Ž 43 | 44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 45 | dr 46 | Dr 47 | med 48 | prof 49 | Prof 50 | inž 51 | Inž 52 | ist.loc 53 | Ist.loc 54 | kor.loc 55 | Kor.loc 56 | v.i 57 | vietn 58 | Vietn 59 | 60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 61 | a.l 62 | t.p 63 | pārb 64 | Pārb 65 | vec 66 | Vec 67 | inv 68 | Inv 69 | sk 70 | Sk 71 | spec 72 | Spec 73 | vienk 74 | Vienk 75 | virz 76 | Virz 77 | māksl 78 | Māksl 79 | mūz 80 | Mūz 81 | akad 82 | Akad 83 | soc 84 | Soc 85 | galv 86 | Galv 87 | vad 88 | Vad 89 | sertif 90 | Sertif 91 | folkl 92 | Folkl 93 | hum 94 | Hum 95 | 96 | #Numbers only. These should only induce breaks when followed by a numeric sequence 97 | # add NUMERIC_ONLY after the word for this function 98 | #This case is mostly for the english "No." which can either be a sentence of its own, or 99 | #if followed by a number, a non-breaking prefix 100 | Nr #NUMERIC_ONLY# 101 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.nl: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm 5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs 6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 7 | #usually upper case letters are initials in a name 8 | A 9 | B 10 | C 11 | D 12 | E 13 | F 14 | G 15 | H 16 | I 17 | J 18 | K 19 | L 20 | M 21 | N 22 | O 23 | P 24 | Q 25 | R 26 | S 27 | T 28 | U 29 | V 30 | W 31 | X 32 | Y 33 | Z 34 | 35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 36 | bacc 37 | bc 38 | bgen 39 | c.i 40 | dhr 41 | dr 42 | dr.h.c 43 | drs 44 | drs 45 | ds 46 | eint 47 | fa 48 | Fa 49 | fam 50 | gen 51 | genm 52 | ing 53 | ir 54 | jhr 55 | jkvr 56 | jr 57 | kand 58 | kol 59 | lgen 60 | lkol 61 | Lt 62 | maj 63 | Mej 64 | mevr 65 | Mme 66 | mr 67 | mr 68 | Mw 69 | o.b.s 70 | plv 71 | prof 72 | ritm 73 | tint 74 | Vz 75 | Z.D 76 | Z.D.H 77 | Z.E 78 | Z.Em 79 | Z.H 80 | Z.K.H 81 | Z.K.M 82 | Z.M 83 | z.v 84 | 85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence 87 | a.g.v 88 | bijv 89 | bijz 90 | bv 91 | d.w.z 92 | e.c 93 | e.g 94 | e.k 95 | ev 96 | i.p.v 97 | i.s.m 98 | i.t.t 99 | i.v.m 100 | m.a.w 101 | m.b.t 102 | m.b.v 103 | m.h.o 104 | m.i 105 | m.i.v 106 | v.w.t 107 | 108 | #Numbers only. These should only induce breaks when followed by a numeric sequence 109 | # add NUMERIC_ONLY after the word for this function 110 | #This case is mostly for the english "No." which can either be a sentence of its own, or 111 | #if followed by a number, a non-breaking prefix 112 | Nr #NUMERIC_ONLY# 113 | Nrs 114 | nrs 115 | nr #NUMERIC_ONLY# 116 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.pl: -------------------------------------------------------------------------------- 1 | adw 2 | afr 3 | akad 4 | al 5 | Al 6 | am 7 | amer 8 | arch 9 | art 10 | Art 11 | artyst 12 | astr 13 | austr 14 | bałt 15 | bdb 16 | bł 17 | bm 18 | br 19 | bryg 20 | bryt 21 | centr 22 | ces 23 | chem 24 | chiń 25 | chir 26 | c.k 27 | c.o 28 | cyg 29 | cyw 30 | cyt 31 | czes 32 | czw 33 | cd 34 | Cd 35 | czyt 36 | ćw 37 | ćwicz 38 | daw 39 | dcn 40 | dekl 41 | demokr 42 | det 43 | diec 44 | dł 45 | dn 46 | dot 47 | dol 48 | dop 49 | dost 50 | dosł 51 | h.c 52 | ds 53 | dst 54 | duszp 55 | dypl 56 | egz 57 | ekol 58 | ekon 59 | elektr 60 | em 61 | ew 62 | fab 63 | farm 64 | fot 65 | fr 66 | gat 67 | gastr 68 | geogr 69 | geol 70 | gimn 71 | głęb 72 | gm 73 | godz 74 | górn 75 | gosp 76 | gr 77 | gram 78 | hist 79 | hiszp 80 | hr 81 | Hr 82 | hot 83 | id 84 | in 85 | im 86 | iron 87 | jn 88 | kard 89 | kat 90 | katol 91 | k.k 92 | kk 93 | kol 94 | kl 95 | k.p.a 96 | kpc 97 | k.p.c 98 | kpt 99 | kr 100 | k.r 101 | krak 102 | k.r.o 103 | kryt 104 | kult 105 | laic 106 | łac 107 | niem 108 | woj 109 | nb 110 | np 111 | Nb 112 | Np 113 | pol 114 | pow 115 | m.in 116 | pt 117 | ps 118 | Pt 119 | Ps 120 | cdn 121 | jw 122 | ryc 123 | rys 124 | Ryc 125 | Rys 126 | tj 127 | tzw 128 | Tzw 129 | tzn 130 | zob 131 | ang 132 | ub 133 | ul 134 | pw 135 | pn 136 | pl 137 | al 138 | k 139 | n 140 | nr #NUMERIC_ONLY# 141 | Nr #NUMERIC_ONLY# 142 | ww 143 | wł 144 | ur 145 | zm 146 | żyd 147 | żarg 148 | żyw 149 | wył 150 | bp 151 | bp 152 | wyst 153 | tow 154 | Tow 155 | o 156 | sp 157 | Sp 158 | st 159 | spółdz 160 | Spółdz 161 | społ 162 | spółgł 163 | stoł 164 | stow 165 | Stoł 166 | Stow 167 | zn 168 | zew 169 | zewn 170 | zdr 171 | zazw 172 | zast 173 | zaw 174 | zał 175 | zal 176 | zam 177 | zak 178 | zakł 179 | zagr 180 | zach 181 | adw 182 | Adw 183 | lek 184 | Lek 185 | med 186 | mec 187 | Mec 188 | doc 189 | Doc 190 | dyw 191 | dyr 192 | Dyw 193 | Dyr 194 | inż 195 | Inż 196 | mgr 197 | Mgr 198 | dh 199 | dr 200 | Dh 201 | Dr 202 | p 203 | P 204 | red 205 | Red 206 | prof 207 | prok 208 | Prof 209 | Prok 210 | hab 211 | płk 212 | Płk 213 | nadkom 214 | Nadkom 215 | podkom 216 | Podkom 217 | ks 218 | Ks 219 | gen 220 | Gen 221 | por 222 | Por 223 | reż 224 | Reż 225 | przyp 226 | Przyp 227 | śp 228 | św 229 | śW 230 | Śp 231 | Św 232 | ŚW 233 | szer 234 | Szer 235 | pkt #NUMERIC_ONLY# 236 | str #NUMERIC_ONLY# 237 | tab #NUMERIC_ONLY# 238 | Tab #NUMERIC_ONLY# 239 | tel 240 | ust #NUMERIC_ONLY# 241 | par #NUMERIC_ONLY# 242 | poz 243 | pok 244 | oo 245 | oO 246 | Oo 247 | OO 248 | r #NUMERIC_ONLY# 249 | l #NUMERIC_ONLY# 250 | s #NUMERIC_ONLY# 251 | najśw 252 | Najśw 253 | A 254 | B 255 | C 256 | D 257 | E 258 | F 259 | G 260 | H 261 | I 262 | J 263 | K 264 | L 265 | M 266 | N 267 | O 268 | P 269 | Q 270 | R 271 | S 272 | T 273 | U 274 | V 275 | W 276 | X 277 | Y 278 | Z 279 | Ś 280 | Ć 281 | Ż 282 | Ź 283 | Dz 284 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ro: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | F 7 | G 8 | H 9 | I 10 | J 11 | K 12 | L 13 | M 14 | N 15 | O 16 | P 17 | Q 18 | R 19 | S 20 | T 21 | U 22 | V 23 | W 24 | X 25 | Y 26 | Z 27 | dpdv 28 | etc 29 | șamd 30 | M.Ap.N 31 | dl 32 | Dl 33 | d-na 34 | D-na 35 | dvs 36 | Dvs 37 | pt 38 | Pt 39 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ru: -------------------------------------------------------------------------------- 1 | # added Cyrillic uppercase letters [А-Я] 2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes) 3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013 4 | А 5 | Б 6 | В 7 | Г 8 | Д 9 | Е 10 | Ж 11 | З 12 | И 13 | Й 14 | К 15 | Л 16 | М 17 | Н 18 | О 19 | П 20 | Р 21 | С 22 | Т 23 | У 24 | Ф 25 | Х 26 | Ц 27 | Ч 28 | Ш 29 | Щ 30 | Ъ 31 | Ы 32 | Ь 33 | Э 34 | Ю 35 | Я 36 | A 37 | B 38 | C 39 | D 40 | E 41 | F 42 | G 43 | H 44 | I 45 | J 46 | K 47 | L 48 | M 49 | N 50 | O 51 | P 52 | Q 53 | R 54 | S 55 | T 56 | U 57 | V 58 | W 59 | X 60 | Y 61 | Z 62 | 0гг 63 | 1гг 64 | 2гг 65 | 3гг 66 | 4гг 67 | 5гг 68 | 6гг 69 | 7гг 70 | 8гг 71 | 9гг 72 | 0г 73 | 1г 74 | 2г 75 | 3г 76 | 4г 77 | 5г 78 | 6г 79 | 7г 80 | 8г 81 | 9г 82 | Xвв 83 | Vвв 84 | Iвв 85 | Lвв 86 | Mвв 87 | Cвв 88 | Xв 89 | Vв 90 | Iв 91 | Lв 92 | Mв 93 | Cв 94 | 0м 95 | 1м 96 | 2м 97 | 3м 98 | 4м 99 | 5м 100 | 6м 101 | 7м 102 | 8м 103 | 9м 104 | 0мм 105 | 1мм 106 | 2мм 107 | 3мм 108 | 4мм 109 | 5мм 110 | 6мм 111 | 7мм 112 | 8мм 113 | 9мм 114 | 0см 115 | 1см 116 | 2см 117 | 3см 118 | 4см 119 | 5см 120 | 6см 121 | 7см 122 | 8см 123 | 9см 124 | 0дм 125 | 1дм 126 | 2дм 127 | 3дм 128 | 4дм 129 | 5дм 130 | 6дм 131 | 7дм 132 | 8дм 133 | 9дм 134 | 0л 135 | 1л 136 | 2л 137 | 3л 138 | 4л 139 | 5л 140 | 6л 141 | 7л 142 | 8л 143 | 9л 144 | 0км 145 | 1км 146 | 2км 147 | 3км 148 | 4км 149 | 5км 150 | 6км 151 | 7км 152 | 8км 153 | 9км 154 | 0га 155 | 1га 156 | 2га 157 | 3га 158 | 4га 159 | 5га 160 | 6га 161 | 7га 162 | 8га 163 | 9га 164 | 0кг 165 | 1кг 166 | 2кг 167 | 3кг 168 | 4кг 169 | 5кг 170 | 6кг 171 | 7кг 172 | 8кг 173 | 9кг 174 | 0т 175 | 1т 176 | 2т 177 | 3т 178 | 4т 179 | 5т 180 | 6т 181 | 7т 182 | 8т 183 | 9т 184 | 0г 185 | 1г 186 | 2г 187 | 3г 188 | 4г 189 | 5г 190 | 6г 191 | 7г 192 | 8г 193 | 9г 194 | 0мг 195 | 1мг 196 | 2мг 197 | 3мг 198 | 4мг 199 | 5мг 200 | 6мг 201 | 7мг 202 | 8мг 203 | 9мг 204 | бульв 205 | в 206 | вв 207 | г 208 | га 209 | гг 210 | гл 211 | гос 212 | д 213 | дм 214 | доп 215 | др 216 | е 217 | ед 218 | ед 219 | зам 220 | и 221 | инд 222 | исп 223 | Исп 224 | к 225 | кап 226 | кг 227 | кв 228 | кл 229 | км 230 | кол 231 | комн 232 | коп 233 | куб 234 | л 235 | лиц 236 | лл 237 | м 238 | макс 239 | мг 240 | мин 241 | мл 242 | млн 243 | млрд 244 | мм 245 | н 246 | наб 247 | нач 248 | неуд 249 | ном 250 | о 251 | обл 252 | обр 253 | общ 254 | ок 255 | ост 256 | отл 257 | п 258 | пер 259 | перераб 260 | пл 261 | пос 262 | пр 263 | просп 264 | проф 265 | р 266 | ред 267 | руб 268 | с 269 | сб 270 | св 271 | см 272 | соч 273 | ср 274 | ст 275 | стр 276 | т 277 | тел 278 | Тел 279 | тех 280 | тт 281 | туп 282 | тыс 283 | уд 284 | ул 285 | уч 286 | физ 287 | х 288 | хор 289 | ч 290 | чел 291 | шт 292 | экз 293 | э 294 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.sk: -------------------------------------------------------------------------------- 1 | Bc 2 | Mgr 3 | RNDr 4 | PharmDr 5 | PhDr 6 | JUDr 7 | PaedDr 8 | ThDr 9 | Ing 10 | MUDr 11 | MDDr 12 | MVDr 13 | Dr 14 | ThLic 15 | PhD 16 | ArtD 17 | ThDr 18 | Dr 19 | DrSc 20 | CSs 21 | prof 22 | obr 23 | Obr 24 | Č 25 | č 26 | absol 27 | adj 28 | admin 29 | adr 30 | Adr 31 | adv 32 | advok 33 | afr 34 | ak 35 | akad 36 | akc 37 | akuz 38 | et 39 | al 40 | alch 41 | amer 42 | anat 43 | angl 44 | Angl 45 | anglosas 46 | anorg 47 | ap 48 | apod 49 | arch 50 | archeol 51 | archit 52 | arg 53 | art 54 | astr 55 | astrol 56 | astron 57 | atp 58 | atď 59 | austr 60 | Austr 61 | aut 62 | belg 63 | Belg 64 | bibl 65 | Bibl 66 | biol 67 | bot 68 | bud 69 | bás 70 | býv 71 | cest 72 | chem 73 | cirk 74 | csl 75 | čs 76 | Čs 77 | dat 78 | dep 79 | det 80 | dial 81 | diaľ 82 | dipl 83 | distrib 84 | dokl 85 | dosl 86 | dopr 87 | dram 88 | duš 89 | dv 90 | dvojčl 91 | dór 92 | ekol 93 | ekon 94 | el 95 | elektr 96 | elektrotech 97 | energet 98 | epic 99 | est 100 | etc 101 | etonym 102 | eufem 103 | európ 104 | Európ 105 | ev 106 | evid 107 | expr 108 | fa 109 | fam 110 | farm 111 | fem 112 | feud 113 | fil 114 | filat 115 | filoz 116 | fi 117 | fon 118 | form 119 | fot 120 | fr 121 | Fr 122 | franc 123 | Franc 124 | fraz 125 | fut 126 | fyz 127 | fyziol 128 | garb 129 | gen 130 | genet 131 | genpor 132 | geod 133 | geogr 134 | geol 135 | geom 136 | germ 137 | gr 138 | Gr 139 | gréc 140 | Gréc 141 | gréckokat 142 | hebr 143 | herald 144 | hist 145 | hlav 146 | hosp 147 | hromad 148 | hud 149 | hypok 150 | ident 151 | i.e 152 | ident 153 | imp 154 | impf 155 | indoeur 156 | inf 157 | inform 158 | instr 159 | int 160 | interj 161 | inšt 162 | inštr 163 | iron 164 | jap 165 | Jap 166 | jaz 167 | jedn 168 | juhoamer 169 | juhových 170 | juhozáp 171 | juž 172 | kanad 173 | Kanad 174 | kanc 175 | kapit 176 | kpt 177 | kart 178 | katastr 179 | knih 180 | kniž 181 | komp 182 | konj 183 | konkr 184 | kozmet 185 | krajč 186 | kresť 187 | kt 188 | kuch 189 | lat 190 | latinskoamer 191 | lek 192 | lex 193 | lingv 194 | lit 195 | litur 196 | log 197 | lok 198 | max 199 | Max 200 | maď 201 | Maď 202 | medzinár 203 | mest 204 | metr 205 | mil 206 | Mil 207 | min 208 | Min 209 | miner 210 | ml 211 | mld 212 | mn 213 | mod 214 | mytol 215 | napr 216 | nar 217 | Nar 218 | nasl 219 | nedok 220 | neg 221 | negat 222 | neklas 223 | nem 224 | Nem 225 | neodb 226 | neos 227 | neskl 228 | nesklon 229 | nespis 230 | nespráv 231 | neved 232 | než 233 | niekt 234 | niž 235 | nom 236 | náb 237 | nákl 238 | námor 239 | nár 240 | obch 241 | obj 242 | obv 243 | obyč 244 | obč 245 | občian 246 | odb 247 | odd 248 | ods 249 | ojed 250 | okr 251 | Okr 252 | opt 253 | opyt 254 | org 255 | os 256 | osob 257 | ot 258 | ovoc 259 | par 260 | part 261 | pejor 262 | pers 263 | pf 264 | Pf 265 | P.f 266 | p.f 267 | pl 268 | Plk 269 | pod 270 | podst 271 | pokl 272 | polit 273 | politol 274 | polygr 275 | pomn 276 | popl 277 | por 278 | porad 279 | porov 280 | posch 281 | potrav 282 | použ 283 | poz 284 | pozit 285 | poľ 286 | poľno 287 | poľnohosp 288 | poľov 289 | pošt 290 | pož 291 | prac 292 | predl 293 | pren 294 | prep 295 | preuk 296 | priezv 297 | Priezv 298 | privl 299 | prof 300 | práv 301 | príd 302 | príj 303 | prík 304 | príp 305 | prír 306 | prísl 307 | príslov 308 | príč 309 | psych 310 | publ 311 | pís 312 | písm 313 | pôv 314 | refl 315 | reg 316 | rep 317 | resp 318 | rozk 319 | rozlič 320 | rozpráv 321 | roč 322 | Roč 323 | ryb 324 | rádiotech 325 | rím 326 | samohl 327 | semest 328 | sev 329 | severoamer 330 | severových 331 | severozáp 332 | sg 333 | skr 334 | skup 335 | sl 336 | Sloven 337 | soc 338 | soch 339 | sociol 340 | sp 341 | spol 342 | Spol 343 | spoloč 344 | spoluhl 345 | správ 346 | spôs 347 | st 348 | star 349 | starogréc 350 | starorím 351 | s.r.o 352 | stol 353 | stor 354 | str 355 | stredoamer 356 | stredoškol 357 | subj 358 | subst 359 | superl 360 | sv 361 | sz 362 | súkr 363 | súp 364 | súvzť 365 | tal 366 | Tal 367 | tech 368 | tel 369 | Tel 370 | telef 371 | teles 372 | telev 373 | teol 374 | trans 375 | turist 376 | tuzem 377 | typogr 378 | tzn 379 | tzv 380 | ukaz 381 | ul 382 | Ul 383 | umel 384 | univ 385 | ust 386 | ved 387 | vedľ 388 | verb 389 | veter 390 | vin 391 | viď 392 | vl 393 | vod 394 | vodohosp 395 | pnl 396 | vulg 397 | vyj 398 | vys 399 | vysokoškol 400 | vzťaž 401 | vôb 402 | vých 403 | výd 404 | výrob 405 | výsk 406 | výsl 407 | výtv 408 | výtvar 409 | význ 410 | včel 411 | vš 412 | všeob 413 | zahr 414 | zar 415 | zariad 416 | zast 417 | zastar 418 | zastaráv 419 | zb 420 | zdravot 421 | združ 422 | zjemn 423 | zlat 424 | zn 425 | Zn 426 | zool 427 | zr 428 | zried 429 | zv 430 | záhr 431 | zák 432 | zákl 433 | zám 434 | záp 435 | západoeur 436 | zázn 437 | územ 438 | účt 439 | čast 440 | čes 441 | Čes 442 | čl 443 | čísl 444 | živ 445 | pr 446 | fak 447 | Kr 448 | p.n.l 449 | A 450 | B 451 | C 452 | D 453 | E 454 | F 455 | G 456 | H 457 | I 458 | J 459 | K 460 | L 461 | M 462 | N 463 | O 464 | P 465 | Q 466 | R 467 | S 468 | T 469 | U 470 | V 471 | W 472 | X 473 | Y 474 | Z 475 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.sl: -------------------------------------------------------------------------------- 1 | dr 2 | Dr 3 | itd 4 | itn 5 | št #NUMERIC_ONLY# 6 | Št #NUMERIC_ONLY# 7 | d 8 | jan 9 | Jan 10 | feb 11 | Feb 12 | mar 13 | Mar 14 | apr 15 | Apr 16 | jun 17 | Jun 18 | jul 19 | Jul 20 | avg 21 | Avg 22 | sept 23 | Sept 24 | sep 25 | Sep 26 | okt 27 | Okt 28 | nov 29 | Nov 30 | dec 31 | Dec 32 | tj 33 | Tj 34 | npr 35 | Npr 36 | sl 37 | Sl 38 | op 39 | Op 40 | gl 41 | Gl 42 | oz 43 | Oz 44 | prev 45 | dipl 46 | ing 47 | prim 48 | Prim 49 | cf 50 | Cf 51 | gl 52 | Gl 53 | A 54 | B 55 | C 56 | D 57 | E 58 | F 59 | G 60 | H 61 | I 62 | J 63 | K 64 | L 65 | M 66 | N 67 | O 68 | P 69 | Q 70 | R 71 | S 72 | T 73 | U 74 | V 75 | W 76 | X 77 | Y 78 | Z 79 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.sv: -------------------------------------------------------------------------------- 1 | #single upper case letter are usually initials 2 | A 3 | B 4 | C 5 | D 6 | E 7 | F 8 | G 9 | H 10 | I 11 | J 12 | K 13 | L 14 | M 15 | N 16 | O 17 | P 18 | Q 19 | R 20 | S 21 | T 22 | U 23 | V 24 | W 25 | X 26 | Y 27 | Z 28 | #misc abbreviations 29 | AB 30 | G 31 | VG 32 | dvs 33 | etc 34 | from 35 | iaf 36 | jfr 37 | kl 38 | kr 39 | mao 40 | mfl 41 | mm 42 | osv 43 | pga 44 | tex 45 | tom 46 | vs 47 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.ta: -------------------------------------------------------------------------------- 1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. 2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers. 3 | 4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) 5 | #usually upper case letters are initials in a name 6 | அ 7 | ஆ 8 | இ 9 | ஈ 10 | உ 11 | ஊ 12 | எ 13 | ஏ 14 | ஐ 15 | ஒ 16 | ஓ 17 | ஔ 18 | ஃ 19 | க 20 | கா 21 | கி 22 | கீ 23 | கு 24 | கூ 25 | கெ 26 | கே 27 | கை 28 | கொ 29 | கோ 30 | கௌ 31 | க் 32 | ச 33 | சா 34 | சி 35 | சீ 36 | சு 37 | சூ 38 | செ 39 | சே 40 | சை 41 | சொ 42 | சோ 43 | சௌ 44 | ச் 45 | ட 46 | டா 47 | டி 48 | டீ 49 | டு 50 | டூ 51 | டெ 52 | டே 53 | டை 54 | டொ 55 | டோ 56 | டௌ 57 | ட் 58 | த 59 | தா 60 | தி 61 | தீ 62 | து 63 | தூ 64 | தெ 65 | தே 66 | தை 67 | தொ 68 | தோ 69 | தௌ 70 | த் 71 | ப 72 | பா 73 | பி 74 | பீ 75 | பு 76 | பூ 77 | பெ 78 | பே 79 | பை 80 | பொ 81 | போ 82 | பௌ 83 | ப் 84 | ற 85 | றா 86 | றி 87 | றீ 88 | று 89 | றூ 90 | றெ 91 | றே 92 | றை 93 | றொ 94 | றோ 95 | றௌ 96 | ற் 97 | ய 98 | யா 99 | யி 100 | யீ 101 | யு 102 | யூ 103 | யெ 104 | யே 105 | யை 106 | யொ 107 | யோ 108 | யௌ 109 | ய் 110 | ர 111 | ரா 112 | ரி 113 | ரீ 114 | ரு 115 | ரூ 116 | ரெ 117 | ரே 118 | ரை 119 | ரொ 120 | ரோ 121 | ரௌ 122 | ர் 123 | ல 124 | லா 125 | லி 126 | லீ 127 | லு 128 | லூ 129 | லெ 130 | லே 131 | லை 132 | லொ 133 | லோ 134 | லௌ 135 | ல் 136 | வ 137 | வா 138 | வி 139 | வீ 140 | வு 141 | வூ 142 | வெ 143 | வே 144 | வை 145 | வொ 146 | வோ 147 | வௌ 148 | வ் 149 | ள 150 | ளா 151 | ளி 152 | ளீ 153 | ளு 154 | ளூ 155 | ளெ 156 | ளே 157 | ளை 158 | ளொ 159 | ளோ 160 | ளௌ 161 | ள் 162 | ழ 163 | ழா 164 | ழி 165 | ழீ 166 | ழு 167 | ழூ 168 | ழெ 169 | ழே 170 | ழை 171 | ழொ 172 | ழோ 173 | ழௌ 174 | ழ் 175 | ங 176 | ஙா 177 | ஙி 178 | ஙீ 179 | ஙு 180 | ஙூ 181 | ஙெ 182 | ஙே 183 | ஙை 184 | ஙொ 185 | ஙோ 186 | ஙௌ 187 | ங் 188 | ஞ 189 | ஞா 190 | ஞி 191 | ஞீ 192 | ஞு 193 | ஞூ 194 | ஞெ 195 | ஞே 196 | ஞை 197 | ஞொ 198 | ஞோ 199 | ஞௌ 200 | ஞ் 201 | ண 202 | ணா 203 | ணி 204 | ணீ 205 | ணு 206 | ணூ 207 | ணெ 208 | ணே 209 | ணை 210 | ணொ 211 | ணோ 212 | ணௌ 213 | ண் 214 | ந 215 | நா 216 | நி 217 | நீ 218 | நு 219 | நூ 220 | நெ 221 | நே 222 | நை 223 | நொ 224 | நோ 225 | நௌ 226 | ந் 227 | ம 228 | மா 229 | மி 230 | மீ 231 | மு 232 | மூ 233 | மெ 234 | மே 235 | மை 236 | மொ 237 | மோ 238 | மௌ 239 | ம் 240 | ன 241 | னா 242 | னி 243 | னீ 244 | னு 245 | னூ 246 | னெ 247 | னே 248 | னை 249 | னொ 250 | னோ 251 | னௌ 252 | ன் 253 | 254 | 255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks 256 | திரு 257 | திருமதி 258 | வண 259 | கௌரவ 260 | 261 | 262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) 263 | உ.ம் 264 | #கா.ம் 265 | #எ.ம் 266 | 267 | 268 | #Numbers only. These should only induce breaks when followed by a numeric sequence 269 | # add NUMERIC_ONLY after the word for this function 270 | #This case is mostly for the english "No." which can either be a sentence of its own, or 271 | #if followed by a number, a non-breaking prefix 272 | No #NUMERIC_ONLY# 273 | Nos 274 | Art #NUMERIC_ONLY# 275 | Nr 276 | pp #NUMERIC_ONLY# 277 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.yue: -------------------------------------------------------------------------------- 1 | # 2 | # Cantonese (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /tools/nonbreaking_prefixes/nonbreaking_prefix.zh: -------------------------------------------------------------------------------- 1 | # 2 | # Mandarin (Chinese) 3 | # 4 | # Anything in this file, followed by a period, 5 | # does NOT indicate an end-of-sentence marker. 6 | # 7 | # English/Euro-language given-name initials (appearing in 8 | # news, periodicals, etc.) 9 | A 10 | Ā 11 | B 12 | C 13 | Č 14 | D 15 | E 16 | Ē 17 | F 18 | G 19 | Ģ 20 | H 21 | I 22 | Ī 23 | J 24 | K 25 | Ķ 26 | L 27 | Ļ 28 | M 29 | N 30 | Ņ 31 | O 32 | P 33 | Q 34 | R 35 | S 36 | Š 37 | T 38 | U 39 | Ū 40 | V 41 | W 42 | X 43 | Y 44 | Z 45 | Ž 46 | 47 | # Numbers only. These should only induce breaks when followed by 48 | # a numeric sequence. 49 | # Add NUMERIC_ONLY after the word for this function. This case is 50 | # mostly for the english "No." which can either be a sentence of its 51 | # own, or if followed by a number, a non-breaking prefix. 52 | No #NUMERIC_ONLY# 53 | Nr #NUMERIC_ONLY# 54 | -------------------------------------------------------------------------------- /tools/release_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import torch 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser( 7 | description="Removes the optim data of PyTorch models") 8 | parser.add_argument("--model", "-m", 9 | help="The model filename (*.pt)", required=True) 10 | parser.add_argument("--output", "-o", 11 | help="The output filename (*.pt)", required=True) 12 | opt = parser.parse_args() 13 | 14 | model = torch.load(opt.model) 15 | model['optim'] = None 16 | torch.save(model, opt.output) 17 | -------------------------------------------------------------------------------- /tools/test_rouge.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | import argparse 3 | import os 4 | import time 5 | import pyrouge 6 | import shutil 7 | import sys 8 | import codecs 9 | 10 | from onmt.utils.logging import init_logger, logger 11 | 12 | 13 | def test_rouge(cand, ref): 14 | """Calculate ROUGE scores of sequences passed as an iterator 15 | e.g. a list of str, an open file, StringIO or even sys.stdin 16 | """ 17 | current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) 18 | tmp_dir = ".rouge-tmp-{}".format(current_time) 19 | try: 20 | if not os.path.isdir(tmp_dir): 21 | os.mkdir(tmp_dir) 22 | os.mkdir(tmp_dir + "/candidate") 23 | os.mkdir(tmp_dir + "/reference") 24 | candidates = [line.strip() for line in cand] 25 | references = [line.strip() for line in ref] 26 | assert len(candidates) == len(references) 27 | cnt = len(candidates) 28 | for i in range(cnt): 29 | if len(references[i]) < 1: 30 | continue 31 | with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", 32 | encoding="utf-8") as f: 33 | f.write(candidates[i]) 34 | with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", 35 | encoding="utf-8") as f: 36 | f.write(references[i]) 37 | r = pyrouge.Rouge155() 38 | r.model_dir = tmp_dir + "/reference/" 39 | r.system_dir = tmp_dir + "/candidate/" 40 | r.model_filename_pattern = 'ref.#ID#.txt' 41 | r.system_filename_pattern = r'cand.(\d+).txt' 42 | rouge_results = r.convert_and_evaluate() 43 | results_dict = r.output_to_dict(rouge_results) 44 | return results_dict 45 | finally: 46 | pass 47 | if os.path.isdir(tmp_dir): 48 | shutil.rmtree(tmp_dir) 49 | 50 | 51 | def rouge_results_to_str(results_dict): 52 | return ">> ROUGE(1/2/3/L/SU4): {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}".format( 53 | results_dict["rouge_1_f_score"] * 100, 54 | results_dict["rouge_2_f_score"] * 100, 55 | results_dict["rouge_3_f_score"] * 100, 56 | results_dict["rouge_l_f_score"] * 100, 57 | results_dict["rouge_su*_f_score"] * 100) 58 | 59 | 60 | if __name__ == "__main__": 61 | init_logger('test_rouge.log') 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('-c', type=str, default="candidate.txt", 64 | help='candidate file') 65 | parser.add_argument('-r', type=str, default="reference.txt", 66 | help='reference file') 67 | args = parser.parse_args() 68 | if args.c.upper() == "STDIN": 69 | candidates = sys.stdin 70 | else: 71 | candidates = codecs.open(args.c, encoding="utf-8") 72 | references = codecs.open(args.r, encoding="utf-8") 73 | 74 | results_dict = test_rouge(candidates, references) 75 | logger.info(rouge_results_to_str(results_dict)) 76 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import torch 3 | import numpy as np 4 | import random 5 | 6 | from onmt.bin.train import main 7 | 8 | 9 | def setup_seed(seed): 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | np.random.seed(seed) 13 | random.seed(seed) 14 | torch.backends.cudnn.deterministic = True 15 | 16 | 17 | setup_seed(2020) 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import torch 3 | import numpy as np 4 | import random 5 | from onmt.bin.translate import main 6 | 7 | 8 | def setup_seed(seed): 9 | torch.manual_seed(seed) 10 | torch.cuda.manual_seed_all(seed) 11 | np.random.seed(seed) 12 | random.seed(seed) 13 | torch.backends.cudnn.deterministic = True 14 | 15 | 16 | setup_seed(2020) 17 | 18 | if __name__ == "__main__": 19 | main() 20 | --------------------------------------------------------------------------------