├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── ONMT_README.md
├── README.md
├── available_models
└── example.conf.json
├── config
├── config-rnn-summarization.yml
├── config-transformer-base-1GPU.yml
└── config-transformer-base-4GPU.yml
├── data
├── README.md
├── morph
│ ├── src.train
│ ├── src.valid
│ ├── tgt.train
│ └── tgt.valid
├── src-test.txt
├── src-train.txt
├── src-val.txt
├── test_model2.src
├── test_model2.tgt
├── tgt-train.txt
└── tgt-val.txt
├── docs
├── Makefile
├── requirements.txt
└── source
│ ├── CONTRIBUTING.md
│ ├── FAQ.md
│ ├── Library.ipynb
│ ├── Library.md
│ ├── Summarization.md
│ ├── _static
│ └── theme_overrides.css
│ ├── conf.py
│ ├── examples.rst
│ ├── extended.md
│ ├── im2text.md
│ ├── index.md
│ ├── index.rst
│ ├── main.md
│ ├── modules.rst
│ ├── onmt.inputters.rst
│ ├── onmt.modules.rst
│ ├── onmt.rst
│ ├── onmt.translate.translation_server.rst
│ ├── onmt.translation.rst
│ ├── options
│ ├── preprocess.rst
│ ├── server.rst
│ ├── train.rst
│ └── translate.rst
│ ├── quickstart.md
│ ├── ref.rst
│ ├── refs.bib
│ ├── speech2text.md
│ └── vid2text.rst
├── floyd.yml
├── floyd_requirements.txt
├── github_deploy_key_opennmt_opennmt_py.enc
├── onmt
├── __init__.py
├── bin
│ ├── __init__.py
│ ├── preprocess.py
│ ├── server.py
│ ├── train.py
│ └── translate.py
├── decoders
│ ├── __init__.py
│ ├── cnn_decoder.py
│ ├── decoder.py
│ ├── ensemble.py
│ └── transformer.py
├── encoders
│ ├── __init__.py
│ ├── audio_encoder.py
│ ├── cnn_encoder.py
│ ├── encoder.py
│ ├── image_encoder.py
│ ├── mean_encoder.py
│ ├── rnn_encoder.py
│ └── transformer.py
├── inputters
│ ├── __init__.py
│ ├── audio_dataset.py
│ ├── datareader_base.py
│ ├── dataset_base.py
│ ├── image_dataset.py
│ ├── inputter.py
│ ├── text_dataset.py
│ └── vec_dataset.py
├── model_builder.py
├── models
│ ├── __init__.py
│ ├── model.py
│ ├── model_saver.py
│ ├── sru.py
│ └── stacked_rnn.py
├── modules
│ ├── __init__.py
│ ├── average_attn.py
│ ├── conv_multi_step_attention.py
│ ├── copy_generator.py
│ ├── embeddings.py
│ ├── gate.py
│ ├── global_attention.py
│ ├── multi_headed_attn.py
│ ├── position_ffn.py
│ ├── sparse_activations.py
│ ├── sparse_losses.py
│ ├── structured_attention.py
│ ├── util_class.py
│ └── weight_norm.py
├── opts.py
├── tests
│ ├── __init__.py
│ ├── output_hyp.txt
│ ├── pull_request_chk.sh
│ ├── rebuild_test_models.sh
│ ├── sample_glove.txt
│ ├── test_attention.py
│ ├── test_audio_dataset.py
│ ├── test_beam.py
│ ├── test_beam_search.py
│ ├── test_copy_generator.py
│ ├── test_embeddings.py
│ ├── test_image_dataset.py
│ ├── test_model.pt
│ ├── test_model2.pt
│ ├── test_models.py
│ ├── test_models.sh
│ ├── test_preprocess.py
│ ├── test_random_sampling.py
│ ├── test_simple.py
│ ├── test_structured_attention.py
│ ├── test_text_dataset.py
│ ├── test_translation_server.py
│ └── utils_for_tests.py
├── train_single.py
├── trainer.py
├── translate
│ ├── __init__.py
│ ├── beam.py
│ ├── beam_search.py
│ ├── decode_strategy.py
│ ├── penalties.py
│ ├── process_zh.py
│ ├── random_sampling.py
│ ├── translation.py
│ ├── translation_server.py
│ └── translator.py
└── utils
│ ├── __init__.py
│ ├── cnn_factory.py
│ ├── distributed.py
│ ├── earlystopping.py
│ ├── logging.py
│ ├── loss.py
│ ├── misc.py
│ ├── optimizers.py
│ ├── parse.py
│ ├── report_manager.py
│ ├── rnn_factory.py
│ └── statistics.py
├── preprocess.py
├── requirement.txt
├── requirements.opt.txt
├── scripts
├── eval.py
├── inference_daily.sh
├── inference_ost.sh
├── preprocess.sh
├── train_daily.sh
└── train_ost.sh
├── server.py
├── setup.py
├── tools
├── README.md
├── apply_bpe.py
├── average_models.py
├── bpe_pipeline.sh
├── create_vocabulary.py
├── detokenize.perl
├── embeddings_to_torch.py
├── extract_embeddings.py
├── learn_bpe.py
├── multi-bleu-detok.perl
├── multi-bleu.perl
├── nonbreaking_prefixes
│ ├── README.txt
│ ├── nonbreaking_prefix.ca
│ ├── nonbreaking_prefix.cs
│ ├── nonbreaking_prefix.de
│ ├── nonbreaking_prefix.el
│ ├── nonbreaking_prefix.en
│ ├── nonbreaking_prefix.es
│ ├── nonbreaking_prefix.fi
│ ├── nonbreaking_prefix.fr
│ ├── nonbreaking_prefix.ga
│ ├── nonbreaking_prefix.hu
│ ├── nonbreaking_prefix.is
│ ├── nonbreaking_prefix.it
│ ├── nonbreaking_prefix.lt
│ ├── nonbreaking_prefix.lv
│ ├── nonbreaking_prefix.nl
│ ├── nonbreaking_prefix.pl
│ ├── nonbreaking_prefix.ro
│ ├── nonbreaking_prefix.ru
│ ├── nonbreaking_prefix.sk
│ ├── nonbreaking_prefix.sl
│ ├── nonbreaking_prefix.sv
│ ├── nonbreaking_prefix.ta
│ ├── nonbreaking_prefix.yue
│ └── nonbreaking_prefix.zh
├── release_model.py
├── test_rouge.py
├── tokenizer.perl
└── vid_feature_extractor.py
├── train.py
└── translate.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # repo-specific stuff
2 | pred.txt
3 | multi-bleu.perl
4 | *.pt
5 | \#*#
6 | .idea
7 | *.sublime-*
8 | .DS_Store
9 | data/
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | .hypothesis/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # celery beat schedule file
86 | celerybeat-schedule
87 |
88 | # SageMath parsed files
89 | *.sage.py
90 |
91 | # Environments
92 | .env
93 | .venv
94 | env/
95 | venv/
96 | ENV/
97 |
98 | # Spyder project settings
99 | .spyderproject
100 | .spyproject
101 |
102 | # Rope project settings
103 | .ropeproject
104 |
105 | # mkdocs documentation
106 | /site
107 |
108 | # mypy
109 | .mypy_cache/
110 |
111 | # Tensorboard
112 | runs/
113 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 |
3 | OpenNMT-py is a community developed project and we love developer contributions.
4 |
5 | ## Guidelines
6 | Before sending a PR, please do this checklist first:
7 |
8 | - Please run `onmt/tests/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks:
9 | 1. flake8 check for coding style;
10 | 2. unittest;
11 | 3. continuous integration tests listed in `.travis.yml`.
12 | - When adding/modifying class constructor, please make the arguments as same naming style as its superclass in PyTorch.
13 | - If your change is based on a paper, please include a clear comment and reference in the code (more on that below).
14 |
15 | ### Docstrings
16 | Above all, try to follow the Google docstring format
17 | ([Napoleon example](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html),
18 | [Google styleguide](http://google.github.io/styleguide/pyguide.html)).
19 | This makes it easy to include your contributions in the Sphinx documentation. And, do feel free
20 | to autodoc your contributions in the API ``.rst`` files in the `docs/source` folder! If you do, check that
21 | your additions look right.
22 |
23 | ```bash
24 | cd docs
25 | # install some dependencies if necessary:
26 | # recommonmark, sphinx_rtd_theme, sphinxcontrib-bibtex
27 | make html
28 | firefox build/html/main.html # or your browser of choice
29 | ```
30 |
31 | Some particular advice:
32 | - Try to follow Python 3 [``typing`` module](https://docs.python.org/3/library/typing.html) conventions when documenting types.
33 | - Exception: use "or" instead of unions for more readability
34 | - For external types, use the full "import name". Common abbreviations (e.g. ``np``) are acceptable.
35 | For ``torch.Tensor`` types, the ``torch.`` is optional.
36 | - Please don't use tics like `` (`str`) `` or rst directives like `` (:obj:`str`) ``. Napoleon handles types
37 | very well without additional help, so avoid the clutter.
38 | - [Google docstrings don't support multiple returns](https://stackoverflow.com/questions/29221551/can-sphinx-napoleon-document-function-returning-multiple-arguments).
39 | For multiple returns, the following works well with Sphinx and is still very readable.
40 | ```python
41 | def foo(a, b):
42 | """This is my docstring.
43 |
44 | Args:
45 | a (object): Something.
46 | b (class): Another thing.
47 |
48 | Returns:
49 | (object, class):
50 |
51 | * a: Something or rather with a long
52 | description that spills over.
53 | * b: And another thing.
54 | """
55 |
56 | return a, b
57 | ```
58 | - When citing a paper, avoid directly linking in the docstring! Add a Bibtex entry to `docs/source/refs.bib`.
59 | E.g., to cite "Attention Is All You Need", visit [arXiv](https://arxiv.org/abs/1706.03762), choose the
60 | [bibtext](https://dblp.uni-trier.de/rec/bibtex/journals/corr/VaswaniSPUJGKP17) link, search `docs/source/refs.bib`
61 | using `CTRL-F` for `DBLP:journals/corr/VaswaniSPUJGKP17`, and if you do not find it then copy-paste the
62 | citation into `refs.bib`. Then, in your docstring, use ``:cite:`DBLP:journals/corr/VaswaniSPUJGKP17` ``.
63 | - However, a link is better than nothing.
64 | - Please document tensor shapes. Prefer the format
65 | ``` ``(a, b, c)`` ```. This style is easy to read, allows using ``x`` for multplication, and is common
66 | (PyTorch uses a few variations on the parentheses format, AllenNLP uses exactly this format, Fairseq uses
67 | the parentheses format with single ticks).
68 | - Again, a different style is better than no shape documentation.
69 | - Please avoid unnecessary space characters, try to capitalize, and try to punctuate.
70 |
71 | For multi-line docstrings, add a blank line after the closing ``"""``.
72 | Don't use a blank line before the closing quotes.
73 |
74 | ``""" not this """`` ``"""This."""``
75 |
76 | ```python
77 | """
78 | Not this.
79 | """
80 | ```
81 | ```python
82 | """This."""
83 | ```
84 |
85 | This note is the least important. Focus on content first, but remember that consistent docs look good.
86 | - Be sensible about the first line. Generally, one stand-alone summary line (per the Google guidelines) is good.
87 | Sometimes, it's better to cut directly to the args or an extended description. It's always acceptable to have a
88 | "trailing" citation.
89 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017-Present OpenNMT
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AdaLabel
2 |
3 | Code/data for ACL'21 paper "Diversifying Dialog Generation via Adaptive Label Smoothing".
4 |
5 | We implemented an Adaptive Label Smoothing (AdaLabel) approach that can adaptively estimate a target label distribution at each time step for different contexts.
6 | Our method is an extension of the traditional MLE loss.
7 | The current implementation is designed for the task of dialogue generation.
8 | However, our approach can be readily extended to other text generation tasks such as summarization.
9 | Please refer to our paper for more details.
10 |
11 | Our implementation is based on the [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py) project,
12 | therefore most behaviors of our code follow the default settings in OpenNMT-py.
13 | Specifically, we forked from [this commit](https://github.com/OpenNMT/OpenNMT-py/tree/1bbf410a00e1d15c87fc5393b9124d531e134445) of OpenNMT-py,
14 | and implemented our code on top of it.
15 | This repo reserves all previous commits of OpenNMT-py and ignores all the follow-up commits.
16 | Our changes can be viewed by comparing the [commits](https://github.com/lemon234071/AdaLabel/commit/4b9531943a4e00f1ee8a7f4b8bf3554e2b1e0f41).
17 |
18 | Our code is tested on Ubuntu 16.04 using python 3.7.4 and PyTorch 1.7.1.
19 |
20 | ## How to use
21 |
22 | ### Step 1: Setup
23 |
24 | Install dependencies:
25 |
26 | ```bash
27 | conda create -n adalabel python=3.7.4
28 | conda activate adalabel
29 | conda install pytorch==1.7.1 cudatoolkit=10.1 -c PyTorch -n adalabel
30 | pip install -r requirement.txt
31 | ```
32 |
33 | Make folders to store training and testing files:
34 |
35 | ```bash
36 | mkdir checkpoint # Model checkpoints will be saved here
37 | mkdir log_dir # The training log will be placed here
38 | mkdir result # The inferred results will be saved here
39 | ```
40 |
41 | ### Step 2: Preprocess the data
42 |
43 | The data can be downloaded from this [link](https://drive.google.com/file/d/1U4M0h9tLNeCyu9JBfSgR3r5EE6IIqyNZ/view?usp=sharing).
44 | After downloading and unzipping, the DailyDialog and OpenSubtitle datasets used in our paper can be found in the `data_daily` and `data_ost` folders, respectively.
45 | We provide a script `scripts/preprocess.sh` to preprocess the data.
46 |
47 | ```bash
48 | bash scripts/preprocess.sh
49 | ```
50 |
51 | Note:
52 |
53 | - Before running `scripts/preprocess.sh`, remember to modify its first line (i.e., the value of `DATA_DIR`) to specify the correct data folder.
54 | - The default choice of our tokenizer is [bert-base-uncased](https://huggingface.co/bert-base-uncased)
55 |
56 | ### Step 3: Train the model
57 |
58 | The training of our model can be performed using the following script:
59 |
60 | ```bash
61 | bash scripts/train_daily.sh # Train models on the DailyDialog dataset
62 | ```
63 |
64 | or
65 |
66 | ```bash
67 | bash scripts/train_ost.sh # Train models on the OpenSubtitle dataset
68 | ```
69 |
70 | Note:
71 |
72 | - The resulting checkpoints will be written to the `checkpoint` folder.
73 | - By default, our script uses the first available GPU.
74 | - Once the training is completed, the training script will log out the best performing model on the validation set.
75 | - Experiments in our paper are performed using TITAN XP with 12GB memory.
76 |
77 | ### Step 4: Inference
78 |
79 | The inference of our model can be performed using the following script:
80 |
81 | ```bash
82 | bash scripts/inference_daily.sh {which GPU to use} {path to your model checkpoint} # Infer models on the DailyDialog dataset
83 | ```
84 |
85 | or
86 |
87 | ```bash
88 | bash scripts/inference_ost.sh {which GPU to use} {path to your model checkpoint} # Infer models on the OpenSubtitle dataset
89 | ```
90 |
91 |
92 | Note:
93 |
94 | - Inferred outputs will be saved to the `result` folder.
95 |
96 | ### Step 5: Evaluation
97 |
98 | The following script can be used to evaluate our model based on the inferred outputs obtained in Step 4:
99 |
100 | ```bash
101 | python scripts/eval.py {path to the data folder} {path to the inferred output file}
102 | ```
103 |
104 | ## Citation
105 |
106 | Please cite our paper if you find this repo useful :)
107 |
108 | ```BibTeX
109 | @inproceedings{wang2021adalabel,
110 | title={Diversifying Dialog Generation via Adaptive Label Smoothing},
111 | author={Wang, Yida and Zheng, Yinhe and Jiang, Yong and Huang, Minlie},
112 | booktitle={Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics},
113 | year={2021}
114 | }
115 | ```
116 |
117 | ----
118 |
119 | Issues and pull requests are welcomed.
120 |
--------------------------------------------------------------------------------
/available_models/example.conf.json:
--------------------------------------------------------------------------------
1 | {
2 | "models_root": "./available_models",
3 | "models": [
4 | {
5 | "id": 100,
6 | "model": "model_0.pt",
7 | "timeout": 600,
8 | "on_timeout": "to_cpu",
9 | "load": true,
10 | "opt": {
11 | "gpu": 0,
12 | "beam_size": 5
13 | },
14 | "tokenizer": {
15 | "type": "sentencepiece",
16 | "model": "wmtenfr.model"
17 | }
18 | },{
19 | "model": "model_0.light.pt",
20 | "timeout": -1,
21 | "on_timeout": "unload",
22 | "model_root": "../other_models",
23 | "opt": {
24 | "batch_size": 1,
25 | "beam_size": 10
26 | }
27 | }
28 | ]
29 | }
30 |
--------------------------------------------------------------------------------
/config/config-rnn-summarization.yml:
--------------------------------------------------------------------------------
1 | data: data/cnndm/CNNDM
2 | save_model: models/cnndm
3 | save_checkpoint_steps: 10000
4 | keep_checkpoint: 10
5 | seed: 3435
6 | train_steps: 100000
7 | valid_steps: 10000
8 | report_every: 100
9 |
10 | encoder_type: brnn
11 | word_vec_size: 128
12 | rnn_size: 512
13 | layers: 1
14 |
15 | optim: adagrad
16 | learning_rate: 0.15
17 | adagrad_accumulator_init: 0.1
18 | max_grad_norm: 2
19 |
20 | batch_size: 16
21 | dropout: 0.0
22 |
23 | copy_attn: 'true'
24 | global_attention: mlp
25 | reuse_copy_attn: 'true'
26 | bridge: 'true'
27 |
28 | world_size: 2
29 | gpu_ranks:
30 | - 0
31 | - 1
32 |
--------------------------------------------------------------------------------
/config/config-transformer-base-1GPU.yml:
--------------------------------------------------------------------------------
1 | data: exp/dataset.de-en
2 | save_model: exp/model.de-en
3 | save_checkpoint_steps: 10000
4 | keep_checkpoint: 10
5 | seed: 3435
6 | train_steps: 500000
7 | valid_steps: 10000
8 | warmup_steps: 8000
9 | report_every: 100
10 |
11 | decoder_type: transformer
12 | encoder_type: transformer
13 | word_vec_size: 512
14 | rnn_size: 512
15 | layers: 6
16 | transformer_ff: 2048
17 | heads: 8
18 |
19 | accum_count: 8
20 | optim: adam
21 | adam_beta1: 0.9
22 | adam_beta2: 0.998
23 | decay_method: noam
24 | learning_rate: 2.0
25 | max_grad_norm: 0.0
26 |
27 | batch_size: 4096
28 | batch_type: tokens
29 | normalization: tokens
30 | dropout: 0.1
31 | label_smoothing: 0.1
32 |
33 | max_generator_batches: 2
34 |
35 | param_init: 0.0
36 | param_init_glorot: 'true'
37 | position_encoding: 'true'
38 |
39 | world_size: 1
40 | gpu_ranks:
41 | - 0
42 |
43 |
--------------------------------------------------------------------------------
/config/config-transformer-base-4GPU.yml:
--------------------------------------------------------------------------------
1 | data: exp/dataset.de-en
2 | save_model: exp/model.de-en
3 | save_checkpoint_steps: 10000
4 | keep_checkpoint: 10
5 | seed: 3435
6 | train_steps: 200000
7 | valid_steps: 10000
8 | warmup_steps: 8000
9 | report_every: 100
10 |
11 | decoder_type: transformer
12 | encoder_type: transformer
13 | word_vec_size: 512
14 | rnn_size: 512
15 | layers: 6
16 | transformer_ff: 2048
17 | heads: 8
18 |
19 | accum_count: 2
20 | optim: adam
21 | adam_beta1: 0.9
22 | adam_beta2: 0.998
23 | decay_method: noam
24 | learning_rate: 2.0
25 | max_grad_norm: 0.0
26 |
27 | batch_size: 4096
28 | batch_type: tokens
29 | normalization: tokens
30 | dropout: 0.1
31 | label_smoothing: 0.1
32 |
33 | max_generator_batches: 2
34 |
35 | param_init: 0.0
36 | param_init_glorot: 'true'
37 | position_encoding: 'true'
38 |
39 | world_size: 4
40 | gpu_ranks:
41 | - 0
42 | - 1
43 | - 2
44 | - 3
45 |
46 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | > python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000
6 |
7 | > python train.py -data data/data -save_model /n/rush_lab/data/tmp_ -world_size 1 -gpu_ranks 0 -rnn_size 100 -word_vec_size 50 -layers 1 -train_steps 100 -optim adam -learning_rate 0.001
8 |
--------------------------------------------------------------------------------
/data/morph/src.valid:
--------------------------------------------------------------------------------
1 | o p r u g a
2 | б е с т и ј а
3 | f a l s e t
4 | b e z i m e n o
5 | п р с и т и
6 | d e c e m b a r s k i
7 | м е ђ а
8 | š t a n d a r a c
9 | к р п о
10 | t a j n i k
11 | к е с и т и
12 | с р ч а н
13 | s c e n o g r a f
14 | б р а т и ћ
15 | з а у з е т и
16 | v e z a
17 | н е п а л а ц
18 | p r i l a g o d l j i v o s t
19 | g o l f e r i c a
20 | г р а ђ е в и н а ц
21 | с е к л у з и ј а
22 | р а с к р с н и ц а
23 | n e o p r a v d a n o
24 | k o m a d
25 | b e s t i j a l n o s t
26 | t u p o g l a v o
27 | с т р а н
28 | к р и в и ч а р
29 | п р о ј а х а т и
30 | о р г а н с к и
31 | r a z m i r i c a
32 | ц е с а р
33 | l j u t n j a
34 | н е в е ш т
35 | н о б е л о в а ц
36 | o s v a j a č
37 | х е д о н и с т
38 | р о ж н а т
39 | s r e b r e n
40 | p r i m e r a n
41 | и с к р и ч а в о
42 | s a g i n j a t i
43 | с и т а н
44 | v e š m a š i n a
45 | b l a m a ž a
46 | н е г и р а т и
47 | n a z d r a v i t i
48 | б о ж и
49 | r e š o
50 | к о ш у т е
51 | и м е н д а н
52 | e l e g a n t n o
53 | s k v r č i t i
54 | b e z v o l j a
55 | н а г и б
56 | k a p i t a l a n
57 | r e n o m e
58 | ф а с ц и н и р а т и
59 | с а к у п љ а т и
60 | п ш е н и ч н и
61 | u s m j e r a v a t i
62 | s k r i v a t i
63 | t i c a t i
64 | b i t n o
65 | j u r i š
66 | ч а с о п и с
67 | b e s k o n a č n o s t
68 | с а ф и р н и
69 | f a l s i f i k a t
70 | o n t o g e n e z a
71 | p r i m j e n a
72 | к р е к е т
73 | а е р о д р о м
74 | o s l o b a đ a t i
75 | с т у д и ј
76 | к р а т к о в и д
77 | л у п е ш т в о
78 | ž i v a c
79 | ј а ј а ш ц е
80 | r a z u v e r i t i
81 | z a v i s t
82 | k o n d u r a
83 | а м ф и т е а т а р
84 | а л г о н к и ј с к и
85 | у м и в а т и
86 | д а ж д е в њ а к
87 | д е ф е т и с т
88 | p r e s k a k a t i
89 | п е р у а н с к и
90 | s u p r o t s t a v l j a t i
91 | s p l j o š t e n
92 | с а ф и р
93 | п о ш т е ђ и в а т и
94 | к р а н и о л о г и ј с к и
95 | s v o j t a
96 | у п а д а т и
97 | r a ž a n j
98 | д о л и к о в а т и
99 | с а м о у в е р е н о с т
100 | х у м о р и с т и ч к и
101 | ч е ш к и
102 | s r a t i
103 | б л е б н у т и
104 | о д с е ц а т и
105 | n o v o r o đ e n
106 | b r e s k v a
107 | о б р е д
108 | k o n a č a n
109 | г р и с т и
110 | i k a v a c
111 | e p i l e p t i č a r k a
112 | d e z o r g a n i z a c i j a
113 | о т е ћ и
114 | z n a m e n j e
115 | м у љ а в
116 | i t a l i j a n s k i
117 | p r e t v o r b a
118 | g e r i j a t r i j a
119 | a l b a n i z a c i j a
120 | о ц ј е н а
121 | o d b i t i
122 | r e t o r i č n o
123 | м о љ а ц
124 | š k a f e t i n
125 | s p a n a ć
126 | у р а н
127 | n e p r i s t u p a č a n
128 | č e l n i k
129 | о п л о ђ и в а т и
130 | к и с т
131 | с а с е ћ и
132 | п о з д р а в
133 | ч а ђ а в о с т
134 | х а н
135 | п р и п о в е д а ч
136 | k i n o l o g i j a
137 | a s t r o n o m i j s k i
138 | n e i z l j e č i v o s t
139 | u s l o v a n
140 | s r p s k i
141 | e v o l u c i o n i z a m
142 | а н о т и р а т и
143 | d o s t a v i t i
144 | с а д а
145 | д о д а т а к
146 | p r o p i s i v a t i
147 | u s t v r đ i v a t i
148 | m i j e š a l i c a
149 | е г з и с т е н ц и ј а л а н
150 | и в и ц а
151 | a u t o k e f a l a n
152 | ж и в а х н о с т
153 | и з о р а в а т и
154 | х л а ч е
155 | к о н с о н а н т
156 | у з м а ћ и
157 | и м и г р и р а т и
158 | п л и т а к
159 | т е о з о ф и ј а
160 | к р а т к о в р а т
161 | s a t a n i z a m
162 | х а р а ч
163 | o t r c a n
164 | g i b a k
165 | k o s t r u š i t i
166 | o d g o v o r i t i
167 | b a l a v i t i
168 | f l a š i r a t i
169 | с л а ч и ц а
170 | t e n d e n c i j a
171 | g d a
172 | m n o ž i n a
173 | т е л е о л о г и ј а
174 | k r i z a n t e m a
175 | l j e t o
176 | к о н т р а д и к ц и ј а
177 | и н д и р е к т а н
178 | s l a m k a
179 | š t r a j h a t i
180 | п а п и р н и ч а р
181 | k u r t o a z n o
182 | o č e k i v a n j e
183 | n a m a ć i
184 | z a m a ć i
185 | b i f t e k
186 | с т о г а
187 | х о м о г е н
188 | з а м а к н у т и
189 | o k r i v i t i
190 | у ч и н и т и
191 | k i n u t i
192 | o b r t a t i
193 | t r a n s p l a n t a c i j a
194 | п р е в е л и к
195 | u k i d a t i
196 | к а м е н и
197 |
--------------------------------------------------------------------------------
/data/morph/tgt.valid:
--------------------------------------------------------------------------------
1 | o p r u ɡ a
2 | b e s t i j a
3 | f a l s e t
4 | b e z i m e n o
5 | p r s i t i
6 | d e t s e m b a r s k i
7 | m e d z a
8 | ʃ t a n d a r a t s
9 | k r p o
10 | t a j n i k
11 | k e s i t i
12 | s r t ʃ a n
13 | s t s e n o ɡ r a f
14 | b r a t i t x
15 | z a u z e t i
16 | ʋ e z a
17 | n e p a l a t s
18 | p r i l a ɡ o d ʎ i ʋ o s t
19 | ɡ o l f e r i t s a
20 | ɡ r a d z e ʋ i n a t s
21 | s e k l u z i j a
22 | r a s k r s n i t s a
23 | n e o p r a ʋ d a n o
24 | k o m a d
25 | b e s t i j a l n o s t
26 | t u p o ɡ l a ʋ o
27 | s t r a n
28 | k r i ʋ i t ʃ a r
29 | p r o j a x a t i
30 | o r ɡ a n s k i
31 | r a z m i r i t s a
32 | t s e s a r
33 | ʎ u t ɲ a
34 | n e ʋ e ʃ t
35 | n o b e l o ʋ a t s
36 | o s ʋ a j a t ʃ
37 | x e d o n i s t
38 | r o ʒ n a t
39 | s r e b r e n
40 | p r i m e r a n
41 | i s k r i t ʃ a ʋ o
42 | s a ɡ i ɲ a t i
43 | s i t a n
44 | ʋ e ʃ m a ʃ i n a
45 | b l a m a ʒ a
46 | n e ɡ i r a t i
47 | n a z d r a ʋ i t i
48 | b o ʒ i
49 | r e ʃ o
50 | k o ʃ u t e
51 | i m e n d a n
52 | e l e ɡ a n t n o
53 | s k ʋ r t ʃ i t i
54 | b e z ʋ o ʎ a
55 | n a ɡ i b
56 | k a p i t a l a n
57 | r e n o m e
58 | f a s t s i n i r a t i
59 | s a k u p ʎ a t i
60 | p ʃ e n i t ʃ n i
61 | u s m j e r a ʋ a t i
62 | s k r i ʋ a t i
63 | t i t s a t i
64 | b i t n o
65 | j u r i ʃ
66 | t ʃ a s o p i s
67 | b e s k o n a t ʃ n o s t
68 | s a f i r n i
69 | f a l s i f i k a t
70 | o n t o ɡ e n e z a
71 | p r i m j e n a
72 | k r e k e t
73 | a e r o d r o m
74 | o s l o b a d z a t i
75 | s t u d i j
76 | k r a t k o ʋ i d
77 | l u p e ʃ t ʋ o
78 | ʒ i ʋ a t s
79 | j a j a ʃ t s e
80 | r a z u ʋ e r i t i
81 | z a ʋ i s t
82 | k o n d u r a
83 | a m f i t e a t a r
84 | a l ɡ o n k i j s k i
85 | u m i ʋ a t i
86 | d a ʒ d e ʋ ɲ a k
87 | d e f e t i s t
88 | p r e s k a k a t i
89 | p e r u a n s k i
90 | s u p r o t s t a ʋ ʎ a t i
91 | s p ʎ o ʃ t e n
92 | s a f i r
93 | p o ʃ t e d z i ʋ a t i
94 | k r a n i o l o ɡ i j s k i
95 | s ʋ o j t a
96 | u p a d a t i
97 | r a ʒ a ɲ
98 | d o l i k o ʋ a t i
99 | s a m o u ʋ e r e n o s t
100 | x u m o r i s t i t ʃ k i
101 | t ʃ e ʃ k i
102 | s r a t i
103 | b l e b n u t i
104 | o d s e t s a t i
105 | n o ʋ o r o d z e n
106 | b r e s k ʋ a
107 | o b r e d
108 | k o n a t ʃ a n
109 | ɡ r i s t i
110 | i k a ʋ a t s
111 | e p i l e p t i t ʃ a r k a
112 | d e z o r ɡ a n i z a t s i j a
113 | o t e t x i
114 | z n a m e ɲ e
115 | m u ʎ a ʋ
116 | i t a l i j a n s k i
117 | p r e t ʋ o r b a
118 | ɡ e r i j a t r i j a
119 | a l b a n i z a t s i j a
120 | o t s j e n a
121 | o d b i t i
122 | r e t o r i t ʃ n o
123 | m o ʎ a t s
124 | ʃ k a f e t i n
125 | s p a n a t x
126 | u r a n
127 | n e p r i s t u p a t ʃ a n
128 | t ʃ e l n i k
129 | o p l o d z i ʋ a t i
130 | k i s t
131 | s a s e t x i
132 | p o z d r a ʋ
133 | t ʃ a d z a ʋ o s t
134 | x a n
135 | p r i p o ʋ e d a t ʃ
136 | k i n o l o ɡ i j a
137 | a s t r o n o m i j s k i
138 | n e i z ʎ e t ʃ i ʋ o s t
139 | u s l o ʋ a n
140 | s r p s k i
141 | e ʋ o l u t s i o n i z a m
142 | a n o t i r a t i
143 | d o s t a ʋ i t i
144 | s a d a
145 | d o d a t a k
146 | p r o p i s i ʋ a t i
147 | u s t ʋ r d z i ʋ a t i
148 | m i j e ʃ a l i t s a
149 | e ɡ z i s t e n t s i j a l a n
150 | i ʋ i t s a
151 | a u t o k e f a l a n
152 | ʒ i ʋ a x n o s t
153 | i z o r a ʋ a t i
154 | x l a t ʃ e
155 | k o n s o n a n t
156 | u z m a t x i
157 | i m i ɡ r i r a t i
158 | p l i t a k
159 | t e o z o f i j a
160 | k r a t k o ʋ r a t
161 | s a t a n i z a m
162 | x a r a t ʃ
163 | o t r t s a n
164 | ɡ i b a k
165 | k o s t r u ʃ i t i
166 | o d ɡ o ʋ o r i t i
167 | b a l a ʋ i t i
168 | f l a ʃ i r a t i
169 | s l a t ʃ i t s a
170 | t e n d e n t s i j a
171 | ɡ d a
172 | m n o ʒ i n a
173 | t e l e o l o ɡ i j a
174 | k r i z a n t e m a
175 | ʎ e t o
176 | k o n t r a d i k t s i j a
177 | i n d i r e k t a n
178 | s l a m k a
179 | ʃ t r a j x a t i
180 | p a p i r n i t ʃ a r
181 | k u r t o a z n o
182 | o t ʃ e k i ʋ a ɲ e
183 | n a m a t x i
184 | z a m a t x i
185 | b i f t e k
186 | s t o ɡ a
187 | x o m o ɡ e n
188 | z a m a k n u t i
189 | o k r i ʋ i t i
190 | u t ʃ i n i t i
191 | k i n u t i
192 | o b r t a t i
193 | t r a n s p l a n t a t s i j a
194 | p r e ʋ e l i k
195 | u k i d a t i
196 | k a m e n i
197 |
--------------------------------------------------------------------------------
/data/test_model2.src:
--------------------------------------------------------------------------------
1 | а з и ј а т с к и
2 | а к р о б а т с к и
3 | а л к о х о л и ч а р
4 | а р м а т у р а
5 | а у т о н о м а ш т в о
6 | б а р о к н и
7 | б е з б р о ј а н
8 | б о р о в и н а
9 | б о с а н а ц
10 | б р а у з е р
11 | в о ј н и ш т в о
12 | г д е г д е
13 | г р а б љ е
14 | г р д о с и ј а
15 | д а л е к о в и д н о
16 | д е з и н ф о р м а ц и ј а
17 | д е р и ш т е
18 | д р о б њ а к
19 | е м а н ц и п а ц и ј а
20 | ж а н д а р м е р и ј а
21 | з а в и д љ и в а ц
22 | з а к о ч и т и
23 | з а н е м а р и т и
24 | з в о н о
25 | з л о д ј е л о
26 | и г р о к а з
27 | ј е д и н и т и
28 | ј е д н о с т а в н о
29 | к о з л и н а ц
30 | к о н с т р у к т и в а н
31 | к р е к е т н у т и
32 | к у ш а ч и ц а
33 | л е г и т и м н о
34 | л и з а л и ц а
35 | л о м љ а в а
36 | м а м у р л у к
37 | м е д а љ а
38 | м о р а л н о
39 | н е г о с т о љ у б и в о с т
40 | н е д ј е љ н и
41 | н е к о ј и
42 | н е с т а н а к
43 | н е с т р у ч њ а к
44 | о д м о р и т и
45 | о п о р и ц а т и
46 | о п р и ч а т и
47 | о с а о
48 | п а њ
49 | п е р ф е к т
50 | п о н а в љ а т и
51 | п о п р и м и т и
52 | п р а с л о в е н с к и
53 | п р и г о д а н
54 | п р и п р е м а т и
55 | п с и х о п а т о л о г и ј а
56 | п с о в а ч к и
57 | п у н ч
58 | р а з а п и њ а т и
59 | с а б и т и
60 | с а г и б љ и в о с т
61 | с а к р и т и
62 | с а к р о с а н к т а н
63 | с а л а т н и
64 | с и р н и
65 | с к у п о ц ј е н
66 | с л а т к о р ј е ч и в о
67 | с н о ш љ и в о
68 | с о ч н о
69 | с т и д љ и в
70 | т а ј
71 | т а н а ц
72 | т е с т е н и н а
73 | т р а н з и т
74 | т р ч а т и
75 | ћ у м у р џ и ј а
76 | у ж и в а л а ц
77 | у к о р е н и т и
78 | у п о м о ћ
79 | у р о т н и к
80 | у с м ј е р а в а т и
81 | у с п у т
82 | у с т а ј а т и
83 | у ц е н и т и
84 | ф а н а т и ч н о с т
85 | ф о т к а
86 | х и љ а д у
87 | х и п и
88 | х у м а н и з а м
89 | ц р н е т и
90 | ш а м п и о н с к и
91 | ш и ф к а р т а
92 | ш л а ј е р
93 | ш л а ј ф а т и
94 | ш л у к
95 | ш п а ј з
96 | ш п а ј с к а р т а
97 | ш п а н с к и
98 | ш т о к а в а ц
99 | ш т р а ј х е р
100 | ш у п ч и ћ
101 | a g r e s o r s k i
102 | a k t e r
103 | a m b r o z i j a
104 | a p s t i n e n c i j a
105 | a s i m e t r i j a
106 | a v i o k o m p a n i j a
107 | d a ž d e v n j a k
108 | d e l o m
109 | d i j a l o g
110 | d o h v a t i t i
111 | d o k t o r a n d
112 | d o k t o r s k i
113 | d o v i t l j i v
114 | d o z v o l a
115 | d v o s m i s l e n
116 | e r i t r e j a
117 | e s t e t i k a
118 | e v r o p s k i
119 | f i z i o t e r a p i j s k i
120 | g a j
121 | g m a z
122 | h a j d e m o
123 | i n j e
124 | i n t o n a c i j a
125 | k e s t e n j a s t
126 | k o l e v k a
127 | k o z j i
128 | k r a l j e š a k
129 | k r a t k o
130 | k r č e v i n a
131 | k r e a t i v a n
132 | k r e š e n d o
133 | k u ć a n i c a
134 | k u ć n i
135 | l i č a n
136 | l j u l j a t i
137 | m e s n i
138 | m r l j a
139 | m u š k o s t
140 | n a b r u s i v a t i
141 | n a d o b u d a n
142 | n a k a l e m i t i
143 | n a r e č j e
144 | n a s l e đ i v a t i
145 | n e ć a k
146 | n e d e l j i v
147 | n e o s e t l j i v o
148 | n e s a v e s t a n
149 | n e s u v i s a o
150 | n e u r a s t e n i č a n
151 | n e u s t a v a n
152 | n i š a n d ž i j a
153 | o b r a ć a t i
154 | o k r e t a t i
155 | o n e s p o s o b l j i v a t i
156 | o s a k a ć i v a t i
157 | o s v e t l j i v
158 | p a l e o z o i k
159 | p e s n i k
160 | p l a t n e n
161 | p l e m e n i t
162 | p r a v o v a l j a n o s t
163 | p r a ž a n i n
164 | p r i p r a v l j a t i
165 | p r o l e t o s
166 | r a z g o v a r a t i
167 | r a z n o l i č a n
168 | r a z r e d
169 | r a z v r a t n o
170 | r i g i d i t e t
171 | r u b e š k i
172 | s e r i o z a n
173 | s i n o ć
174 | s i r u t k a
175 | s l e d i t i
176 | s m j e š t a j
177 | š n i r a t i
178 | s o k a k
179 | š p e k
180 | s r a m i t i
181 | š t i t o n o š a
182 | s t o t i n a
183 | t i n j a t i
184 | t o k s i k o l o g i j a
185 | t o l i k
186 | t r a n s a t l a n t s k i
187 | u n u t a r
188 | u z g r e d i c e
189 | v a k c i n a
190 | v a š m a š i n a
191 | v e ć e
192 | v l a d a v i n a
193 | v o k a c i j a
194 | z a b u š a n t s k i
195 | z a k l i n j a t i
196 | z a k o č i t i
197 | z n a t a n
198 | z o r a n
199 |
--------------------------------------------------------------------------------
/data/test_model2.tgt:
--------------------------------------------------------------------------------
1 | a z i j a t s k i
2 | a k r o b a t s k i
3 | a l k o x o l i t ʃ a r
4 | a r m a t u r a
5 | a u t o n o m a ʃ t ʋ o
6 | b a r o k n i
7 | b e z b r o j a n
8 | b o r o ʋ i n a
9 | b o s a n a t s
10 | b r a u z e r
11 | ʋ o j n i ʃ t ʋ o
12 | ɡ d e ɡ d e
13 | ɡ r a b ʎ e
14 | ɡ r d o s i j a
15 | d a l e k o ʋ i d n o
16 | d e z i n f o r m a t s i j a
17 | d e r i ʃ t e
18 | d r o b ɲ a k
19 | e m a n t s i p a t s i j a
20 | ʒ a n d a r m e r i j a
21 | z a ʋ i d ʎ i ʋ a t s
22 | z a k o t ʃ i t i
23 | z a n e m a r i t i
24 | z ʋ o n o
25 | z l o d j e l o
26 | i ɡ r o k a z
27 | j e d i n i t i
28 | j e d n o s t a ʋ n o
29 | k o z l i n a t s
30 | k o n s t r u k t i ʋ a n
31 | k r e k e t n u t i
32 | k u ʃ a t ʃ i t s a
33 | l e ɡ i t i m n o
34 | l i z a l i t s a
35 | l o m ʎ a ʋ a
36 | m a m u r l u k
37 | m e d a ʎ a
38 | m o r a l n o
39 | n e ɡ o s t o ʎ u b i ʋ o s t
40 | n e d j e ʎ n i
41 | n e k o j i
42 | n e s t a n a k
43 | n e s t r u t ʃ ɲ a k
44 | o d m o r i t i
45 | o p o r i t s a t i
46 | o p r i t ʃ a t i
47 | o s a o
48 | p a ɲ
49 | p e r f e k t
50 | p o n a ʋ ʎ a t i
51 | p o p r i m i t i
52 | p r a s l o ʋ e n s k i
53 | p r i ɡ o d a n
54 | p r i p r e m a t i
55 | p s i x o p a t o l o ɡ i j a
56 | p s o ʋ a t ʃ k i
57 | p u n t ʃ
58 | r a z a p i ɲ a t i
59 | s a b i t i
60 | s a ɡ i b ʎ i ʋ o s t
61 | s a k r i t i
62 | s a k r o s a n k t a n
63 | s a l a t n i
64 | s i r n i
65 | s k u p o t s j e n
66 | s l a t k o r j e t ʃ i ʋ o
67 | s n o ʃ ʎ i ʋ o
68 | s o t ʃ n o
69 | s t i d ʎ i ʋ
70 | t a j
71 | t a n a t s
72 | t e s t e n i n a
73 | t r a n z i t
74 | t r t ʃ a t i
75 | t x u m u r d ʒ i j a
76 | u ʒ i ʋ a l a t s
77 | u k o r e n i t i
78 | u p o m o t x
79 | u r o t n i k
80 | u s m j e r a ʋ a t i
81 | u s p u t
82 | u s t a j a t i
83 | u t s e n i t i
84 | f a n a t i t ʃ n o s t
85 | f o t k a
86 | x i ʎ a d u
87 | x i p i
88 | x u m a n i z a m
89 | t s r n e t i
90 | ʃ a m p i o n s k i
91 | ʃ i f k a r t a
92 | ʃ l a j e r
93 | ʃ l a j f a t i
94 | ʃ l u k
95 | ʃ p a j z
96 | ʃ p a j s k a r t a
97 | ʃ p a n s k i
98 | ʃ t o k a ʋ a t s
99 | ʃ t r a j x e r
100 | ʃ u p t ʃ i t x
101 | a ɡ r e s o r s k i
102 | a k t e r
103 | a m b r o z i j a
104 | a p s t i n e n t s i j a
105 | a s i m e t r i j a
106 | a ʋ i o k o m p a n i j a
107 | d a ʒ d e ʋ ɲ a k
108 | d e l o m
109 | d i j a l o ɡ
110 | d o x ʋ a t i t i
111 | d o k t o r a n d
112 | d o k t o r s k i
113 | d o ʋ i t ʎ i ʋ
114 | d o z ʋ o l a
115 | d ʋ o s m i s l e n
116 | e r i t r e j a
117 | e s t e t i k a
118 | e ʋ r o p s k i
119 | f i z i o t e r a p i j s k i
120 | ɡ a j
121 | ɡ m a z
122 | x a j d e m o
123 | i ɲ e
124 | i n t o n a t s i j a
125 | k e s t e ɲ a s t
126 | k o l e ʋ k a
127 | k o z j i
128 | k r a ʎ e ʃ a k
129 | k r a t k o
130 | k r t ʃ e ʋ i n a
131 | k r e a t i ʋ a n
132 | k r e ʃ e n d o
133 | k u t x a n i t s a
134 | k u t x n i
135 | l i t ʃ a n
136 | ʎ u ʎ a t i
137 | m e s n i
138 | m r ʎ a
139 | m u ʃ k o s t
140 | n a b r u s i ʋ a t i
141 | n a d o b u d a n
142 | n a k a l e m i t i
143 | n a r e t ʃ j e
144 | n a s l e d z i ʋ a t i
145 | n e t x a k
146 | n e d e ʎ i ʋ
147 | n e o s e t ʎ i ʋ o
148 | n e s a ʋ e s t a n
149 | n e s u ʋ i s a o
150 | n e u r a s t e n i t ʃ a n
151 | n e u s t a ʋ a n
152 | n i ʃ a n d ʒ i j a
153 | o b r a t x a t i
154 | o k r e t a t i
155 | o n e s p o s o b ʎ i ʋ a t i
156 | o s a k a t x i ʋ a t i
157 | o s ʋ e t ʎ i ʋ
158 | p a l e o z o i k
159 | p e s n i k
160 | p l a t n e n
161 | p l e m e n i t
162 | p r a ʋ o ʋ a ʎ a n o s t
163 | p r a ʒ a n i n
164 | p r i p r a ʋ ʎ a t i
165 | p r o l e t o s
166 | r a z ɡ o ʋ a r a t i
167 | r a z n o l i t ʃ a n
168 | r a z r e d
169 | r a z ʋ r a t n o
170 | r i ɡ i d i t e t
171 | r u b e ʃ k i
172 | s e r i o z a n
173 | s i n o t x
174 | s i r u t k a
175 | s l e d i t i
176 | s m j e ʃ t a j
177 | ʃ n i r a t i
178 | s o k a k
179 | ʃ p e k
180 | s r a m i t i
181 | ʃ t i t o n o ʃ a
182 | s t o t i n a
183 | t i ɲ a t i
184 | t o k s i k o l o ɡ i j a
185 | t o l i k
186 | t r a n s a t l a n t s k i
187 | u n u t a r
188 | u z ɡ r e d i t s e
189 | ʋ a k t s i n a
190 | ʋ a ʃ m a ʃ i n a
191 | ʋ e t x e
192 | ʋ l a d a ʋ i n a
193 | ʋ o k a t s i j a
194 | z a b u ʃ a n t s k i
195 | z a k l i ɲ a t i
196 | z a k o t ʃ i t i
197 | z n a t a n
198 | z o r a n
199 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python3 -msphinx
7 | SPHINXPROJ = OpenNMT-py
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinxcontrib.bibtex
3 | sphinxcontrib.mermaid
4 | sphinx-rtd-theme
5 | recommonmark
6 | sphinx-argparse
7 | sphinx_markdown_tables
8 |
--------------------------------------------------------------------------------
/docs/source/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 |
3 | OpenNMT-py is a community developed project and we love developer contributions.
4 |
5 | ## Guidelines
6 | Before sending a PR, please do this checklist first:
7 |
8 | - Please run `tools/pull_request_chk.sh` and fix any errors. When adding new functionality, also add tests to this script. Included checks:
9 | 1. flake8 check for coding style;
10 | 2. unittest;
11 | 3. continuous integration tests listed in `.travis.yml`.
12 | - When adding/modifying class constructor, please make the arguments as same naming style as its superclass in PyTorch.
13 | - If your change is based on a paper, please include a clear comment and reference in the code (more on that below).
14 |
15 | ### Docstrings
16 | Above all, try to follow the Google docstring format
17 | ([Napoleon example](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html),
18 | [Google styleguide](http://google.github.io/styleguide/pyguide.html)).
19 | This makes it easy to include your contributions in the Sphinx documentation. And, do feel free
20 | to autodoc your contributions in the API ``.rst`` files in the `docs/source` folder! If you do, check that
21 | your additions look right.
22 |
23 | ```bash
24 | cd docs
25 | # install some dependencies if necessary:
26 | # recommonmark, sphinx_rtd_theme, sphinxcontrib-bibtex
27 | make html
28 | firefox build/html/main.html # or your browser of choice
29 | ```
30 |
31 | Some particular advice:
32 | - Try to follow Python 3 [``typing`` module](https://docs.python.org/3/library/typing.html) conventions when documenting types.
33 | - Exception: use "or" instead of unions for more readability
34 | - For external types, use the full "import name". Common abbreviations (e.g. ``np``) are acceptable.
35 | For ``torch.Tensor`` types, the ``torch.`` is optional.
36 | - Please don't use tics like `` (`str`) `` or rst directives like `` (:obj:`str`) ``. Napoleon handles types
37 | very well without additional help, so avoid the clutter.
38 | - [Google docstrings don't support multiple returns](https://stackoverflow.com/questions/29221551/can-sphinx-napoleon-document-function-returning-multiple-arguments).
39 | For multiple returns, the following works well with Sphinx and is still very readable.
40 | ```python
41 | def foo(a, b):
42 | """This is my docstring.
43 |
44 | Args:
45 | a (object): Something.
46 | b (class): Another thing.
47 |
48 | Returns:
49 | (object, class):
50 |
51 | * a: Something or rather with a long
52 | description that spills over.
53 | * b: And another thing.
54 | """
55 |
56 | return a, b
57 | ```
58 | - When citing a paper, avoid directly linking in the docstring! Add a Bibtex entry to `docs/source/refs.bib`.
59 | E.g., to cite "Attention Is All You Need", visit [arXiv](https://arxiv.org/abs/1706.03762), choose the
60 | [bibtext](https://dblp.uni-trier.de/rec/bibtex/journals/corr/VaswaniSPUJGKP17) link, search `docs/source/refs.bib`
61 | using `CTRL-F` for `DBLP:journals/corr/VaswaniSPUJGKP17`, and if you do not find it then copy-paste the
62 | citation into `refs.bib`. Then, in your docstring, use ``:cite:`DBLP:journals/corr/VaswaniSPUJGKP17` ``.
63 | - However, a link is better than nothing.
64 | - Please document tensor shapes. Prefer the format
65 | ``` ``(a, b, c)`` ```. This style is easy to read, allows using ``x`` for multplication, and is common
66 | (PyTorch uses a few variations on the parentheses format, AllenNLP uses exactly this format, Fairseq uses
67 | the parentheses format with single ticks).
68 | - Again, a different style is better than no shape documentation.
69 | - Please avoid unnecessary space characters, try to capitalize, and try to punctuate.
70 |
71 | For multi-line docstrings, add a blank line after the closing ``"""``.
72 | Don't use a blank line before the closing quotes.
73 |
74 | ``""" not this """`` ``"""This."""``
75 |
76 | ```python
77 | """
78 | Not this.
79 | """
80 | ```
81 | ```python
82 | """This."""
83 | ```
84 |
85 | This note is the least important. Focus on content first, but remember that consistent docs look good.
86 | - Be sensible about the first line. Generally, one stand-alone summary line (per the Google guidelines) is good.
87 | Sometimes, it's better to cut directly to the args or an extended description. It's always acceptable to have a
88 | "trailing" citation.
--------------------------------------------------------------------------------
/docs/source/_static/theme_overrides.css:
--------------------------------------------------------------------------------
1 | /* override table width restrictions */
2 | @media screen and (min-width: 767px) {
3 |
4 | .wy-table-responsive table td {
5 | /* !important prevents the common CSS stylesheets from overriding
6 | this as on RTD they are loaded after this stylesheet */
7 | white-space: normal !important;
8 | }
9 |
10 | .wy-table-responsive {
11 | overflow: visible !important;
12 | }
13 | }
--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
1 | == Examples ==
2 |
3 |
4 | .. include:: quickstart.md
5 | .. include:: extended.md
6 |
--------------------------------------------------------------------------------
/docs/source/extended.md:
--------------------------------------------------------------------------------
1 |
2 | # Translation
3 |
4 | The example below uses the Moses tokenizer (http://www.statmt.org/moses/) to prepare the data and the moses BLEU script for evaluation. This example if for training for the WMT'16 Multimodal Translation task (http://www.statmt.org/wmt16/multimodal-task.html).
5 |
6 | Step 0. Download the data.
7 |
8 | ```bash
9 | mkdir -p data/multi30k
10 | wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz && tar -xf training.tar.gz -C data/multi30k && rm training.tar.gz
11 | wget http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz && tar -xf validation.tar.gz -C data/multi30k && rm validation.tar.gz
12 | wget http://www.quest.dcs.shef.ac.uk/wmt17_files_mmt/mmt_task1_test2016.tar.gz && tar -xf mmt_task1_test2016.tar.gz -C data/multi30k && rm mmt_task1_test2016.tar.gz
13 | ```
14 |
15 | Step 1. Preprocess the data.
16 |
17 | ```bash
18 | for l in en de; do for f in data/multi30k/*.$l; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi; done; done
19 | for l in en de; do for f in data/multi30k/*.$l; do perl tools/tokenizer.perl -a -no-escape -l $l -q < $f > $f.atok; done; done
20 | onmt_preprocess -train_src data/multi30k/train.en.atok -train_tgt data/multi30k/train.de.atok -valid_src data/multi30k/val.en.atok -valid_tgt data/multi30k/val.de.atok -save_data data/multi30k.atok.low -lower
21 | ```
22 |
23 | Step 2. Train the model.
24 |
25 | ```bash
26 | onmt_train -data data/multi30k.atok.low -save_model multi30k_model -gpu_ranks 0
27 | ```
28 |
29 | Step 3. Translate sentences.
30 |
31 | ```bash
32 | onmt_translate -gpu 0 -model multi30k_model_*_e13.pt -src data/multi30k/test2016.en.atok -tgt data/multi30k/test2016.de.atok -replace_unk -verbose -output multi30k.test.pred.atok
33 | ```
34 |
35 | And evaluate
36 |
37 | ```bash
38 | perl tools/multi-bleu.perl data/multi30k/test2016.de.atok < multi30k.test.pred.atok
39 | ```
40 |
--------------------------------------------------------------------------------
/docs/source/im2text.md:
--------------------------------------------------------------------------------
1 | # Image to Text
2 |
3 | A deep learning-based approach to learning the image-to-text conversion, built on top of the OpenNMT system. It is completely data-driven, hence can be used for a variety of image-to-text problems, such as image captioning, optical character recognition and LaTeX decompilation.
4 |
5 | Take LaTeX decompilation as an example, given a formula image:
6 |
7 |

8 |
9 | The goal is to infer the LaTeX source that can be compiled to such an image:
10 |
11 | ```
12 | d s _ { 1 1 } ^ { 2 } = d x ^ { + } d x ^ { - } + l _ { p } ^ { 9 } \frac { p _ { - } } { r ^ { 7 } } \delta ( x ^ { - } ) d x ^ { - } d x ^ { - } + d x _ { 1 } ^ { 2 } + \; \cdots \; + d x _ { 9 } ^ { 2 }
13 | ```
14 |
15 | The paper [[What You Get Is What You See: A Visual Markup Decompiler]](https://arxiv.org/pdf/1609.04938.pdf) provides more technical details of this model.
16 |
17 | ### Dependencies
18 |
19 | * `torchvision`: `conda install torchvision`
20 | * `Pillow`: `pip install Pillow`
21 |
22 | ### Quick Start
23 |
24 | To get started, we provide a toy Math-to-LaTex example. We assume that the working directory is `OpenNMT-py` throughout this document.
25 |
26 | Im2Text consists of four commands:
27 |
28 | 0) Download the data.
29 |
30 | ```bash
31 | wget -O data/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; tar zxf data/im2text.tgz -C data/
32 | ```
33 |
34 | 1) Preprocess the data.
35 |
36 | ```bash
37 | onmt_preprocess -data_type img \
38 | -src_dir data/im2text/images/ \
39 | -train_src data/im2text/src-train.txt \
40 | -train_tgt data/im2text/tgt-train.txt -valid_src data/im2text/src-val.txt \
41 | -valid_tgt data/im2text/tgt-val.txt -save_data data/im2text/demo \
42 | -tgt_seq_length 150 \
43 | -tgt_words_min_frequency 2 \
44 | -shard_size 500 \
45 | -image_channel_size 1
46 | ```
47 |
48 | 2) Train the model.
49 |
50 | ```bash
51 | onmt_train -model_type img \
52 | -data data/im2text/demo \
53 | -save_model demo-model \
54 | -gpu_ranks 0 \
55 | -batch_size 20 \
56 | -max_grad_norm 20 \
57 | -learning_rate 0.1 \
58 | -word_vec_size 80 \
59 | -encoder_type brnn \
60 | -image_channel_size 1
61 | ```
62 |
63 | 3) Translate the images.
64 |
65 | ```bash
66 | onmt_translate -data_type img \
67 | -model demo-model_acc_x_ppl_x_e13.pt \
68 | -src_dir data/im2text/images \
69 | -src data/im2text/src-test.txt \
70 | -output pred.txt \
71 | -max_length 150 \
72 | -beam_size 5 \
73 | -gpu 0 \
74 | -verbose
75 | ```
76 |
77 | The above dataset is sampled from the [im2latex-100k-dataset](http://lstm.seas.harvard.edu/latex/im2text.tgz). We provide a trained model [[link]](http://lstm.seas.harvard.edu/latex/py-model.pt) on this dataset.
78 |
79 | ### Options
80 |
81 | * `-src_dir`: The directory containing the images.
82 |
83 | * `-train_tgt`: The file storing the tokenized labels, one label per line. It shall look like:
84 | ```
85 | ...
86 | ...
87 | ...
88 | ...
89 | ```
90 |
91 | * `-train_src`: The file storing the paths of the images (relative to `src_dir`).
92 | ```
93 |
94 |
95 |
96 | ...
97 | ```
98 |
--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
1 |
2 | .. toctree::
3 | :maxdepth: 2
4 |
5 | index.md
6 | quickstart.md
7 | extended.md
8 |
9 |
10 | This portal provides a detailled documentation of the OpenNMT toolkit. It describes how to use the PyTorch project and how it works.
11 |
12 |
13 |
14 | ## Installation
15 |
16 | 1\. [Install PyTorch](http://pytorch.org/)
17 |
18 | 2\. Clone the OpenNMT-py repository:
19 |
20 | ```bash
21 | git clone https://github.com/OpenNMT/OpenNMT-py
22 | cd OpenNMT-py
23 | ```
24 |
25 | 3\. Install required libraries
26 |
27 | ```bash
28 | pip install -r requirements.txt
29 | ```
30 |
31 | And you are ready to go! Take a look at the [quickstart](quickstart.md) to familiarize yourself with the main training workflow.
32 |
33 | Alternatively you can use Docker to install with `nvidia-docker`. The main Dockerfile is included
34 | in the root directory.
35 |
36 | ## Citation
37 |
38 | When using OpenNMT for research please cite our
39 | [OpenNMT technical report](https://doi.org/10.18653/v1/P17-4012)
40 |
41 | ```
42 | @inproceedings{opennmt,
43 | author = {Guillaume Klein and
44 | Yoon Kim and
45 | Yuntian Deng and
46 | Jean Senellart and
47 | Alexander M. Rush},
48 | title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
49 | booktitle = {Proc. ACL},
50 | year = {2017},
51 | url = {https://doi.org/10.18653/v1/P17-4012},
52 | doi = {10.18653/v1/P17-4012}
53 | }
54 | ```
55 |
56 | ## Additional resources
57 |
58 | You can find additional help or tutorials in the following resources:
59 |
60 | * [Gitter channel](https://gitter.im/OpenNMT/openmt-py)
61 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | Contents
2 | --------
3 |
4 | .. toctree::
5 | :caption: Getting Started
6 | :maxdepth: 2
7 |
8 | main.md
9 | quickstart.md
10 | FAQ.md
11 | CONTRIBUTING.md
12 | ref.rst
13 |
14 |
15 | .. toctree::
16 | :caption: Examples
17 | :maxdepth: 2
18 |
19 | Library.md
20 | extended.md
21 | Summarization.md
22 | im2text.md
23 | speech2text.md
24 | vid2text.rst
25 |
26 |
27 | .. toctree::
28 | :caption: Scripts
29 | :maxdepth: 2
30 |
31 | options/preprocess.rst
32 | options/train.rst
33 | options/translate.rst
34 | options/server.rst
35 |
36 |
37 | .. toctree::
38 | :caption: API
39 | :maxdepth: 2
40 |
41 | onmt.rst
42 | onmt.modules.rst
43 | onmt.translation.rst
44 | onmt.translate.translation_server.rst
45 | onmt.inputters.rst
--------------------------------------------------------------------------------
/docs/source/main.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 |
4 | This portal provides a detailed documentation of the OpenNMT toolkit. It describes how to use the PyTorch project and how it works.
5 |
6 |
7 |
8 | ## Installation
9 | Install from `pip`:
10 | Install `OpenNMT-py` from `pip`:
11 | ```bash
12 | pip install OpenNMT-py
13 | ```
14 |
15 | or from the sources:
16 | ```bash
17 | git clone https://github.com/OpenNMT/OpenNMT-py.git
18 | cd OpenNMT-py
19 | python setup.py install
20 | ```
21 |
22 | *(Optionnal)* some advanced features (e.g. working audio, image or pretrained models) requires extra packages, you can install it with:
23 | ```bash
24 | pip install -r requirements.opt.txt
25 | ```
26 |
27 | And you are ready to go! Take a look at the [quickstart](quickstart) to familiarize yourself with the main training workflow.
28 |
29 | Alternatively you can use Docker to install with `nvidia-docker`. The main Dockerfile is included
30 | in the root directory.
31 |
32 | ## Citation
33 |
34 | When using OpenNMT for research please cite our
35 | [OpenNMT technical report](https://doi.org/10.18653/v1/P17-4012)
36 |
37 | ```
38 | @inproceedings{opennmt,
39 | author = {Guillaume Klein and
40 | Yoon Kim and
41 | Yuntian Deng and
42 | Jean Senellart and
43 | Alexander M. Rush},
44 | title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation},
45 | booktitle = {Proc. ACL},
46 | year = {2017},
47 | url = {https://doi.org/10.18653/v1/P17-4012},
48 | doi = {10.18653/v1/P17-4012}
49 | }
50 | ```
51 |
52 | ## Additional resources
53 |
54 | You can find additional help or tutorials in the following resources:
55 |
56 | * [Gitter channel](https://gitter.im/OpenNMT/openmt-py)
57 |
58 | * [Forum](http://forum.opennmt.net/)
59 |
--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | onmt
2 | ====
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | onmt
8 |
--------------------------------------------------------------------------------
/docs/source/onmt.inputters.rst:
--------------------------------------------------------------------------------
1 | Data Loaders
2 | =================
3 |
4 | Data Readers
5 | -------------
6 |
7 | .. autoexception:: onmt.inputters.datareader_base.MissingDependencyException
8 |
9 | .. autoclass:: onmt.inputters.DataReaderBase
10 | :members:
11 |
12 | .. autoclass:: onmt.inputters.TextDataReader
13 | :members:
14 |
15 | .. autoclass:: onmt.inputters.ImageDataReader
16 | :members:
17 |
18 | .. autoclass:: onmt.inputters.AudioDataReader
19 | :members:
20 |
21 |
22 | Dataset
23 | --------
24 |
25 | .. autoclass:: onmt.inputters.Dataset
26 | :members:
27 |
--------------------------------------------------------------------------------
/docs/source/onmt.modules.rst:
--------------------------------------------------------------------------------
1 | Modules
2 | =============
3 |
4 | Core Modules
5 | ------------
6 |
7 | .. autoclass:: onmt.modules.Embeddings
8 | :members:
9 |
10 |
11 | Encoders
12 | ---------
13 |
14 | .. autoclass:: onmt.encoders.EncoderBase
15 | :members:
16 |
17 | .. autoclass:: onmt.encoders.MeanEncoder
18 | :members:
19 |
20 | .. autoclass:: onmt.encoders.RNNEncoder
21 | :members:
22 |
23 |
24 | Decoders
25 | ---------
26 |
27 |
28 | .. autoclass:: onmt.decoders.DecoderBase
29 | :members:
30 |
31 | .. autoclass:: onmt.decoders.decoder.RNNDecoderBase
32 | :members:
33 |
34 | .. autoclass:: onmt.decoders.StdRNNDecoder
35 | :members:
36 |
37 | .. autoclass:: onmt.decoders.InputFeedRNNDecoder
38 | :members:
39 |
40 | Attention
41 | ----------
42 |
43 | .. autoclass:: onmt.modules.AverageAttention
44 | :members:
45 |
46 | .. autoclass:: onmt.modules.GlobalAttention
47 | :members:
48 |
49 |
50 |
51 | Architecture: Transformer
52 | ----------------------------
53 |
54 | .. autoclass:: onmt.modules.PositionalEncoding
55 | :members:
56 |
57 | .. autoclass:: onmt.modules.position_ffn.PositionwiseFeedForward
58 | :members:
59 |
60 | .. autoclass:: onmt.encoders.TransformerEncoder
61 | :members:
62 |
63 | .. autoclass:: onmt.decoders.TransformerDecoder
64 | :members:
65 |
66 | .. autoclass:: onmt.modules.MultiHeadedAttention
67 | :members:
68 | :undoc-members:
69 |
70 |
71 | Architecture: Conv2Conv
72 | ----------------------------
73 |
74 | (These methods are from a user contribution
75 | and have not been thoroughly tested.)
76 |
77 |
78 | .. autoclass:: onmt.encoders.CNNEncoder
79 | :members:
80 |
81 |
82 | .. autoclass:: onmt.decoders.CNNDecoder
83 | :members:
84 |
85 | .. autoclass:: onmt.modules.ConvMultiStepAttention
86 | :members:
87 |
88 | .. autoclass:: onmt.modules.WeightNormConv2d
89 | :members:
90 |
91 | Architecture: SRU
92 | ----------------------------
93 |
94 | .. autoclass:: onmt.models.sru.SRU
95 | :members:
96 |
97 |
98 | Alternative Encoders
99 | --------------------
100 |
101 | onmt\.modules\.AudioEncoder
102 |
103 | .. autoclass:: onmt.encoders.AudioEncoder
104 | :members:
105 |
106 |
107 | onmt\.modules\.ImageEncoder
108 |
109 | .. autoclass:: onmt.encoders.ImageEncoder
110 | :members:
111 |
112 |
113 | Copy Attention
114 | --------------
115 |
116 | .. autoclass:: onmt.modules.CopyGenerator
117 | :members:
118 |
119 |
120 | Structured Attention
121 | -------------------------------------------
122 |
123 | .. autoclass:: onmt.modules.structured_attention.MatrixTree
124 | :members:
125 |
--------------------------------------------------------------------------------
/docs/source/onmt.rst:
--------------------------------------------------------------------------------
1 | Framework
2 | =================
3 |
4 | Model
5 | -----
6 |
7 | .. autoclass:: onmt.models.NMTModel
8 | :members:
9 |
10 | Trainer
11 | -------
12 |
13 | .. autoclass:: onmt.Trainer
14 | :members:
15 |
16 |
17 | .. autoclass:: onmt.utils.Statistics
18 | :members:
19 |
20 | Loss
21 | ----
22 |
23 |
24 | .. autoclass:: onmt.utils.loss.LossComputeBase
25 | :members:
26 |
27 |
28 | Optimizer
29 | -----
30 |
31 | .. autoclass:: onmt.utils.Optimizer
32 | :members:
33 |
--------------------------------------------------------------------------------
/docs/source/onmt.translate.translation_server.rst:
--------------------------------------------------------------------------------
1 | Server
2 | ======
3 |
4 |
5 | Models
6 | -------------
7 |
8 | .. autoclass:: onmt.translate.translation_server.ServerModel
9 | :members:
10 |
11 |
12 | Core Server
13 | ------------
14 |
15 | .. autoexception:: onmt.translate.translation_server.ServerModelError
16 |
17 | .. autoclass:: onmt.translate.translation_server.Timer
18 | :members:
19 |
20 | .. autoclass:: onmt.translate.translation_server.TranslationServer
21 | :members:
22 |
--------------------------------------------------------------------------------
/docs/source/onmt.translation.rst:
--------------------------------------------------------------------------------
1 | Translation
2 | ==================
3 |
4 | Translations
5 | -------------
6 |
7 | .. autoclass:: onmt.translate.Translation
8 | :members:
9 |
10 | Translator Class
11 | -----------------
12 |
13 | .. autoclass:: onmt.translate.Translator
14 | :members:
15 |
16 | .. autoclass:: onmt.translate.TranslationBuilder
17 | :members:
18 |
19 |
20 | Decoding Strategies
21 | --------------------
22 | .. autoclass:: onmt.translate.DecodeStrategy
23 | :members:
24 |
25 | .. autoclass:: onmt.translate.BeamSearch
26 | :members:
27 |
28 | .. autofunction:: onmt.translate.random_sampling.sample_with_temperature
29 |
30 | .. autoclass:: onmt.translate.RandomSampling
31 | :members:
32 |
33 | Scoring
34 | --------
35 | .. autoclass:: onmt.translate.penalties.PenaltyBuilder
36 | :members:
37 |
38 | .. autoclass:: onmt.translate.GNMTGlobalScorer
39 | :members:
40 |
--------------------------------------------------------------------------------
/docs/source/options/preprocess.rst:
--------------------------------------------------------------------------------
1 | Preprocess
2 | ==========
3 |
4 | .. argparse::
5 | :filename: ../onmt/bin/preprocess.py
6 | :func: _get_parser
7 | :prog: preprocess.py
--------------------------------------------------------------------------------
/docs/source/options/server.rst:
--------------------------------------------------------------------------------
1 | Server
2 | =========
3 |
4 | .. argparse::
5 | :filename: ../onmt/bin/server.py
6 | :func: _get_parser
7 | :prog: server.py
--------------------------------------------------------------------------------
/docs/source/options/train.rst:
--------------------------------------------------------------------------------
1 | Train
2 | =====
3 |
4 | .. argparse::
5 | :filename: ../onmt/bin/train.py
6 | :func: _get_parser
7 | :prog: train.py
--------------------------------------------------------------------------------
/docs/source/options/translate.rst:
--------------------------------------------------------------------------------
1 | Translate
2 | =========
3 |
4 | .. argparse::
5 | :filename: ../onmt/bin/translate.py
6 | :func: _get_parser
7 | :prog: translate.py
--------------------------------------------------------------------------------
/docs/source/quickstart.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Quickstart
4 |
5 |
6 | ### Step 1: Preprocess the data
7 |
8 | ```bash
9 | onmt_preprocess -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/demo
10 | ```
11 |
12 | We will be working with some example data in `data/` folder.
13 |
14 | The data consists of parallel source (`src`) and target (`tgt`) data containing one sentence per line with tokens separated by a space:
15 |
16 | * `src-train.txt`
17 | * `tgt-train.txt`
18 | * `src-val.txt`
19 | * `tgt-val.txt`
20 |
21 | Validation files are required and used to evaluate the convergence of the training. It usually contains no more than 5000 sentences.
22 |
23 | ```text
24 | $ head -n 3 data/src-train.txt
25 | It is not acceptable that , with the help of the national bureaucracies , Parliament 's legislative prerogative should be made null and void by means of implementing provisions whose content , purpose and extent are not laid down in advance .
26 | Federal Master Trainer and Senior Instructor of the Italian Federation of Aerobic Fitness , Group Fitness , Postural Gym , Stretching and Pilates; from 2004 , he has been collaborating with Antiche Terme as personal Trainer and Instructor of Stretching , Pilates and Postural Gym .
27 | " Two soldiers came up to me and told me that if I refuse to sleep with them , they will kill me . They beat me and ripped my clothes .
28 | ```
29 |
30 | ### Step 2: Train the model
31 |
32 | ```bash
33 | onmt_train -data data/demo -save_model demo-model
34 | ```
35 |
36 | The main train command is quite simple. Minimally it takes a data file
37 | and a save file. This will run the default model, which consists of a
38 | 2-layer LSTM with 500 hidden units on both the encoder/decoder.
39 | If you want to train on GPU, you need to set, as an example:
40 | CUDA_VISIBLE_DEVICES=1,3
41 | `-world_size 2 -gpu_ranks 0 1` to use (say) GPU 1 and 3 on this node only.
42 | To know more about distributed training on single or multi nodes, read the FAQ section.
43 |
44 | ### Step 3: Translate
45 |
46 | ```bash
47 | onmt_translate -model demo-model_XYZ.pt -src data/src-test.txt -output pred.txt -replace_unk -verbose
48 | ```
49 |
50 | Now you have a model which you can use to predict on new data. We do this by running beam search. This will output predictions into `pred.txt`.
51 |
52 | Note:
53 |
54 | The predictions are going to be quite terrible, as the demo dataset is small. Try running on some larger datasets! For example you can download millions of parallel sentences for [translation](http://www.statmt.org/wmt16/translation-task.html) or [summarization](https://github.com/harvardnlp/sent-summary).
55 |
--------------------------------------------------------------------------------
/docs/source/ref.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | References
3 | ==========
4 |
5 |
6 |
7 | References
8 |
9 | .. bibliography:: refs.bib
10 |
11 |
--------------------------------------------------------------------------------
/docs/source/speech2text.md:
--------------------------------------------------------------------------------
1 | # Speech to Text
2 |
3 | A deep learning-based approach to learning the speech-to-text conversion, built on top of the OpenNMT system.
4 |
5 | Given raw audio, we first apply short-time Fourier transform (STFT), then apply Convolutional Neural Networks to get the source features. Based on this source representation, we use an LSTM decoder with attention to produce the text character by character.
6 |
7 | ### Dependencies
8 |
9 | * `torchaudio`: `sudo apt-get install -y sox libsox-dev libsox-fmt-all; pip install git+https://github.com/pytorch/audio`
10 | * `librosa`: `pip install librosa`
11 |
12 | ### Quick Start
13 |
14 | To get started, we provide a toy speech-to-text example. We assume that the working directory is `OpenNMT-py` throughout this document.
15 |
16 | 0) Download the data.
17 |
18 | ```
19 | wget -O data/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf data/speech.tgz -C data/
20 | ```
21 |
22 |
23 | 1) Preprocess the data.
24 |
25 | ```
26 | onmt_preprocess -data_type audio -src_dir data/speech/an4_dataset -train_src data/speech/src-train.txt -train_tgt data/speech/tgt-train.txt -valid_src data/speech/src-val.txt -valid_tgt data/speech/tgt-val.txt -shard_size 300 -save_data data/speech/demo
27 | ```
28 |
29 | 2) Train the model.
30 |
31 | ```
32 | onmt_train -model_type audio -enc_rnn_size 512 -dec_rnn_size 512 -audio_enc_pooling 1,1,2,2 -dropout 0 -enc_layers 4 -dec_layers 1 -rnn_type LSTM -data data/speech/demo -save_model demo-model -global_attention mlp -gpu_ranks 0 -batch_size 8 -optim adam -max_grad_norm 100 -learning_rate 0.0003 -learning_rate_decay 0.8 -train_steps 100000
33 | ```
34 |
35 | 3) Translate the speechs.
36 |
37 | ```
38 | onmt_translate -data_type audio -model demo-model_acc_x_ppl_x_e13.pt -src_dir data/speech/an4_dataset -src data/speech/src-val.txt -output pred.txt -gpu 0 -verbose
39 | ```
40 |
41 |
42 | ### Options
43 |
44 | * `-src_dir`: The directory containing the audio files.
45 |
46 | * `-train_tgt`: The file storing the tokenized labels, one label per line. It shall look like:
47 | ```
48 | ...
49 | ...
50 | ...
51 | ...
52 | ```
53 |
54 | * `-train_src`: The file storing the paths of the audio files (relative to `src_dir`).
55 | ```
56 |
57 |
58 |
59 | ...
60 | ```
61 |
62 | * `sample_rate`: Sample rate. Default: 16000.
63 | * `window_size`: Window size for spectrogram in seconds. Default: 0.02.
64 | * `window_stride`: Window stride for spectrogram in seconds. Default: 0.01.
65 | * `window`: Window type for spectrogram generation. Default: hamming.
66 |
67 | ### Acknowledgement
68 |
69 | Our preprocessing and CNN encoder is adapted from [deepspeech.pytorch](https://github.com/SeanNaren/deepspeech.pytorch).
70 |
--------------------------------------------------------------------------------
/floyd.yml:
--------------------------------------------------------------------------------
1 | env: pytorch-0.4
2 | machine: cpu
3 |
--------------------------------------------------------------------------------
/floyd_requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/pytorch/text
2 |
--------------------------------------------------------------------------------
/github_deploy_key_opennmt_opennmt_py.enc:
--------------------------------------------------------------------------------
1 | gAAAAABaPWC5LTHR5xMoviRbhWsMCxo0FPMTXwcm4DBbG2jYaTxuqdjT78PXu1XxcEfbRuZ-xX8723WjgJMaOVFRuB6k1Oow7Qw8YlO6CV5fyjU8jJFy0D4fSEE40P6A0GbvtMwj2uVKyhrCK341_8roVVegN96S40muebu0oi3cY0sDwLybAOBQYdf_J6gQgWIxf289hPMzmV4iy332V9gRN-cNbmpUYaVxINrxv0Ce6pw3NV99mNNK5izq-g4hlpErnF7LG60Jar7Vh7bw52C0PpEVJmUXWIJOtDGy6d_SuvR4SIj64J4IEDO78s7PyI8jAyP5Nu5emcH_eOV8z7C2nszkNbx6RwtDPh5qK0HILCgGmF4nzOTVK8mE9_8gD-tlWpS7jj7y_IJwNPJB3Gnqt383sg5NIQpgQqJMzmKtacXPF-sDvczsyf4t6GEURhPYobNociBQa3ZZBtJU0O_moUtwdSRsjkk1RdUbIgG3tcX73T_SYJqMLGtMKywmyDzv1CVqFCdCAhlVAcSnLLvP2xlJ1uJSKa46dtSDoUXleWCGMR9SoLz2UpPvtnJ1zZ8YKW7UD9iQfAsznBMSG4wKGEdZdFymvCuLnZQYWmJK9UFSyoYnrW1Jy1pmOJ8a25kfyI6_LiK52iC1zr9DZcn5MP2FGgrJnz0RfuvPtcgKFtvs731LzVycUT-u1I4WftPh_6b6fPxYuSnRPdJ39m7OnaGb5VOobleElaZMkh8niXM4K654i1dQA_ItuYeWjU3HPhwN86aOif6GeZSlq_Xjp3Z2DACSmYqyxKccVBWYBdZO8WSdSt07TEeWUboDDQTu_xCPEh-E8Z-Bb-xjTjVM99jkvZSrbqJn6TeY__nH2thfl9cVMj73o7wIp0EJgSpUuKEnJqPwenPwm-VEj_ODB8qbNYC3y4QkfHBL6nbdUt8Qx6P59i8C54st2v5OdZ31bF6bqbJxE5UElJRyuASmE92vu8QqQqPGjZqLhIE9Tl6EC4JFdwJMZI53gztzfKTYMAQLkbV0zYtSoBYavbBCTwQTlG49qDeZk6r5K4DPwZh9xM-M9j32Yr3NYE6QvS4sPaikPkAGoLqTAWVrfdLDc7IdIgAmZNt1D5E3Wm2n7wlQflrdLu6VgiGT1rZgsax_C1bTvsi7InkjuQuNphzXEn3_9FlWmnatDK0Nb0MFqGtEAd0S5SDGI2cf7drLVOtJzvNw9GUgdMoqn-hutvJNS1vpIZK2KektVoFMB-gBJj4oPp4gx8WDbvmkd88Jbitk3xuQp8JmoxPcVkZhPJYYMouMHnO982N9HiJ7AsvFmML_AEe72_qQCh5jcGpsbMq_U5Cu8S2L6MpaMmcn1Piup9ClCricSNEtJD-QS9EEyn-mCHnXnQ1_z6AQ-An5wwm2eNrsEN1F1DjqLcyO3ziE5pHKNXh5W1H3Ec1_ETpInJRBoZ7DEvPpI1KFyxSnwCCrONAIZwrZzMDHPsXgJbXZZfX8_bah36380_eecZOmeCVE1UsimA2MLE3K-ziv0YhXiyHkdzROSmXruXSmzr1NW1bn26Fwy3M3L3GDmHI4Wd62eiYlPAdiOOGO2rA1H37q47X-65BBdh9XXz0k_5YRPLtQeDUavLKzd9MIHc8Ef4g2PkHJTRp9jdkertDy1NkKg3rV-QZ12fCCce97ftcMJ4BSXLgEx_jvxISTo4mB8R0fAAWYJAYCd0vFc7Q4PRFHhyJsm_5BtrwEC5JFQF6sNQllkIRbixJ-kGaieAwRZ-JKzR7gzQ3MJVjArZKcZJV6N8YYRQvKcR8sEcgLv_lr_1hQNLjmGyFeZ1RYxagaddVLAxwp8W5_vofhnKCc5JpnVcAm4W-h_l7uZd42raso-7HeRYIacW9tuFhmUi7iZBHzsNz9G0XFdsdeD2FKJb2yt30Ze4VA1crIOWwVkHsfXid2tjV4wEkR1GGQXYJ2HSHeiH5W4_9vxyYlpum8swrEWY_vLywnv92Bqerk2pfBi6kJqE1ZyZR-8NQuZMxQO_l8pTurirI-nCeHY5Im-jhs4MmA4-zwthY6RKQqbijYCbEd3HeHHMS0k8c84NlMiVAlEd7cAQZYSvlrAxNsaUWmBazE6HAGhXlB0X5pYDYV0LDalIU4guqpVLx-B4iwvnQ7nA3EzsXSBSJDsYbtVQaOHabG_jTL-SDKpkMEdb1Fh0UAeflB02fSenwj1DmsZysiJDD16IxKq22XjGslQZKNvZqk2XivzbL7JfVkCDU6N8XgyOpImZmh28Cq5iyN0GfgzYBvUscrspXQd7QJmiatoGLA-nkCZae4XRfeEh9l0qj_jiLnDzDXxF8pz9A-2GMTUUiUFwehSw2haTZJ4Ndqj3ekItvVJZxwVPYs_Voim3orgFUKmT1SUWXy5lKWPuqpWpbhBs0W5EJ2gt5EzV_ejsnnMqyDoxS-R03-ZATHRaFtvf96Zz0qo7xP__UONT1c5l8FX4Tf_kBF5JlTFe3FbSk9fa38QJGqH3RiF1mx91VXOwXR4fw-vGy5CuZoCND3QVzrdwmYE3jqxClBo7AnAjLTXD-lUCf7gqFqHFU-on1zypAZaXhwMVmfuKeolQhPsuybzUWTlRQW5OT2rxnwI-xO_6s78sRIyBwtbQba6lcOUnNH5PF9TbGj4Z2ErzA7eBS6ZBlnEE_fx8QrHoF32x2KLbyX6ELgEG4pt6aWfroWTWWC2T1CjUrswmMEfF5F0aA0uvr-vikxFl62Ob2yIuyF39ytmr8mb_o4JBpd3Etj4m_T-5HwmrsNnAf8bUqf0hTHuQlS9ek5jJK-_pNWWL1Q3yQ7x-4eiJkppero7UYyOKXGLRqgWchry26edqEETCybJMvgjmN2kHqcrg3XBM4ItjOPw0s4XklG7YZzEVmq8O3hgp-fVozpX_RAaaFSGmDzuZcQl2R_-Yo13KzjLj8wu3KjBCfVhJoAjc4T2VZMGVL3T4AOZOEN_GXEKjT5rbrEo1E7eQUoKE_PKKxmyDeNZN3W3hULAS_FMKAURyCLT_nfQ-cKU7pg113AyV6juAS_DFnBPZkcwM-PJBKz69QsrN_D3s3M53rART78zbUAab-La7Q803g-eaSgxpGJgCZKqHHafE4OpMnhKJl1eXaO_YekbtNR-JNXxdMS5wMEA_BOpqu_ixwuw_vJx-tZxKJ1p_o75OVFK9YH9ZFT5_--ngM8G-kHZrV6u5XKc5Jymrq9m6nZaH__HdAMvQmRfMWbOsSXl3HrlyEoPK5nyBcKtlHLwANc_1WeMJp3HjpHi5HelTnqNDxi5I5Z0RWP1mU0f8mUMkTvGb5U1wW0pL0Aq_5vSfn5LQhH0QAt2JcHrFasMe_7dABIzMLb8_ph0yQQ57IAIfXUYleOwyD1ZpAFgysnh9V9duxPmg3yswRlJ9MZK9tYkwWcj_nOjq2407qR42aThqWYL4702HVycoQgErx6K4XSkF5mmJdfsZ515IIpqHJt-7Q5n_gzIPQa4Wq5ANgS5-2y97uN61NkoE9eIiLHZMY6OvuORvSdMeL6_84MuLBsKS_3OgXrOQFOgdK5mCn9Iv53UZiMkR0rLGHOLnb2hnTZGq4ao3yiNsauBqf0O4r6ecarYxGty4yWZBxB8aHLFcK-FAlFuoEL8PlRLChOEUqvUoaFs3jzyQY_iRZRyCMszPi0xPrvdiILk4VDaa0NR0XtCC-kA3tdcb_Xbdfv_Djw-wVLf7Dx6iBlPNwtjE4OzweqBaAkNkk5Ij35vk-6QQryHhAgiAHdXDGZoegdHZdKUeC_GSCMud0wpXloEPxDREskWu1VN310OXaa6VvpG0VB1B2CrUlFNvwzmal3PYCrb7XPAT1Lu5C4oSH3bTr6Hk9wtIEv0sAgt4B9RPhZ0Kq-lP85raW748Pkc0PDK1C4g4SzAxl_x7JTSTYUk_fjMnc7yEN0iBRJCMfmUq-ILtj2zOI7f3dazGCp9dXBOTVTYMVNRpcka7vWjlGHMMuVvid3Oz6GgBZl_I3csNzGXTZEvJurp3qXSaXL_THxHmDBDn7T_uY58uPaTC-qjdvkKNDUzg2kRtzejmO7TPEGIRAQghEkVK-ruZU5llxjMg1NOTeXfhXZlRK2Ri8F9QPs6FSFuiqLgOzgbl_rlecf3E6iJ9fgTsdE8OGgekAwmF5hi7Tp5DsGNlKXpWvc4TftLO7len-b9Tqa7XYPU5NKv1hVIIobSRjYuFuW1yDSWtXY0zzzqPsdhtrv97JoM71QL8fZ3tUDBDWhvlBmpXSSfjf4qYQ0PmP7pQWLjb_DuVBDO5EDV0xblgz_stLcNvxRIYChm0ytxN8B2jCaH1n_CLEWTvFloWBP72ovnRWcd1gqbZ4bD4KrI_Tb7VcepWqUg1CO-yTRHR4zQUSBBfM=
--------------------------------------------------------------------------------
/onmt/__init__.py:
--------------------------------------------------------------------------------
1 | """ Main entry point of the ONMT library """
2 | from __future__ import division, print_function
3 |
4 | import onmt.inputters
5 | import onmt.encoders
6 | import onmt.decoders
7 | import onmt.models
8 | import onmt.utils
9 | import onmt.modules
10 | from onmt.trainer import Trainer
11 | import sys
12 | import onmt.utils.optimizers
13 | onmt.utils.optimizers.Optim = onmt.utils.optimizers.Optimizer
14 | sys.modules["onmt.Optim"] = onmt.utils.optimizers
15 |
16 | # For Flake
17 | __all__ = [onmt.inputters, onmt.encoders, onmt.decoders, onmt.models,
18 | onmt.utils, onmt.modules, "Trainer"]
19 |
20 | __version__ = "1.0.0.rc2"
21 |
--------------------------------------------------------------------------------
/onmt/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/bin/__init__.py
--------------------------------------------------------------------------------
/onmt/bin/server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import configargparse
3 |
4 | from flask import Flask, jsonify, request
5 | from onmt.translate import TranslationServer, ServerModelError
6 |
7 | STATUS_OK = "ok"
8 | STATUS_ERROR = "error"
9 |
10 |
11 | def start(config_file,
12 | url_root="./translator",
13 | host="0.0.0.0",
14 | port=5000,
15 | debug=True):
16 | def prefix_route(route_function, prefix='', mask='{0}{1}'):
17 | def newroute(route, *args, **kwargs):
18 | return route_function(mask.format(prefix, route), *args, **kwargs)
19 | return newroute
20 |
21 | app = Flask(__name__)
22 | app.route = prefix_route(app.route, url_root)
23 | translation_server = TranslationServer()
24 | translation_server.start(config_file)
25 |
26 | @app.route('/models', methods=['GET'])
27 | def get_models():
28 | out = translation_server.list_models()
29 | return jsonify(out)
30 |
31 | @app.route('/health', methods=['GET'])
32 | def health():
33 | out = {}
34 | out['status'] = STATUS_OK
35 | return jsonify(out)
36 |
37 | @app.route('/clone_model/', methods=['POST'])
38 | def clone_model(model_id):
39 | out = {}
40 | data = request.get_json(force=True)
41 | timeout = -1
42 | if 'timeout' in data:
43 | timeout = data['timeout']
44 | del data['timeout']
45 |
46 | opt = data.get('opt', None)
47 | try:
48 | model_id, load_time = translation_server.clone_model(
49 | model_id, opt, timeout)
50 | except ServerModelError as e:
51 | out['status'] = STATUS_ERROR
52 | out['error'] = str(e)
53 | else:
54 | out['status'] = STATUS_OK
55 | out['model_id'] = model_id
56 | out['load_time'] = load_time
57 |
58 | return jsonify(out)
59 |
60 | @app.route('/unload_model/', methods=['GET'])
61 | def unload_model(model_id):
62 | out = {"model_id": model_id}
63 |
64 | try:
65 | translation_server.unload_model(model_id)
66 | out['status'] = STATUS_OK
67 | except Exception as e:
68 | out['status'] = STATUS_ERROR
69 | out['error'] = str(e)
70 |
71 | return jsonify(out)
72 |
73 | @app.route('/translate', methods=['POST'])
74 | def translate():
75 | inputs = request.get_json(force=True)
76 | out = {}
77 | try:
78 | translation, scores, n_best, times = translation_server.run(inputs)
79 | assert len(translation) == len(inputs)
80 | assert len(scores) == len(inputs)
81 |
82 | out = [[{"src": inputs[i]['src'], "tgt": translation[i],
83 | "n_best": n_best,
84 | "pred_score": scores[i]}
85 | for i in range(len(translation))]]
86 | except ServerModelError as e:
87 | out['error'] = str(e)
88 | out['status'] = STATUS_ERROR
89 |
90 | return jsonify(out)
91 |
92 | @app.route('/to_cpu/', methods=['GET'])
93 | def to_cpu(model_id):
94 | out = {'model_id': model_id}
95 | translation_server.models[model_id].to_cpu()
96 |
97 | out['status'] = STATUS_OK
98 | return jsonify(out)
99 |
100 | @app.route('/to_gpu/', methods=['GET'])
101 | def to_gpu(model_id):
102 | out = {'model_id': model_id}
103 | translation_server.models[model_id].to_gpu()
104 |
105 | out['status'] = STATUS_OK
106 | return jsonify(out)
107 |
108 | app.run(debug=debug, host=host, port=port, use_reloader=False,
109 | threaded=True)
110 |
111 |
112 | def _get_parser():
113 | parser = configargparse.ArgumentParser(
114 | config_file_parser_class=configargparse.YAMLConfigFileParser,
115 | description="OpenNMT-py REST Server")
116 | parser.add_argument("--ip", type=str, default="0.0.0.0")
117 | parser.add_argument("--port", type=int, default="5000")
118 | parser.add_argument("--url_root", type=str, default="/translator")
119 | parser.add_argument("--debug", "-d", action="store_true")
120 | parser.add_argument("--config", "-c", type=str,
121 | default="./available_models/conf.json")
122 | return parser
123 |
124 |
125 | def main():
126 | parser = _get_parser()
127 | args = parser.parse_args()
128 | start(args.config, url_root=args.url_root, host=args.ip, port=args.port,
129 | debug=args.debug)
130 |
131 |
132 | if __name__ == "__main__":
133 | main()
134 |
--------------------------------------------------------------------------------
/onmt/bin/translate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals
5 | from itertools import repeat
6 |
7 | from onmt.utils.logging import init_logger
8 | from onmt.utils.misc import split_corpus
9 | from onmt.translate.translator import build_translator
10 |
11 | import onmt.opts as opts
12 | from onmt.utils.parse import ArgumentParser
13 |
14 |
15 | def translate(opt):
16 | ArgumentParser.validate_translate_opts(opt)
17 | logger = init_logger(opt.log_file)
18 |
19 | translator = build_translator(opt, report_score=True)
20 | src_shards = split_corpus(opt.src, opt.shard_size)
21 | tgt_shards = split_corpus(opt.tgt, opt.shard_size) \
22 | if opt.tgt is not None else repeat(None)
23 | shard_pairs = zip(src_shards, tgt_shards)
24 |
25 | for i, (src_shard, tgt_shard) in enumerate(shard_pairs):
26 | logger.info("Translating shard %d." % i)
27 | translator.translate(
28 | src=src_shard,
29 | tgt=tgt_shard,
30 | src_dir=opt.src_dir,
31 | batch_size=opt.batch_size,
32 | batch_type=opt.batch_type,
33 | attn_debug=opt.attn_debug
34 | )
35 |
36 |
37 | def _get_parser():
38 | parser = ArgumentParser(description='translate.py')
39 |
40 | opts.config_opts(parser)
41 | opts.translate_opts(parser)
42 | return parser
43 |
44 |
45 | def main():
46 | parser = _get_parser()
47 |
48 | opt = parser.parse_args()
49 | translate(opt)
50 |
51 |
52 | if __name__ == "__main__":
53 | main()
54 |
--------------------------------------------------------------------------------
/onmt/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining decoders."""
2 | from onmt.decoders.decoder import DecoderBase, InputFeedRNNDecoder, \
3 | StdRNNDecoder
4 | from onmt.decoders.transformer import TransformerDecoder, BiTransformerDecoder
5 | from onmt.decoders.cnn_decoder import CNNDecoder
6 |
7 |
8 | str2dec = {"rnn": StdRNNDecoder, "ifrnn": InputFeedRNNDecoder,
9 | "cnn": CNNDecoder, "transformer": TransformerDecoder, "bidecoder": BiTransformerDecoder}
10 |
11 | __all__ = ["DecoderBase", "TransformerDecoder", "StdRNNDecoder", "CNNDecoder",
12 | "InputFeedRNNDecoder", "str2dec"]
13 |
--------------------------------------------------------------------------------
/onmt/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining encoders."""
2 | from onmt.encoders.encoder import EncoderBase
3 | from onmt.encoders.transformer import TransformerEncoder
4 | from onmt.encoders.rnn_encoder import RNNEncoder
5 | from onmt.encoders.cnn_encoder import CNNEncoder
6 | from onmt.encoders.mean_encoder import MeanEncoder
7 | from onmt.encoders.audio_encoder import AudioEncoder
8 | from onmt.encoders.image_encoder import ImageEncoder
9 |
10 |
11 | str2enc = {"rnn": RNNEncoder, "brnn": RNNEncoder, "cnn": CNNEncoder,
12 | "transformer": TransformerEncoder, "img": ImageEncoder,
13 | "audio": AudioEncoder, "mean": MeanEncoder}
14 |
15 | __all__ = ["EncoderBase", "TransformerEncoder", "RNNEncoder", "CNNEncoder",
16 | "MeanEncoder", "str2enc"]
17 |
--------------------------------------------------------------------------------
/onmt/encoders/cnn_encoder.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of "Convolutional Sequence to Sequence Learning"
3 | """
4 | import torch.nn as nn
5 |
6 | from onmt.encoders.encoder import EncoderBase
7 | from onmt.utils.cnn_factory import shape_transform, StackedCNN
8 |
9 | SCALE_WEIGHT = 0.5 ** 0.5
10 |
11 |
12 | class CNNEncoder(EncoderBase):
13 | """Encoder based on "Convolutional Sequence to Sequence Learning"
14 | :cite:`DBLP:journals/corr/GehringAGYD17`.
15 | """
16 |
17 | def __init__(self, num_layers, hidden_size,
18 | cnn_kernel_width, dropout, embeddings):
19 | super(CNNEncoder, self).__init__()
20 |
21 | self.embeddings = embeddings
22 | input_size = embeddings.embedding_size
23 | self.linear = nn.Linear(input_size, hidden_size)
24 | self.cnn = StackedCNN(num_layers, hidden_size,
25 | cnn_kernel_width, dropout)
26 |
27 | @classmethod
28 | def from_opt(cls, opt, embeddings):
29 | """Alternate constructor."""
30 | return cls(
31 | opt.enc_layers,
32 | opt.enc_rnn_size,
33 | opt.cnn_kernel_width,
34 | opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
35 | embeddings)
36 |
37 | def forward(self, input, lengths=None, hidden=None):
38 | """See :class:`onmt.modules.EncoderBase.forward()`"""
39 | self._check_args(input, lengths, hidden)
40 |
41 | emb = self.embeddings(input)
42 | # s_len, batch, emb_dim = emb.size()
43 |
44 | emb = emb.transpose(0, 1).contiguous()
45 | emb_reshape = emb.view(emb.size(0) * emb.size(1), -1)
46 | emb_remap = self.linear(emb_reshape)
47 | emb_remap = emb_remap.view(emb.size(0), emb.size(1), -1)
48 | emb_remap = shape_transform(emb_remap)
49 | out = self.cnn(emb_remap)
50 |
51 | return emb_remap.squeeze(3).transpose(0, 1).contiguous(), \
52 | out.squeeze(3).transpose(0, 1).contiguous(), lengths
53 |
54 | def update_dropout(self, dropout):
55 | self.cnn.dropout.p = dropout
56 |
--------------------------------------------------------------------------------
/onmt/encoders/encoder.py:
--------------------------------------------------------------------------------
1 | """Base class for encoders and generic multi encoders."""
2 |
3 | import torch.nn as nn
4 |
5 | from onmt.utils.misc import aeq
6 |
7 |
8 | class EncoderBase(nn.Module):
9 | """
10 | Base encoder class. Specifies the interface used by different encoder types
11 | and required by :class:`onmt.Models.NMTModel`.
12 |
13 | .. mermaid::
14 |
15 | graph BT
16 | A[Input]
17 | subgraph RNN
18 | C[Pos 1]
19 | D[Pos 2]
20 | E[Pos N]
21 | end
22 | F[Memory_Bank]
23 | G[Final]
24 | A-->C
25 | A-->D
26 | A-->E
27 | C-->F
28 | D-->F
29 | E-->F
30 | E-->G
31 | """
32 |
33 | @classmethod
34 | def from_opt(cls, opt, embeddings=None):
35 | raise NotImplementedError
36 |
37 | def _check_args(self, src, lengths=None, hidden=None):
38 | n_batch = src.size(1)
39 | if lengths is not None:
40 | n_batch_, = lengths.size()
41 | aeq(n_batch, n_batch_)
42 |
43 | def forward(self, src, lengths=None):
44 | """
45 | Args:
46 | src (LongTensor):
47 | padded sequences of sparse indices ``(src_len, batch, nfeat)``
48 | lengths (LongTensor): length of each sequence ``(batch,)``
49 |
50 |
51 | Returns:
52 | (FloatTensor, FloatTensor):
53 |
54 | * final encoder state, used to initialize decoder
55 | * memory bank for attention, ``(src_len, batch, hidden)``
56 | """
57 |
58 | raise NotImplementedError
59 |
--------------------------------------------------------------------------------
/onmt/encoders/mean_encoder.py:
--------------------------------------------------------------------------------
1 | """Define a minimal encoder."""
2 | from onmt.encoders.encoder import EncoderBase
3 | from onmt.utils.misc import sequence_mask
4 | import torch
5 |
6 |
7 | class MeanEncoder(EncoderBase):
8 | """A trivial non-recurrent encoder. Simply applies mean pooling.
9 |
10 | Args:
11 | num_layers (int): number of replicated layers
12 | embeddings (onmt.modules.Embeddings): embedding module to use
13 | """
14 |
15 | def __init__(self, num_layers, embeddings):
16 | super(MeanEncoder, self).__init__()
17 | self.num_layers = num_layers
18 | self.embeddings = embeddings
19 |
20 | @classmethod
21 | def from_opt(cls, opt, embeddings):
22 | """Alternate constructor."""
23 | return cls(
24 | opt.enc_layers,
25 | embeddings)
26 |
27 | def forward(self, src, lengths=None):
28 | """See :func:`EncoderBase.forward()`"""
29 | self._check_args(src, lengths)
30 |
31 | emb = self.embeddings(src)
32 | _, batch, emb_dim = emb.size()
33 |
34 | if lengths is not None:
35 | # we avoid padding while mean pooling
36 | mask = sequence_mask(lengths).float()
37 | mask = mask / lengths.unsqueeze(1).float()
38 | mean = torch.bmm(mask.unsqueeze(1), emb.transpose(0, 1)).squeeze(1)
39 | else:
40 | mean = emb.mean(0)
41 |
42 | mean = mean.expand(self.num_layers, batch, emb_dim)
43 | memory_bank = emb
44 | encoder_final = (mean, mean)
45 | return encoder_final, memory_bank, lengths
46 |
--------------------------------------------------------------------------------
/onmt/encoders/rnn_encoder.py:
--------------------------------------------------------------------------------
1 | """Define RNN-based encoders."""
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from torch.nn.utils.rnn import pack_padded_sequence as pack
6 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
7 |
8 | from onmt.encoders.encoder import EncoderBase
9 | from onmt.utils.rnn_factory import rnn_factory
10 |
11 |
12 | class RNNEncoder(EncoderBase):
13 | """ A generic recurrent neural network encoder.
14 |
15 | Args:
16 | rnn_type (str):
17 | style of recurrent unit to use, one of [RNN, LSTM, GRU, SRU]
18 | bidirectional (bool) : use a bidirectional RNN
19 | num_layers (int) : number of stacked layers
20 | hidden_size (int) : hidden size of each layer
21 | dropout (float) : dropout value for :class:`torch.nn.Dropout`
22 | embeddings (onmt.modules.Embeddings): embedding module to use
23 | """
24 |
25 | def __init__(self, rnn_type, bidirectional, num_layers,
26 | hidden_size, dropout=0.0, embeddings=None,
27 | use_bridge=False):
28 | super(RNNEncoder, self).__init__()
29 | assert embeddings is not None
30 |
31 | num_directions = 2 if bidirectional else 1
32 | assert hidden_size % num_directions == 0
33 | hidden_size = hidden_size // num_directions
34 | self.embeddings = embeddings
35 |
36 | self.rnn, self.no_pack_padded_seq = \
37 | rnn_factory(rnn_type,
38 | input_size=embeddings.embedding_size,
39 | hidden_size=hidden_size,
40 | num_layers=num_layers,
41 | dropout=dropout,
42 | bidirectional=bidirectional)
43 |
44 | # Initialize the bridge layer
45 | self.use_bridge = use_bridge
46 | if self.use_bridge:
47 | self._initialize_bridge(rnn_type,
48 | hidden_size,
49 | num_layers)
50 |
51 | @classmethod
52 | def from_opt(cls, opt, embeddings):
53 | """Alternate constructor."""
54 | return cls(
55 | opt.rnn_type,
56 | opt.brnn,
57 | opt.enc_layers,
58 | opt.enc_rnn_size,
59 | opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
60 | embeddings,
61 | opt.bridge)
62 |
63 | def forward(self, src, lengths=None):
64 | """See :func:`EncoderBase.forward()`"""
65 | self._check_args(src, lengths)
66 |
67 | emb = self.embeddings(src)
68 | # s_len, batch, emb_dim = emb.size()
69 |
70 | packed_emb = emb
71 | if lengths is not None and not self.no_pack_padded_seq:
72 | # Lengths data is wrapped inside a Tensor.
73 | lengths_list = lengths.view(-1).tolist()
74 | packed_emb = pack(emb, lengths_list)
75 |
76 | memory_bank, encoder_final = self.rnn(packed_emb)
77 |
78 | if lengths is not None and not self.no_pack_padded_seq:
79 | memory_bank = unpack(memory_bank)[0]
80 |
81 | if self.use_bridge:
82 | encoder_final = self._bridge(encoder_final)
83 | return encoder_final, memory_bank, lengths
84 |
85 | def _initialize_bridge(self, rnn_type,
86 | hidden_size,
87 | num_layers):
88 |
89 | # LSTM has hidden and cell state, other only one
90 | number_of_states = 2 if rnn_type == "LSTM" else 1
91 | # Total number of states
92 | self.total_hidden_dim = hidden_size * num_layers
93 |
94 | # Build a linear layer for each
95 | self.bridge = nn.ModuleList([nn.Linear(self.total_hidden_dim,
96 | self.total_hidden_dim,
97 | bias=True)
98 | for _ in range(number_of_states)])
99 |
100 | def _bridge(self, hidden):
101 | """Forward hidden state through bridge."""
102 | def bottle_hidden(linear, states):
103 | """
104 | Transform from 3D to 2D, apply linear and return initial size
105 | """
106 | size = states.size()
107 | result = linear(states.view(-1, self.total_hidden_dim))
108 | return F.relu(result).view(size)
109 |
110 | if isinstance(hidden, tuple): # LSTM
111 | outs = tuple([bottle_hidden(layer, hidden[ix])
112 | for ix, layer in enumerate(self.bridge)])
113 | else:
114 | outs = bottle_hidden(self.bridge[0], hidden)
115 | return outs
116 |
117 | def update_dropout(self, dropout):
118 | self.rnn.dropout = dropout
119 |
--------------------------------------------------------------------------------
/onmt/inputters/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining inputters.
2 |
3 | Inputters implement the logic of transforming raw data to vectorized inputs,
4 | e.g., from a line of text to a sequence of embeddings.
5 | """
6 | from onmt.inputters.inputter import \
7 | load_old_vocab, get_fields, OrderedIterator, \
8 | build_vocab, old_style_vocab, filter_example
9 | from onmt.inputters.dataset_base import Dataset
10 | from onmt.inputters.text_dataset import text_sort_key, TextDataReader
11 | from onmt.inputters.image_dataset import img_sort_key, ImageDataReader
12 | from onmt.inputters.audio_dataset import audio_sort_key, AudioDataReader
13 | from onmt.inputters.vec_dataset import vec_sort_key, VecDataReader
14 | from onmt.inputters.datareader_base import DataReaderBase
15 |
16 |
17 | str2reader = {
18 | "text": TextDataReader, "img": ImageDataReader, "audio": AudioDataReader,
19 | "vec": VecDataReader}
20 | str2sortkey = {
21 | 'text': text_sort_key, 'img': img_sort_key, 'audio': audio_sort_key,
22 | 'vec': vec_sort_key}
23 |
24 |
25 | __all__ = ['Dataset', 'load_old_vocab', 'get_fields', 'DataReaderBase',
26 | 'filter_example', 'old_style_vocab',
27 | 'build_vocab', 'OrderedIterator',
28 | 'text_sort_key', 'img_sort_key', 'audio_sort_key', 'vec_sort_key',
29 | 'TextDataReader', 'ImageDataReader', 'AudioDataReader',
30 | 'VecDataReader']
31 |
--------------------------------------------------------------------------------
/onmt/inputters/datareader_base.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 |
4 | # several data readers need optional dependencies. There's no
5 | # appropriate builtin exception
6 | class MissingDependencyException(Exception):
7 | pass
8 |
9 |
10 | class DataReaderBase(object):
11 | """Read data from file system and yield as dicts.
12 |
13 | Raises:
14 | onmt.inputters.datareader_base.MissingDependencyException: A number
15 | of DataReaders need specific additional packages.
16 | If any are missing, this will be raised.
17 | """
18 |
19 | @classmethod
20 | def from_opt(cls, opt):
21 | """Alternative constructor.
22 |
23 | Args:
24 | opt (argparse.Namespace): The parsed arguments.
25 | """
26 |
27 | return cls()
28 |
29 | @classmethod
30 | def _read_file(cls, path):
31 | """Line-by-line read a file as bytes."""
32 | with open(path, "rb") as f:
33 | for line in f:
34 | yield line
35 |
36 | @staticmethod
37 | def _raise_missing_dep(*missing_deps):
38 | """Raise missing dep exception with standard error message."""
39 | raise MissingDependencyException(
40 | "Could not create reader. Be sure to install "
41 | "the following dependencies: " + ", ".join(missing_deps))
42 |
43 | def read(self, data, side, src_dir):
44 | """Read data from file system and yield as dicts."""
45 | raise NotImplementedError()
46 |
--------------------------------------------------------------------------------
/onmt/inputters/image_dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 | import torch
6 | from torchtext.data import Field
7 |
8 | from onmt.inputters.datareader_base import DataReaderBase
9 |
10 | # domain specific dependencies
11 | try:
12 | from PIL import Image
13 | from torchvision import transforms
14 | import cv2
15 | except ImportError:
16 | Image, transforms, cv2 = None, None, None
17 |
18 |
19 | class ImageDataReader(DataReaderBase):
20 | """Read image data from disk.
21 |
22 | Args:
23 | truncate (tuple[int] or NoneType): maximum img size. Use
24 | ``(0,0)`` or ``None`` for unlimited.
25 | channel_size (int): Number of channels per image.
26 |
27 | Raises:
28 | onmt.inputters.datareader_base.MissingDependencyException: If
29 | importing any of ``PIL``, ``torchvision``, or ``cv2`` fail.
30 | """
31 |
32 | def __init__(self, truncate=None, channel_size=3):
33 | self._check_deps()
34 | self.truncate = truncate
35 | self.channel_size = channel_size
36 |
37 | @classmethod
38 | def from_opt(cls, opt):
39 | return cls(channel_size=opt.image_channel_size)
40 |
41 | @classmethod
42 | def _check_deps(cls):
43 | if any([Image is None, transforms is None, cv2 is None]):
44 | cls._raise_missing_dep(
45 | "PIL", "torchvision", "cv2")
46 |
47 | def read(self, images, side, img_dir=None):
48 | """Read data into dicts.
49 |
50 | Args:
51 | images (str or Iterable[str]): Sequence of image paths or
52 | path to file containing audio paths.
53 | In either case, the filenames may be relative to ``src_dir``
54 | (default behavior) or absolute.
55 | side (str): Prefix used in return dict. Usually
56 | ``"src"`` or ``"tgt"``.
57 | img_dir (str): Location of source image files. See ``images``.
58 |
59 | Yields:
60 | a dictionary containing image data, path and index for each line.
61 | """
62 | if isinstance(images, str):
63 | images = DataReaderBase._read_file(images)
64 |
65 | for i, filename in enumerate(images):
66 | filename = filename.decode("utf-8").strip()
67 | img_path = os.path.join(img_dir, filename)
68 | if not os.path.exists(img_path):
69 | img_path = filename
70 |
71 | assert os.path.exists(img_path), \
72 | 'img path %s not found' % filename
73 |
74 | if self.channel_size == 1:
75 | img = transforms.ToTensor()(
76 | Image.fromarray(cv2.imread(img_path, 0)))
77 | else:
78 | img = transforms.ToTensor()(Image.open(img_path))
79 | if self.truncate and self.truncate != (0, 0):
80 | if not (img.size(1) <= self.truncate[0]
81 | and img.size(2) <= self.truncate[1]):
82 | continue
83 | yield {side: img, side + '_path': filename, 'indices': i}
84 |
85 |
86 | def img_sort_key(ex):
87 | """Sort using the size of the image: (width, height)."""
88 | return ex.src.size(2), ex.src.size(1)
89 |
90 |
91 | def batch_img(data, vocab):
92 | """Pad and batch a sequence of images."""
93 | c = data[0].size(0)
94 | h = max([t.size(1) for t in data])
95 | w = max([t.size(2) for t in data])
96 | imgs = torch.zeros(len(data), c, h, w).fill_(1)
97 | for i, img in enumerate(data):
98 | imgs[i, :, 0:img.size(1), 0:img.size(2)] = img
99 | return imgs
100 |
101 |
102 | def image_fields(**kwargs):
103 | img = Field(
104 | use_vocab=False, dtype=torch.float,
105 | postprocessing=batch_img, sequential=False)
106 | return img
107 |
--------------------------------------------------------------------------------
/onmt/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining models."""
2 | from onmt.models.model_saver import build_model_saver, ModelSaver
3 | from onmt.models.model import NMTModel
4 |
5 | __all__ = ["build_model_saver", "ModelSaver", "NMTModel"]
6 |
--------------------------------------------------------------------------------
/onmt/models/model.py:
--------------------------------------------------------------------------------
1 | """ Onmt NMT Model base class definition """
2 | import torch.nn as nn
3 |
4 |
5 | class NMTModel(nn.Module):
6 | """
7 | Core trainable object in OpenNMT. Implements a trainable interface
8 | for a simple, generic encoder + decoder model.
9 |
10 | Args:
11 | encoder (onmt.encoders.EncoderBase): an encoder object
12 | decoder (onmt.decoders.DecoderBase): a decoder object
13 | """
14 |
15 | def __init__(self, encoder, decoder, bidecoder=None):
16 | super(NMTModel, self).__init__()
17 | self.encoder = encoder
18 | self.decoder = decoder
19 | self.bidecoder = bidecoder
20 |
21 | def forward(self, src, tgt, lengths, bptt=False):
22 | """Forward propagate a `src` and `tgt` pair for training.
23 | Possible initialized with a beginning decoder state.
24 |
25 | Args:
26 | src (Tensor): A source sequence passed to encoder.
27 | typically for inputs this will be a padded `LongTensor`
28 | of size ``(len, batch, features)``. However, may be an
29 | image or other generic input depending on encoder.
30 | tgt (LongTensor): A target sequence of size ``(tgt_len, batch)``.
31 | lengths(LongTensor): The src lengths, pre-padding ``(batch,)``.
32 | bptt (Boolean): A flag indicating if truncated bptt is set.
33 | If reset then init_state
34 |
35 | Returns:
36 | (FloatTensor, dict[str, FloatTensor]):
37 |
38 | * decoder output ``(tgt_len, batch, hidden)``
39 | * dictionary attention dists of ``(tgt_len, batch, src_len)``
40 | """
41 | tgt = tgt[:-1] # exclude last target from inputs
42 |
43 | enc_state, memory_bank, lengths = self.encoder(src, lengths)
44 |
45 | if bptt is False:
46 | self.decoder.init_state(src, memory_bank, enc_state)
47 | dec_out, attns = self.decoder(tgt, memory_bank,
48 | memory_lengths=lengths)
49 | # bidecoder
50 | bidec_out, bidec_attns = None, None
51 | if self.bidecoder is not None:
52 | if bptt is False:
53 | self.bidecoder.init_state(src, memory_bank, enc_state)
54 | bidec_out, bidec_attns = self.bidecoder(
55 | tgt, memory_bank, memory_lengths=lengths)
56 |
57 | return dec_out, attns, bidec_out, bidec_attns
58 |
59 | def update_dropout(self, dropout):
60 | self.encoder.update_dropout(dropout)
61 | self.decoder.update_dropout(dropout)
62 | self.bidecoder.update_dropout(dropout)
63 |
--------------------------------------------------------------------------------
/onmt/models/model_saver.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 |
4 | from collections import deque
5 | from onmt.utils.logging import logger
6 |
7 | from copy import deepcopy
8 |
9 |
10 | def build_model_saver(model_opt, opt, model, fields, optim):
11 | model_saver = ModelSaver(opt.save_model,
12 | model,
13 | model_opt,
14 | fields,
15 | optim,
16 | opt.keep_checkpoint)
17 | return model_saver
18 |
19 |
20 | class ModelSaverBase(object):
21 | """Base class for model saving operations
22 |
23 | Inherited classes must implement private methods:
24 | * `_save`
25 | * `_rm_checkpoint
26 | """
27 |
28 | def __init__(self, base_path, model, model_opt, fields, optim,
29 | keep_checkpoint=-1):
30 | self.base_path = base_path
31 | self.model = model
32 | self.model_opt = model_opt
33 | self.fields = fields
34 | self.optim = optim
35 | self.last_saved_step = None
36 | self.keep_checkpoint = keep_checkpoint
37 | if keep_checkpoint > 0:
38 | self.checkpoint_queue = deque([], maxlen=keep_checkpoint)
39 |
40 | def save(self, step, moving_average=None):
41 | """Main entry point for model saver
42 |
43 | It wraps the `_save` method with checks and apply `keep_checkpoint`
44 | related logic
45 | """
46 |
47 | if self.keep_checkpoint == 0 or step == self.last_saved_step:
48 | return
49 |
50 | if moving_average:
51 | save_model = deepcopy(self.model)
52 | for avg, param in zip(moving_average, save_model.parameters()):
53 | param.data.copy_(avg.data)
54 | else:
55 | save_model = self.model
56 |
57 | chkpt, chkpt_name = self._save(step, save_model)
58 | self.last_saved_step = step
59 |
60 | if moving_average:
61 | del save_model
62 |
63 | if self.keep_checkpoint > 0:
64 | if len(self.checkpoint_queue) == self.checkpoint_queue.maxlen:
65 | todel = self.checkpoint_queue.popleft()
66 | self._rm_checkpoint(todel)
67 | self.checkpoint_queue.append(chkpt_name)
68 |
69 | def _save(self, step):
70 | """Save a resumable checkpoint.
71 |
72 | Args:
73 | step (int): step number
74 |
75 | Returns:
76 | (object, str):
77 |
78 | * checkpoint: the saved object
79 | * checkpoint_name: name (or path) of the saved checkpoint
80 | """
81 |
82 | raise NotImplementedError()
83 |
84 | def _rm_checkpoint(self, name):
85 | """Remove a checkpoint
86 |
87 | Args:
88 | name(str): name that indentifies the checkpoint
89 | (it may be a filepath)
90 | """
91 |
92 | raise NotImplementedError()
93 |
94 |
95 | class ModelSaver(ModelSaverBase):
96 | """Simple model saver to filesystem"""
97 |
98 | def _save(self, step, model):
99 | model_state_dict = model.state_dict()
100 | model_state_dict = {k: v for k, v in model_state_dict.items()
101 | if 'generator' not in k}
102 | generator_state_dict = model.generator.state_dict()
103 |
104 | # NOTE: We need to trim the vocab to remove any unk tokens that
105 | # were not originally here.
106 |
107 | vocab = deepcopy(self.fields)
108 | for side in ["src", "tgt"]:
109 | keys_to_pop = []
110 | if hasattr(vocab[side], "fields"):
111 | unk_token = vocab[side].fields[0][1].vocab.itos[0]
112 | for key, value in vocab[side].fields[0][1].vocab.stoi.items():
113 | if value == 0 and key != unk_token:
114 | keys_to_pop.append(key)
115 | for key in keys_to_pop:
116 | vocab[side].fields[0][1].vocab.stoi.pop(key, None)
117 |
118 | checkpoint = {
119 | 'model': model_state_dict,
120 | 'generator': generator_state_dict,
121 | 'vocab': vocab,
122 | 'opt': self.model_opt,
123 | 'optim': self.optim.state_dict(),
124 | }
125 | if model.bidecoder_generator is not None:
126 | checkpoint['bidecoder_generator'] = model.bidecoder_generator.state_dict()
127 |
128 | logger.info("Saving checkpoint %s_step_%d.pt" % (self.base_path, step))
129 | checkpoint_path = '%s_step_%d.pt' % (self.base_path, step)
130 | torch.save(checkpoint, checkpoint_path)
131 | return checkpoint, checkpoint_path
132 |
133 | def _rm_checkpoint(self, name):
134 | os.remove(name)
135 |
--------------------------------------------------------------------------------
/onmt/models/stacked_rnn.py:
--------------------------------------------------------------------------------
1 | """ Implementation of ONMT RNN for Input Feeding Decoding """
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class StackedLSTM(nn.Module):
7 | """
8 | Our own implementation of stacked LSTM.
9 | Needed for the decoder, because we do input feeding.
10 | """
11 |
12 | def __init__(self, num_layers, input_size, rnn_size, dropout):
13 | super(StackedLSTM, self).__init__()
14 | self.dropout = nn.Dropout(dropout)
15 | self.num_layers = num_layers
16 | self.layers = nn.ModuleList()
17 |
18 | for _ in range(num_layers):
19 | self.layers.append(nn.LSTMCell(input_size, rnn_size))
20 | input_size = rnn_size
21 |
22 | def forward(self, input_feed, hidden):
23 | h_0, c_0 = hidden
24 | h_1, c_1 = [], []
25 | for i, layer in enumerate(self.layers):
26 | h_1_i, c_1_i = layer(input_feed, (h_0[i], c_0[i]))
27 | input_feed = h_1_i
28 | if i + 1 != self.num_layers:
29 | input_feed = self.dropout(input_feed)
30 | h_1 += [h_1_i]
31 | c_1 += [c_1_i]
32 |
33 | h_1 = torch.stack(h_1)
34 | c_1 = torch.stack(c_1)
35 |
36 | return input_feed, (h_1, c_1)
37 |
38 |
39 | class StackedGRU(nn.Module):
40 | """
41 | Our own implementation of stacked GRU.
42 | Needed for the decoder, because we do input feeding.
43 | """
44 |
45 | def __init__(self, num_layers, input_size, rnn_size, dropout):
46 | super(StackedGRU, self).__init__()
47 | self.dropout = nn.Dropout(dropout)
48 | self.num_layers = num_layers
49 | self.layers = nn.ModuleList()
50 |
51 | for _ in range(num_layers):
52 | self.layers.append(nn.GRUCell(input_size, rnn_size))
53 | input_size = rnn_size
54 |
55 | def forward(self, input_feed, hidden):
56 | h_1 = []
57 | for i, layer in enumerate(self.layers):
58 | h_1_i = layer(input_feed, hidden[0][i])
59 | input_feed = h_1_i
60 | if i + 1 != self.num_layers:
61 | input_feed = self.dropout(input_feed)
62 | h_1 += [h_1_i]
63 |
64 | h_1 = torch.stack(h_1)
65 | return input_feed, (h_1,)
66 |
--------------------------------------------------------------------------------
/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
1 | """ Attention and normalization modules """
2 | from onmt.modules.util_class import Elementwise
3 | from onmt.modules.gate import context_gate_factory, ContextGate
4 | from onmt.modules.global_attention import GlobalAttention
5 | from onmt.modules.conv_multi_step_attention import ConvMultiStepAttention
6 | from onmt.modules.copy_generator import CopyGenerator, CopyGeneratorLoss, \
7 | CopyGeneratorLossCompute
8 | from onmt.modules.multi_headed_attn import MultiHeadedAttention
9 | from onmt.modules.embeddings import Embeddings, PositionalEncoding, \
10 | VecEmbedding
11 | from onmt.modules.weight_norm import WeightNormConv2d
12 | from onmt.modules.average_attn import AverageAttention
13 |
14 | __all__ = ["Elementwise", "context_gate_factory", "ContextGate",
15 | "GlobalAttention", "ConvMultiStepAttention", "CopyGenerator",
16 | "CopyGeneratorLoss", "CopyGeneratorLossCompute",
17 | "MultiHeadedAttention", "Embeddings", "PositionalEncoding",
18 | "WeightNormConv2d", "AverageAttention", "VecEmbedding"]
19 |
--------------------------------------------------------------------------------
/onmt/modules/average_attn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Average Attention module."""
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from onmt.modules.position_ffn import PositionwiseFeedForward
8 |
9 |
10 | class AverageAttention(nn.Module):
11 | """
12 | Average Attention module from
13 | "Accelerating Neural Transformer via an Average Attention Network"
14 | :cite:`DBLP:journals/corr/abs-1805-00631`.
15 |
16 | Args:
17 | model_dim (int): the dimension of keys/values/queries,
18 | must be divisible by head_count
19 | dropout (float): dropout parameter
20 | """
21 |
22 | def __init__(self, model_dim, dropout=0.1, aan_useffn=False):
23 | self.model_dim = model_dim
24 | self.aan_useffn = aan_useffn
25 | super(AverageAttention, self).__init__()
26 | if aan_useffn:
27 | self.average_layer = PositionwiseFeedForward(model_dim, model_dim,
28 | dropout)
29 | self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
30 |
31 | def cumulative_average_mask(self, batch_size, inputs_len, device):
32 | """
33 | Builds the mask to compute the cumulative average as described in
34 | :cite:`DBLP:journals/corr/abs-1805-00631` -- Figure 3
35 |
36 | Args:
37 | batch_size (int): batch size
38 | inputs_len (int): length of the inputs
39 |
40 | Returns:
41 | (FloatTensor):
42 |
43 | * A Tensor of shape ``(batch_size, input_len, input_len)``
44 | """
45 |
46 | triangle = torch.tril(torch.ones(inputs_len, inputs_len,
47 | dtype=torch.float, device=device))
48 | weights = torch.ones(1, inputs_len, dtype=torch.float, device=device) \
49 | / torch.arange(1, inputs_len + 1, dtype=torch.float, device=device)
50 | mask = triangle * weights.transpose(0, 1)
51 |
52 | return mask.unsqueeze(0).expand(batch_size, inputs_len, inputs_len)
53 |
54 | def cumulative_average(self, inputs, mask_or_step,
55 | layer_cache=None, step=None):
56 | """
57 | Computes the cumulative average as described in
58 | :cite:`DBLP:journals/corr/abs-1805-00631` -- Equations (1) (5) (6)
59 |
60 | Args:
61 | inputs (FloatTensor): sequence to average
62 | ``(batch_size, input_len, dimension)``
63 | mask_or_step: if cache is set, this is assumed
64 | to be the current step of the
65 | dynamic decoding. Otherwise, it is the mask matrix
66 | used to compute the cumulative average.
67 | layer_cache: a dictionary containing the cumulative average
68 | of the previous step.
69 |
70 | Returns:
71 | a tensor of the same shape and type as ``inputs``.
72 | """
73 |
74 | if layer_cache is not None:
75 | step = mask_or_step
76 | average_attention = (inputs + step *
77 | layer_cache["prev_g"]) / (step + 1)
78 | layer_cache["prev_g"] = average_attention
79 | return average_attention
80 | else:
81 | mask = mask_or_step
82 | return torch.matmul(mask.to(inputs.dtype), inputs)
83 |
84 | def forward(self, inputs, mask=None, layer_cache=None, step=None):
85 | """
86 | Args:
87 | inputs (FloatTensor): ``(batch_size, input_len, model_dim)``
88 |
89 | Returns:
90 | (FloatTensor, FloatTensor):
91 |
92 | * gating_outputs ``(batch_size, input_len, model_dim)``
93 | * average_outputs average attention
94 | ``(batch_size, input_len, model_dim)``
95 | """
96 |
97 | batch_size = inputs.size(0)
98 | inputs_len = inputs.size(1)
99 | average_outputs = self.cumulative_average(
100 | inputs, self.cumulative_average_mask(batch_size,
101 | inputs_len, inputs.device)
102 | if layer_cache is None else step, layer_cache=layer_cache)
103 | if self.aan_useffn:
104 | average_outputs = self.average_layer(average_outputs)
105 | gating_outputs = self.gating_layer(torch.cat((inputs,
106 | average_outputs), -1))
107 | input_gate, forget_gate = torch.chunk(gating_outputs, 2, dim=2)
108 | gating_outputs = torch.sigmoid(input_gate) * inputs + \
109 | torch.sigmoid(forget_gate) * average_outputs
110 |
111 | return gating_outputs, average_outputs
112 |
--------------------------------------------------------------------------------
/onmt/modules/conv_multi_step_attention.py:
--------------------------------------------------------------------------------
1 | """ Multi Step Attention for CNN """
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from onmt.utils.misc import aeq
6 |
7 |
8 | SCALE_WEIGHT = 0.5 ** 0.5
9 |
10 |
11 | def seq_linear(linear, x):
12 | """ linear transform for 3-d tensor """
13 | batch, hidden_size, length, _ = x.size()
14 | h = linear(torch.transpose(x, 1, 2).contiguous().view(
15 | batch * length, hidden_size))
16 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
17 |
18 |
19 | class ConvMultiStepAttention(nn.Module):
20 | """
21 | Conv attention takes a key matrix, a value matrix and a query vector.
22 | Attention weight is calculated by key matrix with the query vector
23 | and sum on the value matrix. And the same operation is applied
24 | in each decode conv layer.
25 | """
26 |
27 | def __init__(self, input_size):
28 | super(ConvMultiStepAttention, self).__init__()
29 | self.linear_in = nn.Linear(input_size, input_size)
30 | self.mask = None
31 |
32 | def apply_mask(self, mask):
33 | """ Apply mask """
34 | self.mask = mask
35 |
36 | def forward(self, base_target_emb, input_from_dec, encoder_out_top,
37 | encoder_out_combine):
38 | """
39 | Args:
40 | base_target_emb: target emb tensor
41 | input_from_dec: output of decode conv
42 | encoder_out_top: the key matrix for calculation of attetion weight,
43 | which is the top output of encode conv
44 | encoder_out_combine:
45 | the value matrix for the attention-weighted sum,
46 | which is the combination of base emb and top output of encode
47 | """
48 |
49 | # checks
50 | # batch, channel, height, width = base_target_emb.size()
51 | batch, _, height, _ = base_target_emb.size()
52 | # batch_, channel_, height_, width_ = input_from_dec.size()
53 | batch_, _, height_, _ = input_from_dec.size()
54 | aeq(batch, batch_)
55 | aeq(height, height_)
56 |
57 | # enc_batch, enc_channel, enc_height = encoder_out_top.size()
58 | enc_batch, _, enc_height = encoder_out_top.size()
59 | # enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
60 | enc_batch_, _, enc_height_ = encoder_out_combine.size()
61 |
62 | aeq(enc_batch, enc_batch_)
63 | aeq(enc_height, enc_height_)
64 |
65 | preatt = seq_linear(self.linear_in, input_from_dec)
66 | target = (base_target_emb + preatt) * SCALE_WEIGHT
67 | target = torch.squeeze(target, 3)
68 | target = torch.transpose(target, 1, 2)
69 | pre_attn = torch.bmm(target, encoder_out_top)
70 |
71 | if self.mask is not None:
72 | pre_attn.data.masked_fill_(self.mask, -float('inf'))
73 |
74 | attn = F.softmax(pre_attn, dim=2)
75 |
76 | context_output = torch.bmm(
77 | attn, torch.transpose(encoder_out_combine, 1, 2))
78 | context_output = torch.transpose(
79 | torch.unsqueeze(context_output, 3), 1, 2)
80 | return context_output, attn
81 |
--------------------------------------------------------------------------------
/onmt/modules/gate.py:
--------------------------------------------------------------------------------
1 | """ ContextGate module """
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | def context_gate_factory(gate_type, embeddings_size, decoder_size,
7 | attention_size, output_size):
8 | """Returns the correct ContextGate class"""
9 |
10 | gate_types = {'source': SourceContextGate,
11 | 'target': TargetContextGate,
12 | 'both': BothContextGate}
13 |
14 | assert gate_type in gate_types, "Not valid ContextGate type: {0}".format(
15 | gate_type)
16 | return gate_types[gate_type](embeddings_size, decoder_size, attention_size,
17 | output_size)
18 |
19 |
20 | class ContextGate(nn.Module):
21 | """
22 | Context gate is a decoder module that takes as input the previous word
23 | embedding, the current decoder state and the attention state, and
24 | produces a gate.
25 | The gate can be used to select the input from the target side context
26 | (decoder state), from the source context (attention state) or both.
27 | """
28 |
29 | def __init__(self, embeddings_size, decoder_size,
30 | attention_size, output_size):
31 | super(ContextGate, self).__init__()
32 | input_size = embeddings_size + decoder_size + attention_size
33 | self.gate = nn.Linear(input_size, output_size, bias=True)
34 | self.sig = nn.Sigmoid()
35 | self.source_proj = nn.Linear(attention_size, output_size)
36 | self.target_proj = nn.Linear(embeddings_size + decoder_size,
37 | output_size)
38 |
39 | def forward(self, prev_emb, dec_state, attn_state):
40 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
41 | z = self.sig(self.gate(input_tensor))
42 | proj_source = self.source_proj(attn_state)
43 | proj_target = self.target_proj(
44 | torch.cat((prev_emb, dec_state), dim=1))
45 | return z, proj_source, proj_target
46 |
47 |
48 | class SourceContextGate(nn.Module):
49 | """Apply the context gate only to the source context"""
50 |
51 | def __init__(self, embeddings_size, decoder_size,
52 | attention_size, output_size):
53 | super(SourceContextGate, self).__init__()
54 | self.context_gate = ContextGate(embeddings_size, decoder_size,
55 | attention_size, output_size)
56 | self.tanh = nn.Tanh()
57 |
58 | def forward(self, prev_emb, dec_state, attn_state):
59 | z, source, target = self.context_gate(
60 | prev_emb, dec_state, attn_state)
61 | return self.tanh(target + z * source)
62 |
63 |
64 | class TargetContextGate(nn.Module):
65 | """Apply the context gate only to the target context"""
66 |
67 | def __init__(self, embeddings_size, decoder_size,
68 | attention_size, output_size):
69 | super(TargetContextGate, self).__init__()
70 | self.context_gate = ContextGate(embeddings_size, decoder_size,
71 | attention_size, output_size)
72 | self.tanh = nn.Tanh()
73 |
74 | def forward(self, prev_emb, dec_state, attn_state):
75 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
76 | return self.tanh(z * target + source)
77 |
78 |
79 | class BothContextGate(nn.Module):
80 | """Apply the context gate to both contexts"""
81 |
82 | def __init__(self, embeddings_size, decoder_size,
83 | attention_size, output_size):
84 | super(BothContextGate, self).__init__()
85 | self.context_gate = ContextGate(embeddings_size, decoder_size,
86 | attention_size, output_size)
87 | self.tanh = nn.Tanh()
88 |
89 | def forward(self, prev_emb, dec_state, attn_state):
90 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
91 | return self.tanh((1. - z) * target + z * source)
92 |
--------------------------------------------------------------------------------
/onmt/modules/position_ffn.py:
--------------------------------------------------------------------------------
1 | """Position feed-forward network from "Attention is All You Need"."""
2 |
3 | import torch.nn as nn
4 |
5 |
6 | class PositionwiseFeedForward(nn.Module):
7 | """ A two-layer Feed-Forward-Network with residual layer norm.
8 |
9 | Args:
10 | d_model (int): the size of input for the first-layer of the FFN.
11 | d_ff (int): the hidden layer size of the second-layer
12 | of the FNN.
13 | dropout (float): dropout probability in :math:`[0, 1)`.
14 | """
15 |
16 | def __init__(self, d_model, d_ff, dropout=0.1):
17 | super(PositionwiseFeedForward, self).__init__()
18 | self.w_1 = nn.Linear(d_model, d_ff)
19 | self.w_2 = nn.Linear(d_ff, d_model)
20 | self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
21 | self.dropout_1 = nn.Dropout(dropout)
22 | self.relu = nn.ReLU()
23 | self.dropout_2 = nn.Dropout(dropout)
24 |
25 | def forward(self, x):
26 | """Layer definition.
27 |
28 | Args:
29 | x: ``(batch_size, input_len, model_dim)``
30 |
31 | Returns:
32 | (FloatTensor): Output ``(batch_size, input_len, model_dim)``.
33 | """
34 |
35 | inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
36 | output = self.dropout_2(self.w_2(inter))
37 | return output + x
38 |
39 | def update_dropout(self, dropout):
40 | self.dropout_1.p = dropout
41 | self.dropout_2.p = dropout
42 |
--------------------------------------------------------------------------------
/onmt/modules/sparse_activations.py:
--------------------------------------------------------------------------------
1 | """
2 | An implementation of sparsemax (Martins & Astudillo, 2016). See
3 | :cite:`DBLP:journals/corr/MartinsA16` for detailed description.
4 |
5 | By Ben Peters and Vlad Niculae
6 | """
7 |
8 | import torch
9 | from torch.autograd import Function
10 | import torch.nn as nn
11 |
12 |
13 | def _make_ix_like(input, dim=0):
14 | d = input.size(dim)
15 | rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
16 | view = [1] * input.dim()
17 | view[0] = -1
18 | return rho.view(view).transpose(0, dim)
19 |
20 |
21 | def _threshold_and_support(input, dim=0):
22 | """Sparsemax building block: compute the threshold
23 |
24 | Args:
25 | input: any dimension
26 | dim: dimension along which to apply the sparsemax
27 |
28 | Returns:
29 | the threshold value
30 | """
31 |
32 | input_srt, _ = torch.sort(input, descending=True, dim=dim)
33 | input_cumsum = input_srt.cumsum(dim) - 1
34 | rhos = _make_ix_like(input, dim)
35 | support = rhos * input_srt > input_cumsum
36 |
37 | support_size = support.sum(dim=dim).unsqueeze(dim)
38 | tau = input_cumsum.gather(dim, support_size - 1)
39 | tau /= support_size.to(input.dtype)
40 | return tau, support_size
41 |
42 |
43 | class SparsemaxFunction(Function):
44 |
45 | @staticmethod
46 | def forward(ctx, input, dim=0):
47 | """sparsemax: normalizing sparse transform (a la softmax)
48 |
49 | Parameters:
50 | input (Tensor): any shape
51 | dim: dimension along which to apply sparsemax
52 |
53 | Returns:
54 | output (Tensor): same shape as input
55 | """
56 | ctx.dim = dim
57 | max_val, _ = input.max(dim=dim, keepdim=True)
58 | input -= max_val # same numerical stability trick as for softmax
59 | tau, supp_size = _threshold_and_support(input, dim=dim)
60 | output = torch.clamp(input - tau, min=0)
61 | ctx.save_for_backward(supp_size, output)
62 | return output
63 |
64 | @staticmethod
65 | def backward(ctx, grad_output):
66 | supp_size, output = ctx.saved_tensors
67 | dim = ctx.dim
68 | grad_input = grad_output.clone()
69 | grad_input[output == 0] = 0
70 |
71 | v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
72 | v_hat = v_hat.unsqueeze(dim)
73 | grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
74 | return grad_input, None
75 |
76 |
77 | sparsemax = SparsemaxFunction.apply
78 |
79 |
80 | class Sparsemax(nn.Module):
81 |
82 | def __init__(self, dim=0):
83 | self.dim = dim
84 | super(Sparsemax, self).__init__()
85 |
86 | def forward(self, input):
87 | return sparsemax(input, self.dim)
88 |
89 |
90 | class LogSparsemax(nn.Module):
91 |
92 | def __init__(self, dim=0):
93 | self.dim = dim
94 | super(LogSparsemax, self).__init__()
95 |
96 | def forward(self, input):
97 | return torch.log(sparsemax(input, self.dim))
98 |
--------------------------------------------------------------------------------
/onmt/modules/sparse_losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.autograd import Function
4 | from onmt.modules.sparse_activations import _threshold_and_support
5 | from onmt.utils.misc import aeq
6 |
7 |
8 | class SparsemaxLossFunction(Function):
9 |
10 | @staticmethod
11 | def forward(ctx, input, target):
12 | """
13 | input (FloatTensor): ``(n, num_classes)``.
14 | target (LongTensor): ``(n,)``, the indices of the target classes
15 | """
16 | input_batch, classes = input.size()
17 | target_batch = target.size(0)
18 | aeq(input_batch, target_batch)
19 |
20 | z_k = input.gather(1, target.unsqueeze(1)).squeeze()
21 | tau_z, support_size = _threshold_and_support(input, dim=1)
22 | support = input > tau_z
23 | x = torch.where(
24 | support, input**2 - tau_z**2,
25 | torch.tensor(0.0, device=input.device)
26 | ).sum(dim=1)
27 | ctx.save_for_backward(input, target, tau_z)
28 | # clamping necessary because of numerical errors: loss should be lower
29 | # bounded by zero, but negative values near zero are possible without
30 | # the clamp
31 | return torch.clamp(x / 2 - z_k + 0.5, min=0.0)
32 |
33 | @staticmethod
34 | def backward(ctx, grad_output):
35 | input, target, tau_z = ctx.saved_tensors
36 | sparsemax_out = torch.clamp(input - tau_z, min=0)
37 | delta = torch.zeros_like(sparsemax_out)
38 | delta.scatter_(1, target.unsqueeze(1), 1)
39 | return sparsemax_out - delta, None
40 |
41 |
42 | sparsemax_loss = SparsemaxLossFunction.apply
43 |
44 |
45 | class SparsemaxLoss(nn.Module):
46 | """
47 | An implementation of sparsemax loss, first proposed in
48 | :cite:`DBLP:journals/corr/MartinsA16`. If using
49 | a sparse output layer, it is not possible to use negative log likelihood
50 | because the loss is infinite in the case the target is assigned zero
51 | probability. Inputs to SparsemaxLoss are arbitrary dense real-valued
52 | vectors (like in nn.CrossEntropyLoss), not probability vectors (like in
53 | nn.NLLLoss).
54 | """
55 |
56 | def __init__(self, weight=None, ignore_index=-100,
57 | reduction='elementwise_mean'):
58 | assert reduction in ['elementwise_mean', 'sum', 'none']
59 | self.reduction = reduction
60 | self.weight = weight
61 | self.ignore_index = ignore_index
62 | super(SparsemaxLoss, self).__init__()
63 |
64 | def forward(self, input, target):
65 | loss = sparsemax_loss(input, target)
66 | if self.ignore_index >= 0:
67 | ignored_positions = target == self.ignore_index
68 | size = float((target.size(0) - ignored_positions.sum()).item())
69 | loss.masked_fill_(ignored_positions, 0.0)
70 | else:
71 | size = float(target.size(0))
72 | if self.reduction == 'sum':
73 | loss = loss.sum()
74 | elif self.reduction == 'elementwise_mean':
75 | loss = loss.sum() / size
76 | return loss
77 |
--------------------------------------------------------------------------------
/onmt/modules/structured_attention.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import torch.cuda
4 |
5 |
6 | class MatrixTree(nn.Module):
7 | """Implementation of the matrix-tree theorem for computing marginals
8 | of non-projective dependency parsing. This attention layer is used
9 | in the paper "Learning Structured Text Representations"
10 | :cite:`DBLP:journals/corr/LiuL17d`.
11 | """
12 |
13 | def __init__(self, eps=1e-5):
14 | self.eps = eps
15 | super(MatrixTree, self).__init__()
16 |
17 | def forward(self, input):
18 | laplacian = input.exp() + self.eps
19 | output = input.clone()
20 | for b in range(input.size(0)):
21 | lap = laplacian[b].masked_fill(
22 | torch.eye(input.size(1), device=input.device).ne(0), 0)
23 | lap = -lap + torch.diag(lap.sum(0))
24 | # store roots on diagonal
25 | lap[0] = input[b].diag().exp()
26 | inv_laplacian = lap.inverse()
27 |
28 | factor = inv_laplacian.diag().unsqueeze(1)\
29 | .expand_as(input[b]).transpose(0, 1)
30 | term1 = input[b].exp().mul(factor).clone()
31 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
32 | term1[:, 0] = 0
33 | term2[0] = 0
34 | output[b] = term1 - term2
35 | roots_output = input[b].diag().exp().mul(
36 | inv_laplacian.transpose(0, 1)[0])
37 | output[b] = output[b] + torch.diag(roots_output)
38 | return output
39 |
--------------------------------------------------------------------------------
/onmt/modules/util_class.py:
--------------------------------------------------------------------------------
1 | """ Misc classes """
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | # At the moment this class is only used by embeddings.Embeddings look-up tables
7 | class Elementwise(nn.ModuleList):
8 | """
9 | A simple network container.
10 | Parameters are a list of modules.
11 | Inputs are a 3d Tensor whose last dimension is the same length
12 | as the list.
13 | Outputs are the result of applying modules to inputs elementwise.
14 | An optional merge parameter allows the outputs to be reduced to a
15 | single Tensor.
16 | """
17 |
18 | def __init__(self, merge=None, *args):
19 | assert merge in [None, 'first', 'concat', 'sum', 'mlp']
20 | self.merge = merge
21 | super(Elementwise, self).__init__(*args)
22 |
23 | def forward(self, inputs):
24 | inputs_ = [feat.squeeze(2) for feat in inputs.split(1, dim=2)]
25 | assert len(self) == len(inputs_)
26 | outputs = [f(x) for f, x in zip(self, inputs_)]
27 | if self.merge == 'first':
28 | return outputs[0]
29 | elif self.merge == 'concat' or self.merge == 'mlp':
30 | return torch.cat(outputs, 2)
31 | elif self.merge == 'sum':
32 | return sum(outputs)
33 | else:
34 | return outputs
35 |
36 |
37 | class Cast(nn.Module):
38 | """
39 | Basic layer that casts its input to a specific data type. The same tensor
40 | is returned if the data type is already correct.
41 | """
42 |
43 | def __init__(self, dtype):
44 | super(Cast, self).__init__()
45 | self._dtype = dtype
46 |
47 | def forward(self, x):
48 | return x.to(self._dtype)
49 |
--------------------------------------------------------------------------------
/onmt/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/tests/__init__.py
--------------------------------------------------------------------------------
/onmt/tests/rebuild_test_models.sh:
--------------------------------------------------------------------------------
1 | # # Retrain the models used for CI.
2 | # # Should be done rarely, indicates a major breaking change.
3 | my_python=python
4 |
5 | ############### TEST regular RNN choose either -rnn_type LSTM / GRU / SRU and set input_feed 0 for SRU
6 | if true; then
7 | rm data/*.pt
8 | $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000
9 |
10 | $my_python train.py -data data/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 1 -train_steps 10000 -optim adam -learning_rate 0.001 -rnn_type LSTM -input_feed 0
11 | #-truncated_decoder 5
12 | #-label_smoothing 0.1
13 |
14 | mv tmp*e10.pt onmt/tests/test_model.pt
15 | rm tmp*.pt
16 | fi
17 | #
18 | #
19 | ############### TEST CNN
20 | if false; then
21 | rm data/*.pt
22 | $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000
23 |
24 | $my_python train.py -data data/data -save_model /tmp/tmp -world_size 1 -gpu_ranks 0 -rnn_size 256 -word_vec_size 256 -layers 2 -train_steps 10000 -optim adam -learning_rate 0.001 -encoder_type cnn -decoder_type cnn
25 |
26 |
27 | mv /tmp/tmp*e10.pt onmt/tests/test_model.pt
28 |
29 | rm /tmp/tmp*.pt
30 | fi
31 | #
32 | ################# MORPH DATA
33 | if true; then
34 | rm data/morph/*.pt
35 | $my_python preprocess.py -train_src data/morph/src.train -train_tgt data/morph/tgt.train -valid_src data/morph/src.valid -valid_tgt data/morph/tgt.valid -save_data data/morph/data
36 |
37 | $my_python train.py -data data/morph/data -save_model tmp -world_size 1 -gpu_ranks 0 -rnn_size 400 -word_vec_size 100 -layers 1 -train_steps 8000 -optim adam -learning_rate 0.001
38 |
39 |
40 | mv tmp*e8.pt onmt/tests/test_model2.pt
41 |
42 | rm tmp*.pt
43 | fi
44 | ############### TEST TRANSFORMER
45 | if false; then
46 | rm data/*.pt
47 | $my_python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data data/data -src_vocab_size 1000 -tgt_vocab_size 1000 -share_vocab
48 |
49 |
50 | $my_python train.py -data data/data -save_model /tmp/tmp -batch_type tokens -batch_size 1024 -accum_count 4 \
51 | -layers 4 -rnn_size 256 -word_vec_size 256 -encoder_type transformer -decoder_type transformer -share_embedding \
52 | -train_steps 10000 -world_size 1 -gpu_ranks 0 -max_generator_batches 4 -dropout 0.1 -normalization tokens \
53 | -max_grad_norm 0 -optim adam -decay_method noam -learning_rate 2 -label_smoothing 0.1 \
54 | -position_encoding -param_init 0 -warmup_steps 100 -param_init_glorot -adam_beta2 0.998
55 | #
56 | mv /tmp/tmp*e10.pt onmt/tests/test_model.pt
57 | rm /tmp/tmp*.pt
58 | fi
59 | #
60 | if false; then
61 | $my_python translate.py -gpu 0 -model onmt/tests/test_model.pt \
62 | -src data/src-val.txt -output onmt/tests/output_hyp.txt -beam 5 -batch_size 16
63 |
64 | fi
65 |
66 |
67 |
--------------------------------------------------------------------------------
/onmt/tests/test_attention.py:
--------------------------------------------------------------------------------
1 | """
2 | Here come the tests for attention types and their compatibility
3 | """
4 | import unittest
5 | import torch
6 | from torch.autograd import Variable
7 |
8 | import onmt
9 |
10 |
11 | class TestAttention(unittest.TestCase):
12 |
13 | def test_masked_global_attention(self):
14 |
15 | source_lengths = torch.IntTensor([7, 3, 5, 2])
16 | # illegal_weights_mask = torch.ByteTensor([
17 | # [0, 0, 0, 0, 0, 0, 0],
18 | # [0, 0, 0, 1, 1, 1, 1],
19 | # [0, 0, 0, 0, 0, 1, 1],
20 | # [0, 0, 1, 1, 1, 1, 1]])
21 |
22 | batch_size = source_lengths.size(0)
23 | dim = 20
24 |
25 | memory_bank = Variable(torch.randn(batch_size,
26 | source_lengths.max(), dim))
27 | hidden = Variable(torch.randn(batch_size, dim))
28 |
29 | attn = onmt.modules.GlobalAttention(dim)
30 |
31 | _, alignments = attn(hidden, memory_bank,
32 | memory_lengths=source_lengths)
33 | # TODO: fix for pytorch 0.3
34 | # illegal_weights = alignments.masked_select(illegal_weights_mask)
35 |
36 | # self.assertEqual(0.0, illegal_weights.data.sum())
37 |
--------------------------------------------------------------------------------
/onmt/tests/test_image_dataset.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from onmt.inputters.image_dataset import ImageDataReader
3 |
4 | import os
5 | import shutil
6 |
7 | import cv2
8 | import numpy as np
9 | import torch
10 |
11 |
12 | class TestImageDataReader(unittest.TestCase):
13 | # this test touches the file system, so it could be considered an
14 | # integration test
15 | _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
16 | _IMG_DATA_DIRNAME = "test_image_data"
17 | _IMG_DATA_DIR = os.path.join(_THIS_DIR, _IMG_DATA_DIRNAME)
18 | _IMG_DATA_FMT = "test_img_{:d}.png"
19 | _IMG_DATA_PATH_FMT = os.path.join(_IMG_DATA_DIR, _IMG_DATA_FMT)
20 |
21 | _IMG_LIST_DIR = "test_image_filenames"
22 | # file to hold full paths to image data
23 | _IMG_LIST_PATHS_FNAME = "test_files.txt"
24 | _IMG_LIST_PATHS_PATH = os.path.join(
25 | _IMG_LIST_DIR, _IMG_LIST_PATHS_FNAME)
26 | # file to hold image paths relative to _IMG_DATA_DIR (i.e. file names)
27 | _IMG_LIST_FNAMES_FNAME = "test_fnames.txt"
28 | _IMG_LIST_FNAMES_PATH = os.path.join(
29 | _IMG_LIST_DIR, _IMG_LIST_FNAMES_FNAME)
30 |
31 | # it's ok if non-image files co-exist with image files in the data dir
32 | _JUNK_FILE = os.path.join(
33 | _IMG_DATA_DIR, "this_is_junk.txt")
34 |
35 | _N_EXAMPLES = 20
36 | _N_CHANNELS = 3
37 |
38 | @classmethod
39 | def setUpClass(cls):
40 | if not os.path.exists(cls._IMG_DATA_DIR):
41 | os.makedirs(cls._IMG_DATA_DIR)
42 | if not os.path.exists(cls._IMG_LIST_DIR):
43 | os.makedirs(cls._IMG_LIST_DIR)
44 |
45 | with open(cls._JUNK_FILE, "w") as f:
46 | f.write("this is some garbage\nShould have no impact.")
47 |
48 | with open(cls._IMG_LIST_PATHS_PATH, "w") as f_list_fnames, \
49 | open(cls._IMG_LIST_FNAMES_PATH, "w") as f_list_paths:
50 | cls.n_rows = torch.randint(30, 314, (cls._N_EXAMPLES,))
51 | cls.n_cols = torch.randint(30, 314, (cls._N_EXAMPLES,))
52 | for i in range(cls._N_EXAMPLES):
53 | img = np.random.randint(
54 | 0, 255, (cls.n_rows[i], cls.n_cols[i], cls._N_CHANNELS))
55 | f_path = cls._IMG_DATA_PATH_FMT.format(i)
56 | cv2.imwrite(f_path, img)
57 | f_name_short = cls._IMG_DATA_FMT.format(i)
58 | f_list_fnames.write(f_name_short + "\n")
59 | f_list_paths.write(f_path + "\n")
60 |
61 | @classmethod
62 | def tearDownClass(cls):
63 | shutil.rmtree(cls._IMG_DATA_DIR)
64 | shutil.rmtree(cls._IMG_LIST_DIR)
65 |
66 | def test_read_from_dir_and_data_file_containing_filenames(self):
67 | rdr = ImageDataReader(channel_size=self._N_CHANNELS)
68 | i = 0 # initialize since there's a sanity check on i
69 | for i, img in enumerate(rdr.read(
70 | self._IMG_LIST_FNAMES_PATH, "src", self._IMG_DATA_DIR)):
71 | self.assertEqual(
72 | img["src"].shape,
73 | (self._N_CHANNELS, self.n_rows[i], self.n_cols[i]))
74 | self.assertEqual(img["src_path"],
75 | self._IMG_DATA_PATH_FMT.format(i))
76 | self.assertGreater(i, 0, "No image data was read.")
77 |
78 | def test_read_from_dir_and_data_file_containing_paths(self):
79 | rdr = ImageDataReader(channel_size=self._N_CHANNELS)
80 | i = 0 # initialize since there's a sanity check on i
81 | for i, img in enumerate(rdr.read(
82 | self._IMG_LIST_PATHS_PATH, "src", self._IMG_DATA_DIR)):
83 | self.assertEqual(
84 | img["src"].shape,
85 | (self._N_CHANNELS, self.n_rows[i], self.n_cols[i]))
86 | self.assertEqual(img["src_path"],
87 | self._IMG_DATA_FMT.format(i))
88 | self.assertGreater(i, 0, "No image data was read.")
89 |
90 |
91 | class TestImageDataReader1Channel(TestImageDataReader):
92 | _N_CHANNELS = 1
93 |
--------------------------------------------------------------------------------
/onmt/tests/test_model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/tests/test_model.pt
--------------------------------------------------------------------------------
/onmt/tests/test_model2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemon234071/AdaLabel/43ebaac16807d70117bc12bd8e4596fad4191e20/onmt/tests/test_model2.pt
--------------------------------------------------------------------------------
/onmt/tests/test_simple.py:
--------------------------------------------------------------------------------
1 | import onmt
2 |
3 |
4 | def test_load():
5 | onmt
6 | pass
7 |
--------------------------------------------------------------------------------
/onmt/tests/test_structured_attention.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from onmt.modules.structured_attention import MatrixTree
3 |
4 | import torch
5 |
6 |
7 | class TestStructuredAttention(unittest.TestCase):
8 | def test_matrix_tree_marg_pdfs_sum_to_1(self):
9 | dtree = MatrixTree()
10 | q = torch.rand(1, 5, 5)
11 | marg = dtree.forward(q)
12 | self.assertTrue(
13 | marg.sum(1).allclose(torch.tensor(1.0)))
14 |
--------------------------------------------------------------------------------
/onmt/tests/utils_for_tests.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 |
4 | def product_dict(**kwargs):
5 | keys = kwargs.keys()
6 | vals = kwargs.values()
7 | for instance in itertools.product(*vals):
8 | yield dict(zip(keys, instance))
9 |
--------------------------------------------------------------------------------
/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | """ Modules for translation """
2 | from onmt.translate.translator import Translator
3 | from onmt.translate.translation import Translation, TranslationBuilder
4 | from onmt.translate.beam import Beam, GNMTGlobalScorer
5 | from onmt.translate.beam_search import BeamSearch
6 | from onmt.translate.decode_strategy import DecodeStrategy
7 | from onmt.translate.random_sampling import RandomSampling
8 | from onmt.translate.penalties import PenaltyBuilder
9 | from onmt.translate.translation_server import TranslationServer, \
10 | ServerModelError
11 |
12 | __all__ = ['Translator', 'Translation', 'Beam', 'BeamSearch',
13 | 'GNMTGlobalScorer', 'TranslationBuilder',
14 | 'PenaltyBuilder', 'TranslationServer', 'ServerModelError',
15 | "DecodeStrategy", "RandomSampling"]
16 |
--------------------------------------------------------------------------------
/onmt/translate/penalties.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import torch
3 |
4 |
5 | class PenaltyBuilder(object):
6 | """Returns the Length and Coverage Penalty function for Beam Search.
7 |
8 | Args:
9 | length_pen (str): option name of length pen
10 | cov_pen (str): option name of cov pen
11 |
12 | Attributes:
13 | has_cov_pen (bool): Whether coverage penalty is None (applying it
14 | is a no-op). Note that the converse isn't true. Setting beta
15 | to 0 should force coverage length to be a no-op.
16 | has_len_pen (bool): Whether length penalty is None (applying it
17 | is a no-op). Note that the converse isn't true. Setting alpha
18 | to 1 should force length penalty to be a no-op.
19 | coverage_penalty (callable[[FloatTensor, float], FloatTensor]):
20 | Calculates the coverage penalty.
21 | length_penalty (callable[[int, float], float]): Calculates
22 | the length penalty.
23 | """
24 |
25 | def __init__(self, cov_pen, length_pen):
26 | self.has_cov_pen = not self._pen_is_none(cov_pen)
27 | self.coverage_penalty = self._coverage_penalty(cov_pen)
28 | self.has_len_pen = not self._pen_is_none(length_pen)
29 | self.length_penalty = self._length_penalty(length_pen)
30 |
31 | @staticmethod
32 | def _pen_is_none(pen):
33 | return pen == "none" or pen is None
34 |
35 | def _coverage_penalty(self, cov_pen):
36 | if cov_pen == "wu":
37 | return self.coverage_wu
38 | elif cov_pen == "summary":
39 | return self.coverage_summary
40 | elif self._pen_is_none(cov_pen):
41 | return self.coverage_none
42 | else:
43 | raise NotImplementedError("No '{:s}' coverage penalty.".format(
44 | cov_pen))
45 |
46 | def _length_penalty(self, length_pen):
47 | if length_pen == "wu":
48 | return self.length_wu
49 | elif length_pen == "avg":
50 | return self.length_average
51 | elif self._pen_is_none(length_pen):
52 | return self.length_none
53 | else:
54 | raise NotImplementedError("No '{:s}' length penalty.".format(
55 | length_pen))
56 |
57 | # Below are all the different penalty terms implemented so far.
58 | # Subtract coverage penalty from topk log probs.
59 | # Divide topk log probs by length penalty.
60 |
61 | def coverage_wu(self, cov, beta=0.):
62 | """GNMT coverage re-ranking score.
63 |
64 | See "Google's Neural Machine Translation System" :cite:`wu2016google`.
65 | ``cov`` is expected to be sized ``(*, seq_len)``, where ``*`` is
66 | probably ``batch_size x beam_size`` but could be several
67 | dimensions like ``(batch_size, beam_size)``. If ``cov`` is attention,
68 | then the ``seq_len`` axis probably sums to (almost) 1.
69 | """
70 |
71 | penalty = -torch.min(cov, cov.clone().fill_(1.0)).log().sum(-1)
72 | return beta * penalty
73 |
74 | def coverage_summary(self, cov, beta=0.):
75 | """Our summary penalty."""
76 | penalty = torch.max(cov, cov.clone().fill_(1.0)).sum(-1)
77 | penalty -= cov.size(-1)
78 | return beta * penalty
79 |
80 | def coverage_none(self, cov, beta=0.):
81 | """Returns zero as penalty"""
82 | none = torch.zeros((1,), device=cov.device,
83 | dtype=torch.float)
84 | if cov.dim() == 3:
85 | none = none.unsqueeze(0)
86 | return none
87 |
88 | def length_wu(self, cur_len, alpha=0.):
89 | """GNMT length re-ranking score.
90 |
91 | See "Google's Neural Machine Translation System" :cite:`wu2016google`.
92 | """
93 |
94 | return ((5 + cur_len) / 6.0) ** alpha
95 |
96 | def length_average(self, cur_len, alpha=0.):
97 | """Returns the current sequence length."""
98 | return cur_len
99 |
100 | def length_none(self, cur_len, alpha=0.):
101 | """Returns unmodified scores."""
102 | return 1.0
103 |
--------------------------------------------------------------------------------
/onmt/translate/process_zh.py:
--------------------------------------------------------------------------------
1 | from pyhanlp import HanLP
2 | from snownlp import SnowNLP
3 | import pkuseg
4 |
5 |
6 | # Chinese segmentation
7 | def zh_segmentator(line):
8 | return " ".join(pkuseg.pkuseg().cut(line))
9 |
10 |
11 | # Chinese simplify -> Chinese traditional standard
12 | def zh_traditional_standard(line):
13 | return HanLP.convertToTraditionalChinese(line)
14 |
15 |
16 | # Chinese simplify -> Chinese traditional (HongKong)
17 | def zh_traditional_hk(line):
18 | return HanLP.s2hk(line)
19 |
20 |
21 | # Chinese simplify -> Chinese traditional (Taiwan)
22 | def zh_traditional_tw(line):
23 | return HanLP.s2tw(line)
24 |
25 |
26 | # Chinese traditional -> Chinese simplify (v1)
27 | def zh_simplify(line):
28 | return HanLP.convertToSimplifiedChinese(line)
29 |
30 |
31 | # Chinese traditional -> Chinese simplify (v2)
32 | def zh_simplify_v2(line):
33 | return SnowNLP(line).han
34 |
--------------------------------------------------------------------------------
/onmt/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Module defining various utilities."""
2 | from onmt.utils.misc import split_corpus, aeq, use_gpu, set_random_seed
3 | from onmt.utils.report_manager import ReportMgr, build_report_manager
4 | from onmt.utils.statistics import Statistics
5 | from onmt.utils.optimizers import MultipleOptimizer, \
6 | Optimizer, AdaFactor
7 | from onmt.utils.earlystopping import EarlyStopping, scorers_from_opts
8 |
9 | __all__ = ["split_corpus", "aeq", "use_gpu", "set_random_seed", "ReportMgr",
10 | "build_report_manager", "Statistics",
11 | "MultipleOptimizer", "Optimizer", "AdaFactor", "EarlyStopping",
12 | "scorers_from_opts"]
13 |
--------------------------------------------------------------------------------
/onmt/utils/cnn_factory.py:
--------------------------------------------------------------------------------
1 | """
2 | Implementation of "Convolutional Sequence to Sequence Learning"
3 | """
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.init as init
7 |
8 | import onmt.modules
9 |
10 | SCALE_WEIGHT = 0.5 ** 0.5
11 |
12 |
13 | def shape_transform(x):
14 | """ Tranform the size of the tensors to fit for conv input. """
15 | return torch.unsqueeze(torch.transpose(x, 1, 2), 3)
16 |
17 |
18 | class GatedConv(nn.Module):
19 | """ Gated convolution for CNN class """
20 |
21 | def __init__(self, input_size, width=3, dropout=0.2, nopad=False):
22 | super(GatedConv, self).__init__()
23 | self.conv = onmt.modules.WeightNormConv2d(
24 | input_size, 2 * input_size, kernel_size=(width, 1), stride=(1, 1),
25 | padding=(width // 2 * (1 - nopad), 0))
26 | init.xavier_uniform_(self.conv.weight, gain=(4 * (1 - dropout))**0.5)
27 | self.dropout = nn.Dropout(dropout)
28 |
29 | def forward(self, x_var):
30 | x_var = self.dropout(x_var)
31 | x_var = self.conv(x_var)
32 | out, gate = x_var.split(int(x_var.size(1) / 2), 1)
33 | out = out * torch.sigmoid(gate)
34 | return out
35 |
36 |
37 | class StackedCNN(nn.Module):
38 | """ Stacked CNN class """
39 |
40 | def __init__(self, num_layers, input_size, cnn_kernel_width=3,
41 | dropout=0.2):
42 | super(StackedCNN, self).__init__()
43 | self.dropout = dropout
44 | self.num_layers = num_layers
45 | self.layers = nn.ModuleList()
46 | for _ in range(num_layers):
47 | self.layers.append(
48 | GatedConv(input_size, cnn_kernel_width, dropout))
49 |
50 | def forward(self, x):
51 | for conv in self.layers:
52 | x = x + conv(x)
53 | x *= SCALE_WEIGHT
54 | return x
55 |
--------------------------------------------------------------------------------
/onmt/utils/distributed.py:
--------------------------------------------------------------------------------
1 | """ Pytorch Distributed utils
2 | This piece of code was heavily inspired by the equivalent of Fairseq-py
3 | https://github.com/pytorch/fairseq
4 | """
5 |
6 |
7 | from __future__ import print_function
8 |
9 | import math
10 | import pickle
11 | import torch.distributed
12 |
13 | from onmt.utils.logging import logger
14 |
15 |
16 | def is_master(opt, device_id):
17 | return opt.gpu_ranks[device_id] == 0
18 |
19 |
20 | def multi_init(opt, device_id):
21 | dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
22 | master_ip=opt.master_ip,
23 | master_port=opt.master_port)
24 | dist_world_size = opt.world_size
25 | torch.distributed.init_process_group(
26 | backend=opt.gpu_backend, init_method=dist_init_method,
27 | world_size=dist_world_size, rank=opt.gpu_ranks[device_id])
28 | gpu_rank = torch.distributed.get_rank()
29 | if not is_master(opt, device_id):
30 | logger.disabled = True
31 |
32 | return gpu_rank
33 |
34 |
35 | def all_reduce_and_rescale_tensors(tensors, rescale_denom,
36 | buffer_size=10485760):
37 | """All-reduce and rescale tensors in chunks of the specified size.
38 |
39 | Args:
40 | tensors: list of Tensors to all-reduce
41 | rescale_denom: denominator for rescaling summed Tensors
42 | buffer_size: all-reduce chunk size in bytes
43 | """
44 | # buffer size in bytes, determine equiv. # of elements based on data type
45 | buffer_t = tensors[0].new(
46 | math.ceil(buffer_size / tensors[0].element_size())).zero_()
47 | buffer = []
48 |
49 | def all_reduce_buffer():
50 | # copy tensors into buffer_t
51 | offset = 0
52 | for t in buffer:
53 | numel = t.numel()
54 | buffer_t[offset:offset+numel].copy_(t.view(-1))
55 | offset += numel
56 |
57 | # all-reduce and rescale
58 | torch.distributed.all_reduce(buffer_t[:offset])
59 | buffer_t.div_(rescale_denom)
60 |
61 | # copy all-reduced buffer back into tensors
62 | offset = 0
63 | for t in buffer:
64 | numel = t.numel()
65 | t.view(-1).copy_(buffer_t[offset:offset+numel])
66 | offset += numel
67 |
68 | filled = 0
69 | for t in tensors:
70 | sz = t.numel() * t.element_size()
71 | if sz > buffer_size:
72 | # tensor is bigger than buffer, all-reduce and rescale directly
73 | torch.distributed.all_reduce(t)
74 | t.div_(rescale_denom)
75 | elif filled + sz > buffer_size:
76 | # buffer is full, all-reduce and replace buffer with grad
77 | all_reduce_buffer()
78 | buffer = [t]
79 | filled = sz
80 | else:
81 | # add tensor to buffer
82 | buffer.append(t)
83 | filled += sz
84 |
85 | if len(buffer) > 0:
86 | all_reduce_buffer()
87 |
88 |
89 | def all_gather_list(data, max_size=4096):
90 | """Gathers arbitrary data from all nodes into a list."""
91 | world_size = torch.distributed.get_world_size()
92 | if not hasattr(all_gather_list, '_in_buffer') or \
93 | max_size != all_gather_list._in_buffer.size():
94 | all_gather_list._in_buffer = torch.cuda.ByteTensor(max_size)
95 | all_gather_list._out_buffers = [
96 | torch.cuda.ByteTensor(max_size)
97 | for i in range(world_size)
98 | ]
99 | in_buffer = all_gather_list._in_buffer
100 | out_buffers = all_gather_list._out_buffers
101 |
102 | enc = pickle.dumps(data)
103 | enc_size = len(enc)
104 | if enc_size + 2 > max_size:
105 | raise ValueError(
106 | 'encoded data exceeds max_size: {}'.format(enc_size + 2))
107 | assert max_size < 255*256
108 | in_buffer[0] = enc_size // 255 # this encoding works for max_size < 65k
109 | in_buffer[1] = enc_size % 255
110 | in_buffer[2:enc_size+2] = torch.ByteTensor(list(enc))
111 |
112 | torch.distributed.all_gather(out_buffers, in_buffer.cuda())
113 |
114 | results = []
115 | for i in range(world_size):
116 | out_buffer = out_buffers[i]
117 | size = (255 * out_buffer[0].item()) + out_buffer[1].item()
118 |
119 | bytes_list = bytes(out_buffer[2:size+2].tolist())
120 | result = pickle.loads(bytes_list)
121 | results.append(result)
122 | return results
123 |
--------------------------------------------------------------------------------
/onmt/utils/logging.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | import logging
5 |
6 | logger = logging.getLogger()
7 |
8 |
9 | def init_logger(log_file=None, log_file_level=logging.NOTSET):
10 | log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
11 | logger = logging.getLogger()
12 | logger.setLevel(logging.INFO)
13 |
14 | console_handler = logging.StreamHandler()
15 | console_handler.setFormatter(log_format)
16 | logger.handlers = [console_handler]
17 |
18 | if log_file and log_file != '':
19 | file_handler = logging.FileHandler(log_file)
20 | file_handler.setLevel(log_file_level)
21 | file_handler.setFormatter(log_format)
22 | logger.addHandler(file_handler)
23 |
24 | return logger
25 |
--------------------------------------------------------------------------------
/onmt/utils/misc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import torch
4 | import random
5 | import inspect
6 | from itertools import islice
7 |
8 |
9 | def split_corpus(path, shard_size):
10 | with open(path, "rb") as f:
11 | if shard_size <= 0:
12 | yield f.readlines()
13 | else:
14 | while True:
15 | shard = list(islice(f, shard_size))
16 | if not shard:
17 | break
18 | yield shard
19 |
20 |
21 | def aeq(*args):
22 | """
23 | Assert all arguments have the same value
24 | """
25 | arguments = (arg for arg in args)
26 | first = next(arguments)
27 | assert all(arg == first for arg in arguments), \
28 | "Not all arguments have the same value: " + str(args)
29 |
30 |
31 | def sequence_mask(lengths, max_len=None):
32 | """
33 | Creates a boolean mask from sequence lengths.
34 | """
35 | batch_size = lengths.numel()
36 | max_len = max_len or lengths.max()
37 | return (torch.arange(0, max_len, device=lengths.device)
38 | .type_as(lengths)
39 | .repeat(batch_size, 1)
40 | .lt(lengths.unsqueeze(1)))
41 |
42 |
43 | def tile(x, count, dim=0):
44 | """
45 | Tiles x on dimension dim count times.
46 | """
47 | perm = list(range(len(x.size())))
48 | if dim != 0:
49 | perm[0], perm[dim] = perm[dim], perm[0]
50 | x = x.permute(perm).contiguous()
51 | out_size = list(x.size())
52 | out_size[0] *= count
53 | batch = x.size(0)
54 | x = x.view(batch, -1) \
55 | .transpose(0, 1) \
56 | .repeat(count, 1) \
57 | .transpose(0, 1) \
58 | .contiguous() \
59 | .view(*out_size)
60 | if dim != 0:
61 | x = x.permute(perm).contiguous()
62 | return x
63 |
64 |
65 | def use_gpu(opt):
66 | """
67 | Creates a boolean if gpu used
68 | """
69 | return (hasattr(opt, 'gpu_ranks') and len(opt.gpu_ranks) > 0) or \
70 | (hasattr(opt, 'gpu') and opt.gpu > -1)
71 |
72 |
73 | def set_random_seed(seed, is_cuda):
74 | """Sets the random seed."""
75 | if seed > 0:
76 | torch.manual_seed(seed)
77 | # this one is needed for torchtext random call (shuffled iterator)
78 | # in multi gpu it ensures datasets are read in the same order
79 | random.seed(seed)
80 | # some cudnn methods can be random even after fixing the seed
81 | # unless you tell it to be deterministic
82 | torch.backends.cudnn.deterministic = True
83 |
84 | if is_cuda and seed > 0:
85 | # These ensure same initialization in multi gpu mode
86 | torch.cuda.manual_seed(seed)
87 |
88 |
89 | def generate_relative_positions_matrix(length, max_relative_positions,
90 | cache=False):
91 | """Generate the clipped relative positions matrix
92 | for a given length and maximum relative positions"""
93 | if cache:
94 | distance_mat = torch.arange(-length+1, 1, 1).unsqueeze(0)
95 | else:
96 | range_vec = torch.arange(length)
97 | range_mat = range_vec.unsqueeze(-1).expand(-1, length).transpose(0, 1)
98 | distance_mat = range_mat - range_mat.transpose(0, 1)
99 | distance_mat_clipped = torch.clamp(distance_mat,
100 | min=-max_relative_positions,
101 | max=max_relative_positions)
102 | # Shift values to be >= 0
103 | final_mat = distance_mat_clipped + max_relative_positions
104 | return final_mat
105 |
106 |
107 | def relative_matmul(x, z, transpose):
108 | """Helper function for relative positions attention."""
109 | batch_size = x.shape[0]
110 | heads = x.shape[1]
111 | length = x.shape[2]
112 | x_t = x.permute(2, 0, 1, 3)
113 | x_t_r = x_t.reshape(length, heads * batch_size, -1)
114 | if transpose:
115 | z_t = z.transpose(1, 2)
116 | x_tz_matmul = torch.matmul(x_t_r, z_t)
117 | else:
118 | x_tz_matmul = torch.matmul(x_t_r, z)
119 | x_tz_matmul_r = x_tz_matmul.reshape(length, batch_size, heads, -1)
120 | x_tz_matmul_r_t = x_tz_matmul_r.permute(1, 2, 0, 3)
121 | return x_tz_matmul_r_t
122 |
123 |
124 | def fn_args(fun):
125 | """Returns the list of function arguments name."""
126 | return inspect.getfullargspec(fun).args
127 |
--------------------------------------------------------------------------------
/onmt/utils/rnn_factory.py:
--------------------------------------------------------------------------------
1 | """
2 | RNN tools
3 | """
4 | import torch.nn as nn
5 | import onmt.models
6 |
7 |
8 | def rnn_factory(rnn_type, **kwargs):
9 | """ rnn factory, Use pytorch version when available. """
10 | no_pack_padded_seq = False
11 | if rnn_type == "SRU":
12 | # SRU doesn't support PackedSequence.
13 | no_pack_padded_seq = True
14 | rnn = onmt.models.sru.SRU(**kwargs)
15 | else:
16 | rnn = getattr(nn, rnn_type)(**kwargs)
17 | return rnn, no_pack_padded_seq
18 |
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import torch
3 | import numpy as np
4 | import random
5 |
6 | from onmt.bin.preprocess import main
7 |
8 |
9 | def setup_seed(seed):
10 | torch.manual_seed(seed)
11 | torch.cuda.manual_seed_all(seed)
12 | np.random.seed(seed)
13 | random.seed(seed)
14 | torch.backends.cudnn.deterministic = True
15 |
16 |
17 | setup_seed(2020)
18 |
19 | if __name__ == "__main__":
20 | main()
21 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | torch==1.7.1
2 | torchtext==0.4.0
3 | nltk==3.4.3
4 | transformers==2.1.1
5 | numpy==1.17.2
6 | requests
7 | configargparse
8 |
--------------------------------------------------------------------------------
/requirements.opt.txt:
--------------------------------------------------------------------------------
1 | cffi
2 | torchvision
3 | joblib
4 | librosa
5 | Pillow
6 | git+git://github.com/pytorch/audio.git@d92de5b97fc6204db4b1e3ed20c03ac06f5d53f0
7 | pyrouge
8 | opencv-python
9 | git+https://github.com/NVIDIA/apex
10 | pretrainedmodels
11 |
--------------------------------------------------------------------------------
/scripts/inference_daily.sh:
--------------------------------------------------------------------------------
1 | DATA_DIR="data_daily"
2 | #DATA_DIR="data_ost"
3 | DATASET="bert"
4 |
5 | python3 translate.py -gpu "$1" -model "$2" \
6 | -output result/"$DATASET"_adalab_"$DATA_DIR".txt -beam 1 -batch_size 128 \
7 | -src "$DATA_DIR"/src-test.txt -max_length 30 -tokenizer bert
8 |
--------------------------------------------------------------------------------
/scripts/inference_ost.sh:
--------------------------------------------------------------------------------
1 | # DATA_DIR="data_daily"
2 | DATA_DIR="data_ost"
3 | DATASET="bert"
4 |
5 | python3 translate.py -gpu "$1" -model "$2" \
6 | -output result/"$DATASET"_adalab_"$DATA_DIR".txt -beam 1 -batch_size 128 \
7 | -src "$DATA_DIR"/src-test.txt -max_length 30 -tokenizer bert
8 |
--------------------------------------------------------------------------------
/scripts/preprocess.sh:
--------------------------------------------------------------------------------
1 | DATA_DIR="data_daily"
2 | #DATA_DIR="data_ost"
3 | DATASET="bert"
4 | VOCAB="vocab.txt"
5 |
6 |
7 | python3 preprocess.py -train_src "$DATA_DIR"/src-train.txt -train_tgt "$DATA_DIR"/tgt-train.txt \
8 | -valid_src "$DATA_DIR"/src-valid.txt -valid_tgt "$DATA_DIR"/tgt-valid.txt \
9 | -save_data "$DATA_DIR"/"$DATASET" -share_vocab \
10 | -src_vocab_size 300000 -tgt_vocab_size 300000 \
11 | -src_vocab "$DATA_DIR"/"$VOCAB" -tgt_vocab "$DATA_DIR"/"$VOCAB" \
12 | -src_seq_length 512 -tgt_seq_length 512 \
13 | -tokenizer bert
--------------------------------------------------------------------------------
/scripts/train_daily.sh:
--------------------------------------------------------------------------------
1 | DATA_DIR="data_daily"
2 | #DATA_DIR="data_ost"
3 | DATASET="bert"
4 | EMB=512
5 | STEPS=1000000
6 | BS=64
7 | ACCUM=2
8 | SAVESTEPS=1000
9 |
10 |
11 | python3 train.py -adalab -bidecoder -ada_temp 1.5 \
12 | -world_size 1 -gpu_ranks 0 \
13 | -log_file ./log_dir/"$DATASET"_transformer_adalab_"$DATA_DIR".log -data "$DATA_DIR"/"$DATASET" \
14 | -save_model checkpoint/"$DATASET"_trainsformer_adalab_"$DATA_DIR" \
15 | -train_steps "$STEPS" -save_checkpoint_steps "$SAVESTEPS" -valid_steps "$SAVESTEPS" -report_every 100 \
16 | -max_generator_batches 0 -dropout 0.1 -max_grad_norm 1 \
17 | -encoder_type transformer -decoder_type transformer -position_encoding \
18 | -param_init 0 -param_init_glorot -transformer_ff 512 -heads 8 \
19 | -batch_size "$BS" -accum_count "$ACCUM" -layers 6 -rnn_size "$EMB" -word_vec_size "$EMB" \
20 | -optim adam -learning_rate 1e-4 -start_decay_steps 100000000 -early_stopping 10
--------------------------------------------------------------------------------
/scripts/train_ost.sh:
--------------------------------------------------------------------------------
1 | #DATA_DIR="data_daily"
2 | DATA_DIR="data_ost"
3 | DATASET="bert"
4 | EMB=512
5 | STEPS=1000000
6 | BS=64
7 | ACCUM=2
8 | SAVESTEPS=1000
9 |
10 |
11 | python3 train.py -adalab -bidecoder -ada_temp 1 \
12 | -world_size 1 -gpu_ranks 0 \
13 | -log_file ./log_dir/"$DATASET"_transformer_adalab_"$DATA_DIR".log -data "$DATA_DIR"/"$DATASET" \
14 | -save_model checkpoint/"$DATASET"_trainsformer_adalab_"$DATA_DIR" \
15 | -train_steps "$STEPS" -save_checkpoint_steps "$SAVESTEPS" -valid_steps "$SAVESTEPS" -report_every 100 \
16 | -max_generator_batches 0 -dropout 0.1 -max_grad_norm 1 \
17 | -encoder_type transformer -decoder_type transformer -position_encoding \
18 | -param_init 0 -param_init_glorot -transformer_ff 512 -heads 8 \
19 | -batch_size "$BS" -accum_count "$ACCUM" -layers 6 -rnn_size "$EMB" -word_vec_size "$EMB" \
20 | -optim adam -learning_rate 1e-4 -start_decay_steps 100000000 -early_stopping 30
--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from onmt.bin.server import main
3 |
4 |
5 | if __name__ == "__main__":
6 | main()
7 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from setuptools import setup, find_packages
3 | from os import path
4 |
5 | this_directory = path.abspath(path.dirname(__file__))
6 | with open(path.join(this_directory, 'ONMT_README.md'), encoding='utf-8') as f:
7 | long_description = f.read()
8 |
9 | setup(
10 | name='OpenNMT-py',
11 | description='A python implementation of OpenNMT',
12 | long_description=long_description,
13 | long_description_content_type='text/markdown',
14 | version='1.0.0.rc2',
15 | packages=find_packages(),
16 | project_urls={
17 | "Documentation": "http://opennmt.net/OpenNMT-py/",
18 | "Forum": "http://forum.opennmt.net/",
19 | "Gitter": "https://gitter.im/OpenNMT/OpenNMT-py",
20 | "Source": "https://github.com/OpenNMT/OpenNMT-py/"
21 | },
22 | install_requires=[
23 | "six",
24 | "tqdm~=4.30.0",
25 | "torch>=1.2",
26 | "torchtext==0.4.0",
27 | "future",
28 | "configargparse",
29 | "tensorboard>=1.14",
30 | "flask",
31 | "pyonmttok==1.*;platform_system=='Linux'",
32 | ],
33 | entry_points={
34 | "console_scripts": [
35 | "onmt_server=onmt.bin.server:main",
36 | "onmt_train=onmt.bin.train:main",
37 | "onmt_translate=onmt.bin.translate:main",
38 | "onmt_preprocess=onmt.bin.preprocess:main",
39 | ],
40 | }
41 | )
42 |
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 | This directly contains scripts and tools adopted from other open source projects such as Apache Joshua and Moses Decoder.
2 |
3 | TODO: credit the authors and resolve license issues (if any)
4 |
--------------------------------------------------------------------------------
/tools/average_models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import torch
4 |
5 |
6 | def average_models(model_files):
7 | vocab = None
8 | opt = None
9 | avg_model = None
10 | avg_generator = None
11 |
12 | for i, model_file in enumerate(model_files):
13 | m = torch.load(model_file, map_location='cpu')
14 | model_weights = m['model']
15 | generator_weights = m['generator']
16 |
17 | if i == 0:
18 | vocab, opt = m['vocab'], m['opt']
19 | avg_model = model_weights
20 | avg_generator = generator_weights
21 | else:
22 | for (k, v) in avg_model.items():
23 | avg_model[k].mul_(i).add_(model_weights[k]).div_(i + 1)
24 |
25 | for (k, v) in avg_generator.items():
26 | avg_generator[k].mul_(i).add_(generator_weights[k]).div_(i + 1)
27 |
28 | final = {"vocab": vocab, "opt": opt, "optim": None,
29 | "generator": avg_generator, "model": avg_model}
30 | return final
31 |
32 |
33 | def main():
34 | parser = argparse.ArgumentParser(description="")
35 | parser.add_argument("-models", "-m", nargs="+", required=True,
36 | help="List of models")
37 | parser.add_argument("-output", "-o", required=True,
38 | help="Output file")
39 | opt = parser.parse_args()
40 |
41 | final = average_models(opt.models)
42 | torch.save(final, opt.output)
43 |
44 |
45 | if __name__ == "__main__":
46 | main()
47 |
--------------------------------------------------------------------------------
/tools/create_vocabulary.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import argparse
4 | import sys
5 | import os
6 |
7 |
8 | def read_files_batch(file_list):
9 | """Reads the provided files in batches"""
10 | batch = [] # Keep batch for each file
11 | fd_list = [] # File descriptor list
12 |
13 | exit = False # Flag used for quitting the program in case of error
14 | try:
15 | for filename in file_list:
16 | fd_list.append(open(filename))
17 |
18 | for lines in zip(*fd_list):
19 | for i, line in enumerate(lines):
20 | line = line.rstrip("\n").split(" ")
21 | batch.append(line)
22 |
23 | yield batch
24 | batch = [] # Reset batch
25 |
26 | except IOError:
27 | print("Error reading file " + filename + ".")
28 | exit = True # Flag to exit the program
29 |
30 | finally:
31 | for fd in fd_list:
32 | fd.close()
33 |
34 | if exit: # An error occurred, end execution
35 | sys.exit(-1)
36 |
37 |
38 | def main():
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument('-file_type', default='text',
41 | choices=['text', 'field'], required=True,
42 | help="""Options for vocabulary creation.
43 | The default is 'text' where the user passes
44 | a corpus or a list of corpora files for which
45 | they want to create a vocabulary from.
46 | If choosing the option 'field', we assume
47 | the file passed is a torch file created during
48 | the preprocessing stage of an already
49 | preprocessed corpus. The vocabulary file created
50 | will just be the vocabulary inside the field
51 | corresponding to the argument 'side'.""")
52 | parser.add_argument("-file", type=str, nargs="+", required=True)
53 | parser.add_argument("-out_file", type=str, required=True)
54 | parser.add_argument("-side", choices=['src', 'tgt'], help="""Specifies
55 | 'src' or 'tgt' side for 'field' file_type.""")
56 |
57 | opt = parser.parse_args()
58 |
59 | vocabulary = {}
60 | if opt.file_type == 'text':
61 | print("Reading input file...")
62 | for batch in read_files_batch(opt.file):
63 | for sentence in batch:
64 | for w in sentence:
65 | if w in vocabulary:
66 | vocabulary[w] += 1
67 | else:
68 | vocabulary[w] = 1
69 |
70 | print("Writing vocabulary file...")
71 | with open(opt.out_file, "w") as f:
72 | for w, count in sorted(vocabulary.items(), key=lambda x: x[1],
73 | reverse=True):
74 | f.write("{0}\n".format(w))
75 | else:
76 | if opt.side not in ['src', 'tgt']:
77 | raise ValueError("If using -file_type='field', specifies "
78 | "'src' or 'tgt' argument for -side.")
79 | import torch
80 | try:
81 | from onmt.inputters.inputter import _old_style_vocab
82 | except ImportError:
83 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
84 | from onmt.inputters.inputter import _old_style_vocab
85 |
86 | print("Reading input file...")
87 | if not len(opt.file) == 1:
88 | raise ValueError("If using -file_type='field', only pass one "
89 | "argument for -file.")
90 | vocabs = torch.load(opt.file[0])
91 | voc = dict(vocabs)[opt.side]
92 | if _old_style_vocab(voc):
93 | word_list = voc.itos
94 | else:
95 | try:
96 | word_list = voc[0][1].base_field.vocab.itos
97 | except AttributeError:
98 | word_list = voc[0][1].vocab.itos
99 |
100 | print("Writing vocabulary file...")
101 | with open(opt.out_file, "wb") as f:
102 | for w in word_list:
103 | f.write(u"{0}\n".format(w).encode("utf-8"))
104 |
105 |
106 | if __name__ == "__main__":
107 | main()
108 |
--------------------------------------------------------------------------------
/tools/extract_embeddings.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 |
5 | import onmt
6 | import onmt.model_builder
7 | import onmt.inputters as inputters
8 | import onmt.opts
9 |
10 | from onmt.utils.misc import use_gpu
11 | from onmt.utils.logging import init_logger, logger
12 |
13 | parser = argparse.ArgumentParser(description='translate.py')
14 |
15 | parser.add_argument('-model', required=True,
16 | help='Path to model .pt file')
17 | parser.add_argument('-output_dir', default='.',
18 | help="""Path to output the embeddings""")
19 | parser.add_argument('-gpu', type=int, default=-1,
20 | help="Device to run on")
21 |
22 |
23 | def write_embeddings(filename, dict, embeddings):
24 | with open(filename, 'wb') as file:
25 | for i in range(min(len(embeddings), len(dict.itos))):
26 | str = dict.itos[i].encode("utf-8")
27 | for j in range(len(embeddings[0])):
28 | str = str + (" %5f" % (embeddings[i][j])).encode("utf-8")
29 | file.write(str + b"\n")
30 |
31 |
32 | def main():
33 | dummy_parser = argparse.ArgumentParser(description='train.py')
34 | onmt.opts.model_opts(dummy_parser)
35 | dummy_opt = dummy_parser.parse_known_args([])[0]
36 | opt = parser.parse_args()
37 | opt.cuda = opt.gpu > -1
38 | if opt.cuda:
39 | torch.cuda.set_device(opt.gpu)
40 |
41 | # Add in default model arguments, possibly added since training.
42 | checkpoint = torch.load(opt.model,
43 | map_location=lambda storage, loc: storage)
44 | model_opt = checkpoint['opt']
45 |
46 | vocab = checkpoint['vocab']
47 | if inputters.old_style_vocab(vocab):
48 | fields = onmt.inputters.load_old_vocab(vocab)
49 | else:
50 | fields = vocab
51 | src_dict = fields['src'].base_field.vocab # assumes src is text
52 | tgt_dict = fields['tgt'].base_field.vocab
53 |
54 | model_opt = checkpoint['opt']
55 | for arg in dummy_opt.__dict__:
56 | if arg not in model_opt:
57 | model_opt.__dict__[arg] = dummy_opt.__dict__[arg]
58 |
59 | model = onmt.model_builder.build_base_model(
60 | model_opt, fields, use_gpu(opt), checkpoint)
61 | encoder = model.encoder
62 | decoder = model.decoder
63 |
64 | encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist()
65 | decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist()
66 |
67 | logger.info("Writing source embeddings")
68 | write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict,
69 | encoder_embeddings)
70 |
71 | logger.info("Writing target embeddings")
72 | write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict,
73 | decoder_embeddings)
74 |
75 | logger.info('... done.')
76 | logger.info('Converting model...')
77 |
78 |
79 | if __name__ == "__main__":
80 | init_logger('extract_embeddings.log')
81 | main()
82 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/README.txt:
--------------------------------------------------------------------------------
1 | The language suffix can be found here:
2 |
3 | http://www.loc.gov/standards/iso639-2/php/code_list.php
4 |
5 | This code includes data from Daniel Naber's Language Tools (czech abbreviations).
6 | This code includes data from czech wiktionary (also czech abbreviations).
7 |
8 |
9 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ca:
--------------------------------------------------------------------------------
1 | Dr
2 | Dra
3 | pàg
4 | p
5 | c
6 | av
7 | Sr
8 | Sra
9 | adm
10 | esq
11 | Prof
12 | S.A
13 | S.L
14 | p.e
15 | ptes
16 | Sta
17 | St
18 | pl
19 | màx
20 | cast
21 | dir
22 | nre
23 | fra
24 | admdora
25 | Emm
26 | Excma
27 | espf
28 | dc
29 | admdor
30 | tel
31 | angl
32 | aprox
33 | ca
34 | dept
35 | dj
36 | dl
37 | dt
38 | ds
39 | dg
40 | dv
41 | ed
42 | entl
43 | al
44 | i.e
45 | maj
46 | smin
47 | n
48 | núm
49 | pta
50 | A
51 | B
52 | C
53 | D
54 | E
55 | F
56 | G
57 | H
58 | I
59 | J
60 | K
61 | L
62 | M
63 | N
64 | O
65 | P
66 | Q
67 | R
68 | S
69 | T
70 | U
71 | V
72 | W
73 | X
74 | Y
75 | Z
76 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.cs:
--------------------------------------------------------------------------------
1 | Bc
2 | BcA
3 | Ing
4 | Ing.arch
5 | MUDr
6 | MVDr
7 | MgA
8 | Mgr
9 | JUDr
10 | PhDr
11 | RNDr
12 | PharmDr
13 | ThLic
14 | ThDr
15 | Ph.D
16 | Th.D
17 | prof
18 | doc
19 | CSc
20 | DrSc
21 | dr. h. c
22 | PaedDr
23 | Dr
24 | PhMr
25 | DiS
26 | abt
27 | ad
28 | a.i
29 | aj
30 | angl
31 | anon
32 | apod
33 | atd
34 | atp
35 | aut
36 | bd
37 | biogr
38 | b.m
39 | b.p
40 | b.r
41 | cca
42 | cit
43 | cizojaz
44 | c.k
45 | col
46 | čes
47 | čín
48 | čj
49 | ed
50 | facs
51 | fasc
52 | fol
53 | fot
54 | franc
55 | h.c
56 | hist
57 | hl
58 | hrsg
59 | ibid
60 | il
61 | ind
62 | inv.č
63 | jap
64 | jhdt
65 | jv
66 | koed
67 | kol
68 | korej
69 | kl
70 | krit
71 | lat
72 | lit
73 | m.a
74 | maď
75 | mj
76 | mp
77 | násl
78 | např
79 | nepubl
80 | něm
81 | no
82 | nr
83 | n.s
84 | okr
85 | odd
86 | odp
87 | obr
88 | opr
89 | orig
90 | phil
91 | pl
92 | pokrač
93 | pol
94 | port
95 | pozn
96 | př.kr
97 | př.n.l
98 | přel
99 | přeprac
100 | příl
101 | pseud
102 | pt
103 | red
104 | repr
105 | resp
106 | revid
107 | rkp
108 | roč
109 | roz
110 | rozš
111 | samost
112 | sect
113 | sest
114 | seš
115 | sign
116 | sl
117 | srv
118 | stol
119 | sv
120 | šk
121 | šk.ro
122 | špan
123 | tab
124 | t.č
125 | tis
126 | tj
127 | tř
128 | tzv
129 | univ
130 | uspoř
131 | vol
132 | vl.jm
133 | vs
134 | vyd
135 | vyobr
136 | zal
137 | zejm
138 | zkr
139 | zprac
140 | zvl
141 | n.p
142 | např
143 | než
144 | MUDr
145 | abl
146 | absol
147 | adj
148 | adv
149 | ak
150 | ak. sl
151 | akt
152 | alch
153 | amer
154 | anat
155 | angl
156 | anglosas
157 | arab
158 | arch
159 | archit
160 | arg
161 | astr
162 | astrol
163 | att
164 | bás
165 | belg
166 | bibl
167 | biol
168 | boh
169 | bot
170 | bulh
171 | círk
172 | csl
173 | č
174 | čas
175 | čes
176 | dat
177 | děj
178 | dep
179 | dět
180 | dial
181 | dór
182 | dopr
183 | dosl
184 | ekon
185 | epic
186 | etnonym
187 | eufem
188 | f
189 | fam
190 | fem
191 | fil
192 | film
193 | form
194 | fot
195 | fr
196 | fut
197 | fyz
198 | gen
199 | geogr
200 | geol
201 | geom
202 | germ
203 | gram
204 | hebr
205 | herald
206 | hist
207 | hl
208 | hovor
209 | hud
210 | hut
211 | chcsl
212 | chem
213 | ie
214 | imp
215 | impf
216 | ind
217 | indoevr
218 | inf
219 | instr
220 | interj
221 | ión
222 | iron
223 | it
224 | kanad
225 | katalán
226 | klas
227 | kniž
228 | komp
229 | konj
230 |
231 | konkr
232 | kř
233 | kuch
234 | lat
235 | lék
236 | les
237 | lid
238 | lit
239 | liturg
240 | lok
241 | log
242 | m
243 | mat
244 | meteor
245 | metr
246 | mod
247 | ms
248 | mysl
249 | n
250 | náb
251 | námoř
252 | neklas
253 | něm
254 | nesklon
255 | nom
256 | ob
257 | obch
258 | obyč
259 | ojed
260 | opt
261 | part
262 | pas
263 | pejor
264 | pers
265 | pf
266 | pl
267 | plpf
268 |
269 | práv
270 | prep
271 | předl
272 | přivl
273 | r
274 | rcsl
275 | refl
276 | reg
277 | rkp
278 | ř
279 | řec
280 | s
281 | samohl
282 | sg
283 | sl
284 | souhl
285 | spec
286 | srov
287 | stfr
288 | střv
289 | stsl
290 | subj
291 | subst
292 | superl
293 | sv
294 | sz
295 | táz
296 | tech
297 | telev
298 | teol
299 | trans
300 | typogr
301 | var
302 | vedl
303 | verb
304 | vl. jm
305 | voj
306 | vok
307 | vůb
308 | vulg
309 | výtv
310 | vztaž
311 | zahr
312 | zájm
313 | zast
314 | zejm
315 |
316 | zeměd
317 | zkr
318 | zř
319 | mj
320 | dl
321 | atp
322 | sport
323 | Mgr
324 | horn
325 | MVDr
326 | JUDr
327 | RSDr
328 | Bc
329 | PhDr
330 | ThDr
331 | Ing
332 | aj
333 | apod
334 | PharmDr
335 | pomn
336 | ev
337 | slang
338 | nprap
339 | odp
340 | dop
341 | pol
342 | st
343 | stol
344 | p. n. l
345 | před n. l
346 | n. l
347 | př. Kr
348 | po Kr
349 | př. n. l
350 | odd
351 | RNDr
352 | tzv
353 | atd
354 | tzn
355 | resp
356 | tj
357 | p
358 | br
359 | č. j
360 | čj
361 | č. p
362 | čp
363 | a. s
364 | s. r. o
365 | spol. s r. o
366 | p. o
367 | s. p
368 | v. o. s
369 | k. s
370 | o. p. s
371 | o. s
372 | v. r
373 | v z
374 | ml
375 | vč
376 | kr
377 | mld
378 | hod
379 | popř
380 | ap
381 | event
382 | rus
383 | slov
384 | rum
385 | švýc
386 | P. T
387 | zvl
388 | hor
389 | dol
390 | S.O.S
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.de:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | #no german words end in single lower-case letters, so we throw those in too.
7 | A
8 | B
9 | C
10 | D
11 | E
12 | F
13 | G
14 | H
15 | I
16 | J
17 | K
18 | L
19 | M
20 | N
21 | O
22 | P
23 | Q
24 | R
25 | S
26 | T
27 | U
28 | V
29 | W
30 | X
31 | Y
32 | Z
33 | a
34 | b
35 | c
36 | d
37 | e
38 | f
39 | g
40 | h
41 | i
42 | j
43 | k
44 | l
45 | m
46 | n
47 | o
48 | p
49 | q
50 | r
51 | s
52 | t
53 | u
54 | v
55 | w
56 | x
57 | y
58 | z
59 |
60 |
61 | #Roman Numerals. A dot after one of these is not a sentence break in German.
62 | I
63 | II
64 | III
65 | IV
66 | V
67 | VI
68 | VII
69 | VIII
70 | IX
71 | X
72 | XI
73 | XII
74 | XIII
75 | XIV
76 | XV
77 | XVI
78 | XVII
79 | XVIII
80 | XIX
81 | XX
82 | i
83 | ii
84 | iii
85 | iv
86 | v
87 | vi
88 | vii
89 | viii
90 | ix
91 | x
92 | xi
93 | xii
94 | xiii
95 | xiv
96 | xv
97 | xvi
98 | xvii
99 | xviii
100 | xix
101 | xx
102 |
103 | #Titles and Honorifics
104 | Adj
105 | Adm
106 | Adv
107 | Asst
108 | Bart
109 | Bldg
110 | Brig
111 | Bros
112 | Capt
113 | Cmdr
114 | Col
115 | Comdr
116 | Con
117 | Corp
118 | Cpl
119 | DR
120 | Dr
121 | Ens
122 | Gen
123 | Gov
124 | Hon
125 | Hosp
126 | Insp
127 | Lt
128 | MM
129 | MR
130 | MRS
131 | MS
132 | Maj
133 | Messrs
134 | Mlle
135 | Mme
136 | Mr
137 | Mrs
138 | Ms
139 | Msgr
140 | Op
141 | Ord
142 | Pfc
143 | Ph
144 | Prof
145 | Pvt
146 | Rep
147 | Reps
148 | Res
149 | Rev
150 | Rt
151 | Sen
152 | Sens
153 | Sfc
154 | Sgt
155 | Sr
156 | St
157 | Supt
158 | Surg
159 |
160 | #Misc symbols
161 | Mio
162 | Mrd
163 | bzw
164 | v
165 | vs
166 | usw
167 | d.h
168 | z.B
169 | u.a
170 | etc
171 | Mrd
172 | MwSt
173 | ggf
174 | d.J
175 | D.h
176 | m.E
177 | vgl
178 | I.F
179 | z.T
180 | sogen
181 | ff
182 | u.E
183 | g.U
184 | g.g.A
185 | c.-à-d
186 | Buchst
187 | u.s.w
188 | sog
189 | u.ä
190 | Std
191 | evtl
192 | Zt
193 | Chr
194 | u.U
195 | o.ä
196 | Ltd
197 | b.A
198 | z.Zt
199 | spp
200 | sen
201 | SA
202 | k.o
203 | jun
204 | i.H.v
205 | dgl
206 | dergl
207 | Co
208 | zzt
209 | usf
210 | s.p.a
211 | Dkr
212 | Corp
213 | bzgl
214 | BSE
215 |
216 | #Number indicators
217 | # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218 | No
219 | Nos
220 | Art
221 | Nr
222 | pp
223 | ca
224 | Ca
225 |
226 | #Ordinals are done with . in German - "1." = "1st" in English
227 | 1
228 | 2
229 | 3
230 | 4
231 | 5
232 | 6
233 | 7
234 | 8
235 | 9
236 | 10
237 | 11
238 | 12
239 | 13
240 | 14
241 | 15
242 | 16
243 | 17
244 | 18
245 | 19
246 | 20
247 | 21
248 | 22
249 | 23
250 | 24
251 | 25
252 | 26
253 | 27
254 | 28
255 | 29
256 | 30
257 | 31
258 | 32
259 | 33
260 | 34
261 | 35
262 | 36
263 | 37
264 | 38
265 | 39
266 | 40
267 | 41
268 | 42
269 | 43
270 | 44
271 | 45
272 | 46
273 | 47
274 | 48
275 | 49
276 | 50
277 | 51
278 | 52
279 | 53
280 | 54
281 | 55
282 | 56
283 | 57
284 | 58
285 | 59
286 | 60
287 | 61
288 | 62
289 | 63
290 | 64
291 | 65
292 | 66
293 | 67
294 | 68
295 | 69
296 | 70
297 | 71
298 | 72
299 | 73
300 | 74
301 | 75
302 | 76
303 | 77
304 | 78
305 | 79
306 | 80
307 | 81
308 | 82
309 | 83
310 | 84
311 | 85
312 | 86
313 | 87
314 | 88
315 | 89
316 | 90
317 | 91
318 | 92
319 | 93
320 | 94
321 | 95
322 | 96
323 | 97
324 | 98
325 | 99
326 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.en:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 |
33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34 | Adj
35 | Adm
36 | Adv
37 | Asst
38 | Bart
39 | Bldg
40 | Brig
41 | Bros
42 | Capt
43 | Cmdr
44 | Col
45 | Comdr
46 | Con
47 | Corp
48 | Cpl
49 | DR
50 | Dr
51 | Drs
52 | Ens
53 | Gen
54 | Gov
55 | Hon
56 | Hr
57 | Hosp
58 | Insp
59 | Lt
60 | MM
61 | MR
62 | MRS
63 | MS
64 | Maj
65 | Messrs
66 | Mlle
67 | Mme
68 | Mr
69 | Mrs
70 | Ms
71 | Msgr
72 | Op
73 | Ord
74 | Pfc
75 | Ph
76 | Prof
77 | Pvt
78 | Rep
79 | Reps
80 | Res
81 | Rev
82 | Rt
83 | Sen
84 | Sens
85 | Sfc
86 | Sgt
87 | Sr
88 | St
89 | Supt
90 | Surg
91 |
92 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
93 | v
94 | vs
95 | i.e
96 | rev
97 | e.g
98 |
99 | #Numbers only. These should only induce breaks when followed by a numeric sequence
100 | # add NUMERIC_ONLY after the word for this function
101 | #This case is mostly for the english "No." which can either be a sentence of its own, or
102 | #if followed by a number, a non-breaking prefix
103 | No #NUMERIC_ONLY#
104 | Nos
105 | Art #NUMERIC_ONLY#
106 | Nr
107 | pp #NUMERIC_ONLY#
108 |
109 | #month abbreviations
110 | Jan
111 | Feb
112 | Mar
113 | Apr
114 | #May is a full word
115 | Jun
116 | Jul
117 | Aug
118 | Sep
119 | Oct
120 | Nov
121 | Dec
122 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.es:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 |
33 | # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
34 |
35 | A.C
36 | Apdo
37 | Av
38 | Bco
39 | CC.AA
40 | Da
41 | Dep
42 | Dn
43 | Dr
44 | Dra
45 | EE.UU
46 | Excmo
47 | FF.CC
48 | Fil
49 | Gral
50 | J.C
51 | Let
52 | Lic
53 | N.B
54 | P.D
55 | P.V.P
56 | Prof
57 | Pts
58 | Rte
59 | S.A
60 | S.A.R
61 | S.E
62 | S.L
63 | S.R.C
64 | Sr
65 | Sra
66 | Srta
67 | Sta
68 | Sto
69 | T.V.E
70 | Tel
71 | Ud
72 | Uds
73 | V.B
74 | V.E
75 | Vd
76 | Vds
77 | a/c
78 | adj
79 | admón
80 | afmo
81 | apdo
82 | av
83 | c
84 | c.f
85 | c.g
86 | cap
87 | cm
88 | cta
89 | dcha
90 | doc
91 | ej
92 | entlo
93 | esq
94 | etc
95 | f.c
96 | gr
97 | grs
98 | izq
99 | kg
100 | km
101 | mg
102 | mm
103 | núm
104 | núm
105 | p
106 | p.a
107 | p.ej
108 | ptas
109 | pág
110 | págs
111 | pág
112 | págs
113 | q.e.g.e
114 | q.e.s.m
115 | s
116 | s.s.s
117 | vid
118 | vol
119 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.fi:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT
2 | #indicate an end-of-sentence marker. Special cases are included for prefixes
3 | #that ONLY appear before 0-9 numbers.
4 |
5 | #This list is compiled from omorfi database
6 | #by Tommi A Pirinen.
7 |
8 |
9 | #any single upper case letter followed by a period is not a sentence ender
10 | A
11 | B
12 | C
13 | D
14 | E
15 | F
16 | G
17 | H
18 | I
19 | J
20 | K
21 | L
22 | M
23 | N
24 | O
25 | P
26 | Q
27 | R
28 | S
29 | T
30 | U
31 | V
32 | W
33 | X
34 | Y
35 | Z
36 | Å
37 | Ä
38 | Ö
39 |
40 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
41 | alik
42 | alil
43 | amir
44 | apul
45 | apul.prof
46 | arkkit
47 | ass
48 | assist
49 | dipl
50 | dipl.arkkit
51 | dipl.ekon
52 | dipl.ins
53 | dipl.kielenk
54 | dipl.kirjeenv
55 | dipl.kosm
56 | dipl.urk
57 | dos
58 | erikoiseläinl
59 | erikoishammasl
60 | erikoisl
61 | erikoist
62 | ev.luutn
63 | evp
64 | fil
65 | ft
66 | hallinton
67 | hallintot
68 | hammaslääket
69 | jatk
70 | jääk
71 | kansaned
72 | kapt
73 | kapt.luutn
74 | kenr
75 | kenr.luutn
76 | kenr.maj
77 | kers
78 | kirjeenv
79 | kom
80 | kom.kapt
81 | komm
82 | konst
83 | korpr
84 | luutn
85 | maist
86 | maj
87 | Mr
88 | Mrs
89 | Ms
90 | M.Sc
91 | neuv
92 | nimim
93 | Ph.D
94 | prof
95 | puh.joht
96 | pääll
97 | res
98 | san
99 | siht
100 | suom
101 | sähköp
102 | säv
103 | toht
104 | toim
105 | toim.apul
106 | toim.joht
107 | toim.siht
108 | tuom
109 | ups
110 | vänr
111 | vääp
112 | ye.ups
113 | ylik
114 | ylil
115 | ylim
116 | ylimatr
117 | yliop
118 | yliopp
119 | ylip
120 | yliv
121 |
122 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall
123 | #into this category - it sometimes ends a sentence)
124 | e.g
125 | ent
126 | esim
127 | huom
128 | i.e
129 | ilm
130 | l
131 | mm
132 | myöh
133 | nk
134 | nyk
135 | par
136 | po
137 | t
138 | v
139 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.fr:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 | #
4 | #any single upper case letter followed by a period is not a sentence ender
5 | #usually upper case letters are initials in a name
6 | #no French words end in single lower-case letters, so we throw those in too?
7 | A
8 | B
9 | C
10 | D
11 | E
12 | F
13 | G
14 | H
15 | I
16 | J
17 | K
18 | L
19 | M
20 | N
21 | O
22 | P
23 | Q
24 | R
25 | S
26 | T
27 | U
28 | V
29 | W
30 | X
31 | Y
32 | Z
33 | #a
34 | b
35 | c
36 | d
37 | e
38 | f
39 | g
40 | h
41 | i
42 | j
43 | k
44 | l
45 | m
46 | n
47 | o
48 | p
49 | q
50 | r
51 | s
52 | t
53 | u
54 | v
55 | w
56 | x
57 | y
58 | z
59 |
60 | # Period-final abbreviation list for French
61 | A.C.N
62 | A.M
63 | art
64 | ann
65 | apr
66 | av
67 | auj
68 | lib
69 | B.P
70 | boul
71 | ca
72 | c.-à-d
73 | cf
74 | ch.-l
75 | chap
76 | contr
77 | C.P.I
78 | C.Q.F.D
79 | C.N
80 | C.N.S
81 | C.S
82 | dir
83 | éd
84 | e.g
85 | env
86 | al
87 | etc
88 | E.V
89 | ex
90 | fasc
91 | fém
92 | fig
93 | fr
94 | hab
95 | ibid
96 | id
97 | i.e
98 | inf
99 | LL.AA
100 | LL.AA.II
101 | LL.AA.RR
102 | LL.AA.SS
103 | L.D
104 | LL.EE
105 | LL.MM
106 | LL.MM.II.RR
107 | loc.cit
108 | masc
109 | MM
110 | ms
111 | N.B
112 | N.D.A
113 | N.D.L.R
114 | N.D.T
115 | n/réf
116 | NN.SS
117 | N.S
118 | N.D
119 | N.P.A.I
120 | p.c.c
121 | pl
122 | pp
123 | p.ex
124 | p.j
125 | P.S
126 | R.A.S
127 | R.-V
128 | R.P
129 | R.I.P
130 | SS
131 | S.S
132 | S.A
133 | S.A.I
134 | S.A.R
135 | S.A.S
136 | S.E
137 | sec
138 | sect
139 | sing
140 | S.M
141 | S.M.I.R
142 | sq
143 | sqq
144 | suiv
145 | sup
146 | suppl
147 | tél
148 | T.S.V.P
149 | vb
150 | vol
151 | vs
152 | X.O
153 | Z.I
154 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ga:
--------------------------------------------------------------------------------
1 |
2 | A
3 | B
4 | C
5 | D
6 | E
7 | F
8 | G
9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | Á
29 | É
30 | Í
31 | Ó
32 | Ú
33 |
34 | Uacht
35 | Dr
36 | B.Arch
37 |
38 | m.sh
39 | .i
40 | Co
41 | Cf
42 | cf
43 | i.e
44 | r
45 | Chr
46 | lch #NUMERIC_ONLY#
47 | lgh #NUMERIC_ONLY#
48 | uimh #NUMERIC_ONLY#
49 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.hu:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 | Á
33 | É
34 | Í
35 | Ó
36 | Ö
37 | Ő
38 | Ú
39 | Ü
40 | Ű
41 |
42 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
43 | Dr
44 | dr
45 | kb
46 | Kb
47 | vö
48 | Vö
49 | pl
50 | Pl
51 | ca
52 | Ca
53 | min
54 | Min
55 | max
56 | Max
57 | ún
58 | Ún
59 | prof
60 | Prof
61 | de
62 | De
63 | du
64 | Du
65 | Szt
66 | St
67 |
68 | #Numbers only. These should only induce breaks when followed by a numeric sequence
69 | # add NUMERIC_ONLY after the word for this function
70 | #This case is mostly for the english "No." which can either be a sentence of its own, or
71 | #if followed by a number, a non-breaking prefix
72 |
73 | # Month name abbreviations
74 | jan #NUMERIC_ONLY#
75 | Jan #NUMERIC_ONLY#
76 | Feb #NUMERIC_ONLY#
77 | feb #NUMERIC_ONLY#
78 | márc #NUMERIC_ONLY#
79 | Márc #NUMERIC_ONLY#
80 | ápr #NUMERIC_ONLY#
81 | Ápr #NUMERIC_ONLY#
82 | máj #NUMERIC_ONLY#
83 | Máj #NUMERIC_ONLY#
84 | jún #NUMERIC_ONLY#
85 | Jún #NUMERIC_ONLY#
86 | Júl #NUMERIC_ONLY#
87 | júl #NUMERIC_ONLY#
88 | aug #NUMERIC_ONLY#
89 | Aug #NUMERIC_ONLY#
90 | Szept #NUMERIC_ONLY#
91 | szept #NUMERIC_ONLY#
92 | okt #NUMERIC_ONLY#
93 | Okt #NUMERIC_ONLY#
94 | nov #NUMERIC_ONLY#
95 | Nov #NUMERIC_ONLY#
96 | dec #NUMERIC_ONLY#
97 | Dec #NUMERIC_ONLY#
98 |
99 | # Other abbreviations
100 | tel #NUMERIC_ONLY#
101 | Tel #NUMERIC_ONLY#
102 | Fax #NUMERIC_ONLY#
103 | fax #NUMERIC_ONLY#
104 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.is:
--------------------------------------------------------------------------------
1 | no #NUMERIC_ONLY#
2 | No #NUMERIC_ONLY#
3 | nr #NUMERIC_ONLY#
4 | Nr #NUMERIC_ONLY#
5 | nR #NUMERIC_ONLY#
6 | NR #NUMERIC_ONLY#
7 | a
8 | b
9 | c
10 | d
11 | e
12 | f
13 | g
14 | h
15 | i
16 | j
17 | k
18 | l
19 | m
20 | n
21 | o
22 | p
23 | q
24 | r
25 | s
26 | t
27 | u
28 | v
29 | w
30 | x
31 | y
32 | z
33 | ^
34 | í
35 | á
36 | ó
37 | æ
38 | A
39 | B
40 | C
41 | D
42 | E
43 | F
44 | G
45 | H
46 | I
47 | J
48 | K
49 | L
50 | M
51 | N
52 | O
53 | P
54 | Q
55 | R
56 | S
57 | T
58 | U
59 | V
60 | W
61 | X
62 | Y
63 | Z
64 | ab.fn
65 | a.fn
66 | afs
67 | al
68 | alm
69 | alg
70 | andh
71 | ath
72 | aths
73 | atr
74 | ao
75 | au
76 | aukaf
77 | áfn
78 | áhrl.s
79 | áhrs
80 | ákv.gr
81 | ákv
82 | bh
83 | bls
84 | dr
85 | e.Kr
86 | et
87 | ef
88 | efn
89 | ennfr
90 | eink
91 | end
92 | e.st
93 | erl
94 | fél
95 | fskj
96 | fh
97 | f.hl
98 | físl
99 | fl
100 | fn
101 | fo
102 | forl
103 | frb
104 | frl
105 | frh
106 | frt
107 | fsl
108 | fsh
109 | fs
110 | fsk
111 | fst
112 | f.Kr
113 | ft
114 | fv
115 | fyrrn
116 | fyrrv
117 | germ
118 | gm
119 | gr
120 | hdl
121 | hdr
122 | hf
123 | hl
124 | hlsk
125 | hljsk
126 | hljv
127 | hljóðv
128 | hr
129 | hv
130 | hvk
131 | holl
132 | Hos
133 | höf
134 | hk
135 | hrl
136 | ísl
137 | kaf
138 | kap
139 | Khöfn
140 | kk
141 | kg
142 | kk
143 | km
144 | kl
145 | klst
146 | kr
147 | kt
148 | kgúrsk
149 | kvk
150 | leturbr
151 | lh
152 | lh.nt
153 | lh.þt
154 | lo
155 | ltr
156 | mlja
157 | mljó
158 | millj
159 | mm
160 | mms
161 | m.fl
162 | miðm
163 | mgr
164 | mst
165 | mín
166 | nf
167 | nh
168 | nhm
169 | nl
170 | nk
171 | nmgr
172 | no
173 | núv
174 | nt
175 | o.áfr
176 | o.m.fl
177 | ohf
178 | o.fl
179 | o.s.frv
180 | ófn
181 | ób
182 | óákv.gr
183 | óákv
184 | pfn
185 | PR
186 | pr
187 | Ritstj
188 | Rvík
189 | Rvk
190 | samb
191 | samhlj
192 | samn
193 | samn
194 | sbr
195 | sek
196 | sérn
197 | sf
198 | sfn
199 | sh
200 | sfn
201 | sh
202 | s.hl
203 | sk
204 | skv
205 | sl
206 | sn
207 | so
208 | ss.us
209 | s.st
210 | samþ
211 | sbr
212 | shlj
213 | sign
214 | skál
215 | st
216 | st.s
217 | stk
218 | sþ
219 | teg
220 | tbl
221 | tfn
222 | tl
223 | tvíhlj
224 | tvt
225 | till
226 | to
227 | umr
228 | uh
229 | us
230 | uppl
231 | útg
232 | vb
233 | Vf
234 | vh
235 | vkf
236 | Vl
237 | vl
238 | vlf
239 | vmf
240 | 8vo
241 | vsk
242 | vth
243 | þt
244 | þf
245 | þjs
246 | þgf
247 | þlt
248 | þolm
249 | þm
250 | þml
251 | þýð
252 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.it:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | B
8 | C
9 | D
10 | E
11 | F
12 | G
13 | H
14 | I
15 | J
16 | K
17 | L
18 | M
19 | N
20 | O
21 | P
22 | Q
23 | R
24 | S
25 | T
26 | U
27 | V
28 | W
29 | X
30 | Y
31 | Z
32 |
33 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34 | Adj
35 | Adm
36 | Adv
37 | Amn
38 | Arch
39 | Asst
40 | Avv
41 | Bart
42 | Bcc
43 | Bldg
44 | Brig
45 | Bros
46 | C.A.P
47 | C.P
48 | Capt
49 | Cc
50 | Cmdr
51 | Co
52 | Col
53 | Comdr
54 | Con
55 | Corp
56 | Cpl
57 | DR
58 | Dott
59 | Dr
60 | Drs
61 | Egr
62 | Ens
63 | Gen
64 | Geom
65 | Gov
66 | Hon
67 | Hosp
68 | Hr
69 | Id
70 | Ing
71 | Insp
72 | Lt
73 | MM
74 | MR
75 | MRS
76 | MS
77 | Maj
78 | Messrs
79 | Mlle
80 | Mme
81 | Mo
82 | Mons
83 | Mr
84 | Mrs
85 | Ms
86 | Msgr
87 | N.B
88 | Op
89 | Ord
90 | P.S
91 | P.T
92 | Pfc
93 | Ph
94 | Prof
95 | Pvt
96 | RP
97 | RSVP
98 | Rag
99 | Rep
100 | Reps
101 | Res
102 | Rev
103 | Rif
104 | Rt
105 | S.A
106 | S.B.F
107 | S.P.M
108 | S.p.A
109 | S.r.l
110 | Sen
111 | Sens
112 | Sfc
113 | Sgt
114 | Sig
115 | Sigg
116 | Soc
117 | Spett
118 | Sr
119 | St
120 | Supt
121 | Surg
122 | V.P
123 |
124 | # other
125 | a.c
126 | acc
127 | all
128 | banc
129 | c.a
130 | c.c.p
131 | c.m
132 | c.p
133 | c.s
134 | c.v
135 | corr
136 | dott
137 | e.p.c
138 | ecc
139 | es
140 | fatt
141 | gg
142 | int
143 | lett
144 | ogg
145 | on
146 | p.c
147 | p.c.c
148 | p.es
149 | p.f
150 | p.r
151 | p.v
152 | post
153 | pp
154 | racc
155 | ric
156 | s.n.c
157 | seg
158 | sgg
159 | ss
160 | tel
161 | u.s
162 | v.r
163 | v.s
164 |
165 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
166 | v
167 | vs
168 | i.e
169 | rev
170 | e.g
171 |
172 | #Numbers only. These should only induce breaks when followed by a numeric sequence
173 | # add NUMERIC_ONLY after the word for this function
174 | #This case is mostly for the english "No." which can either be a sentence of its own, or
175 | #if followed by a number, a non-breaking prefix
176 | No #NUMERIC_ONLY#
177 | Nos
178 | Art #NUMERIC_ONLY#
179 | Nr
180 | pp #NUMERIC_ONLY#
181 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.lv:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | A
7 | Ā
8 | B
9 | C
10 | Č
11 | D
12 | E
13 | Ē
14 | F
15 | G
16 | Ģ
17 | H
18 | I
19 | Ī
20 | J
21 | K
22 | Ķ
23 | L
24 | Ļ
25 | M
26 | N
27 | Ņ
28 | O
29 | P
30 | Q
31 | R
32 | S
33 | Š
34 | T
35 | U
36 | Ū
37 | V
38 | W
39 | X
40 | Y
41 | Z
42 | Ž
43 |
44 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
45 | dr
46 | Dr
47 | med
48 | prof
49 | Prof
50 | inž
51 | Inž
52 | ist.loc
53 | Ist.loc
54 | kor.loc
55 | Kor.loc
56 | v.i
57 | vietn
58 | Vietn
59 |
60 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
61 | a.l
62 | t.p
63 | pārb
64 | Pārb
65 | vec
66 | Vec
67 | inv
68 | Inv
69 | sk
70 | Sk
71 | spec
72 | Spec
73 | vienk
74 | Vienk
75 | virz
76 | Virz
77 | māksl
78 | Māksl
79 | mūz
80 | Mūz
81 | akad
82 | Akad
83 | soc
84 | Soc
85 | galv
86 | Galv
87 | vad
88 | Vad
89 | sertif
90 | Sertif
91 | folkl
92 | Folkl
93 | hum
94 | Hum
95 |
96 | #Numbers only. These should only induce breaks when followed by a numeric sequence
97 | # add NUMERIC_ONLY after the word for this function
98 | #This case is mostly for the english "No." which can either be a sentence of its own, or
99 | #if followed by a number, a non-breaking prefix
100 | Nr #NUMERIC_ONLY#
101 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.nl:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 | #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
4 | # http://nl.wikipedia.org/wiki/Aanspreekvorm
5 | # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
6 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
7 | #usually upper case letters are initials in a name
8 | A
9 | B
10 | C
11 | D
12 | E
13 | F
14 | G
15 | H
16 | I
17 | J
18 | K
19 | L
20 | M
21 | N
22 | O
23 | P
24 | Q
25 | R
26 | S
27 | T
28 | U
29 | V
30 | W
31 | X
32 | Y
33 | Z
34 |
35 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
36 | bacc
37 | bc
38 | bgen
39 | c.i
40 | dhr
41 | dr
42 | dr.h.c
43 | drs
44 | drs
45 | ds
46 | eint
47 | fa
48 | Fa
49 | fam
50 | gen
51 | genm
52 | ing
53 | ir
54 | jhr
55 | jkvr
56 | jr
57 | kand
58 | kol
59 | lgen
60 | lkol
61 | Lt
62 | maj
63 | Mej
64 | mevr
65 | Mme
66 | mr
67 | mr
68 | Mw
69 | o.b.s
70 | plv
71 | prof
72 | ritm
73 | tint
74 | Vz
75 | Z.D
76 | Z.D.H
77 | Z.E
78 | Z.Em
79 | Z.H
80 | Z.K.H
81 | Z.K.M
82 | Z.M
83 | z.v
84 |
85 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
86 | #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
87 | a.g.v
88 | bijv
89 | bijz
90 | bv
91 | d.w.z
92 | e.c
93 | e.g
94 | e.k
95 | ev
96 | i.p.v
97 | i.s.m
98 | i.t.t
99 | i.v.m
100 | m.a.w
101 | m.b.t
102 | m.b.v
103 | m.h.o
104 | m.i
105 | m.i.v
106 | v.w.t
107 |
108 | #Numbers only. These should only induce breaks when followed by a numeric sequence
109 | # add NUMERIC_ONLY after the word for this function
110 | #This case is mostly for the english "No." which can either be a sentence of its own, or
111 | #if followed by a number, a non-breaking prefix
112 | Nr #NUMERIC_ONLY#
113 | Nrs
114 | nrs
115 | nr #NUMERIC_ONLY#
116 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.pl:
--------------------------------------------------------------------------------
1 | adw
2 | afr
3 | akad
4 | al
5 | Al
6 | am
7 | amer
8 | arch
9 | art
10 | Art
11 | artyst
12 | astr
13 | austr
14 | bałt
15 | bdb
16 | bł
17 | bm
18 | br
19 | bryg
20 | bryt
21 | centr
22 | ces
23 | chem
24 | chiń
25 | chir
26 | c.k
27 | c.o
28 | cyg
29 | cyw
30 | cyt
31 | czes
32 | czw
33 | cd
34 | Cd
35 | czyt
36 | ćw
37 | ćwicz
38 | daw
39 | dcn
40 | dekl
41 | demokr
42 | det
43 | diec
44 | dł
45 | dn
46 | dot
47 | dol
48 | dop
49 | dost
50 | dosł
51 | h.c
52 | ds
53 | dst
54 | duszp
55 | dypl
56 | egz
57 | ekol
58 | ekon
59 | elektr
60 | em
61 | ew
62 | fab
63 | farm
64 | fot
65 | fr
66 | gat
67 | gastr
68 | geogr
69 | geol
70 | gimn
71 | głęb
72 | gm
73 | godz
74 | górn
75 | gosp
76 | gr
77 | gram
78 | hist
79 | hiszp
80 | hr
81 | Hr
82 | hot
83 | id
84 | in
85 | im
86 | iron
87 | jn
88 | kard
89 | kat
90 | katol
91 | k.k
92 | kk
93 | kol
94 | kl
95 | k.p.a
96 | kpc
97 | k.p.c
98 | kpt
99 | kr
100 | k.r
101 | krak
102 | k.r.o
103 | kryt
104 | kult
105 | laic
106 | łac
107 | niem
108 | woj
109 | nb
110 | np
111 | Nb
112 | Np
113 | pol
114 | pow
115 | m.in
116 | pt
117 | ps
118 | Pt
119 | Ps
120 | cdn
121 | jw
122 | ryc
123 | rys
124 | Ryc
125 | Rys
126 | tj
127 | tzw
128 | Tzw
129 | tzn
130 | zob
131 | ang
132 | ub
133 | ul
134 | pw
135 | pn
136 | pl
137 | al
138 | k
139 | n
140 | nr #NUMERIC_ONLY#
141 | Nr #NUMERIC_ONLY#
142 | ww
143 | wł
144 | ur
145 | zm
146 | żyd
147 | żarg
148 | żyw
149 | wył
150 | bp
151 | bp
152 | wyst
153 | tow
154 | Tow
155 | o
156 | sp
157 | Sp
158 | st
159 | spółdz
160 | Spółdz
161 | społ
162 | spółgł
163 | stoł
164 | stow
165 | Stoł
166 | Stow
167 | zn
168 | zew
169 | zewn
170 | zdr
171 | zazw
172 | zast
173 | zaw
174 | zał
175 | zal
176 | zam
177 | zak
178 | zakł
179 | zagr
180 | zach
181 | adw
182 | Adw
183 | lek
184 | Lek
185 | med
186 | mec
187 | Mec
188 | doc
189 | Doc
190 | dyw
191 | dyr
192 | Dyw
193 | Dyr
194 | inż
195 | Inż
196 | mgr
197 | Mgr
198 | dh
199 | dr
200 | Dh
201 | Dr
202 | p
203 | P
204 | red
205 | Red
206 | prof
207 | prok
208 | Prof
209 | Prok
210 | hab
211 | płk
212 | Płk
213 | nadkom
214 | Nadkom
215 | podkom
216 | Podkom
217 | ks
218 | Ks
219 | gen
220 | Gen
221 | por
222 | Por
223 | reż
224 | Reż
225 | przyp
226 | Przyp
227 | śp
228 | św
229 | śW
230 | Śp
231 | Św
232 | ŚW
233 | szer
234 | Szer
235 | pkt #NUMERIC_ONLY#
236 | str #NUMERIC_ONLY#
237 | tab #NUMERIC_ONLY#
238 | Tab #NUMERIC_ONLY#
239 | tel
240 | ust #NUMERIC_ONLY#
241 | par #NUMERIC_ONLY#
242 | poz
243 | pok
244 | oo
245 | oO
246 | Oo
247 | OO
248 | r #NUMERIC_ONLY#
249 | l #NUMERIC_ONLY#
250 | s #NUMERIC_ONLY#
251 | najśw
252 | Najśw
253 | A
254 | B
255 | C
256 | D
257 | E
258 | F
259 | G
260 | H
261 | I
262 | J
263 | K
264 | L
265 | M
266 | N
267 | O
268 | P
269 | Q
270 | R
271 | S
272 | T
273 | U
274 | V
275 | W
276 | X
277 | Y
278 | Z
279 | Ś
280 | Ć
281 | Ż
282 | Ź
283 | Dz
284 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ro:
--------------------------------------------------------------------------------
1 | A
2 | B
3 | C
4 | D
5 | E
6 | F
7 | G
8 | H
9 | I
10 | J
11 | K
12 | L
13 | M
14 | N
15 | O
16 | P
17 | Q
18 | R
19 | S
20 | T
21 | U
22 | V
23 | W
24 | X
25 | Y
26 | Z
27 | dpdv
28 | etc
29 | șamd
30 | M.Ap.N
31 | dl
32 | Dl
33 | d-na
34 | D-na
35 | dvs
36 | Dvs
37 | pt
38 | Pt
39 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ru:
--------------------------------------------------------------------------------
1 | # added Cyrillic uppercase letters [А-Я]
2 | # removed 000D carriage return (this is not removed by chomp in tokenizer.perl, and prevents recognition of the prefixes)
3 | # edited by Kate Young (nspaceanalysis@earthlink.net) 21 May 2013
4 | А
5 | Б
6 | В
7 | Г
8 | Д
9 | Е
10 | Ж
11 | З
12 | И
13 | Й
14 | К
15 | Л
16 | М
17 | Н
18 | О
19 | П
20 | Р
21 | С
22 | Т
23 | У
24 | Ф
25 | Х
26 | Ц
27 | Ч
28 | Ш
29 | Щ
30 | Ъ
31 | Ы
32 | Ь
33 | Э
34 | Ю
35 | Я
36 | A
37 | B
38 | C
39 | D
40 | E
41 | F
42 | G
43 | H
44 | I
45 | J
46 | K
47 | L
48 | M
49 | N
50 | O
51 | P
52 | Q
53 | R
54 | S
55 | T
56 | U
57 | V
58 | W
59 | X
60 | Y
61 | Z
62 | 0гг
63 | 1гг
64 | 2гг
65 | 3гг
66 | 4гг
67 | 5гг
68 | 6гг
69 | 7гг
70 | 8гг
71 | 9гг
72 | 0г
73 | 1г
74 | 2г
75 | 3г
76 | 4г
77 | 5г
78 | 6г
79 | 7г
80 | 8г
81 | 9г
82 | Xвв
83 | Vвв
84 | Iвв
85 | Lвв
86 | Mвв
87 | Cвв
88 | Xв
89 | Vв
90 | Iв
91 | Lв
92 | Mв
93 | Cв
94 | 0м
95 | 1м
96 | 2м
97 | 3м
98 | 4м
99 | 5м
100 | 6м
101 | 7м
102 | 8м
103 | 9м
104 | 0мм
105 | 1мм
106 | 2мм
107 | 3мм
108 | 4мм
109 | 5мм
110 | 6мм
111 | 7мм
112 | 8мм
113 | 9мм
114 | 0см
115 | 1см
116 | 2см
117 | 3см
118 | 4см
119 | 5см
120 | 6см
121 | 7см
122 | 8см
123 | 9см
124 | 0дм
125 | 1дм
126 | 2дм
127 | 3дм
128 | 4дм
129 | 5дм
130 | 6дм
131 | 7дм
132 | 8дм
133 | 9дм
134 | 0л
135 | 1л
136 | 2л
137 | 3л
138 | 4л
139 | 5л
140 | 6л
141 | 7л
142 | 8л
143 | 9л
144 | 0км
145 | 1км
146 | 2км
147 | 3км
148 | 4км
149 | 5км
150 | 6км
151 | 7км
152 | 8км
153 | 9км
154 | 0га
155 | 1га
156 | 2га
157 | 3га
158 | 4га
159 | 5га
160 | 6га
161 | 7га
162 | 8га
163 | 9га
164 | 0кг
165 | 1кг
166 | 2кг
167 | 3кг
168 | 4кг
169 | 5кг
170 | 6кг
171 | 7кг
172 | 8кг
173 | 9кг
174 | 0т
175 | 1т
176 | 2т
177 | 3т
178 | 4т
179 | 5т
180 | 6т
181 | 7т
182 | 8т
183 | 9т
184 | 0г
185 | 1г
186 | 2г
187 | 3г
188 | 4г
189 | 5г
190 | 6г
191 | 7г
192 | 8г
193 | 9г
194 | 0мг
195 | 1мг
196 | 2мг
197 | 3мг
198 | 4мг
199 | 5мг
200 | 6мг
201 | 7мг
202 | 8мг
203 | 9мг
204 | бульв
205 | в
206 | вв
207 | г
208 | га
209 | гг
210 | гл
211 | гос
212 | д
213 | дм
214 | доп
215 | др
216 | е
217 | ед
218 | ед
219 | зам
220 | и
221 | инд
222 | исп
223 | Исп
224 | к
225 | кап
226 | кг
227 | кв
228 | кл
229 | км
230 | кол
231 | комн
232 | коп
233 | куб
234 | л
235 | лиц
236 | лл
237 | м
238 | макс
239 | мг
240 | мин
241 | мл
242 | млн
243 | млрд
244 | мм
245 | н
246 | наб
247 | нач
248 | неуд
249 | ном
250 | о
251 | обл
252 | обр
253 | общ
254 | ок
255 | ост
256 | отл
257 | п
258 | пер
259 | перераб
260 | пл
261 | пос
262 | пр
263 | просп
264 | проф
265 | р
266 | ред
267 | руб
268 | с
269 | сб
270 | св
271 | см
272 | соч
273 | ср
274 | ст
275 | стр
276 | т
277 | тел
278 | Тел
279 | тех
280 | тт
281 | туп
282 | тыс
283 | уд
284 | ул
285 | уч
286 | физ
287 | х
288 | хор
289 | ч
290 | чел
291 | шт
292 | экз
293 | э
294 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.sk:
--------------------------------------------------------------------------------
1 | Bc
2 | Mgr
3 | RNDr
4 | PharmDr
5 | PhDr
6 | JUDr
7 | PaedDr
8 | ThDr
9 | Ing
10 | MUDr
11 | MDDr
12 | MVDr
13 | Dr
14 | ThLic
15 | PhD
16 | ArtD
17 | ThDr
18 | Dr
19 | DrSc
20 | CSs
21 | prof
22 | obr
23 | Obr
24 | Č
25 | č
26 | absol
27 | adj
28 | admin
29 | adr
30 | Adr
31 | adv
32 | advok
33 | afr
34 | ak
35 | akad
36 | akc
37 | akuz
38 | et
39 | al
40 | alch
41 | amer
42 | anat
43 | angl
44 | Angl
45 | anglosas
46 | anorg
47 | ap
48 | apod
49 | arch
50 | archeol
51 | archit
52 | arg
53 | art
54 | astr
55 | astrol
56 | astron
57 | atp
58 | atď
59 | austr
60 | Austr
61 | aut
62 | belg
63 | Belg
64 | bibl
65 | Bibl
66 | biol
67 | bot
68 | bud
69 | bás
70 | býv
71 | cest
72 | chem
73 | cirk
74 | csl
75 | čs
76 | Čs
77 | dat
78 | dep
79 | det
80 | dial
81 | diaľ
82 | dipl
83 | distrib
84 | dokl
85 | dosl
86 | dopr
87 | dram
88 | duš
89 | dv
90 | dvojčl
91 | dór
92 | ekol
93 | ekon
94 | el
95 | elektr
96 | elektrotech
97 | energet
98 | epic
99 | est
100 | etc
101 | etonym
102 | eufem
103 | európ
104 | Európ
105 | ev
106 | evid
107 | expr
108 | fa
109 | fam
110 | farm
111 | fem
112 | feud
113 | fil
114 | filat
115 | filoz
116 | fi
117 | fon
118 | form
119 | fot
120 | fr
121 | Fr
122 | franc
123 | Franc
124 | fraz
125 | fut
126 | fyz
127 | fyziol
128 | garb
129 | gen
130 | genet
131 | genpor
132 | geod
133 | geogr
134 | geol
135 | geom
136 | germ
137 | gr
138 | Gr
139 | gréc
140 | Gréc
141 | gréckokat
142 | hebr
143 | herald
144 | hist
145 | hlav
146 | hosp
147 | hromad
148 | hud
149 | hypok
150 | ident
151 | i.e
152 | ident
153 | imp
154 | impf
155 | indoeur
156 | inf
157 | inform
158 | instr
159 | int
160 | interj
161 | inšt
162 | inštr
163 | iron
164 | jap
165 | Jap
166 | jaz
167 | jedn
168 | juhoamer
169 | juhových
170 | juhozáp
171 | juž
172 | kanad
173 | Kanad
174 | kanc
175 | kapit
176 | kpt
177 | kart
178 | katastr
179 | knih
180 | kniž
181 | komp
182 | konj
183 | konkr
184 | kozmet
185 | krajč
186 | kresť
187 | kt
188 | kuch
189 | lat
190 | latinskoamer
191 | lek
192 | lex
193 | lingv
194 | lit
195 | litur
196 | log
197 | lok
198 | max
199 | Max
200 | maď
201 | Maď
202 | medzinár
203 | mest
204 | metr
205 | mil
206 | Mil
207 | min
208 | Min
209 | miner
210 | ml
211 | mld
212 | mn
213 | mod
214 | mytol
215 | napr
216 | nar
217 | Nar
218 | nasl
219 | nedok
220 | neg
221 | negat
222 | neklas
223 | nem
224 | Nem
225 | neodb
226 | neos
227 | neskl
228 | nesklon
229 | nespis
230 | nespráv
231 | neved
232 | než
233 | niekt
234 | niž
235 | nom
236 | náb
237 | nákl
238 | námor
239 | nár
240 | obch
241 | obj
242 | obv
243 | obyč
244 | obč
245 | občian
246 | odb
247 | odd
248 | ods
249 | ojed
250 | okr
251 | Okr
252 | opt
253 | opyt
254 | org
255 | os
256 | osob
257 | ot
258 | ovoc
259 | par
260 | part
261 | pejor
262 | pers
263 | pf
264 | Pf
265 | P.f
266 | p.f
267 | pl
268 | Plk
269 | pod
270 | podst
271 | pokl
272 | polit
273 | politol
274 | polygr
275 | pomn
276 | popl
277 | por
278 | porad
279 | porov
280 | posch
281 | potrav
282 | použ
283 | poz
284 | pozit
285 | poľ
286 | poľno
287 | poľnohosp
288 | poľov
289 | pošt
290 | pož
291 | prac
292 | predl
293 | pren
294 | prep
295 | preuk
296 | priezv
297 | Priezv
298 | privl
299 | prof
300 | práv
301 | príd
302 | príj
303 | prík
304 | príp
305 | prír
306 | prísl
307 | príslov
308 | príč
309 | psych
310 | publ
311 | pís
312 | písm
313 | pôv
314 | refl
315 | reg
316 | rep
317 | resp
318 | rozk
319 | rozlič
320 | rozpráv
321 | roč
322 | Roč
323 | ryb
324 | rádiotech
325 | rím
326 | samohl
327 | semest
328 | sev
329 | severoamer
330 | severových
331 | severozáp
332 | sg
333 | skr
334 | skup
335 | sl
336 | Sloven
337 | soc
338 | soch
339 | sociol
340 | sp
341 | spol
342 | Spol
343 | spoloč
344 | spoluhl
345 | správ
346 | spôs
347 | st
348 | star
349 | starogréc
350 | starorím
351 | s.r.o
352 | stol
353 | stor
354 | str
355 | stredoamer
356 | stredoškol
357 | subj
358 | subst
359 | superl
360 | sv
361 | sz
362 | súkr
363 | súp
364 | súvzť
365 | tal
366 | Tal
367 | tech
368 | tel
369 | Tel
370 | telef
371 | teles
372 | telev
373 | teol
374 | trans
375 | turist
376 | tuzem
377 | typogr
378 | tzn
379 | tzv
380 | ukaz
381 | ul
382 | Ul
383 | umel
384 | univ
385 | ust
386 | ved
387 | vedľ
388 | verb
389 | veter
390 | vin
391 | viď
392 | vl
393 | vod
394 | vodohosp
395 | pnl
396 | vulg
397 | vyj
398 | vys
399 | vysokoškol
400 | vzťaž
401 | vôb
402 | vých
403 | výd
404 | výrob
405 | výsk
406 | výsl
407 | výtv
408 | výtvar
409 | význ
410 | včel
411 | vš
412 | všeob
413 | zahr
414 | zar
415 | zariad
416 | zast
417 | zastar
418 | zastaráv
419 | zb
420 | zdravot
421 | združ
422 | zjemn
423 | zlat
424 | zn
425 | Zn
426 | zool
427 | zr
428 | zried
429 | zv
430 | záhr
431 | zák
432 | zákl
433 | zám
434 | záp
435 | západoeur
436 | zázn
437 | územ
438 | účt
439 | čast
440 | čes
441 | Čes
442 | čl
443 | čísl
444 | živ
445 | pr
446 | fak
447 | Kr
448 | p.n.l
449 | A
450 | B
451 | C
452 | D
453 | E
454 | F
455 | G
456 | H
457 | I
458 | J
459 | K
460 | L
461 | M
462 | N
463 | O
464 | P
465 | Q
466 | R
467 | S
468 | T
469 | U
470 | V
471 | W
472 | X
473 | Y
474 | Z
475 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.sl:
--------------------------------------------------------------------------------
1 | dr
2 | Dr
3 | itd
4 | itn
5 | št #NUMERIC_ONLY#
6 | Št #NUMERIC_ONLY#
7 | d
8 | jan
9 | Jan
10 | feb
11 | Feb
12 | mar
13 | Mar
14 | apr
15 | Apr
16 | jun
17 | Jun
18 | jul
19 | Jul
20 | avg
21 | Avg
22 | sept
23 | Sept
24 | sep
25 | Sep
26 | okt
27 | Okt
28 | nov
29 | Nov
30 | dec
31 | Dec
32 | tj
33 | Tj
34 | npr
35 | Npr
36 | sl
37 | Sl
38 | op
39 | Op
40 | gl
41 | Gl
42 | oz
43 | Oz
44 | prev
45 | dipl
46 | ing
47 | prim
48 | Prim
49 | cf
50 | Cf
51 | gl
52 | Gl
53 | A
54 | B
55 | C
56 | D
57 | E
58 | F
59 | G
60 | H
61 | I
62 | J
63 | K
64 | L
65 | M
66 | N
67 | O
68 | P
69 | Q
70 | R
71 | S
72 | T
73 | U
74 | V
75 | W
76 | X
77 | Y
78 | Z
79 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.sv:
--------------------------------------------------------------------------------
1 | #single upper case letter are usually initials
2 | A
3 | B
4 | C
5 | D
6 | E
7 | F
8 | G
9 | H
10 | I
11 | J
12 | K
13 | L
14 | M
15 | N
16 | O
17 | P
18 | Q
19 | R
20 | S
21 | T
22 | U
23 | V
24 | W
25 | X
26 | Y
27 | Z
28 | #misc abbreviations
29 | AB
30 | G
31 | VG
32 | dvs
33 | etc
34 | from
35 | iaf
36 | jfr
37 | kl
38 | kr
39 | mao
40 | mfl
41 | mm
42 | osv
43 | pga
44 | tex
45 | tom
46 | vs
47 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.ta:
--------------------------------------------------------------------------------
1 | #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2 | #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3 |
4 | #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5 | #usually upper case letters are initials in a name
6 | அ
7 | ஆ
8 | இ
9 | ஈ
10 | உ
11 | ஊ
12 | எ
13 | ஏ
14 | ஐ
15 | ஒ
16 | ஓ
17 | ஔ
18 | ஃ
19 | க
20 | கா
21 | கி
22 | கீ
23 | கு
24 | கூ
25 | கெ
26 | கே
27 | கை
28 | கொ
29 | கோ
30 | கௌ
31 | க்
32 | ச
33 | சா
34 | சி
35 | சீ
36 | சு
37 | சூ
38 | செ
39 | சே
40 | சை
41 | சொ
42 | சோ
43 | சௌ
44 | ச்
45 | ட
46 | டா
47 | டி
48 | டீ
49 | டு
50 | டூ
51 | டெ
52 | டே
53 | டை
54 | டொ
55 | டோ
56 | டௌ
57 | ட்
58 | த
59 | தா
60 | தி
61 | தீ
62 | து
63 | தூ
64 | தெ
65 | தே
66 | தை
67 | தொ
68 | தோ
69 | தௌ
70 | த்
71 | ப
72 | பா
73 | பி
74 | பீ
75 | பு
76 | பூ
77 | பெ
78 | பே
79 | பை
80 | பொ
81 | போ
82 | பௌ
83 | ப்
84 | ற
85 | றா
86 | றி
87 | றீ
88 | று
89 | றூ
90 | றெ
91 | றே
92 | றை
93 | றொ
94 | றோ
95 | றௌ
96 | ற்
97 | ய
98 | யா
99 | யி
100 | யீ
101 | யு
102 | யூ
103 | யெ
104 | யே
105 | யை
106 | யொ
107 | யோ
108 | யௌ
109 | ய்
110 | ர
111 | ரா
112 | ரி
113 | ரீ
114 | ரு
115 | ரூ
116 | ரெ
117 | ரே
118 | ரை
119 | ரொ
120 | ரோ
121 | ரௌ
122 | ர்
123 | ல
124 | லா
125 | லி
126 | லீ
127 | லு
128 | லூ
129 | லெ
130 | லே
131 | லை
132 | லொ
133 | லோ
134 | லௌ
135 | ல்
136 | வ
137 | வா
138 | வி
139 | வீ
140 | வு
141 | வூ
142 | வெ
143 | வே
144 | வை
145 | வொ
146 | வோ
147 | வௌ
148 | வ்
149 | ள
150 | ளா
151 | ளி
152 | ளீ
153 | ளு
154 | ளூ
155 | ளெ
156 | ளே
157 | ளை
158 | ளொ
159 | ளோ
160 | ளௌ
161 | ள்
162 | ழ
163 | ழா
164 | ழி
165 | ழீ
166 | ழு
167 | ழூ
168 | ழெ
169 | ழே
170 | ழை
171 | ழொ
172 | ழோ
173 | ழௌ
174 | ழ்
175 | ங
176 | ஙா
177 | ஙி
178 | ஙீ
179 | ஙு
180 | ஙூ
181 | ஙெ
182 | ஙே
183 | ஙை
184 | ஙொ
185 | ஙோ
186 | ஙௌ
187 | ங்
188 | ஞ
189 | ஞா
190 | ஞி
191 | ஞீ
192 | ஞு
193 | ஞூ
194 | ஞெ
195 | ஞே
196 | ஞை
197 | ஞொ
198 | ஞோ
199 | ஞௌ
200 | ஞ்
201 | ண
202 | ணா
203 | ணி
204 | ணீ
205 | ணு
206 | ணூ
207 | ணெ
208 | ணே
209 | ணை
210 | ணொ
211 | ணோ
212 | ணௌ
213 | ண்
214 | ந
215 | நா
216 | நி
217 | நீ
218 | நு
219 | நூ
220 | நெ
221 | நே
222 | நை
223 | நொ
224 | நோ
225 | நௌ
226 | ந்
227 | ம
228 | மா
229 | மி
230 | மீ
231 | மு
232 | மூ
233 | மெ
234 | மே
235 | மை
236 | மொ
237 | மோ
238 | மௌ
239 | ம்
240 | ன
241 | னா
242 | னி
243 | னீ
244 | னு
245 | னூ
246 | னெ
247 | னே
248 | னை
249 | னொ
250 | னோ
251 | னௌ
252 | ன்
253 |
254 |
255 | #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
256 | திரு
257 | திருமதி
258 | வண
259 | கௌரவ
260 |
261 |
262 | #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
263 | உ.ம்
264 | #கா.ம்
265 | #எ.ம்
266 |
267 |
268 | #Numbers only. These should only induce breaks when followed by a numeric sequence
269 | # add NUMERIC_ONLY after the word for this function
270 | #This case is mostly for the english "No." which can either be a sentence of its own, or
271 | #if followed by a number, a non-breaking prefix
272 | No #NUMERIC_ONLY#
273 | Nos
274 | Art #NUMERIC_ONLY#
275 | Nr
276 | pp #NUMERIC_ONLY#
277 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.yue:
--------------------------------------------------------------------------------
1 | #
2 | # Cantonese (Chinese)
3 | #
4 | # Anything in this file, followed by a period,
5 | # does NOT indicate an end-of-sentence marker.
6 | #
7 | # English/Euro-language given-name initials (appearing in
8 | # news, periodicals, etc.)
9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 |
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 |
--------------------------------------------------------------------------------
/tools/nonbreaking_prefixes/nonbreaking_prefix.zh:
--------------------------------------------------------------------------------
1 | #
2 | # Mandarin (Chinese)
3 | #
4 | # Anything in this file, followed by a period,
5 | # does NOT indicate an end-of-sentence marker.
6 | #
7 | # English/Euro-language given-name initials (appearing in
8 | # news, periodicals, etc.)
9 | A
10 | Ā
11 | B
12 | C
13 | Č
14 | D
15 | E
16 | Ē
17 | F
18 | G
19 | Ģ
20 | H
21 | I
22 | Ī
23 | J
24 | K
25 | Ķ
26 | L
27 | Ļ
28 | M
29 | N
30 | Ņ
31 | O
32 | P
33 | Q
34 | R
35 | S
36 | Š
37 | T
38 | U
39 | Ū
40 | V
41 | W
42 | X
43 | Y
44 | Z
45 | Ž
46 |
47 | # Numbers only. These should only induce breaks when followed by
48 | # a numeric sequence.
49 | # Add NUMERIC_ONLY after the word for this function. This case is
50 | # mostly for the english "No." which can either be a sentence of its
51 | # own, or if followed by a number, a non-breaking prefix.
52 | No #NUMERIC_ONLY#
53 | Nr #NUMERIC_ONLY#
54 |
--------------------------------------------------------------------------------
/tools/release_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import torch
4 |
5 | if __name__ == "__main__":
6 | parser = argparse.ArgumentParser(
7 | description="Removes the optim data of PyTorch models")
8 | parser.add_argument("--model", "-m",
9 | help="The model filename (*.pt)", required=True)
10 | parser.add_argument("--output", "-o",
11 | help="The output filename (*.pt)", required=True)
12 | opt = parser.parse_args()
13 |
14 | model = torch.load(opt.model)
15 | model['optim'] = None
16 | torch.save(model, opt.output)
17 |
--------------------------------------------------------------------------------
/tools/test_rouge.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | import argparse
3 | import os
4 | import time
5 | import pyrouge
6 | import shutil
7 | import sys
8 | import codecs
9 |
10 | from onmt.utils.logging import init_logger, logger
11 |
12 |
13 | def test_rouge(cand, ref):
14 | """Calculate ROUGE scores of sequences passed as an iterator
15 | e.g. a list of str, an open file, StringIO or even sys.stdin
16 | """
17 | current_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
18 | tmp_dir = ".rouge-tmp-{}".format(current_time)
19 | try:
20 | if not os.path.isdir(tmp_dir):
21 | os.mkdir(tmp_dir)
22 | os.mkdir(tmp_dir + "/candidate")
23 | os.mkdir(tmp_dir + "/reference")
24 | candidates = [line.strip() for line in cand]
25 | references = [line.strip() for line in ref]
26 | assert len(candidates) == len(references)
27 | cnt = len(candidates)
28 | for i in range(cnt):
29 | if len(references[i]) < 1:
30 | continue
31 | with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w",
32 | encoding="utf-8") as f:
33 | f.write(candidates[i])
34 | with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w",
35 | encoding="utf-8") as f:
36 | f.write(references[i])
37 | r = pyrouge.Rouge155()
38 | r.model_dir = tmp_dir + "/reference/"
39 | r.system_dir = tmp_dir + "/candidate/"
40 | r.model_filename_pattern = 'ref.#ID#.txt'
41 | r.system_filename_pattern = r'cand.(\d+).txt'
42 | rouge_results = r.convert_and_evaluate()
43 | results_dict = r.output_to_dict(rouge_results)
44 | return results_dict
45 | finally:
46 | pass
47 | if os.path.isdir(tmp_dir):
48 | shutil.rmtree(tmp_dir)
49 |
50 |
51 | def rouge_results_to_str(results_dict):
52 | return ">> ROUGE(1/2/3/L/SU4): {:.2f}/{:.2f}/{:.2f}/{:.2f}/{:.2f}".format(
53 | results_dict["rouge_1_f_score"] * 100,
54 | results_dict["rouge_2_f_score"] * 100,
55 | results_dict["rouge_3_f_score"] * 100,
56 | results_dict["rouge_l_f_score"] * 100,
57 | results_dict["rouge_su*_f_score"] * 100)
58 |
59 |
60 | if __name__ == "__main__":
61 | init_logger('test_rouge.log')
62 | parser = argparse.ArgumentParser()
63 | parser.add_argument('-c', type=str, default="candidate.txt",
64 | help='candidate file')
65 | parser.add_argument('-r', type=str, default="reference.txt",
66 | help='reference file')
67 | args = parser.parse_args()
68 | if args.c.upper() == "STDIN":
69 | candidates = sys.stdin
70 | else:
71 | candidates = codecs.open(args.c, encoding="utf-8")
72 | references = codecs.open(args.r, encoding="utf-8")
73 |
74 | results_dict = test_rouge(candidates, references)
75 | logger.info(rouge_results_to_str(results_dict))
76 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import torch
3 | import numpy as np
4 | import random
5 |
6 | from onmt.bin.train import main
7 |
8 |
9 | def setup_seed(seed):
10 | torch.manual_seed(seed)
11 | torch.cuda.manual_seed_all(seed)
12 | np.random.seed(seed)
13 | random.seed(seed)
14 | torch.backends.cudnn.deterministic = True
15 |
16 |
17 | setup_seed(2020)
18 |
19 | if __name__ == "__main__":
20 | main()
21 |
--------------------------------------------------------------------------------
/translate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import torch
3 | import numpy as np
4 | import random
5 | from onmt.bin.translate import main
6 |
7 |
8 | def setup_seed(seed):
9 | torch.manual_seed(seed)
10 | torch.cuda.manual_seed_all(seed)
11 | np.random.seed(seed)
12 | random.seed(seed)
13 | torch.backends.cudnn.deterministic = True
14 |
15 |
16 | setup_seed(2020)
17 |
18 | if __name__ == "__main__":
19 | main()
20 |
--------------------------------------------------------------------------------