├── LICENSE
├── README.md
├── encdec
├── LICENSE
├── README.md
├── distributed_train.py
├── docs
│ ├── Makefile
│ ├── _static
│ │ └── theme_overrides.css
│ ├── command_line_tools.rst
│ ├── conf.py
│ ├── criterions.rst
│ ├── data.rst
│ ├── docutils.conf
│ ├── getting_started.rst
│ ├── index.rst
│ ├── lr_scheduler.rst
│ ├── make.bat
│ ├── models.rst
│ ├── modules.rst
│ ├── optim.rst
│ ├── overview.rst
│ ├── requirements.txt
│ ├── tasks.rst
│ ├── tutorial_classifying_names.rst
│ └── tutorial_simple_lstm.rst
├── eval_lm.py
├── examples
│ ├── .gitignore
│ ├── language_model
│ │ ├── README.md
│ │ └── prepare-wikitext-103.sh
│ ├── stories
│ │ └── README.md
│ └── translation
│ │ ├── README.md
│ │ ├── prepare-iwslt14.sh
│ │ ├── prepare-wmt14en2de.sh
│ │ └── prepare-wmt14en2fr.sh
├── fairseq
│ ├── __init__.py
│ ├── bleu.py
│ ├── clib
│ │ └── libbleu
│ │ │ ├── libbleu.cpp
│ │ │ └── module.cpp
│ ├── criterions
│ │ ├── __init__.py
│ │ ├── adaptive_loss.py
│ │ ├── cross_entropy.py
│ │ ├── fairseq_criterion.py
│ │ └── label_smoothed_cross_entropy.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── append_eos_dataset.py
│ │ ├── backtranslation_dataset.py
│ │ ├── concat_dataset.py
│ │ ├── data_utils.py
│ │ ├── dictionary.py
│ │ ├── fairseq_dataset.py
│ │ ├── indexed_dataset.py
│ │ ├── iterators.py
│ │ ├── language_pair_dataset.py
│ │ ├── monolingual_dataset.py
│ │ ├── noising.py
│ │ └── token_block_dataset.py
│ ├── distributed_utils.py
│ ├── meters.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── composite_encoder.py
│ │ ├── distributed_fairseq_model.py
│ │ ├── fairseq_decoder.py
│ │ ├── fairseq_encoder.py
│ │ ├── fairseq_incremental_decoder.py
│ │ ├── fairseq_model.py
│ │ ├── fconv.py
│ │ ├── fconv_self_att.py
│ │ ├── lstm.py
│ │ └── transformer.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── adaptive_softmax.py
│ │ ├── beamable_mm.py
│ │ ├── character_token_embedder.py
│ │ ├── conv_tbc.py
│ │ ├── downsampled_multihead_attention.py
│ │ ├── grad_multiply.py
│ │ ├── highway.py
│ │ ├── learned_positional_embedding.py
│ │ ├── linearized_convolution.py
│ │ ├── multihead_attention.py
│ │ ├── scalar_bias.py
│ │ └── sinusoidal_positional_embedding.py
│ ├── multiprocessing_pdb.py
│ ├── optim
│ │ ├── __init__.py
│ │ ├── adagrad.py
│ │ ├── adam.py
│ │ ├── fairseq_optimizer.py
│ │ ├── fp16_optimizer.py
│ │ ├── lr_scheduler
│ │ │ ├── __init__.py
│ │ │ ├── cosine_lr_scheduler.py
│ │ │ ├── fairseq_lr_scheduler.py
│ │ │ ├── fixed_schedule.py
│ │ │ ├── inverse_square_root_schedule.py
│ │ │ ├── reduce_lr_on_plateau.py
│ │ │ └── triangular_lr_scheduler.py
│ │ ├── nag.py
│ │ └── sgd.py
│ ├── options.py
│ ├── progress_bar.py
│ ├── search.py
│ ├── sequence_generator.py
│ ├── sequence_scorer.py
│ ├── tasks
│ │ ├── __init__.py
│ │ ├── fairseq_task.py
│ │ ├── language_modeling.py
│ │ └── translation.py
│ ├── tokenizer.py
│ ├── trainer.py
│ └── utils.py
├── generate.py
├── interactive.py
├── multiprocessing_train.py
├── preprocess.py
├── requirements.txt
├── rerank.py
├── score.py
├── scripts
│ ├── __init__.py
│ ├── average_checkpoints.py
│ ├── build_sym_alignment.py
│ ├── convert_dictionary.lua
│ ├── convert_model.lua
│ └── read_binarized.py
├── setup.py
├── tests
│ ├── __init__.py
│ ├── test_average_checkpoints.py
│ ├── test_backtranslation_dataset.py
│ ├── test_binaries.py
│ ├── test_character_token_embedder.py
│ ├── test_convtbc.py
│ ├── test_dictionary.py
│ ├── test_iterators.py
│ ├── test_label_smoothing.py
│ ├── test_noising.py
│ ├── test_reproducibility.py
│ ├── test_sequence_generator.py
│ ├── test_sequence_scorer.py
│ ├── test_train.py
│ ├── test_utils.py
│ └── utils.py
└── train.py
└── eval
├── LICENSE
├── README.md
├── calculate_variance_from_fixlength.py
├── eval.sh
├── make_rouge.py
└── prepare4rouge-simple.pl
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Sho Takase
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Positional Encoding to Control Output Sequence Length
2 |
3 | This repository contains source files we used in our paper
4 | >[Positional Encoding to Control Output Sequence Length](https://www.aclweb.org/anthology/N19-1401)
5 |
6 | >Sho Takase, Naoaki Okazaki
7 |
8 | > Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies
9 |
10 |
11 | ## Requirements
12 |
13 | - Python 3.6 or later for training
14 | - Python 2.7 for calculating rouge
15 | - PyTorch 0.4
16 | - To use new version PyTorch (e.g., 1.4.0), please use [this code](https://github.com/takase/alone_seq2seq) without one-emb option.
17 |
18 | ## Test data
19 |
20 | Test data used in our paper for each length
21 |
22 | - [https://drive.google.com/open?id=1teets0SZ82cdwQG0s454Y7JFuoutOawb](https://drive.google.com/open?id=1teets0SZ82cdwQG0s454Y7JFuoutOawb)
23 | - Each file contains ```SOURCE PART tab HEADLINE```
24 |
25 | ## Pre-trained model
26 |
27 | The following file contains pre-trained LRPE + PE model in English dataset. This model outputs ``` @@@@ ``` as a space, namely, a segmentation marker of words.
28 |
29 | The file also contains BPE code to split a plane English text into BPE with [this code](https://github.com/rsennrich/subword-nmt).
30 |
31 | [https://drive.google.com/file/d/15Sy8rv6Snw6Nso7T5MxYHSAZDdieXpE7/view?usp=sharing](https://drive.google.com/file/d/15Sy8rv6Snw6Nso7T5MxYHSAZDdieXpE7/view?usp=sharing)
32 |
33 | ## Acknowledgements
34 |
35 | A large portion of this repo is borrowed from the following repos: [https://github.com/pytorch/fairseq](https://github.com/pytorch/fairseq) and [https://github.com/facebookarchive/NAMAS](https://github.com/facebookarchive/NAMAS).
36 |
--------------------------------------------------------------------------------
/encdec/LICENSE:
--------------------------------------------------------------------------------
1 | BSD License
2 |
3 | For fairseq software
4 |
5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without modification,
8 | are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 |
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 |
17 | * Neither the name Facebook nor the names of its contributors may be used to
18 | endorse or promote products derived from this software without specific
19 | prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/encdec/README.md:
--------------------------------------------------------------------------------
1 | ## Preprocessing
2 |
3 | - Construction of binarized data with shared vocabulary
4 |
5 | - Input data is plain text such as following example
6 |
7 | ```
8 | australia 's current account deficit shrunk by a record 7.07 billion dollars -lrb- 6.04 billion us -rrb- in the june quarter due to soaring commodity prices , figures released monday showed .
9 | at least two people were killed in a suspected bomb attack on a passenger bus in the strife-torn southern philippines on monday , the military said .
10 | australian shares closed down 0.3 percent monday following a weak lead from the united states and lower commodity prices , dealers said .
11 | ```
12 |
13 | ```
14 | python preprocess.py --source-lang SOURCE_SUFFIX --target-lang TARGET_SUFFIX
15 | --trainpref PREFIX_PATH_TO_TRAIN_DATA --validpref PREFIX_PATH_TO_VALID_DATA
16 | --joined-dictionary --destdir PREPROCESS_PATH
17 | ```
18 |
19 | - If source file name of training data is text.source and target file name of training data is text.target, please set SOURCE_SUFFIX=source, TARGET_SUFFIX=target, PREFIX_PATH_TO_TRAIN_DATA=text
20 |
21 | - Preprocessing to test file
22 |
23 | ```
24 | python preprocess.py --source-lang SOURCE_SUFFIX --target-lang TARGET_SUFFIX
25 | --tgtdict PATH_TO_TARGET_DICT --srcdict PATH_TO_SOURCE_DICT
26 | --testpref PREFIX_PATH_TO_TEST_DATA --destdir PREPROCESS_TEST_PATH
27 | ```
28 |
29 | ## Training
30 |
31 | - E.g., training Transformer+LRPE+PE on 4 GPU machine
32 |
33 | - +LRPE: --represent-length-by-lrpe
34 |
35 | - +LDPE: --represent-length-by-ldpe
36 |
37 | - +PE: --ordinary-sinpos
38 |
39 | ```
40 | python train.py PREPROCESS_PATH --source-lang SOURCE_SUFFIX --target-lang TARGET_SUFFIX
41 | --arch transformer_wmt_en_de --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0
42 | --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 --lr 0.001 --min-lr 1e-09
43 | --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1
44 | --max-tokens 3584 --seed 2723 --max-epoch 100 --update-freq 16 --share-all-embeddings
45 | --represent-length-by-lrpe --ordinary-sinpos --save-dir PATH_TO_SAVE_MODEL
46 | ```
47 |
48 | - If you run the training process on 1 GPU, please modify update freq 16 -> 64
49 |
50 | - Averaging last 10 checkpoints
51 |
52 | ```
53 | python scripts/average_checkpoints.py --inputs PATH_TO_SAVE_MODEL --num-epoch-checkpoints 10 --output PATH_TO_AVERAGED_MODEL
54 | ```
55 |
56 | ## Generation
57 |
58 | 1. Generate headlines in the constraint of 75 characters
59 |
60 | ```
61 | python generate.py PREPROCESS_TEST_PATH --source-lang SOURCE_SUFFIX --target-lang TARGET_SUFFIX
62 | --path PATH_TO_AVERAGED_MODEL --desired-length 75 --batch-size 32 --beam 5
63 | | grep '^H' | sed 's/^H\-//g' | sort -t 'TAB' -k1,1 -n | cut -f 3-
64 | ```
65 |
66 | 2. Generate n-best headlines and re-ranking
67 |
68 | - Generate n-best headlines (n = 20 in the following example)
69 |
70 | ```
71 | python generate.py PREPROCESS_TEST_PATH --source-lang SOURCE_SUFFIX --target-lang TARGET_SUFFIX
72 | --path PATH_TO_AVERAGED_MODEL --batch-size 32 --beam 20 --nbest 20 --desired-length 75 > nbest.txt
73 | ```
74 |
75 | - Re-ranking n-best headlines
76 |
77 | ```
78 | python rerank.py --cand nbest.txt -m --source SOURCE_FILE
79 | ```
80 |
--------------------------------------------------------------------------------
/encdec/distributed_train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3 -u
2 | # Copyright (c) 2017-present, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the license found in the LICENSE file in
6 | # the root directory of this source tree. An additional grant of patent rights
7 | # can be found in the PATENTS file in the same directory.
8 |
9 | import os
10 | import socket
11 | import subprocess
12 |
13 | from train import main as single_process_main
14 | from fairseq import distributed_utils, options
15 |
16 |
17 | def main(args):
18 | if args.distributed_init_method is None and args.distributed_port > 0:
19 | # We can determine the init method automatically for Slurm.
20 | node_list = os.environ.get('SLURM_JOB_NODELIST')
21 | if node_list is not None:
22 | try:
23 | hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list])
24 | args.distributed_init_method = 'tcp://{host}:{port}'.format(
25 | host=hostnames.split()[0].decode('utf-8'),
26 | port=args.distributed_port)
27 | args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
28 | args.device_id = int(os.environ.get('SLURM_LOCALID'))
29 | except subprocess.CalledProcessError as e: # scontrol failed
30 | raise e
31 | except FileNotFoundError as e: # Slurm is not installed
32 | pass
33 | if args.distributed_init_method is None and args.distributed_port is None:
34 | raise ValueError('--distributed-init-method or --distributed-port '
35 | 'must be specified for distributed training')
36 |
37 | args.distributed_rank = distributed_utils.distributed_init(args)
38 | print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank))
39 | single_process_main(args)
40 |
41 |
42 | if __name__ == '__main__':
43 | parser = options.get_training_parser()
44 | args = options.parse_args_and_arch(parser)
45 | main(args)
46 |
--------------------------------------------------------------------------------
/encdec/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python -msphinx
7 | SPHINXPROJ = fairseq
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/encdec/docs/_static/theme_overrides.css:
--------------------------------------------------------------------------------
1 | .wy-table-responsive table td kbd {
2 | white-space: nowrap;
3 | }
4 | .wy-table-responsive table td {
5 | white-space: normal !important;
6 | }
7 | .wy-table-responsive {
8 | overflow: visible !important;
9 | }
10 |
--------------------------------------------------------------------------------
/encdec/docs/command_line_tools.rst:
--------------------------------------------------------------------------------
1 | .. _Command-line Tools:
2 |
3 | Command-line Tools
4 | ==================
5 |
6 | Fairseq provides several command-line tools for training and evaluating models:
7 |
8 | - :ref:`preprocess.py`: Data pre-processing: build vocabularies and binarize training data
9 | - :ref:`train.py`: Train a new model on one or multiple GPUs
10 | - :ref:`generate.py`: Translate pre-processed data with a trained model
11 | - :ref:`interactive.py`: Translate raw text with a trained model
12 | - :ref:`score.py`: BLEU scoring of generated translations against reference translations
13 | - :ref:`eval_lm.py`: Language model evaluation
14 |
15 |
16 | .. _preprocess.py:
17 |
18 | preprocess.py
19 | ~~~~~~~~~~~~~
20 | .. automodule:: preprocess
21 |
22 | .. argparse::
23 | :module: preprocess
24 | :func: get_parser
25 | :prog: preprocess.py
26 |
27 |
28 | .. _train.py:
29 |
30 | train.py
31 | ~~~~~~~~
32 | .. automodule:: train
33 |
34 | .. argparse::
35 | :module: fairseq.options
36 | :func: get_training_parser
37 | :prog: train.py
38 |
39 |
40 | .. _generate.py:
41 |
42 | generate.py
43 | ~~~~~~~~~~~
44 | .. automodule:: generate
45 |
46 | .. argparse::
47 | :module: fairseq.options
48 | :func: get_generation_parser
49 | :prog: generate.py
50 |
51 |
52 | .. _interactive.py:
53 |
54 | interactive.py
55 | ~~~~~~~~~~~~~~
56 | .. automodule:: interactive
57 |
58 | .. argparse::
59 | :module: fairseq.options
60 | :func: get_interactive_generation_parser
61 | :prog: interactive.py
62 |
63 |
64 | .. _score.py:
65 |
66 | score.py
67 | ~~~~~~~~
68 | .. automodule:: score
69 |
70 | .. argparse::
71 | :module: score
72 | :func: get_parser
73 | :prog: score.py
74 |
75 |
76 | .. _eval_lm.py:
77 |
78 | eval_lm.py
79 | ~~~~~~~~~~
80 | .. automodule:: eval_lm
81 |
82 | .. argparse::
83 | :module: fairseq.options
84 | :func: get_eval_lm_parser
85 | :prog: eval_lm.py
86 |
--------------------------------------------------------------------------------
/encdec/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # fairseq documentation build configuration file, created by
5 | # sphinx-quickstart on Fri Aug 17 21:45:30 2018.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 |
20 | import os
21 | import sys
22 |
23 | # source code directory, relative to this file, for sphinx-autobuild
24 | sys.path.insert(0, os.path.abspath('..'))
25 |
26 | source_suffix = ['.rst']
27 |
28 | # -- General configuration ------------------------------------------------
29 |
30 | # If your documentation needs a minimal Sphinx version, state it here.
31 | #
32 | # needs_sphinx = '1.0'
33 |
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | extensions = [
38 | 'sphinx.ext.autodoc',
39 | 'sphinx.ext.intersphinx',
40 | 'sphinx.ext.viewcode',
41 | 'sphinx.ext.napoleon',
42 | 'sphinxarg.ext',
43 | ]
44 |
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ['_templates']
47 |
48 | # The master toctree document.
49 | master_doc = 'index'
50 |
51 | # General information about the project.
52 | project = 'fairseq'
53 | copyright = '2018, Facebook AI Research (FAIR)'
54 | author = 'Facebook AI Research (FAIR)'
55 |
56 | github_doc_root = 'https://github.com/pytorch/fairseq/tree/master/docs/'
57 |
58 | # The version info for the project you're documenting, acts as replacement for
59 | # |version| and |release|, also used in various other places throughout the
60 | # built documents.
61 | #
62 | # The short X.Y version.
63 | version = '0.6.0'
64 | # The full version, including alpha/beta/rc tags.
65 | release = '0.6.0'
66 |
67 | # The language for content autogenerated by Sphinx. Refer to documentation
68 | # for a list of supported languages.
69 | #
70 | # This is also used if you do content translation via gettext catalogs.
71 | # Usually you set "language" from the command line for these cases.
72 | language = None
73 |
74 | # List of patterns, relative to source directory, that match files and
75 | # directories to ignore when looking for source files.
76 | # This patterns also effect to html_static_path and html_extra_path
77 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
78 |
79 | # The name of the Pygments (syntax highlighting) style to use.
80 | pygments_style = 'sphinx'
81 | highlight_language = 'python'
82 |
83 | # If true, `todo` and `todoList` produce output, else they produce nothing.
84 | todo_include_todos = False
85 |
86 |
87 | # -- Options for HTML output ----------------------------------------------
88 |
89 | # The theme to use for HTML and HTML Help pages. See the documentation for
90 | # a list of builtin themes.
91 | #
92 | html_theme = 'sphinx_rtd_theme'
93 |
94 | # Theme options are theme-specific and customize the look and feel of a theme
95 | # further. For a list of options available for each theme, see the
96 | # documentation.
97 | #
98 | # html_theme_options = {}
99 |
100 | # Add any paths that contain custom static files (such as style sheets) here,
101 | # relative to this directory. They are copied after the builtin static files,
102 | # so a file named "default.css" will overwrite the builtin "default.css".
103 | html_static_path = ['_static']
104 |
105 | html_context = {
106 | 'css_files': [
107 | '_static/theme_overrides.css', # override wide tables in RTD theme
108 | ],
109 | }
110 |
111 | # Custom sidebar templates, must be a dictionary that maps document names
112 | # to template names.
113 | #
114 | # This is required for the alabaster theme
115 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
116 | #html_sidebars = {
117 | # '**': [
118 | # 'about.html',
119 | # 'navigation.html',
120 | # 'relations.html', # needs 'show_related': True theme option to display
121 | # 'searchbox.html',
122 | # 'donate.html',
123 | # ]
124 | #}
125 |
126 |
127 | # Example configuration for intersphinx: refer to the Python standard library.
128 | intersphinx_mapping = {
129 | 'numpy': ('http://docs.scipy.org/doc/numpy/', None),
130 | 'python': ('https://docs.python.org/', None),
131 | 'torch': ('https://pytorch.org/docs/master/', None),
132 | }
133 |
--------------------------------------------------------------------------------
/encdec/docs/criterions.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | .. _Criterions:
5 |
6 | Criterions
7 | ==========
8 |
9 | .. automodule:: fairseq.criterions
10 | :members:
11 | .. autoclass:: fairseq.criterions.FairseqCriterion
12 | :members:
13 | :undoc-members:
14 |
--------------------------------------------------------------------------------
/encdec/docs/data.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | .. module:: fairseq.data
5 |
6 | Data Loading and Utilities
7 | ==========================
8 |
9 | .. _datasets:
10 |
11 | Datasets
12 | --------
13 |
14 | **Datasets** define the data format and provide helpers for creating
15 | mini-batches.
16 |
17 | .. autoclass:: fairseq.data.FairseqDataset
18 | :members:
19 | .. autoclass:: fairseq.data.LanguagePairDataset
20 | :members:
21 | .. autoclass:: fairseq.data.MonolingualDataset
22 | :members:
23 |
24 |
25 | Dictionary
26 | ----------
27 |
28 | .. autoclass:: fairseq.data.Dictionary
29 | :members:
30 |
31 |
32 | Iterators
33 | ---------
34 |
35 | .. autoclass:: fairseq.data.CountingIterator
36 | :members:
37 | .. autoclass:: fairseq.data.EpochBatchIterator
38 | :members:
39 | .. autoclass:: fairseq.data.GroupedIterator
40 | :members:
41 | .. autoclass:: fairseq.data.ShardedIterator
42 | :members:
43 |
--------------------------------------------------------------------------------
/encdec/docs/docutils.conf:
--------------------------------------------------------------------------------
1 | [writers]
2 | option-limit=0
3 |
--------------------------------------------------------------------------------
/encdec/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. fairseq documentation master file, created by
2 | sphinx-quickstart on Fri Aug 17 21:45:30 2018.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | :github_url: https://github.com/pytorch/fairseq
7 |
8 |
9 | fairseq documentation
10 | =====================
11 |
12 | Fairseq is a sequence modeling toolkit written in `PyTorch
13 | `_ that allows researchers and developers to
14 | train custom models for translation, summarization, language modeling and other
15 | text generation tasks.
16 |
17 | .. toctree::
18 | :maxdepth: 1
19 | :caption: Getting Started
20 |
21 | getting_started
22 | command_line_tools
23 |
24 | .. toctree::
25 | :maxdepth: 1
26 | :caption: Extending Fairseq
27 |
28 | overview
29 | tutorial_simple_lstm
30 | tutorial_classifying_names
31 |
32 | .. toctree::
33 | :maxdepth: 2
34 | :caption: Library Reference
35 |
36 | tasks
37 | models
38 | criterions
39 | optim
40 | lr_scheduler
41 | data
42 | modules
43 |
44 |
45 | Indices and tables
46 | ==================
47 |
48 | * :ref:`genindex`
49 | * :ref:`search`
50 |
--------------------------------------------------------------------------------
/encdec/docs/lr_scheduler.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | .. _Learning Rate Schedulers:
5 |
6 | Learning Rate Schedulers
7 | ========================
8 |
9 | TODO
10 |
11 | .. automodule:: fairseq.optim.lr_scheduler
12 | :members:
13 |
--------------------------------------------------------------------------------
/encdec/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=python -msphinx
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=fairseq
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | echo.then set the SPHINXBUILD environment variable to point to the full
21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | echo.Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/encdec/docs/models.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | .. module:: fairseq.models
5 |
6 | .. _Models:
7 |
8 | Models
9 | ======
10 |
11 | A Model defines the neural network's ``forward()`` method and encapsulates all
12 | of the learnable parameters in the network. Each model also provides a set of
13 | named *architectures* that define the precise network configuration (e.g.,
14 | embedding dimension, number of layers, etc.).
15 |
16 | Both the model type and architecture are selected via the ``--arch``
17 | command-line argument. Once selected, a model may expose additional command-line
18 | arguments for further configuration.
19 |
20 | .. note::
21 |
22 | All fairseq Models extend :class:`BaseFairseqModel`, which in turn extends
23 | :class:`torch.nn.Module`. Thus any fairseq Model can be used as a
24 | stand-alone Module in other PyTorch code.
25 |
26 |
27 | Convolutional Neural Networks (CNN)
28 | -----------------------------------
29 |
30 | .. module:: fairseq.models.fconv
31 | .. autoclass:: fairseq.models.fconv.FConvModel
32 | :members:
33 | .. autoclass:: fairseq.models.fconv.FConvEncoder
34 | :members:
35 | :undoc-members:
36 | .. autoclass:: fairseq.models.fconv.FConvDecoder
37 | :members:
38 |
39 |
40 | Long Short-Term Memory (LSTM) networks
41 | --------------------------------------
42 |
43 | .. module:: fairseq.models.lstm
44 | .. autoclass:: fairseq.models.lstm.LSTMModel
45 | :members:
46 | .. autoclass:: fairseq.models.lstm.LSTMEncoder
47 | :members:
48 | .. autoclass:: fairseq.models.lstm.LSTMDecoder
49 | :members:
50 |
51 |
52 | Transformer (self-attention) networks
53 | -------------------------------------
54 |
55 | .. module:: fairseq.models.transformer
56 | .. autoclass:: fairseq.models.transformer.TransformerModel
57 | :members:
58 | .. autoclass:: fairseq.models.transformer.TransformerEncoder
59 | :members:
60 | .. autoclass:: fairseq.models.transformer.TransformerEncoderLayer
61 | :members:
62 | .. autoclass:: fairseq.models.transformer.TransformerDecoder
63 | :members:
64 | .. autoclass:: fairseq.models.transformer.TransformerDecoderLayer
65 | :members:
66 |
67 |
68 | Adding new models
69 | -----------------
70 |
71 | .. currentmodule:: fairseq.models
72 | .. autofunction:: fairseq.models.register_model
73 | .. autofunction:: fairseq.models.register_model_architecture
74 | .. autoclass:: fairseq.models.BaseFairseqModel
75 | :members:
76 | :undoc-members:
77 | .. autoclass:: fairseq.models.FairseqModel
78 | :members:
79 | :undoc-members:
80 | .. autoclass:: fairseq.models.FairseqLanguageModel
81 | :members:
82 | :undoc-members:
83 | .. autoclass:: fairseq.models.FairseqEncoder
84 | :members:
85 | .. autoclass:: fairseq.models.CompositeEncoder
86 | :members:
87 | .. autoclass:: fairseq.models.FairseqDecoder
88 | :members:
89 |
90 |
91 | .. _Incremental decoding:
92 |
93 | Incremental decoding
94 | --------------------
95 |
96 | .. autoclass:: fairseq.models.FairseqIncrementalDecoder
97 | :members:
98 | :undoc-members:
99 |
--------------------------------------------------------------------------------
/encdec/docs/modules.rst:
--------------------------------------------------------------------------------
1 | Modules
2 | =======
3 |
4 | Fairseq provides several stand-alone :class:`torch.nn.Module` s that may be
5 | helpful when implementing a new :class:`FairseqModel`.
6 |
7 | .. automodule:: fairseq.modules
8 | :members:
9 | :undoc-members:
10 |
--------------------------------------------------------------------------------
/encdec/docs/optim.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | .. _optimizers:
5 |
6 | Optimizers
7 | ==========
8 |
9 | .. automodule:: fairseq.optim
10 | :members:
11 |
--------------------------------------------------------------------------------
/encdec/docs/overview.rst:
--------------------------------------------------------------------------------
1 | Overview
2 | ========
3 |
4 | Fairseq can be extended through user-supplied `plug-ins
5 | `_. We support five kinds of
6 | plug-ins:
7 |
8 | - :ref:`Models` define the neural network architecture and encapsulate all of the
9 | learnable parameters.
10 | - :ref:`Criterions` compute the loss function given the model outputs and targets.
11 | - :ref:`Tasks` store dictionaries and provide helpers for loading/iterating over
12 | Datasets, initializing the Model/Criterion and calculating the loss.
13 | - :ref:`Optimizers` update the Model parameters based on the gradients.
14 | - :ref:`Learning Rate Schedulers` update the learning rate over the course of
15 | training.
16 |
17 | **Training Flow**
18 |
19 | Given a ``model``, ``criterion``, ``task``, ``optimizer`` and ``lr_scheduler``,
20 | fairseq implements the following high-level training flow::
21 |
22 | for epoch in range(num_epochs):
23 | itr = task.get_batch_iterator(task.dataset('train'))
24 | for num_updates, batch in enumerate(itr):
25 | loss = criterion(model, batch)
26 | optimizer.backward(loss)
27 | optimizer.step()
28 | lr_scheduler.step_update(num_updates)
29 | lr_scheduler.step(epoch)
30 |
31 | **Registering new plug-ins**
32 |
33 | New plug-ins are *registered* through a set of ``@register`` function
34 | decorators, for example::
35 |
36 | @register_model('my_lstm')
37 | class MyLSTM(FairseqModel):
38 | (...)
39 |
40 | Once registered, new plug-ins can be used with the existing :ref:`Command-line
41 | Tools`. See the Tutorial sections for more detailed walkthroughs of how to add
42 | new plug-ins.
43 |
--------------------------------------------------------------------------------
/encdec/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx<2.0
2 | sphinx-argparse
3 |
--------------------------------------------------------------------------------
/encdec/docs/tasks.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | .. module:: fairseq.tasks
5 |
6 | .. _Tasks:
7 |
8 | Tasks
9 | =====
10 |
11 | Tasks store dictionaries and provide helpers for loading/iterating over
12 | Datasets, initializing the Model/Criterion and calculating the loss.
13 |
14 | Tasks can be selected via the ``--task`` command-line argument. Once selected, a
15 | task may expose additional command-line arguments for further configuration.
16 |
17 | Example usage::
18 |
19 | # setup the task (e.g., load dictionaries)
20 | task = fairseq.tasks.setup_task(args)
21 |
22 | # build model and criterion
23 | model = task.build_model(args)
24 | criterion = task.build_criterion(args)
25 |
26 | # load datasets
27 | task.load_dataset('train')
28 | task.load_dataset('valid')
29 |
30 | # iterate over mini-batches of data
31 | batch_itr = task.get_batch_iterator(
32 | task.dataset('train'), max_tokens=4096,
33 | )
34 | for batch in batch_itr:
35 | # compute the loss
36 | loss, sample_size, logging_output = task.get_loss(
37 | model, criterion, batch,
38 | )
39 | loss.backward()
40 |
41 |
42 | Translation
43 | -----------
44 |
45 | .. autoclass:: fairseq.tasks.translation.TranslationTask
46 |
47 | .. _language modeling:
48 |
49 | Language Modeling
50 | -----------------
51 |
52 | .. autoclass:: fairseq.tasks.language_modeling.LanguageModelingTask
53 |
54 |
55 | Adding new tasks
56 | ----------------
57 |
58 | .. autofunction:: fairseq.tasks.register_task
59 | .. autoclass:: fairseq.tasks.FairseqTask
60 | :members:
61 | :undoc-members:
62 |
--------------------------------------------------------------------------------
/encdec/examples/.gitignore:
--------------------------------------------------------------------------------
1 | */*
2 | !*/*.sh
3 | !*/*.md
4 |
--------------------------------------------------------------------------------
/encdec/examples/language_model/README.md:
--------------------------------------------------------------------------------
1 | Sample data processing scripts for the FAIR Sequence-to-Sequence Toolkit
2 |
3 | These scripts provide an example of pre-processing data for the Language Modeling task.
4 |
5 | # prepare-wikitext-103.sh
6 |
7 | Provides an example of pre-processing for [WikiText-103 language modeling task](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset):
8 |
9 | Example usage:
10 | ```
11 | $ cd examples/language_model/
12 | $ bash prepare-wikitext-103.sh
13 | $ cd ../..
14 |
15 | # Binarize the dataset:
16 | $ TEXT=examples/language_model/wikitext-103
17 |
18 | $ python preprocess.py --only-source \
19 | --trainpref $TEXT/wiki.train.tokens --validpref $TEXT/wiki.valid.tokens --testpref $TEXT/wiki.test.tokens \
20 | --destdir data-bin/wikitext-103
21 |
22 | # Train the model:
23 | # If it runs out of memory, try to reduce max-tokens and max-target-positions
24 | $ mkdir -p checkpoints/wikitext-103
25 | $ python train.py --task language_modeling data-bin/wikitext-103 \
26 | --max-epoch 35 --arch fconv_lm_dauphin_wikitext103 --optimizer nag \
27 | --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
28 | --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
29 | --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024
30 |
31 | # Evaluate:
32 | $ python eval_lm.py data-bin/wikitext-103 --path 'checkpoints/wiki103/checkpoint_best.pt'
33 |
34 | ```
35 |
--------------------------------------------------------------------------------
/encdec/examples/language_model/prepare-wikitext-103.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
3 |
4 | URLS=(
5 | "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
6 | )
7 | FILES=(
8 | "wikitext-103-v1.zip"
9 | )
10 |
11 | for ((i=0;i<${#URLS[@]};++i)); do
12 | file=${FILES[i]}
13 | if [ -f $file ]; then
14 | echo "$file already exists, skipping download"
15 | else
16 | url=${URLS[i]}
17 | wget "$url"
18 | if [ -f $file ]; then
19 | echo "$url successfully downloaded."
20 | else
21 | echo "$url not successfully downloaded."
22 | exit -1
23 | fi
24 | if [ ${file: -4} == ".tgz" ]; then
25 | tar zxvf $file
26 | elif [ ${file: -4} == ".tar" ]; then
27 | tar xvf $file
28 | elif [ ${file: -4} == ".zip" ]; then
29 | unzip $file
30 | fi
31 | fi
32 | done
33 | cd ..
34 |
--------------------------------------------------------------------------------
/encdec/examples/stories/README.md:
--------------------------------------------------------------------------------
1 | FAIR Sequence-to-Sequence Toolkit for Story Generation
2 |
3 | The following commands provide an example of pre-processing data, training a model, and generating text for story generation with the WritingPrompts dataset.
4 |
5 | The dataset can be downloaded like this:
6 |
7 | ```
8 | curl https://s3.amazonaws.com/fairseq-py/data/writingPrompts.tar.gz | tar xvzf -
9 | ```
10 |
11 | and contains a train, test, and valid split. The dataset is described here: https://arxiv.org/abs/1805.04833. We model only the first 1000 words of each story, including one newLine token.
12 |
13 |
14 | Example usage:
15 | ```
16 | # Preprocess the dataset:
17 | # Note that the dataset release is the full data, but the paper models the first 1000 words of each story
18 | # Here is some example code that can trim the dataset to the first 1000 words of each story
19 | $ python
20 | $ data = ["train", "test", "valid"]
21 | $ for name in data:
22 | $ with open(name + ".wp_target") as f:
23 | $ stories = f.readlines()
24 | $ stories = [" ".join(i.split()[0:1000]) for i in stories]
25 | $ with open(name + ".wp_target", "w") as o:
26 | $ for line in stories:
27 | $ o.write(line.strip() + "\n")
28 |
29 | # Binarize the dataset:
30 | $ TEXT=examples/stories/writingPrompts
31 | $ python preprocess.py --source-lang wp_source --target-lang wp_target \
32 | --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
33 | --destdir data-bin/writingPrompts --padding-factor 1 --thresholdtgt 10 --thresholdsrc 10
34 |
35 | # Train the model:
36 | $ python train.py data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau --decoder-attention True --encoder-attention False --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 --source-lang wp_source --target-lang wp_target --gated-attention True --self-attention True --project-input True --pretrained False
37 |
38 | # Train a fusion model:
39 | # add the arguments: --pretrained True --pretrained-checkpoint path/to/checkpoint
40 |
41 | # Generate:
42 | # Note: to load the pretrained model at generation time, you need to pass in a model-override argument to communicate to the fusion model at generation time where you have placed the pretrained checkpoint. By default, it will load the exact path of the fusion model's pretrained model from training time. You should use model-override if you have moved the pretrained model (or are using our provided models). If you are generating from a non-fusion model, the model-override argument is not necessary.
43 |
44 | $ python generate.py data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --sampling-temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}"
45 | ```
46 |
--------------------------------------------------------------------------------
/encdec/examples/translation/prepare-iwslt14.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
4 |
5 | echo 'Cloning Moses github repository (for tokenization scripts)...'
6 | git clone https://github.com/moses-smt/mosesdecoder.git
7 |
8 | echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
9 | git clone https://github.com/rsennrich/subword-nmt.git
10 |
11 | SCRIPTS=mosesdecoder/scripts
12 | TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
13 | LC=$SCRIPTS/tokenizer/lowercase.perl
14 | CLEAN=$SCRIPTS/training/clean-corpus-n.perl
15 | BPEROOT=subword-nmt
16 | BPE_TOKENS=10000
17 |
18 | URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
19 | GZ=de-en.tgz
20 |
21 | if [ ! -d "$SCRIPTS" ]; then
22 | echo "Please set SCRIPTS variable correctly to point to Moses scripts."
23 | exit
24 | fi
25 |
26 | src=de
27 | tgt=en
28 | lang=de-en
29 | prep=iwslt14.tokenized.de-en
30 | tmp=$prep/tmp
31 | orig=orig
32 |
33 | mkdir -p $orig $tmp $prep
34 |
35 | echo "Downloading data from ${URL}..."
36 | cd $orig
37 | wget "$URL"
38 |
39 | if [ -f $GZ ]; then
40 | echo "Data successfully downloaded."
41 | else
42 | echo "Data not successfully downloaded."
43 | exit
44 | fi
45 |
46 | tar zxvf $GZ
47 | cd ..
48 |
49 | echo "pre-processing train data..."
50 | for l in $src $tgt; do
51 | f=train.tags.$lang.$l
52 | tok=train.tags.$lang.tok.$l
53 |
54 | cat $orig/$lang/$f | \
55 | grep -v '' | \
56 | grep -v '' | \
57 | grep -v '' | \
58 | sed -e 's///g' | \
59 | sed -e 's/<\/title>//g' | \
60 | sed -e 's///g' | \
61 | sed -e 's/<\/description>//g' | \
62 | perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
63 | echo ""
64 | done
65 | perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
66 | for l in $src $tgt; do
67 | perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
68 | done
69 |
70 | echo "pre-processing valid/test data..."
71 | for l in $src $tgt; do
72 | for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
73 | fname=${o##*/}
74 | f=$tmp/${fname%.*}
75 | echo $o $f
76 | grep '\s*//g' | \
78 | sed -e 's/\s*<\/seg>\s*//g' | \
79 | sed -e "s/\’/\'/g" | \
80 | perl $TOKENIZER -threads 8 -l $l | \
81 | perl $LC > $f
82 | echo ""
83 | done
84 | done
85 |
86 |
87 | echo "creating train, valid, test..."
88 | for l in $src $tgt; do
89 | awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
90 | awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
91 |
92 | cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
93 | $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
94 | $tmp/IWSLT14.TED.tst2010.de-en.$l \
95 | $tmp/IWSLT14.TED.tst2011.de-en.$l \
96 | $tmp/IWSLT14.TED.tst2012.de-en.$l \
97 | > $tmp/test.$l
98 | done
99 |
100 | TRAIN=$tmp/train.en-de
101 | BPE_CODE=$prep/code
102 | rm -f $TRAIN
103 | for l in $src $tgt; do
104 | cat $tmp/train.$l >> $TRAIN
105 | done
106 |
107 | echo "learn_bpe.py on ${TRAIN}..."
108 | python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
109 |
110 | for L in $src $tgt; do
111 | for f in train.$L valid.$L test.$L; do
112 | echo "apply_bpe.py to ${f}..."
113 | python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
114 | done
115 | done
116 |
--------------------------------------------------------------------------------
/encdec/examples/translation/prepare-wmt14en2de.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
3 |
4 | echo 'Cloning Moses github repository (for tokenization scripts)...'
5 | git clone https://github.com/moses-smt/mosesdecoder.git
6 |
7 | echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
8 | git clone https://github.com/rsennrich/subword-nmt.git
9 |
10 | SCRIPTS=mosesdecoder/scripts
11 | TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
12 | CLEAN=$SCRIPTS/training/clean-corpus-n.perl
13 | NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
14 | REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
15 | BPEROOT=subword-nmt
16 | BPE_TOKENS=40000
17 |
18 | URLS=(
19 | "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
20 | "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
21 | "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz"
22 | "http://data.statmt.org/wmt17/translation-task/dev.tgz"
23 | "http://statmt.org/wmt14/test-full.tgz"
24 | )
25 | FILES=(
26 | "training-parallel-europarl-v7.tgz"
27 | "training-parallel-commoncrawl.tgz"
28 | "training-parallel-nc-v12.tgz"
29 | "dev.tgz"
30 | "test-full.tgz"
31 | )
32 | CORPORA=(
33 | "training/europarl-v7.de-en"
34 | "commoncrawl.de-en"
35 | "training/news-commentary-v12.de-en"
36 | )
37 |
38 | # This will make the dataset compatible to the one used in "Convolutional Sequence to Sequence Learning"
39 | # https://arxiv.org/abs/1705.03122
40 | if [ "$1" == "--icml17" ]; then
41 | URLS[2]="http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
42 | FILES[2]="training-parallel-nc-v9.tgz"
43 | CORPORA[2]="training/news-commentary-v9.de-en"
44 | fi
45 |
46 | if [ ! -d "$SCRIPTS" ]; then
47 | echo "Please set SCRIPTS variable correctly to point to Moses scripts."
48 | exit
49 | fi
50 |
51 | src=en
52 | tgt=de
53 | lang=en-de
54 | prep=wmt14_en_de
55 | tmp=$prep/tmp
56 | orig=orig
57 | dev=dev/newstest2013
58 |
59 | mkdir -p $orig $tmp $prep
60 |
61 | cd $orig
62 |
63 | for ((i=0;i<${#URLS[@]};++i)); do
64 | file=${FILES[i]}
65 | if [ -f $file ]; then
66 | echo "$file already exists, skipping download"
67 | else
68 | url=${URLS[i]}
69 | wget "$url"
70 | if [ -f $file ]; then
71 | echo "$url successfully downloaded."
72 | else
73 | echo "$url not successfully downloaded."
74 | exit -1
75 | fi
76 | if [ ${file: -4} == ".tgz" ]; then
77 | tar zxvf $file
78 | elif [ ${file: -4} == ".tar" ]; then
79 | tar xvf $file
80 | fi
81 | fi
82 | done
83 | cd ..
84 |
85 | echo "pre-processing train data..."
86 | for l in $src $tgt; do
87 | rm $tmp/train.tags.$lang.tok.$l
88 | for f in "${CORPORA[@]}"; do
89 | cat $orig/$f.$l | \
90 | perl $NORM_PUNC $l | \
91 | perl $REM_NON_PRINT_CHAR | \
92 | perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
93 | done
94 | done
95 |
96 | echo "pre-processing test data..."
97 | for l in $src $tgt; do
98 | if [ "$l" == "$src" ]; then
99 | t="src"
100 | else
101 | t="ref"
102 | fi
103 | grep '\s*//g' | \
105 | sed -e 's/\s*<\/seg>\s*//g' | \
106 | sed -e "s/\’/\'/g" | \
107 | perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
108 | echo ""
109 | done
110 |
111 | echo "splitting train and valid..."
112 | for l in $src $tgt; do
113 | awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
114 | awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
115 | done
116 |
117 | TRAIN=$tmp/train.de-en
118 | BPE_CODE=$prep/code
119 | rm -f $TRAIN
120 | for l in $src $tgt; do
121 | cat $tmp/train.$l >> $TRAIN
122 | done
123 |
124 | echo "learn_bpe.py on ${TRAIN}..."
125 | python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
126 |
127 | for L in $src $tgt; do
128 | for f in train.$L valid.$L test.$L; do
129 | echo "apply_bpe.py to ${f}..."
130 | python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
131 | done
132 | done
133 |
134 | perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
135 | perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
136 |
137 | for L in $src $tgt; do
138 | cp $tmp/bpe.test.$L $prep/test.$L
139 | done
140 |
--------------------------------------------------------------------------------
/encdec/examples/translation/prepare-wmt14en2fr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
3 |
4 | echo 'Cloning Moses github repository (for tokenization scripts)...'
5 | git clone https://github.com/moses-smt/mosesdecoder.git
6 |
7 | echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
8 | git clone https://github.com/rsennrich/subword-nmt.git
9 |
10 | SCRIPTS=mosesdecoder/scripts
11 | TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
12 | CLEAN=$SCRIPTS/training/clean-corpus-n.perl
13 | NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
14 | REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
15 | BPEROOT=subword-nmt
16 | BPE_TOKENS=40000
17 |
18 | URLS=(
19 | "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
20 | "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
21 | "http://statmt.org/wmt13/training-parallel-un.tgz"
22 | "http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
23 | "http://statmt.org/wmt10/training-giga-fren.tar"
24 | "http://statmt.org/wmt14/test-full.tgz"
25 | )
26 | FILES=(
27 | "training-parallel-europarl-v7.tgz"
28 | "training-parallel-commoncrawl.tgz"
29 | "training-parallel-un.tgz"
30 | "training-parallel-nc-v9.tgz"
31 | "training-giga-fren.tar"
32 | "test-full.tgz"
33 | )
34 | CORPORA=(
35 | "training/europarl-v7.fr-en"
36 | "commoncrawl.fr-en"
37 | "un/undoc.2000.fr-en"
38 | "training/news-commentary-v9.fr-en"
39 | "giga-fren.release2.fixed"
40 | )
41 |
42 | if [ ! -d "$SCRIPTS" ]; then
43 | echo "Please set SCRIPTS variable correctly to point to Moses scripts."
44 | exit
45 | fi
46 |
47 | src=en
48 | tgt=fr
49 | lang=en-fr
50 | prep=wmt14_en_fr
51 | tmp=$prep/tmp
52 | orig=orig
53 |
54 | mkdir -p $orig $tmp $prep
55 |
56 | cd $orig
57 |
58 | for ((i=0;i<${#URLS[@]};++i)); do
59 | file=${FILES[i]}
60 | if [ -f $file ]; then
61 | echo "$file already exists, skipping download"
62 | else
63 | url=${URLS[i]}
64 | wget "$url"
65 | if [ -f $file ]; then
66 | echo "$url successfully downloaded."
67 | else
68 | echo "$url not successfully downloaded."
69 | exit -1
70 | fi
71 | if [ ${file: -4} == ".tgz" ]; then
72 | tar zxvf $file
73 | elif [ ${file: -4} == ".tar" ]; then
74 | tar xvf $file
75 | fi
76 | fi
77 | done
78 |
79 | gunzip giga-fren.release2.fixed.*.gz
80 | cd ..
81 |
82 | echo "pre-processing train data..."
83 | for l in $src $tgt; do
84 | rm $tmp/train.tags.$lang.tok.$l
85 | for f in "${CORPORA[@]}"; do
86 | cat $orig/$f.$l | \
87 | perl $NORM_PUNC $l | \
88 | perl $REM_NON_PRINT_CHAR | \
89 | perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
90 | done
91 | done
92 |
93 | echo "pre-processing test data..."
94 | for l in $src $tgt; do
95 | if [ "$l" == "$src" ]; then
96 | t="src"
97 | else
98 | t="ref"
99 | fi
100 | grep '\s*//g' | \
102 | sed -e 's/\s*<\/seg>\s*//g' | \
103 | sed -e "s/\’/\'/g" | \
104 | perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
105 | echo ""
106 | done
107 |
108 | echo "splitting train and valid..."
109 | for l in $src $tgt; do
110 | awk '{if (NR%1333 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
111 | awk '{if (NR%1333 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
112 | done
113 |
114 | TRAIN=$tmp/train.fr-en
115 | BPE_CODE=$prep/code
116 | rm -f $TRAIN
117 | for l in $src $tgt; do
118 | cat $tmp/train.$l >> $TRAIN
119 | done
120 |
121 | echo "learn_bpe.py on ${TRAIN}..."
122 | python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
123 |
124 | for L in $src $tgt; do
125 | for f in train.$L valid.$L test.$L; do
126 | echo "apply_bpe.py to ${f}..."
127 | python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
128 | done
129 | done
130 |
131 | perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
132 | perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
133 |
134 | for L in $src $tgt; do
135 | cp $tmp/bpe.test.$L $prep/test.$L
136 | done
137 |
--------------------------------------------------------------------------------
/encdec/fairseq/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the LICENSE file in
5 | # the root directory of this source tree. An additional grant of patent rights
6 | # can be found in the PATENTS file in the same directory.
7 |
8 | from .multiprocessing_pdb import pdb
9 |
10 | __all__ = ['pdb']
11 |
12 | import fairseq.criterions
13 | import fairseq.models
14 | import fairseq.modules
15 | import fairseq.optim
16 | import fairseq.optim.lr_scheduler
17 | import fairseq.tasks
18 |
--------------------------------------------------------------------------------
/encdec/fairseq/bleu.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the LICENSE file in
5 | # the root directory of this source tree. An additional grant of patent rights
6 | # can be found in the PATENTS file in the same directory.
7 |
8 | import ctypes
9 | import math
10 | import torch
11 |
12 | try:
13 | from fairseq import libbleu
14 | except ImportError as e:
15 | import sys
16 | sys.stderr.write('ERROR: missing libbleu.so. run `python setup.py install`\n')
17 | raise e
18 |
19 |
20 | C = ctypes.cdll.LoadLibrary(libbleu.__file__)
21 |
22 |
23 | class BleuStat(ctypes.Structure):
24 | _fields_ = [
25 | ('reflen', ctypes.c_size_t),
26 | ('predlen', ctypes.c_size_t),
27 | ('match1', ctypes.c_size_t),
28 | ('count1', ctypes.c_size_t),
29 | ('match2', ctypes.c_size_t),
30 | ('count2', ctypes.c_size_t),
31 | ('match3', ctypes.c_size_t),
32 | ('count3', ctypes.c_size_t),
33 | ('match4', ctypes.c_size_t),
34 | ('count4', ctypes.c_size_t),
35 | ]
36 |
37 |
38 | class Scorer(object):
39 | def __init__(self, pad, eos, unk):
40 | self.stat = BleuStat()
41 | self.pad = pad
42 | self.eos = eos
43 | self.unk = unk
44 | self.reset()
45 |
46 | def reset(self, one_init=False):
47 | if one_init:
48 | C.bleu_one_init(ctypes.byref(self.stat))
49 | else:
50 | C.bleu_zero_init(ctypes.byref(self.stat))
51 |
52 | def add(self, ref, pred):
53 | if not isinstance(ref, torch.IntTensor):
54 | raise TypeError('ref must be a torch.IntTensor (got {})'
55 | .format(type(ref)))
56 | if not isinstance(pred, torch.IntTensor):
57 | raise TypeError('pred must be a torch.IntTensor(got {})'
58 | .format(type(pred)))
59 |
60 | # don't match unknown words
61 | rref = ref.clone()
62 | assert not rref.lt(0).any()
63 | rref[rref.eq(self.unk)] = -999
64 |
65 | rref = rref.contiguous().view(-1)
66 | pred = pred.contiguous().view(-1)
67 |
68 | C.bleu_add(
69 | ctypes.byref(self.stat),
70 | ctypes.c_size_t(rref.size(0)),
71 | ctypes.c_void_p(rref.data_ptr()),
72 | ctypes.c_size_t(pred.size(0)),
73 | ctypes.c_void_p(pred.data_ptr()),
74 | ctypes.c_int(self.pad),
75 | ctypes.c_int(self.eos))
76 |
77 | def score(self, order=4):
78 | psum = sum(math.log(p) if p > 0 else float('-Inf')
79 | for p in self.precision()[:order])
80 | return self.brevity() * math.exp(psum / order) * 100
81 |
82 | def precision(self):
83 | def ratio(a, b):
84 | return a / b if b > 0 else 0
85 |
86 | return [
87 | ratio(self.stat.match1, self.stat.count1),
88 | ratio(self.stat.match2, self.stat.count2),
89 | ratio(self.stat.match3, self.stat.count3),
90 | ratio(self.stat.match4, self.stat.count4),
91 | ]
92 |
93 | def brevity(self):
94 | r = self.stat.reflen / self.stat.predlen
95 | return min(1, math.exp(1 - r))
96 |
97 | def result_string(self, order=4):
98 | assert order <= 4, "BLEU scores for order > 4 aren't supported"
99 | fmt = 'BLEU{} = {:2.2f}, {:2.1f}'
100 | for _ in range(1, order):
101 | fmt += '/{:2.1f}'
102 | fmt += ' (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})'
103 | bleup = [p * 100 for p in self.precision()[:order]]
104 | return fmt.format(order, self.score(order=order), *bleup,
105 | self.brevity(), self.stat.predlen/self.stat.reflen,
106 | self.stat.predlen, self.stat.reflen)
107 |
--------------------------------------------------------------------------------
/encdec/fairseq/clib/libbleu/libbleu.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2017-present, Facebook, Inc.
3 | * All rights reserved.
4 | *
5 | * This source code is licensed under the license found in the
6 | * LICENSE file in the root directory of this source tree.
7 | */
8 |
9 | #include