├── .coveragerc
├── .flake8
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── buildwebsite.yml
│ ├── data-pipeline.yml
│ ├── nightly-test.yml
│ ├── unittests-gpu.yml
│ └── unittests.yml
├── .gitignore
├── .pylintrc
├── .pytype.cfg
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE
├── Makefile
├── README.md
├── conftest.py
├── docs
├── .gitignore
├── .nojekyll
├── 404.rst
├── Doxyfile
├── Makefile
├── README.txt
├── _static
│ ├── 404.jpg
│ ├── custom.css
│ ├── gluon-logo.svg
│ ├── gluon.ico
│ ├── google_analytics.js
│ ├── hidebib.js
│ └── install-options.js
├── api
│ ├── attention.rst
│ ├── data.rst
│ ├── embedding.rst
│ ├── index.rst
│ ├── layers.rst
│ ├── models.rst
│ ├── operators.rst
│ ├── sequence_sampler.rst
│ └── utils.rst
├── conf.py
├── examples.rst
├── genindex.rst
├── index.rst
├── install.rst
├── install
│ ├── install-include.rst
│ └── install-more.rst
├── md2ipynb.py
├── model_zoo
├── tutorials
│ ├── deep_learning_compiler
│ │ ├── index.rst
│ │ └── tvm_basic.md
│ ├── index.rst
│ ├── pretrained_models
│ │ ├── index.rst
│ │ ├── pretrained_t5_mlm.md
│ │ └── pretraining_objectives.png
│ ├── question_answering
│ │ ├── index.rst
│ │ ├── offsets_match.png
│ │ ├── qa1.png
│ │ ├── qa2.png
│ │ ├── question_answering.md
│ │ └── squad_utils.py
│ ├── text_prediction
│ │ ├── bert_illustration.png
│ │ ├── index.rst
│ │ ├── merge_input.png
│ │ ├── text_prediction_part1.md
│ │ └── text_prediction_part2.md
│ ├── tokenization
│ │ ├── index.rst
│ │ ├── tokenization_part1.md
│ │ ├── tokenization_part2.md
│ │ └── tokenization_part3.md
│ └── word_embedding
│ │ ├── index.rst
│ │ └── word_embedding.md
└── website
│ ├── configuration.rst
│ ├── contribute.rst
│ ├── git.rst
│ ├── index.rst
│ └── release.rst
├── pytest.ini
├── scripts
├── __init__.py
├── benchmarks
│ ├── README.md
│ ├── benchmark_gluonnlp.py
│ ├── benchmark_gluonnlp.sh
│ ├── benchmark_gluonnlp_fp16.sh
│ ├── benchmark_gluonnlp_tvm.sh
│ ├── benchmark_hf.py
│ ├── benchmark_utils.py
│ ├── requirements.txt
│ └── run_backbone_benchmark.sh
├── classification
│ ├── README.md
│ ├── classification.py
│ ├── classification_utils.py
│ └── train_classification.py
├── conversion_toolkits
│ ├── README.md
│ ├── bert_base_config.json
│ ├── bert_large_config.json
│ ├── convert_albert.sh
│ ├── convert_bart.sh
│ ├── convert_bert.sh
│ ├── convert_bert_torch.sh
│ ├── convert_electra.py
│ ├── convert_electra.sh
│ ├── convert_fairseq_bart.py
│ ├── convert_fairseq_roberta.py
│ ├── convert_fairseq_xlmr.py
│ ├── convert_gpt2.py
│ ├── convert_gpt2.sh
│ ├── convert_mobilebert.py
│ ├── convert_mobilebert.sh
│ ├── convert_mt5.py
│ ├── convert_mt5.sh
│ ├── convert_roberta.sh
│ ├── convert_t5.py
│ ├── convert_t5.sh
│ ├── convert_tf_hub_model.py
│ └── convert_xlmr.sh
├── datasets
│ ├── README.md
│ ├── __init__.py
│ ├── __main__.py
│ ├── general_nlp_benchmark
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── prepare_glue.py
│ │ └── prepare_text_classification.py
│ ├── language_modeling
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── prepare_lm.py
│ ├── machine_translation
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── prepare_wmt.py
│ │ ├── wmt2014_ende.sh
│ │ └── wmt2017_zhen.sh
│ ├── music_generation
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── prepare_music_midi.py
│ ├── pretrain_corpus
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── prepare_bookcorpus.py
│ │ ├── prepare_gutenberg.py
│ │ ├── prepare_openwebtext.py
│ │ └── prepare_wikipedia.py
│ ├── question_answering
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── prepare_hotpotqa.py
│ │ ├── prepare_naturalquestions.py
│ │ ├── prepare_searchqa.py
│ │ ├── prepare_squad.py
│ │ └── prepare_triviaqa.py
│ ├── update_download_stats.py
│ └── url_checksums
│ │ ├── bookcorpus.txt
│ │ ├── glue.txt
│ │ ├── gutenberg.txt
│ │ ├── hotpotqa.txt
│ │ ├── language_model.txt
│ │ ├── mirror
│ │ └── wmt.json
│ │ ├── music_midi.txt
│ │ ├── naturalquestions.txt
│ │ ├── searchqa.txt
│ │ ├── squad.txt
│ │ ├── superglue.txt
│ │ ├── text_classification.txt
│ │ ├── triviaqa.txt
│ │ ├── wikipedia.txt
│ │ └── wmt.txt
├── generation
│ ├── README.md
│ ├── calculate_metrics.py
│ ├── generate_unconditional_gpt2_samples.py
│ └── interactive_conditional_gpt2_samples.py
├── index.rst
├── machine_translation
│ ├── README.md
│ ├── __init__.py
│ ├── evaluate_epochs_wmt2014_ende.sh
│ ├── evaluate_transformer.py
│ ├── train_transformer.py
│ ├── transformer_enc12_dec1.yml
│ └── wmt2014_back_translation.sh
├── pretraining
│ ├── README.md
│ ├── bert
│ │ ├── README.md
│ │ ├── covert_bookcorpus_format.py
│ │ ├── create_pretraining_data.py
│ │ ├── pretraining_utils.py
│ │ └── run_pretraining.py
│ ├── convert_electra_pretrain_backbone.py
│ ├── data_preprocessing.py
│ ├── pretraining_utils.py
│ ├── run_electra.py
│ └── torch
│ │ └── bert
│ │ ├── README.md
│ │ ├── prepare_quickthought.py
│ │ └── run_pretraining.py
├── processing
│ ├── README.md
│ ├── __init__.py
│ ├── __main__.py
│ ├── apply_subword.py
│ ├── clean_tok_corpus.py
│ ├── learn_subword.py
│ └── segment_sentences.py
└── question_answering
│ ├── README.md
│ ├── albert_custom.yaml
│ ├── commands
│ ├── README.md
│ ├── generate_commands.py
│ ├── run_squad.template
│ ├── run_squad2_albert_base.sh
│ ├── run_squad2_albert_large.sh
│ ├── run_squad2_albert_xlarge.sh
│ ├── run_squad2_albert_xxlarge.sh
│ ├── run_squad2_electra_base.sh
│ ├── run_squad2_electra_large.sh
│ ├── run_squad2_electra_small.sh
│ ├── run_squad2_gluon_en_cased_bert_base_v1.sh
│ ├── run_squad2_mobilebert.sh
│ ├── run_squad2_roberta_large.sh
│ ├── run_squad2_uncased_bert_base.sh
│ ├── run_squad2_uncased_bert_large.sh
│ └── run_squad2_uncased_bert_wwm_large.sh
│ ├── custom_strategy.py
│ ├── eval_utils.py
│ ├── models.py
│ ├── run_squad.py
│ ├── run_squad_albert.py
│ └── squad_utils.py
├── setup.py
├── src
└── gluonnlp
│ ├── __init__.py
│ ├── attention_cell.py
│ ├── base.py
│ ├── cli
│ ├── __init__.py
│ ├── average_checkpoint.py
│ ├── data
│ └── process
│ ├── data
│ ├── __init__.py
│ ├── batchify.py
│ ├── filtering.py
│ ├── loading.py
│ ├── sampler.py
│ ├── tokenizers
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── huggingface.py
│ │ ├── jieba.py
│ │ ├── moses.py
│ │ ├── sentencepiece.py
│ │ ├── spacy.py
│ │ ├── subword_nmt.py
│ │ ├── whitespace.py
│ │ └── yttm.py
│ └── vocab.py
│ ├── embedding
│ ├── __init__.py
│ ├── _constants.py
│ └── embed_loader.py
│ ├── initializer.py
│ ├── layers.py
│ ├── loss.py
│ ├── lr_scheduler.py
│ ├── models
│ ├── __init__.py
│ ├── albert.py
│ ├── bart.py
│ ├── base.py
│ ├── bert.py
│ ├── electra.py
│ ├── gpt2.py
│ ├── mobilebert.py
│ ├── model_zoo_checksums
│ │ ├── albert.txt
│ │ ├── bart.txt
│ │ ├── bert.txt
│ │ ├── electra.txt
│ │ ├── gpt2.txt
│ │ ├── mobilebert.txt
│ │ ├── mt5.txt
│ │ ├── roberta.txt
│ │ ├── t5.txt
│ │ └── xlmr.txt
│ ├── mt5.py
│ ├── roberta.py
│ ├── t5.py
│ ├── transformer.py
│ ├── transformer_xl.py
│ └── xlmr.py
│ ├── op.py
│ ├── sequence_sampler.py
│ ├── third_party
│ ├── __init__.py
│ ├── sentencepiece_model_pb2.py
│ └── sentencepiece_pb2.py
│ ├── torch
│ ├── __init__.py
│ ├── attention_cell.py
│ ├── clib
│ │ ├── amp_C_frontend.cpp
│ │ ├── compat.h
│ │ ├── multi_tensor_apply.cuh
│ │ ├── multi_tensor_l2norm_kernel.cu
│ │ ├── multi_tensor_lans.cu
│ │ └── type_shim.h
│ ├── data
│ │ ├── __init__.py
│ │ └── batchify.py
│ ├── layers.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── bert.py
│ │ └── transformer.py
│ ├── optimizers
│ │ ├── __init__.py
│ │ ├── fused_lans.py
│ │ └── schedules.py
│ └── utils.py
│ └── utils
│ ├── __init__.py
│ ├── config.py
│ ├── lazy_imports.py
│ ├── misc.py
│ ├── parameter.py
│ ├── preprocessing.py
│ ├── registry.py
│ ├── shm.py
│ ├── testing.py
│ └── tvm_utils.py
├── tests
├── README.md
├── data_cli
│ ├── test_glue.py
│ └── test_wikipedia.py
├── process_cli
│ ├── data
│ │ ├── wmt19-test-de-en.de
│ │ ├── wmt19-test-de-en.en
│ │ └── wmt19-test-zh-en.zh.jieba
│ ├── test_average_checkpoint.py
│ └── test_learn_apply_subword.py
├── test_attention_cell.py
├── test_data_batchify.py
├── test_data_filtering.py
├── test_data_loading.py
├── test_data_sampler.py
├── test_data_tokenizers.py
├── test_data_vocab.py
├── test_embedding.py
├── test_gluon_block.py
├── test_initializer.py
├── test_layers.py
├── test_loss.py
├── test_models.py
├── test_models_albert.py
├── test_models_bart.py
├── test_models_bert.py
├── test_models_electra.py
├── test_models_gpt2.py
├── test_models_mobilebert.py
├── test_models_mt5.py
├── test_models_roberta.py
├── test_models_t5.py
├── test_models_transformer.py
├── test_models_transformer_xl.py
├── test_models_xlmr.py
├── test_op.py
├── test_pytest.py
├── test_sequence_sampler.py
├── test_utils_misc.py
├── test_utils_parameter.py
├── test_utils_preprocessing.py
├── test_utils_registry.py
└── torch
│ ├── test_attention_cell_torch.py
│ ├── test_bert_torch.py
│ └── test_layers_torch.py
└── tools
├── batch
├── README.md
├── backbone_benchmark
│ └── run_batch_backbone_benchmark.sh
├── batch_states
│ ├── compile_notebooks.sh
│ ├── test.sh
│ └── test_data_pipeline.sh
├── hello_world.py
├── question_answering
│ ├── parse_squad_results.py
│ └── run_batch_squad.sh
├── run_batch_conversion.sh
├── submit-job.py
├── sync_batch_result.sh
└── wait-job.py
├── diagnose.py
└── docker
├── README.md
├── devel_entrypoint.sh
├── gluon_nlp_job.sh
├── install
├── install_horovod.sh
├── install_jupyter_lab.sh
├── install_llvm.sh
├── install_openmpi.sh
├── install_python_packages.sh
├── install_tvm_cpu.sh
├── install_tvm_gpu.sh
└── install_ubuntu18.04_core.sh
├── start_jupyter.sh
├── ubuntu18.04-cpu.Dockerfile
└── ubuntu18.04-gpu.Dockerfile
/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | omit =
4 | tests/*
5 | scripts/*
6 | concurrency =
7 | multiprocessing
8 | thread
9 |
10 | [report]
11 | ignore_errors = True
12 |
13 | [html]
14 | directory = coverage_html_report
15 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | max-complexity = 18
4 | exclude = tests,__init__.py
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: 'bug'
6 | assignees: ''
7 |
8 | ---
9 | ## Description
10 | (A clear and concise description of what the bug is.)
11 |
12 | ### Error Message
13 | (Paste the complete error message, including stack trace.)
14 |
15 | ## To Reproduce
16 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)
17 |
18 | ### Steps to reproduce
19 | (Paste the commands you ran that produced the error.)
20 |
21 | 1.
22 | 2.
23 |
24 | ## What have you tried to solve it?
25 |
26 | 1.
27 | 2.
28 |
29 | ## Environment
30 |
31 | We recommend using our script for collecting the diagnositc information. Run the following command and paste the outputs below:
32 | ```
33 | curl --retry 10 -s https://raw.githubusercontent.com/dmlc/gluon-nlp/master/tools/diagnose.py | python
34 |
35 | # paste outputs here
36 | ```
37 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: 'enhancement'
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Description
11 | (A clear and concise description of what the feature is.)
12 | - If the proposal is about a new model, provide description of what the model is.
13 | - If the proposal is about an API, provide mock examples if possible.
14 |
15 | ## References
16 | - list reference and related literature
17 | - list known implementations
18 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Description ##
2 | (Brief description on what this PR is about)
3 |
4 | ## Checklist ##
5 | ### Essentials ###
6 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [MODEL], [TUTORIAL], [FEATURE], [DOC], etc)
7 | - [ ] Changes are complete (i.e. I finished coding on this PR)
8 | - [ ] All changes have test coverage
9 | - [ ] Code is well-documented
10 |
11 | ### Changes ###
12 | - [ ] Feature1, tests, (and when applicable, API doc)
13 | - [ ] Feature2, tests, (and when applicable, API doc)
14 |
15 | ## Comments ##
16 | - If this change is a backward incompatible change, why must this change be made.
17 | - Interesting edge cases to note here
18 |
19 | cc @dmlc/gluon-nlp-team
20 |
--------------------------------------------------------------------------------
/.github/workflows/data-pipeline.yml:
--------------------------------------------------------------------------------
1 | name: data pipeline end-to-end
2 |
3 | on:
4 | schedule:
5 | - cron: '00 18 * * *' # At UTC 18:00 AM, everyday, use https://crontab.guru/
6 |
7 | defaults:
8 | run:
9 | shell: bash
10 |
11 | jobs:
12 | unittest:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | python-version: [ '3.7' ]
18 | steps:
19 | - name: Checkout repository
20 | uses: actions/checkout@v2
21 |
22 | - name: Install Other Dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | python -m pip install boto3
26 |
27 | - name: Configure AWS Credentials
28 | uses: aws-actions/configure-aws-credentials@v1
29 | with:
30 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
31 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
32 | aws-region: us-east-1
33 |
34 | - name: Test Data Pipeline on AWS Batch
35 | run: |
36 | python ./tools/batch/submit-job.py --region us-east-1 \
37 | --job-type c5n.4x \
38 | --source-ref ${{ github.ref }} \
39 | --work-dir tools/batch/batch_states \
40 | --remote https://github.com/${{ github.repository }} \
41 | --command "bash test_data_pipeline.sh" --wait
42 |
--------------------------------------------------------------------------------
/.github/workflows/nightly-test.yml:
--------------------------------------------------------------------------------
1 | name: nightly test
2 |
3 | on:
4 | schedule:
5 | - cron: '30 23 * * *' # At UTC 23:30, everyday, use https://crontab.guru/
6 |
7 | defaults:
8 | run:
9 | shell: bash
10 |
11 | jobs:
12 | unittest:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | python-version: [ '3.7' ]
18 | steps:
19 | - name: Checkout repository
20 | uses: actions/checkout@v2
21 |
22 | - name: Install Other Dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | python -m pip install boto3
26 |
27 | - name: Configure AWS Credentials
28 | uses: aws-actions/configure-aws-credentials@v1
29 | with:
30 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
31 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
32 | aws-region: us-east-1
33 |
34 | - name: Test GluonNLP on MXNet nightly realse
35 | run: |
36 | echo "Start submitting job"
37 | python ./tools/batch/submit-job.py --region us-east-1 \
38 | --job-type g4dn.4x \
39 | --name GluonNLP-Nightly-Test \
40 | --source-ref ${{ github.ref }} \
41 | --work-dir . \
42 | --remote https://github.com/${{ github.repository }} \
43 | --command "python3 -m pip install pytest-forked \
44 | && python3 -m pip install -U --pre 'mxnet-cu102>=2.0.0b20210418' -f https://dist.mxnet.io/python/cu102 \
45 | && python3 -m pytest --forked --durations=50 --device="cpu" --device="gpu" --runslow ./tests/" \
46 | --wait | tee batch_job.log
47 |
48 | - name: Upload Cloud Watch Log
49 | if: ${{ failure() || success() }}
50 | uses: actions/upload-artifact@v2
51 | with:
52 | name: Test_Log
53 | path: ./batch_job.log
54 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # pycharm
77 | .idea
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # dotenv
86 | .env
87 |
88 | # virtualenv
89 | .venv
90 | venv/
91 | ENV/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # test data
107 | tests/data/
108 | tests/data/embedding/
109 | tests/data/my_embed/
110 | tests/externaldata/
111 | .pytest_cache
112 |
113 | # docs
114 | docs/html
115 |
116 | # release
117 | scripts/*.zip
118 | docs/tutorials/*.zip
119 | docs/tutorials/*/*.ipynb
120 |
121 | conda
122 |
123 | # temp files
124 | *.swp
125 |
126 | # vscode
127 | .vscode
128 |
129 | # Mac
130 | .DS_Store
131 |
132 | # license checker
133 | ci/rat/apache-rat/
134 | ci/rat/apache-rat.jar
135 |
--------------------------------------------------------------------------------
/.pytype.cfg:
--------------------------------------------------------------------------------
1 | # NOTE: All relative paths are relative to the location of this file.
2 | [pytype]
3 | # Space-separated list of files or directories to process.
4 | inputs =
5 | src/gluonnlp
6 |
7 | # Python version (major.minor) of the target code.
8 | python_version = 3.6
9 |
--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Watchers and contributors to DMLC GluonNLP repo directories/packages/files
2 | # Please see documentation of use of CODEOWNERS file at
3 | # https://help.github.com/articles/about-codeowners/ and
4 | # https://github.com/blog/2392-introducing-code-owners
5 | #
6 | # Anybody can add themselves or a team as additional watcher or contributor
7 | # to get notified about changes in a specific package.
8 | # See https://help.github.com/articles/about-teams how to setup teams.
9 |
10 |
11 | # Global owners
12 | * @dmlc/gluon-nlp-committers @dmlc/gluon-nlp-reviewers
13 |
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | doxygen
2 | _build
3 | gen_modules
4 | tutorials
5 | doctrees
6 |
--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/.nojekyll
--------------------------------------------------------------------------------
/docs/404.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | Page Not Found
4 | --------------
5 |
6 | You stumbled upon a page that's making us scratch our brains right now. Before any of us panics,
7 | we'd like you to know that you are being redirected to a better known and cozy place, in just a few seconds.
8 |
9 | .. image:: _static/404.jpg
10 | :alt: Page Not Found
11 | :width: 60%
12 | :align: center
13 | :target: ./index.html
14 |
15 | .. raw:: html
16 |
17 |
26 |
--------------------------------------------------------------------------------
/docs/README.txt:
--------------------------------------------------------------------------------
1 | The documentation of GluonNLP is generated with recommonmark and sphinx.
2 |
3 | - pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
4 |
5 | For more details, refer to [website/configuration.rst](website/configuration.rst)
6 |
--------------------------------------------------------------------------------
/docs/_static/404.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/_static/404.jpg
--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | .Logos {
2 | display: inline;
3 | margin: 1em;
4 | max-width: 120px;
5 | }
6 |
7 | .install {
8 | max-width: 800px;
9 | }
10 | .install .title {
11 | display: inline-block;
12 | min-width: 100px;
13 | text-transform: uppercase;
14 | font-size: 90%;
15 | color: #555;
16 | }
17 |
18 | .install .option {
19 | margin: 5px;
20 | }
21 |
22 | @media (max-width: 650px) {
23 | .install .option, .install .title {
24 | width: 90%;
25 | }
26 |
27 | .install .title {
28 | margin-top: 1em;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/docs/_static/gluon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/_static/gluon.ico
--------------------------------------------------------------------------------
/docs/_static/google_analytics.js:
--------------------------------------------------------------------------------
1 | (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
2 | (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
3 | m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
4 | })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
5 |
6 | ga('create', 'UA-96378503-8', 'auto');
7 | ga('send', 'pageview');
8 |
--------------------------------------------------------------------------------
/docs/_static/hidebib.js:
--------------------------------------------------------------------------------
1 | // adapted from: http://www.robots.ox.ac.uk/~vedaldi/assets/hidebib.js
2 | function hideallbibs()
3 | {
4 | var el = document.getElementsByTagName("div") ;
5 | for (var i = 0 ; i < el.length ; ++i) {
6 | if (el[i].className == "paper") {
7 | var bib = el[i].getElementsByTagName("pre") ;
8 | if (bib.length > 0) {
9 | bib [0] .style.display = 'none' ;
10 | }
11 | }
12 | }
13 | }
14 |
15 | function togglebib(paperid)
16 | {
17 | var paper = document.getElementById(paperid) ;
18 | var bib = paper.getElementsByTagName('pre') ;
19 | if (bib.length > 0) {
20 | if (bib [0] .style.display == 'none') {
21 | bib [0] .style.display = 'block' ;
22 | } else {
23 | bib [0] .style.display = 'none' ;
24 | }
25 | }
26 | }
27 |
28 | function toggleblock(blockId)
29 | {
30 | var block = document.getElementById(blockId);
31 | if (block.style.display == 'none') {
32 | block.style.display = 'block' ;
33 | } else {
34 | block.style.display = 'none' ;
35 | }
36 | }
37 |
38 | function hideblock(blockId)
39 | {
40 | var block = document.getElementById(blockId);
41 | block.style.display = 'none' ;
42 | }
43 |
--------------------------------------------------------------------------------
/docs/_static/install-options.js:
--------------------------------------------------------------------------------
1 | $(document).ready(function () {
2 |
3 | function label(lbl) {
4 | return $.trim(lbl.replace(/[ .]/g, '-').replace('+-', '').toLowerCase());
5 | }
6 |
7 | // a hack: macos doesn't support cuda, so disable all cuda options when it
8 | // is selected.
9 | function disableCuda() {
10 | $('.install .option').each(function(){
11 | if (label($(this).text()).indexOf("cuda") != -1) {
12 | $(this).addClass('disabled');
13 | }
14 | });
15 | }
16 | function enableCuda() {
17 | $('.install .option').each(function(){
18 | if (label($(this).text()).indexOf("cuda") != -1) {
19 | $(this).removeClass('disabled');
20 | }
21 | });
22 | }
23 |
24 | // find the user os, and set the according option to active
25 | function setActiveOSButton() {
26 | var os = "linux"
27 | var agent = window.navigator.userAgent.toLowerCase();
28 | if (agent.indexOf("win") != -1) {
29 | os = "windows"
30 | } else if (agent.indexOf("mac") != -1) {
31 | os = "macos"
32 | }
33 | if (os == "macos") {
34 | disableCuda();
35 | }
36 | $('.install .option').each(function(){
37 | if (label($(this).text()).indexOf(os) != -1) {
38 | $(this).addClass('active');
39 | }
40 | });
41 | }
42 |
43 | setActiveOSButton();
44 |
45 | // apply theme
46 | function setTheme() {
47 | $('.opt-group .option').each(function(){
48 | $(this).addClass('mdl-button mdl-js-button mdl-js-ripple-effect mdl-button--raised ');
49 | $(this).attr('id', label($(this).text()));
50 | });
51 | $('.opt-group .active').each(function(){
52 | $(this).addClass('mdl-button--colored');
53 | });
54 | }
55 | setTheme();
56 |
57 |
58 | // show the command according to the active options
59 | function showCommand() {
60 | $('.opt-group .option').each(function(){
61 | $('.'+label($(this).text())).hide();
62 | // console.log('disable '+label($(this).text()));
63 | });
64 | $('.opt-group .active').each(function(){
65 | $('.'+label($(this).text())).show();
66 | // console.log('enable '+label($(this).text()));
67 | });
68 | }
69 | showCommand();
70 |
71 | function setOptions() {
72 | var el = $(this);
73 | el.siblings().removeClass('active');
74 | el.siblings().removeClass('mdl-button--colored');
75 | el.addClass('active');
76 | el.addClass('mdl-button--colored');
77 | // console.log('enable'+el.text())
78 | // console.log('disable'+el.siblings().text())
79 | console.log($('.install #macos').hasClass('active') )
80 | if ($('.install #macos').hasClass('active') == true) {
81 | disableCuda();
82 | } else {
83 | enableCuda();
84 | }
85 | showCommand();
86 | }
87 |
88 | $('.opt-group').on('click', '.option', setOptions);
89 |
90 | });
91 |
--------------------------------------------------------------------------------
/docs/api/attention.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.attention_cell
2 | =======================
3 |
4 | GluonNLP Toolkit provides ways to implement the attention mechanism that is prevailing in NLP models.
5 |
6 | .. currentmodule:: gluonnlp.attention_cell
7 |
8 | Attention Mechanism
9 | -------------------
10 |
11 | .. automodule:: gluonnlp.attention_cell
12 | :members:
13 | :imported-members:
14 | :special-members: __contains__, __getitem__, __setitem__
15 |
16 |
--------------------------------------------------------------------------------
/docs/api/data.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.data
2 | =============
3 |
4 | GluonNLP Toolkit provides tools for building efficient data pipelines for NLP tasks.
5 |
6 | .. currentmodule:: gluonnlp.data
7 |
8 | Tokenizers
9 | ----------
10 | .. automodule:: gluonnlp.data.tokenizers
11 | :members:
12 | :imported-members:
13 | :special-members: __contains__, __getitem__, __setitem__
14 |
15 | Vocabulary
16 | ----------
17 | .. automodule:: gluonnlp.data.vocab
18 | :members:
19 | :imported-members:
20 | :special-members: __contains__, __getitem__, __setitem__
21 |
22 | Batchify Function
23 | -----------------
24 | .. automodule:: gluonnlp.data.batchify
25 | :members:
26 |
27 | Data Sampler
28 | ------------
29 | .. automodule:: gluonnlp.data.sampler
30 | :members:
31 | :imported-members:
32 |
33 | Text Filtering
34 | --------------
35 | .. automodule:: gluonnlp.data.filtering
36 | :members:
37 | :imported-members:
38 |
39 | Data Loading
40 | ------------
41 | .. automodule:: gluonnlp.data.loading
42 | :members:
43 | :imported-members:
44 |
--------------------------------------------------------------------------------
/docs/api/embedding.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.embedding
2 | ==================
3 |
4 | GluonNLP Toolkit provides tools for working with embeddings.
5 |
6 | .. currentmodule:: gluonnlp.embedding
7 |
8 | This page describes the ``gluonnlp`` APIs for text embedding, such as loading
9 | pre-trained embedding vectors for text tokens and storing them in the
10 | ``numpy.ndarray`` format.
11 |
12 |
13 | Pre-trained Embeddings
14 | ----------------------
15 |
16 | .. currentmodule:: gluonnlp.embedding
17 | .. autosummary::
18 | :nosignatures:
19 |
20 | list_sources
21 | load_embeddings
22 | get_fasttext_model
23 |
24 |
25 | API Reference
26 | -------------
27 |
28 | .. automodule:: gluonnlp.embedding
29 | :members:
30 | :imported-members:
31 | :special-members: __contains__, __getitem__, __setitem__
32 |
33 |
34 |
--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 |
7 | data
8 | embedding
9 | models
10 | attention
11 | layers
12 | operators
13 | sequence_sampler
14 | utils
15 |
--------------------------------------------------------------------------------
/docs/api/layers.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.layers
2 | ===============
3 |
4 | GluonNLP Toolkit provides some common layers that can help you build NLP models.
5 |
6 | .. currentmodule:: gluonnlp.layers
7 |
8 | Layers
9 | ------
10 |
11 | .. automodule:: gluonnlp.layers
12 | :members:
13 | :imported-members:
14 | :special-members: __contains__, __getitem__, __setitem__
15 |
16 |
--------------------------------------------------------------------------------
/docs/api/models.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.models
2 | ===============
3 |
4 | GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default,
5 | all requested pre-trained weights are downloaded from public repo and stored in `~/.gluonnlp/models/`.
6 |
7 | .. currentmodule:: gluonnlp.models
8 | .. autosummary::
9 |
10 | Models
11 | ------
12 | .. automodule:: gluonnlp.models
13 | :members:
14 | :no-inherited-members:
15 | :imported-members:
16 | :special-members: __contains__, __getitem__, __setitem__
17 |
--------------------------------------------------------------------------------
/docs/api/operators.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.op
2 | ===============
3 |
4 | GluonNLP Toolkit provides some functions that can help you build NLP architectures and training pipelines.
5 |
6 | .. currentmodule:: gluonnlp.op
7 |
8 | Layers
9 | ------
10 |
11 | .. automodule:: gluonnlp.op
12 | :members:
13 | :imported-members:
14 | :special-members: __contains__, __getitem__, __setitem__
15 |
16 |
--------------------------------------------------------------------------------
/docs/api/sequence_sampler.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.sequence_sampler
2 | =========================
3 |
4 | GluonNLP Toolkit provides ways to sample from a sequence generator.
5 |
6 | .. currentmodule:: gluonnlp.sequence_sampler
7 |
8 | Sequence Sampler
9 | ----------------
10 |
11 | .. automodule:: gluonnlp.sequence_sampler
12 | :members:
13 | :imported-members:
14 | :special-members: __contains__, __getitem__, __setitem__
15 |
16 |
--------------------------------------------------------------------------------
/docs/api/utils.rst:
--------------------------------------------------------------------------------
1 | gluonnlp.utils
2 | ==============
3 |
4 | GluonNLP Toolkit provides tools for easily setting up task specific loss.
5 |
6 | .. currentmodule:: gluonnlp.utils
7 |
8 | API Reference
9 | -------------
10 |
11 | .. automodule:: gluonnlp.utils
12 | :members:
13 | :imported-members:
14 | :special-members: __contains__, __getitem__, __setitem__
15 |
--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | --------
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Benchmarking the Performance of NLP Backbones
8 | :link: model_zoo/benchmarks/index.html
9 |
10 | Benchmarking the performance of NLP models.
11 | We released the benchmarking script that compares different NLP packages.
12 |
13 | .. card::
14 | :title: Conversion Scripts
15 | :link: model_zoo/conversion_toolkits/index.html
16 |
17 | Converting NLP models from other frameworks to GluonNLP.
18 |
19 | .. card::
20 | :title: Datasets
21 | :link: model_zoo/datasets/index.html
22 |
23 | Example about the datasets supported by `nlp_data`
24 |
25 | .. card::
26 | :title: Generation
27 | :link: model_zoo/generation/index.html
28 |
29 | Example about how to generate from a pretrained GPT-2 model with GluonNLP.
30 | We provided the generation script and tried to compare different sampling methods.
31 |
32 | .. card::
33 | :title: Machine Translation
34 | :link: model_zoo/machine_translation/index.html
35 |
36 | Train machine translation model with GluonNLP.
37 |
38 | .. card::
39 | :title: Data Preprocessing Toolkit in GluonNLP
40 | :link: model_zoo/processing/index.html
41 |
42 | Example about the data processing toolkit (`nlp_process`) offered in GluonNLP.
43 |
44 | .. card::
45 | :title: Pretraining Model
46 | :link: model_zoo/pretraining/index.html
47 |
48 | Examples about pretraining your own backbones.
49 |
50 | .. card::
51 | :title: Question Answering Examples
52 | :link: model_zoo/question_answering/index.html
53 |
54 | Run SQuAD 1.1 and 2.0 finetuning with GluonNLP. You will know how to run the models with
55 | mixed-precision training (AMP) and Horovod.
56 |
57 |
--------------------------------------------------------------------------------
/docs/genindex.rst:
--------------------------------------------------------------------------------
1 | Index
2 | =====
3 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | GluonNLP: NLP made easy
2 | =======================
3 |
4 | Get Started: A Quick Example
5 | ----------------------------
6 |
7 | Here is a quick example that downloads and creates a word embedding model and then
8 | computes the cosine similarity between two words.
9 |
10 | (You can click the play button below to run this example.)
11 |
12 | .. container:: demo
13 | :name: frontpage-demo
14 |
15 | `Word Embedding `_
16 |
17 | .. raw:: html
18 |
19 |
38 |
39 |
40 | .. include:: examples.rst
41 |
42 | And more in :doc:`tutorials `.
43 |
44 |
45 | .. include:: install.rst
46 |
47 |
48 | About GluonNLP
49 | --------------
50 |
51 | .. hint::
52 |
53 | You can find out the doc for our master development branch `here `_.
54 |
55 | GluonNLP provides implementations of the state-of-the-art (SOTA) deep learning
56 | models in NLP, and build blocks for text data pipelines and models.
57 | It is designed for engineers, researchers, and students to fast prototype
58 | research ideas and products based on these models. This toolkit offers five main features:
59 |
60 | 1. Carefully designed APIs that greatly reduce the implementation complexity.
61 | 2. Pre-trained models for common NLP tasks.
62 | 3. Tutorials to help get started on new NLP tasks.
63 | 4. Community support.
64 |
65 | This toolkit assumes that users have basic knowledge about deep learning and
66 | NLP. Otherwise, please refer to an introductory course such as
67 | `Dive into Deep Learning `_ or
68 | `Stanford CS224n `_.
69 | If you are not familiar with Gluon, check out the `Gluon documentation
70 | `__.
71 | You may find the 60-min Gluon crash course linked from there especially helpful.
72 |
73 |
74 | .. toctree::
75 | :hidden:
76 | :maxdepth: 2
77 |
78 | tutorials/index
79 | model_zoo/index
80 | api/index
81 | website/index
82 | genindex
83 |
--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ------------
3 |
4 | .. Ignore prerequisites to make the index page concise, which will be shown at
5 | the install page
6 |
7 | .. raw:: html
8 |
9 |
10 |
11 | .. include:: install/install-include.rst
12 |
13 | .. raw:: html
14 |
15 |
16 |
17 |
18 | Check :doc:`install/install-more` for more installation instructions and options.
19 |
--------------------------------------------------------------------------------
/docs/install/install-more.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | Installation
4 | ------------
5 |
6 | .. include:: install-include.rst
7 |
8 | .. raw:: html
9 |
10 |
11 |
12 |
13 |
14 | Next steps
15 | ----------
16 |
17 | - Checkout Apache MXNet `Get Started `_ for more options such as ARM devices and docker images.
18 | - `Verify your MXNet installation `_
19 | - `Configure MXNet environment variables `_
20 | - For new users: MXNet `Crash Course `_ and `other tutorials `_.
21 | - For experienced users: `Packages & Modules `_ and `Performance tips `_.
22 | - For advanced users: Apache MXNet `API `_ and `GluonNLP API <../api/index.html>`_.
23 |
24 | ..
25 | TOOD: write a new directive `no-local-toc` for it
26 |
27 | .. raw:: html
28 |
29 |
30 |
--------------------------------------------------------------------------------
/docs/md2ipynb.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import time
4 |
5 | import nbformat
6 | import notedown
7 |
8 | parser = argparse.ArgumentParser(description='Convert md file to ipynb files.')
9 | parser.add_argument('input', help='input.md', type=str)
10 | parser.add_argument('-d', '--disable_compute',
11 | help='Disable computing python scripts', action="store_true")
12 | args = parser.parse_args()
13 |
14 | # timeout for each notebook, in sec
15 | timeout = 90 * 60
16 |
17 | # the files will be ignored for execution
18 | ignore_execution = []
19 |
20 | # Change working directory to directory of input file
21 | input_dir, input_fn = os.path.split(args.input)
22 | if input_dir:
23 | os.chdir(input_dir)
24 |
25 | output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb'])
26 |
27 | reader = notedown.MarkdownReader()
28 |
29 | # read
30 | with open(input_fn, encoding='utf-8', mode='r') as f:
31 | notebook = reader.read(f)
32 |
33 | if not any([i in input_fn for i in ignore_execution]):
34 | tic = time.time()
35 | if not args.disable_compute:
36 | notedown.run(notebook, timeout)
37 | print('=== Finished evaluation in %f sec' % (time.time() - tic))
38 |
39 | # write
40 | # need to add language info to for syntax highlight
41 | notebook['metadata'].update({'language_info': {'name': 'ipython', 'version': 3}})
42 |
43 | notebook_json = nbformat.writes(notebook)
44 |
45 | with open(output_fn, encoding='utf-8', mode='w') as f:
46 | f.write(notebook_json)
47 |
--------------------------------------------------------------------------------
/docs/model_zoo:
--------------------------------------------------------------------------------
1 | ../scripts
--------------------------------------------------------------------------------
/docs/tutorials/deep_learning_compiler/index.rst:
--------------------------------------------------------------------------------
1 | Compile NLP Models
2 | ==================
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Compile and accelerate NLP models via TVM
8 | :link: tvm_basic.html
9 |
10 | Basic example of how to use TVM to compile backbone models in GluonNLP.
11 |
12 | .. toctree::
13 | :hidden:
14 | :maxdepth: 2
15 |
16 | tvm_basic.ipynb
17 |
--------------------------------------------------------------------------------
/docs/tutorials/index.rst:
--------------------------------------------------------------------------------
1 | Tutorials
2 | =========
3 |
4 | Interested in getting started in a new NLP area? Here are some tutorials to help get started.
5 |
6 |
7 | Embedding
8 | -----------------------
9 |
10 | .. container:: cards
11 |
12 | .. card::
13 | :title: Using Pre-trained Word Embeddings
14 | :link: word_embedding/word_embedding.html
15 |
16 | Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
17 | analogy problems.
18 |
19 |
20 | Text Prediction
21 | -----------------------
22 |
23 | .. container:: cards
24 |
25 | .. card::
26 | :title: Text Prediction Part1
27 | :link: text_prediction/text_prediction_part1.html
28 |
29 | Load pretrained NLP backbones.
30 |
31 | .. card::
32 | :title: Text Prediction Part2
33 | :link: text_prediction/text_prediction_part2.html
34 |
35 | An example that finetunes MobileBERT for sentiment analysis and sentence similarity.
36 |
37 |
38 | Question Answering
39 | -----------------------
40 |
41 | .. container:: cards
42 |
43 | .. card::
44 | :title: Question Answering with GluonNLP
45 | :link: question_answering/question_answering.html
46 |
47 | Learn how to build a model for Question Answering (QA) based on the backbone provided in GluonNLP.
48 |
49 |
50 | Tokenization
51 | -----------------------
52 |
53 | .. container:: cards
54 |
55 | .. card::
56 | :title: Tokenization Part1
57 | :link: tokenization/tokenization_part1.html
58 |
59 | The basic usage tokenizers in GluonNLP.
60 |
61 | .. card::
62 | :title: Tokenization Part2
63 | :link: tokenization/tokenization_part2.html
64 |
65 | Try out different subword learning algorithms.
66 |
67 |
68 | Using Pretrained Models
69 | -----------------------
70 |
71 | .. container:: cards
72 |
73 | .. card::
74 | :title: T5 for Masked Language Modeling
75 | :link: pretrained_models/pretrained_t5_mlm.html
76 |
77 | An example of using pretrained models in GluonNLP.
78 |
79 |
80 | Compiling NLP Models
81 | --------------------
82 |
83 | .. container:: cards
84 |
85 | .. card::
86 | :title: Compile and accelerate NLP models via TVM
87 | :link: deep_learning_compiler/tvm_basic.html
88 |
89 | Basic example of how to use TVM to compile backbone models in GluonNLP.
90 |
91 |
92 | .. toctree::
93 | :hidden:
94 | :maxdepth: 2
95 |
96 | word_embedding/index
97 | text_prediction/index
98 | question_answering/index
99 | tokenization/index
100 | pretrained_models/index
101 | deep_learning_compiler/index
102 |
--------------------------------------------------------------------------------
/docs/tutorials/pretrained_models/index.rst:
--------------------------------------------------------------------------------
1 | Using Pretrained Models
2 | =======================
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: T5 for Masked Language Modeling
8 | :link: pretrained_t5_mlm.html
9 |
10 | Use a pretrained T5 for MLM with noise spans.
11 |
12 | .. toctree::
13 | :hidden:
14 | :maxdepth: 2
15 |
16 | pretrained_t5_mlm.ipynb
17 |
--------------------------------------------------------------------------------
/docs/tutorials/pretrained_models/pretraining_objectives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/pretrained_models/pretraining_objectives.png
--------------------------------------------------------------------------------
/docs/tutorials/question_answering/index.rst:
--------------------------------------------------------------------------------
1 | Question Answering
2 | =======================
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Question Answering with GluonNLP
8 | :link: question_answering.html
9 |
10 | Learn how to build a model for Question Answering (QA) based on the backbone provided in GluonNLP.
11 |
12 | .. toctree::
13 | :hidden:
14 | :maxdepth: 2
15 |
16 | question_answering.ipynb
17 |
--------------------------------------------------------------------------------
/docs/tutorials/question_answering/offsets_match.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/offsets_match.png
--------------------------------------------------------------------------------
/docs/tutorials/question_answering/qa1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/qa1.png
--------------------------------------------------------------------------------
/docs/tutorials/question_answering/qa2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/qa2.png
--------------------------------------------------------------------------------
/docs/tutorials/text_prediction/bert_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/text_prediction/bert_illustration.png
--------------------------------------------------------------------------------
/docs/tutorials/text_prediction/index.rst:
--------------------------------------------------------------------------------
1 | Text Prediction
2 | =======================
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Text Prediction Part1
8 | :link: text_prediction_part1.html
9 |
10 | Load pretrained NLP backbones.
11 |
12 | .. card::
13 | :title: Text Prediction Part2
14 | :link: text_prediction_part2.html
15 |
16 | An example that finetunes MobileBERT for sentiment analysis and sentence similarity.
17 |
18 |
19 | .. toctree::
20 | :hidden:
21 | :maxdepth: 2
22 |
23 | text_prediction_part1.ipynb
24 | text_prediction_part2.ipynb
25 |
--------------------------------------------------------------------------------
/docs/tutorials/text_prediction/merge_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/text_prediction/merge_input.png
--------------------------------------------------------------------------------
/docs/tutorials/tokenization/index.rst:
--------------------------------------------------------------------------------
1 | Tokenization
2 | =======================
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Tokenization Part1
8 | :link: tokenization_part1.html
9 |
10 | The basic usage tokenizers in GluonNLP.
11 |
12 |
13 | .. card::
14 | :title: Tokenization Part2
15 | :link: tokenization_part2.html
16 |
17 | Try out different subword learning algorithms.
18 |
19 |
20 | .. card::
21 | :title: Tokenization Part3
22 | :link: tokenization_part3.html
23 |
24 | Tutorial that downloads wikipedia data and learn subword.
25 |
26 |
27 | .. toctree::
28 | :hidden:
29 | :maxdepth: 2
30 |
31 | tokenization_part1.ipynb
32 | tokenization_part2.ipynb
33 | tokenization_part3.ipynb
34 |
--------------------------------------------------------------------------------
/docs/tutorials/tokenization/tokenization_part3.md:
--------------------------------------------------------------------------------
1 | # Part3: Download Data from Wikipedia and Learn Subword
2 |
3 | In this tutorial, we will download the Wikipedia classical Chinese dataset with `nlp_data` and learn a customized sentencepiece vocabulary.
4 |
5 | ## Download Data
6 |
7 | ```{.shell .input}
8 | !nlp_data prepare_wikipedia --mode download+format --lang zh-classical --date latest --quiet -o wiki_zh_classical
9 | ```
10 |
11 | To save time, we will use the first 10000 sentences for training the subword model.
12 |
13 |
14 | ```{.shell .input}
15 | !head -10000 wiki_zh_classical/prepared_wikipedia/wikipedia-prepared-0000.txt > train_corpus.txt
16 | ```
17 |
18 | ```{.shell .input}
19 | !nlp_process learn_subword --model spm --corpus train_corpus.txt --vocab-size 10000 \
20 | --disable-bos --disable-eos \
21 | --custom-special-tokens "cls_token=[CLS]" "sep_token=[SEP]" "mask_token=[MASK]"
22 | ```
23 |
24 | The model are saved in "spm" folder.
25 |
26 | ```{.shell .input}
27 | !ls spm
28 | ```
29 |
30 | ## Build the Tokenizer with the Saved Model
31 |
32 |
33 | ```{.python .input}
34 | import gluonnlp
35 | import json
36 | from gluonnlp.data.tokenizers import SentencepieceTokenizer
37 | tokenizer = SentencepieceTokenizer(model_path='spm/spm.model', vocab="spm/spm.vocab")
38 | print(tokenizer)
39 | print()
40 | print('The first 10 tokens in the vocabulary:')
41 | print('--------------------------------------')
42 | print(tokenizer.vocab.all_tokens[:10])
43 | ```
44 |
45 | You can use the tokenizer direclty.
46 |
47 |
48 | ```{.python .input}
49 | tokenizer.encode('賈夫人仙逝揚州城 ·')
50 | ```
51 |
52 |
53 | ```{.python .input}
54 | tokenizer.encode_with_offsets('賈夫人仙逝揚州城 ·')
55 | ```
56 |
57 | ## Explore More Options
58 |
59 | To explore more options, you may check the README.
60 |
61 |
62 | ```{.shell .input}
63 | !nlp_process learn_subword --help
64 | ```
65 |
--------------------------------------------------------------------------------
/docs/tutorials/word_embedding/index.rst:
--------------------------------------------------------------------------------
1 | Representation Learning
2 | =======================
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Using Pre-trained Word Embeddings
8 | :link: word_embedding.html
9 |
10 | Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
11 | analogy problems.
12 |
13 |
14 | .. toctree::
15 | :hidden:
16 | :maxdepth: 2
17 |
18 | word_embedding.ipynb
19 |
--------------------------------------------------------------------------------
/docs/website/index.rst:
--------------------------------------------------------------------------------
1 | Community
2 | =========
3 |
4 | .. card::
5 | :title: Community
6 | :is_head: true
7 | :link: https://www.apache.org/foundation/policies/conduct
8 |
9 | Welcome to GluonNLP community. We strive to foster a collaborative and welcoming community. We
10 | expect all members to follow the `code of conduct `__.
11 |
12 |
13 | .. container:: cards
14 |
15 | .. card::
16 | :title: Github Issues
17 | :link: https://github.com/dmlc/gluon-nlp/issues
18 |
19 | Feature requests, bug reports, design and roadmap discussion.
20 |
21 | .. card::
22 | :title: GluonNLP Slack Channel
23 | :link: https://apache-mxnet.slack.com/messages/CCCDM10V9
24 |
25 | #gluon-nlp Slack channel. Click the `sign-up link `_ to register.
26 |
27 |
28 | Interested in contributing to GluonNLP? Check our contribution guide:
29 |
30 | .. toctree::
31 | :maxdepth: 3
32 |
33 | contribute
34 | git
35 | release
36 | configuration
--------------------------------------------------------------------------------
/docs/website/release.rst:
--------------------------------------------------------------------------------
1 | Release Checklist
2 | =================
3 |
4 | Below is the checklist for releasing a new minor version of GluonNLP:
5 |
6 | - Creat a new release branch $major.$minor.x with commits from the master branch
7 | - Bump the version in the master branch to $major.$minor+1.$patch.dev
8 | - Bump the version in the release branch to $major.$minor.$patch
9 | - Update the installation from source instruction in the release branch
10 | - Draft the release note, highlight important events/models/features, as well as breaking changes
11 | - Publish the release on Github, creating a tag $major.$minor.$patch
12 | - Check the content at http://gluon-nlp.mxnet.io/$major.$minor.x/index.html
13 | - Upload and refresh the default version website
14 | - Prepare pip package
15 | - Make annoucement (Twitter, etc)
16 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 | seed: set the python, numpy and mxnet random seeds to a specified value for test reproducibility
4 | serial: mark a test that requires more resources to run that are thus only suitable for serial run.
5 | remote_required: mark a test that requires internet access.
6 | gpu: mark a test that requires GPU.
7 | integration: mark an integration test
8 | skip_master: mark a test that is temporarily skipped for mxnet master validation.
9 |
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/__init__.py
--------------------------------------------------------------------------------
/scripts/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking the Performance of NLP Backbones
2 |
3 | We benchmark the latency and peak memory usage of a single training (forward + backward) and inference (forward-only) step
4 | of the NLP backbones.
5 | For comparison, we also provide the numbers of the models in huggingface.
6 |
7 | ## Backbones in HuggingFace
8 |
9 | We use the [huggingface benchmark](https://github.com/huggingface/transformers/tree/master/examples/benchmarking)
10 | to benchmark the training + inference speed of common workloads in NLP.
11 |
12 | ```bash
13 | python3 -m pip install -U -r requirements.txt
14 | python3 benchmark_hf.py
15 | ```
16 |
17 | It will generate a list of csv files:
18 |
19 | ```
20 | ├── pytorch_train_fp32.csv
21 | ├── pytorch_train_fp16.csv
22 | ├── pytorch_infer_fp32.csv
23 | ├── pytorch_infer_fp16.csv
24 | ├── pytorch_infer_fp32_ts.csv
25 | ```
26 |
27 | ## GluonNLP Backbones based on MXNet-2.0
28 |
29 | We profile three options: `NT` layout, `NT` layout with `TN` layout as the compute layout,
30 | and `TN` layout.
31 |
32 | ```bash
33 | python3 -m pip install -U -r requirements.txt
34 | bash benchmark_gluonnlp.sh
35 | ```
36 |
37 | It will generate csv files with `gluonnlp_` as the prefix
38 | ```
39 | ├── gluonnlp_train_fp32_NT_NT.csv
40 | ├── gluonnlp_train_fp32_NT_TN.csv
41 | ├── gluonnlp_train_fp32_TN_TN.csv
42 | ├── gluonnlp_infer_fp32_NT_NT_tvm0.csv
43 | ├── gluonnlp_infer_fp32_NT_TN_tvm0.csv
44 | ├── gluonnlp_infer_fp32_TN_TN_tvm0.csv
45 | ```
46 |
47 | ## GluonNLP + TVM for Inference
48 |
49 | Install TVM as described in https://tvm.apache.org/docs/install/index.html
50 |
51 | ```bash
52 | bash benchmark_gluonnlp_tvm.sh
53 | ```
54 |
55 | ```
56 | ├── gluonnlp_infer_fp32_NT_NT_tvm1.csv
57 | ├── gluonnlp_infer_fp32_NT_TN_tvm1.csv
58 | ├── gluonnlp_infer_fp32_TN_TN_tvm1.csv
59 | ```
60 |
61 | ## Generate the Benchmark Report
62 |
--------------------------------------------------------------------------------
/scripts/benchmarks/benchmark_gluonnlp.sh:
--------------------------------------------------------------------------------
1 | for mode in train inference
2 | do
3 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode
4 | done
5 |
6 | for mode in train inference
7 | do
8 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode
9 | done
10 |
11 | for mode in train inference
12 | do
13 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode
14 | done
15 |
--------------------------------------------------------------------------------
/scripts/benchmarks/benchmark_gluonnlp_fp16.sh:
--------------------------------------------------------------------------------
1 | for mode in train inference
2 | do
3 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16
4 | done
5 |
6 | for mode in train inference
7 | do
8 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16
9 | done
10 |
11 | for mode in train inference
12 | do
13 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16
14 | done
15 |
--------------------------------------------------------------------------------
/scripts/benchmarks/benchmark_gluonnlp_tvm.sh:
--------------------------------------------------------------------------------
1 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode inference --use_tvm --instance_type g4
2 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode inference --use_tvm --instance_type g4
3 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode inference --use_tvm --instance_type g4
4 |
--------------------------------------------------------------------------------
/scripts/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | py3nvml
3 | torch
4 | torchvision
5 |
--------------------------------------------------------------------------------
/scripts/benchmarks/run_backbone_benchmark.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install -U -r requirements.txt
2 | python3 benchmark_hf.py
3 | bash benchmark_gluonnlp.sh
4 | bash benchmark_gluonnlp_fp16.sh
5 |
--------------------------------------------------------------------------------
/scripts/classification/README.md:
--------------------------------------------------------------------------------
1 | # finetune classification
2 | ## prepare datasets
3 | use nlp_data to prepare data at first.
4 | ```bash
5 | nlp_data prepare_glue --benchmark glue -t sst
6 | ```
7 | ##finetine scripts
8 | Then run the scripts to finetune:
9 | ```bash
10 | python train_classification.py \
11 | --model_name google_en_uncased_bert_base \
12 | --task_name cola \
13 | --lr 2e-5\
14 | --model_name google_en_cased_bert_base \
15 | --batch_size 32 \
16 | --do_train \
17 | --do_eval \
18 | --seed 7800 \
19 | --epochs 10 \
20 | --optimizer adamw \
21 | --train_dir glue/cola/train.parquet \
22 | --eval_dir glue/cola/dev.parquet \
23 | --gpus 0
24 | ```
25 | alternatively, because some task are slow(like MNLI), you can use horovod to accelerate,
26 | ```bash
27 | horovodrun -np 4 -H localhost:4 python train_classification.py \
28 | --comm_backend horovod \
29 | --model_name google_en_uncased_bert_base \
30 | --task_name mnli \
31 | --lr 2e-4\
32 | --batch_size 32 \
33 | --do_train \
34 | --do_eval \
35 | --epochs 5 \
36 | --log_interval 500 \
37 | --warmup_ratio 0.1 \
38 | --optimizer adamw \
39 | --train_dir glue/mnli/train.parquet \
40 | --eval_dir glue/mnli/dev_matched.parquet \
41 | --gpus 0,1,2,3
42 | ```
43 |
44 | ## some result
45 | here are some results with their hyperparameters
46 |
47 | | task Name | metirc | learning rate | batch size | seed | epoch | result | tensorboard dev |
48 | |-----------|-------------|---------------|--------------|---------|-------|------|-----|
49 | | SST | Accuracy | 2e-5 | 32 | 7800 | 5 | 93.23 | https://tensorboard.dev/experiment/eKVI0DC6SEWBbHzS8ZphNg/|
50 | | STS | Pearson Corr. | 2e-5 | 32 | 24 | 10 | 89.26 | https://tensorboard.dev/experiment/kPOnlNeiQ4W5EmFlkqjC6A/|
51 | | CoLA | Matthew Corr. | 2e-5 | 32 | 7800 | 10 | 59.23 | https://tensorboard.dev/experiment/33euRGh9SrW3p15JWgILnw/ |
52 | | RTE | Accuracy | 2e-5 | 32 | 1800 | 10 | 69.67 | https://tensorboard.dev/experiment/XjTxr5anRrC1LMukLJJQ3g/|
53 | | MRPC | Accuracy/F1 | 3e-5 | 32 | 7800 | 5 | 85.38/87.31 | https://tensorboard.dev/experiment/jEJFq2XXQ8SvCxt6eKIjwg/ |
54 | | MNLI | Accuracy(m/mm) | 2e-5 | 48 | 7800 | 5 | 84.90/85.10 | https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ |
--------------------------------------------------------------------------------
/scripts/classification/classification.py:
--------------------------------------------------------------------------------
1 | import gluonnlp
2 | import numpy as np
3 | import mxnet as mx
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 | from gluonnlp.data.sampler import SplitSampler
7 | from tqdm import tqdm
8 | from mxnet.gluon import nn
9 | from gluonnlp.models import get_backbone
10 | from gluonnlp.utils.parameter import clip_grad_global_norm
11 | from gluonnlp.utils.preprocessing import get_trimmed_lengths
12 | from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat
13 | from mxnet.gluon.data import batchify as bf
14 | from mxnet.gluon.data import DataLoader
15 | from mxnet.lr_scheduler import PolyScheduler
16 | from gluonnlp.utils import set_seed
17 |
18 | class TextPredictionNet(nn.HybridBlock):
19 | def __init__(self, backbone, output_size = 2):
20 | super().__init__()
21 | self.backbone = backbone
22 | self.output_size = output_size
23 | self.out_proj = nn.Dense(in_units=backbone.units,
24 | units=self.output_size,
25 | flatten=False)
26 |
27 |
28 | def forward(self, data, token_types, valid_length):
29 | _, pooled_out = self.backbone(data, token_types, valid_length)
30 | out = self.out_proj(pooled_out)
31 | return out
32 |
33 | def initialize_with_pretrained_backbone(self, backbone_params_path, ctx=None):
34 | self.backbone.load_parameters(backbone_params_path, ctx=ctx)
35 | self.out_proj.initialize(ctx=ctx)
36 |
37 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/README.md:
--------------------------------------------------------------------------------
1 | # Conversion Scripts
2 |
3 | In GluonNLP, we provide shared scripts to convert the model checkpoints in other repositories to GluonNLP.
4 |
5 | At this stage, the model needs to be downloaded locally, and the converting scripts accepts only the file directory as the argument,
6 | without the support of accepting the url. In addition, both the tensorflow fine-tuned models that
7 | can be loaded in TF1 Hub modules and TF2 SavedModels are accepted, although the parameters of mask
8 | language model are not provided in TF2 SavedModels in most cases, and
9 | the differences of these parameters are not required to be tested after converting.
10 |
11 | The testing step mentioned above are controlled by the flag `--test`, in which the maximum
12 | tolerance of 1e-3 between gluon model with converted weights and original tensorflow model.
13 | In addition, we can use GPU in all converting scripts by adding `--gpu 0`.
14 |
15 | For RoBERTa XLM-R and BART model, we rely on the master version of [fairseq](https://github.com/pytorch/fairseq#requirements-and-installation) package locally as `pip install git+https://github.com/pytorch/fairseq.git@master`.
16 |
17 | ## BERT
18 | Convert model from [BERT LIST](https://tfhub.dev/google/collections/bert/1).
19 |
20 | You can use the script provided in [convert_bert.sh](convert_bert.sh).
21 | The following command give you a rough idea about the code.
22 |
23 | ```bash
24 | bash convert_bert.sh
25 | ```
26 |
27 | In the process, we downloaded the config file from the [official repo](https://github.com/google-research/bert#pre-trained-models), download the configuration file `bert_config.json`,
28 | and move it into `${case}_bert_${model}/assets/`.
29 |
30 | ## ALBERT
31 | You can use the command described in
32 | ```bash
33 | bash convert_albert.sh
34 | ```
35 |
36 | ## ELECTRA
37 | The TF Hub is not available for ELECTRA model currently.
38 | Thus, you will need to clone the [electra repository](https://github.com/ZheyuYe/electra)
39 | and download the checkpoint. The parameters are converted from local checkpoints.
40 | By running the following command, you can convert + verify the ELECTRA model with both the discriminator and the generator.
41 |
42 | Notice: please set up the `--electra_path` with the cloned path if you'd like to directly use `convert_electra.py`.
43 |
44 | ```bash
45 | bash convert_electra.sh
46 | ```
47 |
48 | ## MobileBert
49 | ```bash
50 | bash convert_mobilebert.sh
51 | ```
52 |
53 | ## RoBERTa
54 | ```bash
55 | bash convert_roberta.sh
56 | ```
57 |
58 | ## XLM-R
59 | ```bash
60 | bash convert_xlmr.sh
61 | ```
62 |
63 | ## BART
64 | ```bash
65 | bash convert_bart.sh
66 | ```
67 |
68 | ## GPT-2
69 | ```bash
70 | bash convert_gpt2.sh
71 | ```
72 |
73 | ## T5
74 | ```bash
75 | bash convert_t5.sh
76 | ```
77 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/bert_base_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention_probs_dropout_prob": 0.1,
3 | "hidden_act": "gelu",
4 | "hidden_dropout_prob": 0.1,
5 | "hidden_size": 768,
6 | "initializer_range": 0.02,
7 | "intermediate_size": 3072,
8 | "max_position_embeddings": 512,
9 | "num_attention_heads": 12,
10 | "num_hidden_layers": 12,
11 | "type_vocab_size": 2,
12 | "vocab_size": 30522
13 | }
14 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/bert_large_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention_probs_dropout_prob": 0.1,
3 | "hidden_act": "gelu",
4 | "hidden_dropout_prob": 0.1,
5 | "hidden_size": 1024,
6 | "initializer_range": 0.02,
7 | "intermediate_size": 4096,
8 | "max_position_embeddings": 512,
9 | "num_attention_heads": 16,
10 | "num_hidden_layers": 24,
11 | "type_vocab_size": 2,
12 | "vocab_size": 30522
13 | }
14 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_albert.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | python3 -m pip install tensorflow==1.15 --upgrade --user
4 | python3 -m pip install tensorflow_hub --upgrade --user
5 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
6 | for model in base large xlarge xxlarge
7 | do
8 | hub_directory="google_albert_${model}_v2"
9 | mkdir -p ${hub_directory}
10 | wget "https://tfhub.dev/google/albert_${model}/3?tf-hub-format=compressed" -O "${hub_directory}.tar.gz"
11 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
12 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type albert --test
13 | done
14 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_bart.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
2 | for model in base large
3 | do
4 | mkdir bart_${model}
5 | wget "https://dl.fbaipublicfiles.com/fairseq/models/bart.${model}.tar.gz"
6 | tar zxf bart.${model}.tar.gz --directory bart_${model}
7 | python3 convert_fairseq_bart.py --fairseq_model_path bart_${model}/bart.${model} --test
8 | done
9 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_bert.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | python3 -m pip install 'tensorflow<3' --upgrade --user
4 | python3 -m pip install tensorflow_hub --upgrade --user
5 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
6 |
7 | # Conversion for English Models
8 | for model in base large
9 | do
10 | for case in cased uncased
11 | do
12 | hub_directory="google_en_${case}_bert_${model}"
13 | mkdir -p ${hub_directory}
14 | if [ ${model} == base ];then
15 | url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed"
16 | else
17 | url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed"
18 | fi
19 | wget ${url} -O "${hub_directory}.tar.gz"
20 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
21 | cp bert_${model}_config.json ${hub_directory}/assets/
22 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
23 | done
24 | done
25 |
26 | # Conversion for Chinese Models
27 | url="https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/2?tf-hub-format=compressed"
28 | hub_directory="google_zh_bert_base"
29 | mkdir -p ${hub_directory}
30 | wget ${url} -O "${hub_directory}.tar.gz"
31 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
32 | cp bert_base_config.json ${hub_directory}/assets/
33 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
34 |
35 | # Conversion for Multi-lingual Models
36 | url="https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2?tf-hub-format=compressed"
37 | hub_directory="google_multi_cased_bert_base"
38 | mkdir -p ${hub_directory}
39 | wget ${url} -O "${hub_directory}.tar.gz"
40 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
41 | cp bert_base_config.json ${hub_directory}/assets/
42 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
43 |
44 | # Conversion for Whole-word-masking Models
45 | for case in cased uncased
46 | do
47 | hub_directory="google_en_${case}_bert_wwm_large"
48 | mkdir -p ${hub_directory}
49 | url="https://tfhub.dev/tensorflow/bert_en_wwm_${case}_L-24_H-1024_A-16/2?tf-hub-format=compressed"
50 | wget ${url} -O "${hub_directory}.tar.gz"
51 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
52 | cp bert_large_config.json ${hub_directory}/assets/
53 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
54 | done
55 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_bert_torch.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 |
3 | python3 -m pip install 'tensorflow<3' --upgrade --user
4 | python3 -m pip install tensorflow_hub --upgrade --user
5 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
6 |
7 | # Conversion for English Models
8 | for model in base large
9 | do
10 | for case in cased uncased
11 | do
12 | hub_directory="google_en_${case}_bert_${model}"
13 | mkdir -p ${hub_directory}
14 | if [ ${model} == base ];then
15 | url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed"
16 | else
17 | url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed"
18 | fi
19 | wget ${url} -O "${hub_directory}.tar.gz"
20 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
21 | cp bert_${model}_config.json ${hub_directory}/assets/
22 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test --torch
23 | done
24 | done
25 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_electra.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install tensorflow==1.15 --upgrade --user
2 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
3 | git clone https://github.com/ZheyuYe/electra.git
4 | cd electra
5 | git checkout 923179410471f9e1820b3f0771c239e1752e4e18
6 | cd ..
7 | for model in small base large
8 | do
9 | wget https://storage.googleapis.com/electra-data/electra_${model}.zip
10 | unzip electra_${model}.zip
11 | python3 convert_electra.py --tf_model_path electra_${model} --electra_path electra --model_size ${model} --test
12 | done
13 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_gpt2.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install tensorflow==1.15 --upgrade --user
2 | git clone https://github.com/openai/gpt-2.git gpt_2
3 | for model in 124M 355M 774M 1558M
4 | do
5 | python3 gpt_2/download_model.py ${model}
6 | mkdir gpt2_${model}
7 | CUDA_VISIBLE_DEVICES="" python3 convert_gpt2.py --tf_model_path models/${model} --save_dir gpt2_${model} --test
8 | done
9 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_mobilebert.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install tensorflow==1.15 --upgrade --user
2 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
3 | svn checkout https://github.com/google-research/google-research/trunk/mobilebert
4 |
5 | mkdir mobilebert_model
6 | url='https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/uncased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz'
7 | wget ${url} -O "mobilebert.tar.gz"
8 | tar -xvf mobilebert.tar.gz --directory mobilebert_model
9 | python3 convert_mobilebert.py --tf_model_path mobilebert_model/mobilebert --mobilebert_dir mobilebert --test
10 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_mt5.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/huggingface/transformers.git --upgrade
2 | for model in small base large xl xxl
3 | do
4 | dest_dir="google_mt5_${model}"
5 | mkdir ${dest_dir}
6 | python3 convert_mt5.py "google/mt5-${model}" ${dest_dir} --test
7 | done
8 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_roberta.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
2 | for model in base large
3 | do
4 | mkdir roberta_${model}
5 | wget "https://dl.fbaipublicfiles.com/fairseq/models/roberta.${model}.tar.gz"
6 | tar zxf roberta.${model}.tar.gz --directory roberta_${model}
7 | python3 convert_fairseq_roberta.py --fairseq_model_path roberta_${model}/roberta.${model} --test
8 | done
9 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_t5.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/huggingface/transformers.git --upgrade
2 | for model in small base large 3B 11B
3 | do
4 | dest_dir="google_t5_${model}"
5 | mkdir ${dest_dir}
6 | python3 convert_t5.py "t5-${model,,}" ${dest_dir} --test
7 | done
8 |
--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_xlmr.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install fairseq==0.10.1 --upgrade --user
2 | for model in base large
3 | do
4 | mkdir xlmr_${model}
5 | wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.${model}.tar.gz"
6 | tar zxf xlmr.${model}.tar.gz --directory xlmr_${model}
7 | python3 convert_fairseq_xlmr.py --fairseq_model_path xlmr_${model}/xlmr.${model} --model_size ${model} --test
8 | done
9 |
--------------------------------------------------------------------------------
/scripts/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import importlib
3 | import os
4 |
5 | SUBCOMMAND_DICT = dict()
6 |
7 | # Find all modules starting with `prepare_`
8 | CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
9 | for root, dirs, files in os.walk(CURR_DIR, topdown=False):
10 | for name in files:
11 | if name.startswith('prepare_') and name.endswith('.py'):
12 | command = name[:-3]
13 | path = os.path.join(root, name)
14 | relpath = os.path.relpath(path, CURR_DIR)[:-3]
15 | if relpath.startswith(os.sep):
16 | relpath = path[len(os.sep):]
17 | subpackage = relpath.replace(os.sep, '.')
18 | SUBCOMMAND_DICT[command] = 'gluonnlp.cli.data.' + subpackage
19 |
20 |
21 | def cli_main():
22 | parser = argparse.ArgumentParser(
23 | description='Build-in scripts for downloading and preparing the data in GluonNLP.',
24 | prog='nlp_data', add_help=False)
25 | parser.add_argument('command', type=str,
26 | choices=sorted(SUBCOMMAND_DICT.keys()) + ['help'],
27 | metavar='[subcommand]',
28 | help='The subcommand to use. '
29 | 'Choices are {}.'.format(sorted(SUBCOMMAND_DICT.keys()) + ['help']))
30 | args, other_args = parser.parse_known_args()
31 | if args.command == 'help':
32 | parser.print_help()
33 | else:
34 | mod = importlib.import_module(SUBCOMMAND_DICT[args.command])
35 | parser = mod.get_parser()
36 | sub_args = parser.parse_args(other_args)
37 | mod.main(sub_args)
38 |
39 |
40 | if __name__ == '__main__':
41 | cli_main()
42 |
--------------------------------------------------------------------------------
/scripts/datasets/general_nlp_benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/general_nlp_benchmark/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/language_modeling/README.md:
--------------------------------------------------------------------------------
1 | # Language Modeling Benchmark
2 |
3 | Prepare the language modeling benchmarking datasets.
4 | In order to help reproduce the papers, we use
5 | the tokenized corpus as the training/validation/testing dataset.
6 |
7 | ```bash
8 | # WikiText-2
9 | nlp_data prepare_lm --dataset wikitext2
10 |
11 | # WikiText-103
12 | nlp_data prepare_lm --dataset wikitext103
13 |
14 | # enwik8
15 | nlp_data prepare_lm --dataset enwik8
16 |
17 | # Text-8
18 | nlp_data prepare_lm --dataset text8
19 |
20 | # Google One-Billion-Word
21 | nlp_data prepare_lm --dataset gbw
22 | ```
23 |
24 | Happy language modeling :)
25 |
--------------------------------------------------------------------------------
/scripts/datasets/language_modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/language_modeling/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/machine_translation/README.md:
--------------------------------------------------------------------------------
1 | # Machine Translation
2 |
3 | In machine translation, we train a model to map a sentence from the source language, e.g., English,
4 | to the target language, e.g., Chinese. Here, we provide scripts to download the common benchmark
5 | datasets for machine translation. The downloaded datasets are stored as a pair of corpus files,
6 | one for the source and the other for the target.
7 |
8 | ## WMT
9 | You can use [prepare_wmt.py](prepare_wmt.py) to download and prepare the raw training corpus and
10 | then use [clean_parallel_corpus.py](../../preprocess/clean_parallel_corpus.py) to clean and
11 | filter the corpus.
12 |
13 | You may download the raw WMT2014 en-de
14 | ```bash
15 | nlp_data prepare_wmt \
16 | --dataset wmt2014 \
17 | --lang-pair en-de \
18 | --save-path wmt2014_en_de
19 | ```
20 |
21 | By combining `nlp_data` and `nlp_process`, we provide the example for preparing the
22 | WMT2014 en-de training dataset: [wmt2014_ende.sh](wmt2014_ende.sh). This involves three steps:
23 | - Downloading the raw text data
24 | - Clean and tokenize the data
25 | - Learn subword model and apply the learned subword model.
26 |
27 | ```bash
28 | bash wmt2014_ende.sh yttm
29 | ```
30 |
31 | We support the following subword learning algorithms:
32 |
33 | ```bash
34 | # BPE from YouTokenToMe
35 | bash wmt2014_ende.sh yttm
36 |
37 | # BPE from Huggingface
38 | bash wmt2014_ende.sh hf_bpe
39 |
40 | # BPE from subword-nmt
41 | bash wmt2014_ende.sh subword_nmt
42 |
43 | # Byte-level BPE
44 | bash wmt2014_ende.sh hf_bytebpe
45 |
46 | # Sentencepiece
47 | bash wmt2014_ende.sh spm
48 |
49 | # WordPiece
50 | bash wmt2014_ende.sh hf_wordpiece
51 | ```
52 |
53 |
54 | Apart from WMT2014 EN-DE, we also provided the script for preparing the training data for
55 | WMT2017 ZH-EN task:
56 | [wmt2017_zhen.sh](wmt2017_zhen.sh).
57 |
58 | ### Monolingual Corpus
59 | In the WMT competition, there are additional monolingual corpus that helps you train NMT models.
60 | You may download the raw monolingual corpus by adding `--mono` flag.
61 |
62 | One example is to download the newscrawl monolingual corpus in German:
63 |
64 | ```bash
65 | nlp_data prepare_wmt \
66 | --mono \
67 | --mono_lang de \
68 | --dataset newscrawl \
69 | --save-path wmt2014_mono
70 | ```
71 |
72 |
73 | ### Directory Structure of Translation Dataset
74 |
75 | The basic structure of a translation dataset is like the following:
76 | ```
77 | folder_name
78 | ├── train.raw.{src}
79 | ├── train.raw.{tgt}
80 | ├── train.tok.{src}
81 | ├── train.tok.{tgt}
82 | ├── train.tok.{subword_model}.{src}
83 | ├── train.tok.{subword_model}.{tgt}
84 | ├── ...
85 | ├── ... Repeat for valid and test
86 | ├── ...
87 | ├── {subword_model}.model
88 | ├── {subword_model}.path
89 | ```
90 |
--------------------------------------------------------------------------------
/scripts/datasets/machine_translation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/machine_translation/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/music_generation/README.md:
--------------------------------------------------------------------------------
1 | # Music Generation
2 |
3 | We provide datasets for training a music generation model.
4 |
5 | ## Maestro
6 |
7 | See https://magenta.tensorflow.org/datasets/maestro for detailed introduction.
8 |
9 | ```
10 | # Get V1 Dataset
11 | nlp_data prepare_music_midi --dataset maestro_v1
12 |
13 | # Get V2 Dataset
14 | nlp_data prepare_music_midi --dataset maestro_v2
15 | ```
16 |
17 | ## LakhMIDI
18 |
19 | See https://colinraffel.com/projects/lmd/ for more details
20 |
21 | ```
22 | # Get Lakh MIDI Full Dataset
23 | nlp_data prepare_music_midi --dataset lmd_full
24 |
25 | # Get the subset of 45,129 files from LMD-full
26 | # which have been matched to entries in the Million Song Datase
27 | nlp_data prepare_music_midi --dataset lmd_matched
28 |
29 | # Get the aligned version of lmd_matched
30 | nlp_data prepare_music_midi --dataset lmd_aligned
31 |
32 | # Get the clean midi data
33 | nlp_data prepare_music_midi --dataset clean_midi
34 | ```
35 |
36 | ## Geocities
37 |
38 | The Geocities collection of MIDI files.
39 | See https://archive.org/details/archiveteam-geocities-midi-collection-2009 for more details.
40 | ```
41 | nlp_data prepare_music_midi --dataset geocities
42 | ```
43 |
--------------------------------------------------------------------------------
/scripts/datasets/music_generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/music_generation/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/pretrain_corpus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/pretrain_corpus/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/question_answering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/question_answering/__init__.py
--------------------------------------------------------------------------------
/scripts/datasets/question_answering/prepare_hotpotqa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | from gluonnlp.utils.misc import download, load_checksum_stats
4 | from gluonnlp.base import get_data_home_dir
5 |
6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
7 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'hotpotqa')
8 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'hotpotqa.txt')
9 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
10 |
11 |
12 | _CITATIONS = """
13 | @inproceedings{yang2018hotpotqa,
14 | title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
15 | author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},
16 | booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
17 | year={2018}
18 | }
19 |
20 | """
21 |
22 | _URLS = {
23 | 'train': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json',
24 | 'dev_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json',
25 | 'dev_distractor': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json',
26 | 'test_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json',
27 | }
28 |
29 |
30 | def get_parser():
31 | parser = argparse.ArgumentParser(description='Downloading the HotpotQA Dataset.')
32 | parser.add_argument('--save-path', type=str, default='hotpotqa')
33 | parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
34 | help='The path to download the dataset.')
35 | parser.add_argument('--overwrite', action='store_true')
36 | return parser
37 |
38 |
39 | def main(args):
40 | if not os.path.exists(args.save_path):
41 | os.makedirs(args.save_path)
42 | for url in _URLS.values():
43 | file_name = url[url.rfind('/') + 1:]
44 | file_hash = _URL_FILE_STATS[url]
45 | download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
46 | if not os.path.exists(os.path.join(args.save_path, file_name))\
47 | or (args.overwrite and args.save_path != args.cache_path):
48 | os.symlink(os.path.join(args.cache_path, file_name),
49 | os.path.join(args.save_path, file_name))
50 |
51 |
52 | def cli_main():
53 | parser = get_parser()
54 | args = parser.parse_args()
55 | main(args)
56 |
57 |
58 | if __name__ == '__main__':
59 | cli_main()
60 |
--------------------------------------------------------------------------------
/scripts/datasets/question_answering/prepare_searchqa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | from gluonnlp.utils.misc import download, load_checksum_stats
4 | from gluonnlp.base import get_data_home_dir, get_repo_url
5 |
6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
7 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
8 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'searchqa.txt')
9 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
10 |
11 |
12 | _CITATIONS = """
13 | @article{dunn2017searchqa,
14 | title={Searchqa: A new q\&a dataset augmented with context from a search engine},
15 | author={Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V Ugur and Cirik, Volkan and Cho, Kyunghyun},
16 | journal={arXiv preprint arXiv:1704.05179},
17 | year={2017}
18 | }
19 |
20 | """
21 |
22 | _URLS = {
23 | 'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt',
24 | 'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt',
25 | 'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt'
26 | }
27 |
28 |
29 | def get_parser():
30 | parser = argparse.ArgumentParser(description='Downloading the SearchQA Dataset.')
31 | parser.add_argument('--save-path', type=str, default='searchqa')
32 | parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
33 | help='The path to download the dataset.')
34 | parser.add_argument('--overwrite', action='store_true')
35 | return parser
36 |
37 |
38 | def main(args):
39 | if not os.path.exists(args.save_path):
40 | os.makedirs(args.save_path)
41 | for url in _URLS.values():
42 | file_name = url[url.rfind('/') + 1:]
43 | file_hash = _URL_FILE_STATS[url]
44 | download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
45 | if not os.path.exists(os.path.join(args.save_path, file_name))\
46 | or (args.overwrite and args.save_path != args.cache_path):
47 | os.symlink(os.path.join(args.cache_path, file_name),
48 | os.path.join(args.save_path, file_name))
49 |
50 |
51 | def cli_main():
52 | parser = get_parser()
53 | args = parser.parse_args()
54 | main(args)
55 |
56 |
57 | if __name__ == '__main__':
58 | cli_main()
59 |
--------------------------------------------------------------------------------
/scripts/datasets/question_answering/prepare_triviaqa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tarfile
3 | import argparse
4 | from gluonnlp.utils.misc import download, load_checksum_stats
5 | from gluonnlp.base import get_data_home_dir
6 |
7 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
8 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'triviaqa')
9 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'triviaqa.txt')
10 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
11 |
12 |
13 | _CITATIONS = """
14 | @InProceedings{JoshiTriviaQA2017,
15 | author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
16 | title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
17 | booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
18 | month = {July},
19 | year = {2017},
20 | address = {Vancouver, Canada},
21 | publisher = {Association for Computational Linguistics},
22 | }
23 |
24 | """
25 |
26 | _URLS = {
27 | 'rc': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz',
28 | 'unfiltered': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz'
29 | }
30 |
31 |
32 | def get_parser():
33 | parser = argparse.ArgumentParser(description='Downloading the TriviaQA Dataset.')
34 | parser.add_argument('--type', type=str, choices=['rc', 'unfiltered'], default='rc',
35 | help='type of the triviaqa dataset.')
36 | parser.add_argument('--save-path', type=str, default='triviaqa')
37 | parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
38 | help='The path to download the dataset.')
39 | parser.add_argument('--overwrite', action='store_true')
40 | return parser
41 |
42 |
43 | def main(args):
44 |
45 | def extract(tar_path, target_path):
46 | try:
47 | tar = tarfile.open(tar_path, "r:gz")
48 | file_names = tar.getnames()
49 | for file_name in file_names:
50 | tar.extract(file_name, target_path)
51 | tar.close()
52 | except Exception as e:
53 | print(e)
54 |
55 | tar_url = _URLS[args.type]
56 | file_name = tar_url[tar_url.rfind('/') + 1:]
57 | file_hash = _URL_FILE_STATS[tar_url]
58 | download(tar_url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
59 | if not os.path.exists(args.save_path):
60 | os.makedirs(args.save_path)
61 | if not os.path.exists(os.path.join(args.save_path, file_name))\
62 | or (args.overwrite and args.save_path != args.cache_path):
63 | os.symlink(os.path.join(args.cache_path, file_name),
64 | os.path.join(args.save_path, file_name))
65 | extract(os.path.join(args.save_path, file_name), args.save_path)
66 |
67 |
68 | def cli_main():
69 | parser = get_parser()
70 | args = parser.parse_args()
71 | main(args)
72 |
73 |
74 | if __name__ == '__main__':
75 | cli_main()
76 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/bookcorpus.txt:
--------------------------------------------------------------------------------
1 | https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz 87ca37e83fd7ea58573a1630ebf9d1da9ee34a41 2404269430
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/glue.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/cola.zip 19096246cd2a06d8fe2d13880d6cec61149f77c7 376971
2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/sst.zip 44f5954391612a8b3d9d65f6d4a824e9ae8d19ce 7439277
3 | https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt 716e0f67af962f08220b7e97d229b293077ef41f 1047044
4 | https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc 506c7a1a5e0dd551ceec2f84070fa1a8c2bc4b41 6222
5 | https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt 4265196c15cf75620b0b592b8b921f543bda7e6c 441275
6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/qqp.zip d775bd543ee78e3f64892a43ada949daf93e003d 41696084
7 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/sts.zip cc66d8533052de6d7475ac56dfce300751e070a4 802872
8 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/mnli.zip c22c684daa5cc9fad949d09d10ecedf94a2ce053 312783507
9 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/snli.zip c60db4cc8820749e6af9f713f4d55109dd46e8c1 129820157
10 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/qnli.zip 6700cb1d2536bf512314b01350f9ac382439218e 10627589
11 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/rte.zip 2eb8630df898b7d8df14ca9130c1ac1cf79eb376 697150
12 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/wnli.zip fc9834b5a8af4e1d8412e48bc38b477510a8c2d0 28999
13 | https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D c137a2020ab489011dc38fde9ee429f4e2c71257 222257
14 | https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1 2f46c4b80fea8d3ea52a28e05467af3332fa65d9 265530
15 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/gutenberg.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
2 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/hotpotqa.txt:
--------------------------------------------------------------------------------
1 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json 08c42431c22984f362e94de0e635c7b858c3cff0 566426227
2 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json 825b6cfc34a61db41e82bbb14d978d5a834925f8 46320117
3 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json 96a41025612e8cb15989251102dc05efe9647eda 47454698
4 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json b30e4ff0d8b7bd808240e5609410f9c36279ef36 46213747
5 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/language_model.txt:
--------------------------------------------------------------------------------
1 | https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d17d80b1459be871a5039ac23e752a53cbe 4475746
2 | https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076
3 | http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475
4 | http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016
5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
7 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/music_midi.txt:
--------------------------------------------------------------------------------
1 | http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz 330b3c67f24f9280f81e1f7ab12749087dd83f08 1768163879
2 | http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz 218b7c82ecb230a6679053e48e87714f0bd4836f 1407072670
3 | http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz 9873e84dd5a531ba3623e0a24ce33a81681cba80 272169548
4 | http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz ae47e29dfc18d7779d95697a6461d759504c7a1c 234283029
5 | https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip e189d8a0b6769f3be576a036da840adafe489327 46579421
6 | https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip 13808bf9503c72371d38e9705e93ce8623b21c01 59243107
7 | https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip 493880759c648dd96167a2f4d394421e6fa33874 437506993
8 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/naturalquestions.txt:
--------------------------------------------------------------------------------
1 | s3://gluonnlp-numpy-data/NaturalQuestions/v1.0-simplified_simplified-nq-train.jsonl.gz 9ae896ea4b29370fe157aea61a088ffdc0fbda8f 4715820286
2 | s3://gluonnlp-numpy-data/NaturalQuestions/nq-dev-all.jsonl.gz b4cc081a2d065f84d630a1338dead7faad77eeff 1068038975
3 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/searchqa.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
4 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/squad.txt:
--------------------------------------------------------------------------------
1 | https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 1faea1252438a64f9718412a55036b786cfcc636 30288272
2 | https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json e1621aae0683b346ee9743bd5609266ba0cc34fc 4854279
3 | https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json ceb2acdea93b9d82ab1829c7b1e03bee9e302c99 42123633
4 | https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json 53ebaeb15bc5cab36645150f6f65d074348e2f3d 4370528
5 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/superglue.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/cb.zip c16fa0a46f0f888d59767851c44d8db397896fe5 75482
2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/copa.zip ef110b215d7ff95a2fd2d0133f0959d324e9eec3 43986
3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/multirc.zip 05bfcb1da7ea06742266f24503342fc20b2ab88a 1116225
4 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/rte.zip 66105efeccc3fc54f9c5539de4c2d393d5bb4d36 750920
5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/wic.zip 5b95487a3690abc718bc173ccd35bf084c43b10a 396213
6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/wsc.zip 829ec3dd532284281cc19bacf9cded6c11d3452d 32751
7 | https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip 8c8874dcace4942dd00cf9f76c2537ea0e2026eb 33950
8 | https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip 949909079262bc4f6fb66bd889707aa71218975f 10413
9 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/boolq.zip 90bf152c8012869d326260709404ce5111a76b46 4118001
10 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/record.zip af2825be511efa8fbc7813756e768efffb8fcc11 51757880
11 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/text_classification.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/ag_news_csv.tar.gz 00b73919ec0527118ca35d819029985c33ca4005 11784327
2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/imdb.tar.gz af11c368141a0cec4d49563000a2a54f9afdc38d 35673480
3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/dbpedia_csv.tar.gz f39ead1841501739a34a5bbb22d405677e3165f7 68341698
4 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/yelp_review_polarity_csv.tar.gz dd08ed616d28c633b1ff7a5e12d900426e5db779 166373322
5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/yelp_review_full_csv.tar.gz d0a1011a88be15254054e94144c83e92a048e318 196146693
6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/amazon_review_polarity_csv.tar.gz 9689538a9ee0630340da8aa456a0888cc6733919 688340758
7 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/amazon_review_full_csv.tar.gz e85b2d264aa8d8d3cc4dbe08adba88c0db92ff5b 643695117
8 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/triviaqa.txt:
--------------------------------------------------------------------------------
1 | https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz aa7d8c01d4a5e563caaeb648e8c1f506e353ebd6 2665779500
2 | https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz 670ba904b286865e25bb67ebd31c25e7c74c18ae 632549060
3 |
--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/wikipedia.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz 1e1d77c31622744aaa45ff5bfbfca397154d9186 5068070627
2 |
--------------------------------------------------------------------------------
/scripts/index.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | ========
3 |
4 | .. container:: cards
5 |
6 | .. card::
7 | :title: Benchmarking the Performance of NLP Backbones
8 | :link: benchmarks/index.html
9 |
10 | NLP Benchmark.
11 |
12 | .. card::
13 | :title: Classification Scripts
14 | :link: classification/index.html
15 |
16 | NLP Classification example.
17 |
18 | .. card::
19 | :title: Conversion Scripts
20 | :link: conversion_toolkits/index.html
21 |
22 | Converting NLP models from other frameworks to GluonNLP.
23 |
24 | .. card::
25 | :title: Datasets
26 | :link: datasets/index.html
27 |
28 | Datasets in GluonNLP.
29 |
30 | .. card::
31 | :title: Generation
32 | :link: generation/index.html
33 |
34 | Sequence generation with GPT-2
35 |
36 | .. card::
37 | :title: Machine Translation
38 | :link: machine_translation/index.html
39 |
40 | Machine Translation examples.
41 |
42 | .. card::
43 | :title: Data Preprocessing Toolkit in GluonNLP
44 | :link: processing/index.html
45 |
46 | Data preprocessing examples.
47 |
48 | .. card::
49 | :title: Pretraining Model
50 | :link: pretraining/index.html
51 |
52 | Pretraining examples.
53 |
54 | .. card::
55 | :title: Question Answering Examples
56 | :link: question_answering/index.html
57 |
58 | Question Answering Example.
59 |
60 | .. toctree::
61 | :hidden:
62 | :maxdepth: 1
63 |
64 |
65 | benchmarks/index
66 | conversion_toolkits/index
67 | datasets/index
68 | classification/index
69 | generation/index
70 | machine_translation/index
71 | pretraining/index
72 | processing/index
73 | question_answering/index
74 |
75 |
--------------------------------------------------------------------------------
/scripts/machine_translation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/machine_translation/__init__.py
--------------------------------------------------------------------------------
/scripts/machine_translation/evaluate_epochs_wmt2014_ende.sh:
--------------------------------------------------------------------------------
1 | SAVE_DIR=$1
2 | SUBWORD_ALGO=${2:-yttm}
3 | EPOCH_BEGIN=${3:-30}
4 | EPOCH_END=${4:-60}
5 | STOCHASTIC=${5:-0}
6 | LP_ALPHA=${6:-0.6}
7 | LP_K=${7:-5}
8 | BEAM_SIZE=${8:-4}
9 |
10 |
11 | for epoch in $( seq ${EPOCH_BEGIN} ${EPOCH_END})
12 | do
13 | for fold in dev test
14 | do
15 | python3 evaluate_transformer.py \
16 | --param_path ${SAVE_DIR}/epoch${epoch}.params \
17 | --src_lang en \
18 | --tgt_lang de \
19 | --cfg ${SAVE_DIR}/config.yml \
20 | --src_tokenizer ${SUBWORD_ALGO} \
21 | --tgt_tokenizer ${SUBWORD_ALGO} \
22 | --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \
23 | --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \
24 | --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \
25 | --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \
26 | --src_corpus wmt2014_ende/${fold}.raw.en \
27 | --tgt_corpus wmt2014_ende/${fold}.raw.de \
28 | --lp_alpha ${LP_ALPHA} \
29 | --lp_k ${LP_K} \
30 | --beam-size ${BEAM_SIZE} \
31 | --save_dir ${SAVE_DIR}/epoch${epoch}_evaluation_${fold}_alpha${LP_ALPHA}_K${LP_K}_beam${BEAM_SIZE} \
32 | --fp16
33 | done
34 | done
35 |
--------------------------------------------------------------------------------
/scripts/machine_translation/transformer_enc12_dec1.yml:
--------------------------------------------------------------------------------
1 | MODEL:
2 | dropout: 0.2
3 | DECODER:
4 | pre_norm: false
5 | num_layers: 1
6 | ENCODER:
7 | pre_norm: false
8 | num_layers: 12
9 |
--------------------------------------------------------------------------------
/scripts/pretraining/bert/covert_bookcorpus_format.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | class BookscorpusTextFormatting:
5 | def __init__(self, books_path, output_filename, recursive = False, interval = 500):
6 | self.books_path = books_path
7 | self.recursive = recursive
8 | self.output_filename = output_filename.split('.')
9 | self.interval = interval
10 |
11 | # This puts one book per line
12 |
13 | def merge(self):
14 | count = 0
15 | for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
16 | if count == 0:
17 | ofile_name = '.'.join([self.output_filename[0]+'-'+str(count//500), self.output_filename[1]])
18 | ofile = open(ofile_name, mode='w', encoding='utf-8-sig', newline='\n')
19 | elif count%self.interval == 0:
20 | print(count)
21 | ofile.close()
22 | ofile_name = '.'.join([self.output_filename[0]+'-'+str(count//500), self.output_filename[1]])
23 | ofile = open(ofile_name, mode='w', encoding='utf-8-sig', newline='\n')
24 | file = open(filename, mode='r', encoding='utf-8-sig', newline='\n')
25 | for line in file:
26 | if line.strip() != '':
27 | ofile.write(line.strip() + ' ')
28 | ofile.write("\n\n")
29 | count += 1
30 | ofile.close()
31 |
32 | data_dir = 'BookCorpus/books1/epubtxt/'
33 | output_name_format = 'BookCorpus/after_prepare/bookcorpus.txt'
34 |
35 | FormatTool = BookscorpusTextFormatting(data_dir, output_name_format)
36 | FormatTool.merge()
37 |
38 |
39 |
--------------------------------------------------------------------------------
/scripts/pretraining/convert_electra_pretrain_backbone.py:
--------------------------------------------------------------------------------
1 | """Convert pre-trained model parameters from ElectraForPretrain to ElectraModel"""
2 |
3 | import os
4 | import argparse
5 | import mxnet as mx
6 |
7 | from pretraining_utils import get_electra_pretraining_model
8 |
9 |
10 | def parse_args():
11 | parser = argparse.ArgumentParser(description=__doc__)
12 | group = parser.add_mutually_exclusive_group(required=True)
13 | group.add_argument('--model-name', type=str, default='google_electra_small',
14 | help='Name of the pretrained model.')
15 | parser.add_argument('--params-file', type=str, required=True,
16 | help='Path to the pretrained parameter file.')
17 | parser.add_argument('--out-file', type=str, default=None,
18 | help='Output file path.')
19 | parser.add_argument('--generator_units_scale', type=float, default=None,
20 | help='The scale size of the generator units, same as used in pretraining.')
21 | parser.add_argument('--generator_layers_scale', type=float, default=None,
22 | help='The scale size of the generator layer, same as used in pretraining.')
23 |
24 | args = parser.parse_args()
25 | return args
26 |
27 |
28 | def convert_params(model_name, generator_units_scale, generator_layers_scale,
29 | params_path, out_path):
30 | _, _, pretrain_model = get_electra_pretraining_model(model_name, [mx.cpu()],
31 | generator_units_scale=generator_units_scale,
32 | generator_layers_scale=generator_layers_scale,
33 | params_path=params_path)
34 | backbone_model = pretrain_model.disc_backbone
35 | backbone_model.save_parameters(out_path)
36 |
37 |
38 | if __name__ == '__main__':
39 | args = parse_args()
40 | out_path = args.out_file
41 | if not out_path:
42 | params_file = args.params_file
43 | file_name_sep = os.path.basename(params_file).split(os.path.extsep)
44 | file_name_sep.insert(-1, 'backbone')
45 | out_path = os.path.join(
46 | os.path.dirname(params_file),
47 | os.path.extsep.join(file_name_sep))
48 | convert_params(args.model_name, args.generator_units_scale, args.generator_layers_scale,
49 | args.params_file, out_path)
50 |
--------------------------------------------------------------------------------
/scripts/pretraining/torch/bert/README.md:
--------------------------------------------------------------------------------
1 | NOTE: GluonNLP uses `/dev/shm/gluonnlp` shared memory filesystem to share
2 | datasets among multi-process workloads. At this time, `/dev/shm/gluonnlp` is not
3 | cleaned up automatically after the workload completes and manual deletion is
4 | needed to free up memory. Sometimes you may not want to delete
5 | `/dev/shm/gluonnlp` after running a workload, as you intend to run a workload
6 | based on same dataset later and it's useful to keep the dataset in shared
7 | memory.
8 |
9 | # BERT
10 |
11 | -1. p4 instance preparation
12 |
13 | ```bash
14 | sudo mkfs.btrfs /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1 /dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1
15 | sudo mount /dev/nvme1n1 /mnt
16 | sudo chown ubuntu:ubuntu /mnt/
17 | ```
18 |
19 | 1. Get the dataset
20 |
21 | ```bash
22 | nlp_data prepare_bookcorpus --segment_sentences --segment_num_worker 16
23 | nlp_data prepare_wikipedia --mode download_prepared --segment_sentences --segment_num_worker 16
24 | find wikicorpus/one_sentence_per_line BookCorpus/one_sentence_per_line -type f > input_reference
25 | ```
26 |
27 | 2. Prepare batches
28 |
29 | ```bash
30 | python3 prepare_quickthought.py \
31 | --input-reference input_reference
32 | --output /mnt/out_quickthought_128 \
33 | --model-name google_en_cased_bert_base \
34 | --max-seq-length 128
35 | ```
36 |
37 |
38 | 1. Phase 1 training with sequence length 128
39 |
40 | ```bash
41 | python3 -m torch.distributed.launch --nproc_per_node=8 run_pretraining.py \
42 | --model_name google_en_cased_bert_base \
43 | --lr 0.005 \
44 | --batch_size 128 \
45 | --num_accumulated 96 \
46 | --num_dataloader_workers 4 \
47 | --num_steps 3870 \
48 | --input-files /mnt/out_quickthought_128/*feather \
49 | --mmap-folder /mnt/gluonnlp_mmap \
50 | --ckpt_dir /mnt/ckpt_dir \
51 | --ckpt_interval 1000 2>&1| tee train.log;
52 | ```
53 |
54 | 3. Phase 2 training with sequence length 512
55 |
56 | TBD
57 |
58 | Finally we obtain a folder of structure as followed,
59 |
60 | ```
61 | coder_base
62 | ├── vocab-{short_hash}.json
63 | ├── model-{short_hash}.params
64 | ├── model-{short_hash}.yml
65 | ```
66 |
--------------------------------------------------------------------------------
/scripts/processing/README.md:
--------------------------------------------------------------------------------
1 | # Data Processing Toolkit in GluonNLP
2 | We provide a bunch of data
3 |
4 | ## Clean and Tokenize a Parallel Corpus
5 |
6 | To clean and tokenize a parallel corpus, use
7 | ```
8 | nlp_process clean_tok_para_corpus --help
9 | ```
10 |
11 | ## Learn a subword model
12 |
13 | To learn a subword tokenizer, use
14 | ```
15 | nlp_process learn_subword --help
16 | ```
17 |
18 |
19 | ## Apply the learned subword model
20 | To apply the learned subword tokenizer, user
21 | ```
22 | nlp_process apply_subword --help
23 | ```
24 |
--------------------------------------------------------------------------------
/scripts/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/processing/__init__.py
--------------------------------------------------------------------------------
/scripts/processing/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import textwrap
3 |
4 | from . import (
5 | clean_tok_corpus,
6 | learn_subword,
7 | apply_subword
8 | )
9 |
10 |
11 | SUBCOMMANDS = ['clean_tok_para_corpus', 'clean_tok_mono_corpus',
12 | 'learn_subword', 'apply_subword', 'help']
13 |
14 |
15 | def cli_main():
16 | parser = argparse.ArgumentParser(
17 | description='Sharable data preprocessing utilities in GluonNLP.',
18 | prog='nlp_process', add_help=False)
19 | parser.add_argument('command', type=str,
20 | choices=SUBCOMMANDS,
21 | metavar='[subcommand]',
22 | help='The subcommand to use. '
23 | 'Choices are {}.'.format(SUBCOMMANDS))
24 | args, other_args = parser.parse_known_args()
25 | if args.command == 'clean_tok_para_corpus':
26 | parser = clean_tok_corpus.get_parser.para()
27 | sub_args = parser.parse_args(other_args)
28 | clean_tok_corpus.main_para(sub_args)
29 | elif args.command == 'clean_tok_mono_corpus':
30 | parser = clean_tok_corpus.get_parser.mono()
31 | sub_args = parser.parse_args(other_args)
32 | clean_tok_corpus.main_mono(sub_args)
33 | elif args.command == 'learn_subword':
34 | parser = learn_subword.get_parser()
35 | sub_args = parser.parse_args(other_args)
36 | learn_subword.main(sub_args)
37 | elif args.command == 'apply_subword':
38 | parser = apply_subword.get_parser()
39 | sub_args = parser.parse_args(other_args)
40 | apply_subword.main(sub_args)
41 | elif args.command == 'help':
42 | parser.print_help()
43 | else:
44 | parser.print_help()
45 |
46 |
47 | if __name__ == '__main__':
48 | cli_main()
49 |
--------------------------------------------------------------------------------
/scripts/question_answering/albert_custom.yaml:
--------------------------------------------------------------------------------
1 | version: 1.0
2 |
3 | model:
4 | name: albert_base_v2
5 | framework: mxnet
6 |
7 | tuning:
8 | strategy:
9 | name: mycustom
10 | accuracy_criterion:
11 | relative: 0.02
12 | exit_policy:
13 | timeout: 0
14 | max_trials: 1000
15 | random_seed: 9527
16 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/README.md:
--------------------------------------------------------------------------------
1 | # Commands For Training on SQuAD
2 |
3 | All commands are generated by parsing the template in [run_squad.template](run_squad.template).
4 | To generate all commands, use the following code.
5 |
6 | ```bash
7 | python3 generate_commands.py
8 | ```
9 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad.template:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-{{ dtype }}} # Default training data type
6 | MODEL_NAME={{ model_name }}
7 | BATCH_SIZE={{ batch_size }}
8 | NUM_ACCUMULATED={{ num_accumulated }}
9 | EPOCHS={{ epochs }}
10 | LR={{ lr }}
11 | WARMUP_RATIO={{ warmup_ratio }}
12 | WD={{ wd }}
13 | MAX_SEQ_LENGTH={{ max_seq_length }}
14 | MAX_GRAD_NORM={{ max_grad_norm }}
15 | LAYERWISE_DECAY={{ layerwise_decay }}
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_base.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_albert_base_v2
7 | BATCH_SIZE=4
8 | NUM_ACCUMULATED=3
9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_large.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_albert_large_v2
7 | BATCH_SIZE=3
8 | NUM_ACCUMULATED=4
9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_xlarge.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_albert_xlarge_v2
7 | BATCH_SIZE=1
8 | NUM_ACCUMULATED=12
9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=0.1
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_albert_xxlarge_v2
7 | BATCH_SIZE=1
8 | NUM_ACCUMULATED=12
9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=0.1
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_electra_base.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_electra_base
7 | BATCH_SIZE=8
8 | NUM_ACCUMULATED=1
9 | EPOCHS=2
10 | LR=0.0001
11 | WARMUP_RATIO=0.1
12 | WD=0
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=0.8
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_electra_large.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_electra_large
7 | BATCH_SIZE=2
8 | NUM_ACCUMULATED=4
9 | EPOCHS=2
10 | LR=5e-05
11 | WARMUP_RATIO=0.1
12 | WD=0
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1
15 | LAYERWISE_DECAY=0.9
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_electra_small.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_electra_small
7 | BATCH_SIZE=8
8 | NUM_ACCUMULATED=1
9 | EPOCHS=2
10 | LR=0.0003
11 | WARMUP_RATIO=0.1
12 | WD=0
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=0.8
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=gluon_en_cased_bert_base_v1
7 | BATCH_SIZE=6
8 | NUM_ACCUMULATED=2
9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_mobilebert.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_uncased_mobilebert
7 | BATCH_SIZE=8
8 | NUM_ACCUMULATED=1
9 | EPOCHS=5
10 | LR=4e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=384
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_roberta_large.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=fairseq_roberta_large
7 | BATCH_SIZE=2
8 | NUM_ACCUMULATED=6
9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.2
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_en_uncased_bert_base
7 | BATCH_SIZE=6
8 | NUM_ACCUMULATED=2
9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_en_uncased_bert_large
7 | BATCH_SIZE=2
8 | NUM_ACCUMULATED=6
9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_uncased_bert_wwm_large.sh:
--------------------------------------------------------------------------------
1 | # Generated by "generate_commands.py"
2 |
3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod
4 | VERSION=${2:-2.0} # SQuAD Version
5 | DTYPE=${3:-float32} # Default training data type
6 | MODEL_NAME=google_en_uncased_bert_wwm_large
7 | BATCH_SIZE=3
8 | NUM_ACCUMULATED=2
9 | EPOCHS=2
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 |
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 |
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 |
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 | --model_name ${MODEL_NAME} \
31 | --data_dir squad \
32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 | --version ${VERSION} \
34 | --do_eval \
35 | --do_train \
36 | --batch_size ${BATCH_SIZE} \
37 | --num_accumulated ${NUM_ACCUMULATED} \
38 | --layerwise_decay ${LAYERWISE_DECAY} \
39 | --epochs ${EPOCHS} \
40 | --lr ${LR} \
41 | --warmup_ratio ${WARMUP_RATIO} \
42 | --wd ${WD} \
43 | --max_seq_length ${MAX_SEQ_LENGTH} \
44 | --max_grad_norm ${MAX_GRAD_NORM} \
45 | --dtype ${DTYPE} \
46 | --overwrite_cache
47 |
--------------------------------------------------------------------------------
/src/gluonnlp/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.0.0.dev'
2 | from . import base
3 | from . import data
4 | from . import models
5 | from . import utils
6 | from . import attention_cell
7 | from . import initializer as init
8 | from . import layers
9 | from . import loss
10 | from . import lr_scheduler
11 | from . import op
12 | from . import torch
13 | from . import sequence_sampler
14 | from . import embedding
15 |
--------------------------------------------------------------------------------
/src/gluonnlp/base.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # pylint: disable=abstract-method
19 | """Helper functions."""
20 |
21 | import os
22 | import numpy as np
23 |
24 | __all__ = ['get_home_dir', 'get_data_home_dir']
25 |
26 | INT_TYPES = (int, np.int32, np.int64)
27 | FLOAT_TYPES = (float, np.float16, np.float32, np.float64)
28 |
29 |
30 | def get_home_dir():
31 | """Get home directory for storing datasets/models/pre-trained word embeddings"""
32 | _home_dir = os.environ.get('GLUONNLP_HOME', os.path.join('~', '.gluonnlp'))
33 | # expand ~ to actual path
34 | _home_dir = os.path.expanduser(_home_dir)
35 | return _home_dir
36 |
37 |
38 | def get_data_home_dir():
39 | """Get home directory for storing the datasets"""
40 | home_dir = get_home_dir()
41 | return os.path.join(home_dir, 'datasets')
42 |
43 |
44 | def get_model_zoo_home_dir():
45 | """Get the local directory for storing pretrained models"""
46 | home_dir = get_home_dir()
47 | return os.path.join(home_dir, 'models')
48 |
49 |
50 | def get_model_zoo_checksum_dir():
51 | """Get the directory that stores the checksums of the artifacts in the model zoo """
52 | curr_dir = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
53 | check_sum_dir = os.path.join(curr_dir, 'models', 'model_zoo_checksums')
54 | return check_sum_dir
55 |
56 |
57 | def get_repo_url():
58 | """Return the base URL for Gluon dataset and model repository """
59 | default_repo = 's3://gluonnlp-numpy-data'
60 | repo_url = os.environ.get('GLUONNLP_REPO_URL', default_repo)
61 | if repo_url[-1] != '/':
62 | repo_url = repo_url + '/'
63 | return repo_url
64 |
65 |
66 | def get_repo_model_zoo_url():
67 | """Return the base URL for GluonNLP Model Zoo"""
68 | repo_url = get_repo_url()
69 | model_zoo_url = repo_url + 'models/'
70 | return model_zoo_url
71 |
72 |
73 | def use_einsum_optimization():
74 | """Whether to use einsum for attention. This will potentially accelerate the
75 | attention cell
76 |
77 | Returns
78 | -------
79 | flag
80 | The use einsum flag
81 |
82 | """
83 | flag = os.environ.get('GLUONNLP_USE_EINSUM', False)
84 | return flag
85 |
--------------------------------------------------------------------------------
/src/gluonnlp/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/src/gluonnlp/cli/__init__.py
--------------------------------------------------------------------------------
/src/gluonnlp/cli/average_checkpoint.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import mxnet as mx
3 | import os
4 |
5 | mx.npx.set_np()
6 |
7 |
8 | def get_parser():
9 | parser = argparse.ArgumentParser(description='Script to average the checkpoints')
10 | parser.add_argument('--checkpoints', type=str, required=True, nargs='+',
11 | help='checkpoint file paths, supports two format, '
12 | '--checkpoints folder/epoch*.params or --checkpoints folder/update*.param')
13 | parser.add_argument('--ids', type=int, required=False, nargs='+',
14 | help='The IDs of the checkpoints.')
15 | parser.add_argument('--begin', type=int, required=False,
16 | default=None,
17 | help='begin number of checkpoints')
18 | parser.add_argument('--end', type=int, required=False,
19 | default=None,
20 | help='end number of checkpoints. '
21 | 'We select the checkpoints with ID >= begin and <= end.')
22 | parser.add_argument('--save-path', type=str, required=True, help='Path of the output file')
23 | return parser
24 |
25 |
26 | def main(args):
27 | if args.begin is not None or args.end is not None or args.ids is not None:
28 | print(f'Before filtering, the checkpoints are {args.checkpoints}')
29 | prefix = os.path.commonprefix(args.checkpoints)
30 | postfix = os.path.commonprefix([ele[::-1] for ele in args.checkpoints])[::-1]
31 | checkpoint_id_l = [int(ele[len(prefix):-len(postfix)]) for ele in args.checkpoints]
32 | ckpt_paths = []
33 | if args.ids is not None:
34 | for ele in args.ids:
35 | assert ele in checkpoint_id_l
36 | ckpt_paths.append(f'{prefix}{ele}{postfix}')
37 | else:
38 | assert args.begin is not None and args.end is not None, \
39 | 'Must specify both begin and end if you want to select a range!'
40 | assert args.begin >= 0
41 | assert args.end >= args.begin
42 | for ele in checkpoint_id_l:
43 | if ele >= args.begin and ele <= args.end:
44 | ckpt_paths.append(f'{prefix}{ele}{postfix}')
45 | else:
46 | ckpt_paths = args.checkpoints
47 | print(f'Load models from {ckpt_paths}')
48 | print('Average the models and save it to {}'.format(args.save_path))
49 | assert len(ckpt_paths) > 0, 'Cannot found checkpoints. You may need to check the inputs again.'
50 | res = mx.npx.load(ckpt_paths[0])
51 | keys = res.keys()
52 | for ckpt_path in ckpt_paths[1:]:
53 | ckpt = mx.npx.load(ckpt_path)
54 | for key in keys:
55 | res[key] += ckpt[key]
56 | for key in keys:
57 | res[key] /= len(ckpt_paths)
58 | mx.npx.savez(args.save_path, **res)
59 |
60 |
61 | def cli_main():
62 | parser = get_parser()
63 | args = parser.parse_args()
64 | main(args)
65 |
66 |
67 | if __name__ == '__main__':
68 | cli_main()
69 |
--------------------------------------------------------------------------------
/src/gluonnlp/cli/data:
--------------------------------------------------------------------------------
1 | ../../../scripts/datasets
--------------------------------------------------------------------------------
/src/gluonnlp/cli/process:
--------------------------------------------------------------------------------
1 | ../../../scripts/processing
--------------------------------------------------------------------------------
/src/gluonnlp/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import vocab
2 | from . import tokenizers
3 | from . import batchify
4 | from .vocab import *
5 |
6 | __all__ = ['batchify', 'tokenizers'] + vocab.__all__
7 |
--------------------------------------------------------------------------------
/src/gluonnlp/data/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | """Tokenizers"""
2 | from .base import *
3 | from .huggingface import *
4 | from .jieba import *
5 | from .moses import *
6 | from .sentencepiece import *
7 | from .spacy import *
8 | from .subword_nmt import *
9 | from .whitespace import *
10 | from .yttm import *
11 |
12 |
13 | __all__ = base.__all__ +\
14 | huggingface.__all__ + \
15 | jieba.__all__ + \
16 | moses.__all__ + \
17 | sentencepiece.__all__ + \
18 | spacy.__all__ + \
19 | subword_nmt.__all__ + \
20 | whitespace.__all__ + \
21 | yttm.__all__
22 |
--------------------------------------------------------------------------------
/src/gluonnlp/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 |
18 | # pylint: disable=wildcard-import
19 | """Word embeddings."""
20 |
21 | from . import embed_loader
22 | from .embed_loader import *
23 |
24 | __all__ = (embed_loader.__all__ )
25 |
--------------------------------------------------------------------------------
/src/gluonnlp/loss.py:
--------------------------------------------------------------------------------
1 | from mxnet.gluon import HybridBlock
2 | from mxnet import npx
3 |
4 |
5 | class LabelSmoothCrossEntropyLoss(HybridBlock):
6 | r"""Computes the softmax cross entropy loss with label-smoothing
7 |
8 | .. math::
9 |
10 | \DeclareMathOperator{softmax}{softmax}
11 |
12 | lp = \log \softmax({pred})
13 |
14 | L = - [(1 - \alpha) \sum_{i=1}^N (lp_{i, {label}_i}) + \alpha \frac{1}{N} \sum_{j=1}^N (lp_{i, j})]
15 |
16 | To reduce complexity, we can implement it as
17 |
18 | .. math::
19 |
20 | L = -\sum_i (\frac{N \alpha - 1}{N-1} lp_{i, {label}_i} + \frac{1 - \alpha}{N - 1} \sum_j lp_{i, j})
21 |
22 | Parameters
23 | ----------
24 | num_labels
25 | The number of possible labels. For example, in NLP, it can be the size of the vocabulary.
26 | alpha
27 | The uncertainty that will be injected to the labels. All the negative labels will be
28 | treated with probability equals to \frac{\alpha} / {N}
29 | from_logits
30 | Whether input is a log probability (usually from log_softmax) instead of unnormalized numbers.
31 | """
32 | def __init__(self, num_labels: int, alpha: float = 0.1, from_logits: bool = False, **kwargs):
33 | super().__init__(**kwargs)
34 | self._num_labels = num_labels
35 | self._alpha = alpha
36 | self._from_logits = from_logits
37 |
38 | def forward(self, pred, label):
39 | """
40 |
41 | Parameters
42 | ----------
43 | pred :
44 | The predictions of the network. Shape (..., V)
45 | label :
46 | The labels. Shape (..., )
47 |
48 | Returns
49 | -------
50 | loss :
51 | Shape (..., )
52 | """
53 | if not self._from_logits:
54 | pred = npx.log_softmax(pred, axis=-1)
55 | log_likelihood = npx.pick(pred, label, axis=-1)
56 | all_scores = pred.sum(axis=-1)
57 | loss = - (1 - self._alpha) * log_likelihood\
58 | - self._alpha / float(self._num_labels) * all_scores
59 | return loss
60 |
--------------------------------------------------------------------------------
/src/gluonnlp/lr_scheduler.py:
--------------------------------------------------------------------------------
1 | import math
2 | from mxnet import lr_scheduler
3 |
4 |
5 | class InverseSquareRootScheduler(lr_scheduler.LRScheduler):
6 | """ Reduce the learning rate according to a polynomial of given power.
7 |
8 | During warmup
9 | Increase the learning rate linearly from warmup_init_lr to base_lr,
10 | After warmup
11 | Decay the learning rate with
12 | lr = base_lr * sqrt(warmup_steps) / sqrt(num_update)
13 |
14 | Parameters
15 | ----------
16 | warmup_steps
17 | maximum number of updates before the decay reaches final learning rate.
18 | base_lr
19 | The final learning rate in the warm-up stage. The learning rate starts to decay after
20 | the lr reaches warmup_end_lr
21 | warmup_init_lr
22 | The initial learning rate of the scheduler. The warm up starts at this point.
23 | """
24 |
25 | def __init__(self, warmup_steps: int, base_lr: float = 1E-3, warmup_init_lr: float = 0.0):
26 | super().__init__(
27 | base_lr, warmup_steps, warmup_init_lr, 'linear')
28 | self.base_lr = base_lr
29 | self.warmup_steps = warmup_steps
30 |
31 | def __call__(self, num_update):
32 | if num_update < self.warmup_steps:
33 | return self.get_warmup_lr(num_update)
34 | else:
35 | return self.base_lr * math.sqrt(self.warmup_steps) / math.sqrt(num_update)
36 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .albert import *
3 | from .bert import *
4 | from .electra import *
5 | from .gpt2 import *
6 | from .mobilebert import *
7 | from .roberta import *
8 | from .transformer import *
9 | from .transformer_xl import *
10 | from .xlmr import *
11 | from .bart import *
12 | from .t5 import *
13 | from .mt5 import *
14 |
15 | __all__ = base.__all__ + \
16 | albert.__all__ + \
17 | bert.__all__ + \
18 | electra.__all__ + \
19 | gpt2.__all__ +\
20 | mobilebert.__all__ + \
21 | roberta.__all__ + \
22 | transformer.__all__ + \
23 | transformer_xl.__all__ + \
24 | t5.__all__ + \
25 | mt5.__all__
26 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/base.py:
--------------------------------------------------------------------------------
1 | __all__ = ['list_backbone_names', 'get_backbone', 'BACKBONE_REGISTRY']
2 |
3 | from typing import Tuple, List
4 | from ..base import get_model_zoo_home_dir
5 | from ..data.tokenizers import BaseTokenizer
6 | from ..utils.registry import Registry
7 | from mxnet.gluon import Block
8 |
9 | BACKBONE_REGISTRY = Registry('Backbone Models')
10 |
11 |
12 | def list_backbone_names():
13 | all_keys = []
14 | for backbone_type in BACKBONE_REGISTRY.list_keys():
15 | all_keys.extend(BACKBONE_REGISTRY.get(backbone_type)[-1]())
16 | return all_keys
17 |
18 |
19 | def get_backbone(model_name: str,
20 | root: str = get_model_zoo_home_dir(),
21 | **kwargs) -> Tuple['Block', str, BaseTokenizer, str, List]:
22 | """Get the backbone network
23 |
24 | Parameters
25 | ----------
26 | model_name
27 | The name of the pretrained model
28 | root
29 | Downloaded directory of the model zoo
30 |
31 | Returns
32 | -------
33 | model_cls
34 | The class to construct the backbone network
35 | cfg
36 | Path to the config file of the backbone
37 | tokenizer
38 | The tokenizer that is bound to the backbone model
39 | backbone_param_path
40 | The path to the pretrained backbone weights
41 | others
42 | The other items returned by the create function.
43 | Will be wrapped into a list
44 |
45 | Examples
46 | --------
47 |
48 | >>> from gluonnlp.models import get_backbone
49 | >>> model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone('google_en_cased_bert_base')
50 | >>> model = model_cls.from_cfg(cfg)
51 | >>> model.load_parameters(backbone_param_path)
52 | """
53 | model_cls, local_create_fn = None, None
54 |
55 | for backbone_type in BACKBONE_REGISTRY.list_keys():
56 | ele_model_cls, ele_local_create_fn, list_key_fn = BACKBONE_REGISTRY.get(backbone_type)
57 | if model_name in list_key_fn():
58 | model_cls = ele_model_cls
59 | local_create_fn = ele_local_create_fn
60 | if model_cls is None or local_create_fn is None:
61 | raise KeyError('The backbone model "{}" is not found! '
62 | 'Here are all available backbone models = {}'
63 | .format(model_name,
64 | list_backbone_names()))
65 | cfg, tokenizer, local_params_path, *others = local_create_fn(model_name=model_name, root=root,
66 | **kwargs)
67 | return model_cls, cfg, tokenizer, local_params_path, others
68 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/albert.txt:
--------------------------------------------------------------------------------
1 | google_albert_base_v2/model-125be477.params 125be477d1cecc6843245eafe46ca1dc5961ffb5 46736016
2 | google_albert_base_v2/model-8767fdc9.yml 8767fdc9e1190606dc9aa17725438b4ae33704c4 436
3 | google_albert_base_v2/model_mlm-fe20650e.params fe20650e289fcd1a36c09d39e1d5cf5ffa64ba32 47251372
4 | google_albert_base_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289
5 | google_albert_base_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576
6 | google_albert_large_v2/model-ad60bcd5.params ad60bcd55cbba463c6e85062769fce846dd9fcf0 70737552
7 | google_albert_large_v2/model-e2e9b974.yml e2e9b9748ffe2b147cd92cbc8edba129ed9e98c1 388
8 | google_albert_large_v2/model_mlm-6a5015ee.params 6a5015ee845f874c1201b5a954275a489e0ed10c 71383980
9 | google_albert_large_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289
10 | google_albert_large_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576
11 | google_albert_xlarge_v2/model-4149c9e2.params 4149c9e2793dbd9352d27ab11d67f84b0763f4b2 234901136
12 | google_albert_xlarge_v2/model-8123bffd.yml 8123bffda684857ddac48ebeaaa18aba0e1503fb 437
13 | google_albert_xlarge_v2/model_mlm-ee184d38.params ee184d389424bab1adf17cc1feb86c69ba0791ff 236071852
14 | google_albert_xlarge_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289
15 | google_albert_xlarge_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576
16 | google_albert_xxlarge_v2/model-5601a0ed.params 5601a0edddb11d324aecccca7f496ef09013481e 890384016
17 | google_albert_xxlarge_v2/model-07fbeebc.yml 07fbeebcdee60e2362040807d56c572ae7dd7f03 438
18 | google_albert_xxlarge_v2/model_mlm-d2e2b06f.params d2e2b06f68668cab9c37dd60dca82f00e2e248ab 892603308
19 | google_albert_xxlarge_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289
20 | google_albert_xxlarge_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576
21 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/bart.txt:
--------------------------------------------------------------------------------
1 | fairseq_bart_base/model-8f4929b5.params 8f4929b54f2f77619885cea9f3bd7dba51a27f38 560560748
2 | fairseq_bart_base/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
3 | fairseq_bart_base/model-251bf089.yml 251bf08944d18cc29b59a4a854bdbccf601dabb5 754
4 | fairseq_bart_base/gpt2-f4dedacb.vocab f4dedacb076b1df441c9c7398ed9acd3c19865f3 575079
5 | fairseq_bart_large/model-862277b1.params 862277b1489ed95140cb63279fbd0098ef2dea90 1625180962
6 | fairseq_bart_large/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
7 | fairseq_bart_large/model-a2932dea.yml a2932deaf9737d95891755841fae3e388f3d698a 746
8 | fairseq_bart_large/gpt2-f1335494.vocab f1335494f47917829e3b1d08e579ff2c3fe4fd60 558231
9 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/electra.txt:
--------------------------------------------------------------------------------
1 | google_electra_small/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235
2 | google_electra_small/model-2654c8b4.params 2654c8b4e240a5713078d2bd79582285c3f1b333 53945262
3 | google_electra_small/gen_model-0c30d1c5.params 0c30d1c5678154937dee1d11bef8db6f43d4d767 54202512
4 | google_electra_small/model-9ffb21c8.yml 9ffb21c8885bdb3e5f62c3f7a670d406167ec10c 472
5 | google_electra_small/disc_model-137714b6.params 137714b6c7f327e642861a7380dd94c8b3dbf1ea 54211975
6 | google_electra_base/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235
7 | google_electra_base/model-31c235cc.params 31c235cc6da6f1872adffb31efe9318600b89ae5 435579680
8 | google_electra_base/gen_model-253a62c9.params 253a62c9aa9de24d85e09a9ae62ef88501e53dff 134978192
9 | google_electra_base/model-5b35ca0b.yml 5b35ca0b7f117978e372cfd8d98970d2d726e6c0 477
10 | google_electra_base/disc_model-514bd353.params 514bd353f9d42bc907bfa7e1175f4013b0147d7e 437947611
11 | google_electra_large/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235
12 | google_electra_large/model-9baf9ff5.params 9baf9ff55cee0195b7754aee7fcb3a1019c99f45 1336395080
13 | google_electra_large/gen_model-82c1b17b.params 82c1b17b4b5ac19700c272858b0b211437f72855 205211944
14 | google_electra_large/model-31b7dfdd.yml 31b7dfdd343bd2b2e43e200a735c83b0af1963f1 476
15 | google_electra_large/disc_model-5b820c02.params 5b820c026aa2ad779c1e9a41ff4ff1408fefacbf 1340602227
16 | gluon_electra_small_owt/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235
17 | gluon_electra_small_owt/model-e9636891.params e9636891daae9f2940b2b3210cca3c34c3d8f21e 53748654
18 | gluon_electra_small_owt/model-6e276d98.yml 6e276d98360fbb7c379d28bac34a3ca2918a90ab 473
19 | gluon_electra_small_owt/gen_model-45a6fb67.params 45a6fb67e1e6cb65d22b80498f2152ce9780d579 33926624
20 | gluon_electra_small_owt/disc_model-87836017.params 878360174ac71c3fdc7071be7835bea532c09b8d 54015367
21 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/gpt2.txt:
--------------------------------------------------------------------------------
1 | gpt2_124M/model_lm-99b90604.params 99b9060488b4542ccd045c28401da10a3158ca80 497771820
2 | gpt2_124M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
3 | gpt2_124M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055
4 | gpt2_124M/model-bfed311d.params bfed311d5c980ba475f90ccf7f536d25c3b40386 497769466
5 | gpt2_355M/model_lm-eed0e964.params eed0e964f4222823a557acfee2c106f228ce0188 1419317644
6 | gpt2_355M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
7 | gpt2_355M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055
8 | gpt2_355M/model-81dee612.params 81dee612413733899f6e5fbbeac91da781805e1b 1419312986
9 | gpt2_774M/model_lm-cfbfa641.params cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6 3096157676
10 | gpt2_774M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
11 | gpt2_774M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055
12 | gpt2_774M/model-9917e24e.params 9917e24e89c651793adea69042d6cceddfc7973c 3096150714
13 | gpt2_1558M/model_lm-c8489dcb.params c8489dcbdb0d39bc3eac6d1d62e0e3dace9faa8f 6230494540
14 | gpt2_1558M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
15 | gpt2_1558M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055
16 | gpt2_1558M/model-af3dd713.params af3dd71313b55b4be5f52bdd538c9db054c1e190 6230485274
17 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/mobilebert.txt:
--------------------------------------------------------------------------------
1 | google_uncased_mobilebert/model-1c33216b.yml 1c33216b256a76713e0906b7ceefb3b37d4d35a0 510
2 | google_uncased_mobilebert/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235
3 | google_uncased_mobilebert/model-c8346cf2.params c8346cf2caf9cc422f081f03b50bc69945328894 98424130
4 | google_uncased_mobilebert/model_mlm-53948e82.params 53948e82d8ec091927af357387b36ade0e42b34c 146503986
5 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/mt5.txt:
--------------------------------------------------------------------------------
1 | google_mt5_small/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802
2 | google_mt5_small/model-23352279.yml 23352279d13971a536847aebe31b34c4a0b80dd8 242
3 | google_mt5_small/model-b20e24d7.params b20e24d75d097e9eea647f4b9a0dc53b956a9d1a 688633650
4 | google_mt5_base/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802
5 | google_mt5_base/model-da71d108.yml da71d1084d75af5648e1b9247fecfa74e0361da0 244
6 | google_mt5_base/model-91eaa894.params 91eaa89444e062e2fc3953b1184e15ccf5375385 1561555474
7 | google_mt5_large/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802
8 | google_mt5_large/model-1226608e.yml 1226608ec2c53cc6dcf2303a8f1b19c59f43cbfe 245
9 | google_mt5_large/model-6b46e841.params 6b46e841e9b1b4c8ad97b071b316f9c52c2731e6 3894572546
10 | google_mt5_xl/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802
11 | google_mt5_xl/model-089b83a2.yml 089b83a2c893bd901fe26180f2fbfd2f52804ae0 245
12 | google_mt5_xl/model-7655ea81.params 7655ea81d4b7c9787dd1bfa902e96cdf9e124e3d 12922784462
13 | google_mt5_xxl/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802
14 | google_mt5_xxl/model-65e24812.yml 65e248120fbdcbaced58fb6f6c21f8143f9e97be 246
15 | google_mt5_xxl/model-2e9e44b9.params 2e9e44b9fc10d8a4c7133fa5e67ecadedfbfb692 47588620878
16 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/roberta.txt:
--------------------------------------------------------------------------------
1 | fairseq_roberta_base/model-565d1db7.yml 565d1db71b0452fa2c28f155b8e9d90754f4f40a 401
2 | fairseq_roberta_base/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
3 | fairseq_roberta_base/gpt2-f1335494.vocab f1335494f47917829e3b1d08e579ff2c3fe4fd60 558231
4 | fairseq_roberta_base/model-09a1520a.params 09a1520adf652468c07e43a6ed28908418fa58a7 496222787
5 | fairseq_roberta_base/model_mlm-29889e2b.params 29889e2b4ef20676fda117bb7b754e1693d0df25 498794868
6 | fairseq_roberta_large/model-6b043b91.params 6b043b91a6a781a12ea643d0644d32300db38ec8 1417251819
7 | fairseq_roberta_large/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318
8 | fairseq_roberta_large/model-6e66dc4a.yml 6e66dc4a450560a93aaf3d0ba9e0d447495d778a 402
9 | fairseq_roberta_large/gpt2-f1335494.vocab f1335494f47917829e3b1d08e579ff2c3fe4fd60 558231
10 | fairseq_roberta_large/model_mlm-119f38e1.params 119f38e1249bd28bea7dd2e90c09b8f4b879fa19 1421664140
11 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/t5.txt:
--------------------------------------------------------------------------------
1 | google_t5_small/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656
2 | google_t5_small/model-3cc6e5f7.yml 3cc6e5f7c6ccc3e2ac174d899b1aed74d7de65e0 235
3 | google_t5_small/model-e34f6fbd.params e34f6fbda666c02f0ffd5e15fec02056d3e3014d 242141346
4 | google_t5_base/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656
5 | google_t5_base/model-ca5cc26c.yml ca5cc26c9dfe31295c97ef536b3f6f954ef1a447 237
6 | google_t5_base/model-e1956ac9.params e1956ac9670263b6803672bd0d7579f71d7494c6 891901274
7 | google_t5_large/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656
8 | google_t5_large/model-01c5d9ae.yml 01c5d9ae5476b18c3516ebbe3a505b966982027d 238
9 | google_t5_large/model-bf5fc813.params bf5fc8138a04aa5f3bc495cacb010c873e59e909 2951363690
10 | google_t5_3B/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656
11 | google_t5_3B/model-791f2e90.yml 791f2e90057fcccfa83bf8130034196d3550fb77 240
12 | google_t5_3B/model-48ba7250.params 48ba72501239c8d2d355282eebdebd0935556780 11407098198
13 | google_t5_11B/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656
14 | google_t5_11B/model-2e50d93e.yml 2e50d93effc258aa75af162e9598be60ae13a83e 241
15 | google_t5_11B/model-1936031c.params 1936031c6db581ae866f41ec6d3c1c6de2049823 45229995126
16 |
--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/xlmr.txt:
--------------------------------------------------------------------------------
1 | fairseq_xlmr_base/model-3fa134e9.params 3fa134e9a13e2329ffa7b8d39612695ed8397c9d 1109814851
2 | fairseq_xlmr_base/model-b893d178.yml b893d178fa859fb6c708a08fc970b9980e047825 402
3 | fairseq_xlmr_base/model_mlm-86e37954.params 86e379542a6430cd988ff4b6a25966949afc241a 1113185880
4 | fairseq_xlmr_base/sentencepiece-18e17bae.model 18e17bae37be115135d4cf4ad9dfcc4f3b12cb80 5069075
5 | fairseq_xlmr_large/model-b62b074c.params b62b074cdd41e682075e2407f842be6578696b26 2235374571
6 | fairseq_xlmr_large/model-01fc59fb.yml 01fc59fb3a805f09d2aa11369d5b57e0be931fdd 403
7 | fairseq_xlmr_large/model_mlm-887506c2.params 887506c20bda452cf13ef04390eaa57a55602a92 2240585840
8 | fairseq_xlmr_large/sentencepiece-18e17bae.model 18e17bae37be115135d4cf4ad9dfcc4f3b12cb80 5069075
9 |
--------------------------------------------------------------------------------
/src/gluonnlp/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/src/gluonnlp/third_party/__init__.py
--------------------------------------------------------------------------------
/src/gluonnlp/torch/__init__.py:
--------------------------------------------------------------------------------
1 | from . import attention_cell
2 | from . import data
3 | from . import layers
4 | from . import optimizers
5 | from . import models
6 | from . import utils
7 |
--------------------------------------------------------------------------------
/src/gluonnlp/torch/clib/amp_C_frontend.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | void multi_tensor_lans_cuda(
4 | int chunk_size,
5 | at::Tensor noop_flag,
6 | std::vector> tensor_lists,
7 | const float lr,
8 | const float beta1,
9 | const float beta2,
10 | const float epsilon,
11 | const int step,
12 | const int bias_correction,
13 | const float weight_decay,
14 | const int grad_averaging,
15 | const int mode,
16 | const bool normalize_grad);
17 |
18 |
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 | m.def("multi_tensor_lans", &multi_tensor_lans_cuda,
21 | "Computes and apply update for LANS optimizer");
22 | }
23 |
--------------------------------------------------------------------------------
/src/gluonnlp/torch/clib/compat.h:
--------------------------------------------------------------------------------
1 | #ifndef TORCH_CHECK
2 | #define TORCH_CHECK AT_CHECK
3 | #endif
4 |
5 | #define DATA_PTR data_ptr
6 |
--------------------------------------------------------------------------------
/src/gluonnlp/torch/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import batchify
2 |
--------------------------------------------------------------------------------
/src/gluonnlp/torch/models/__init__.py:
--------------------------------------------------------------------------------
1 | from . import transformer
2 | from . import bert
3 |
--------------------------------------------------------------------------------
/src/gluonnlp/torch/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from . import schedules
2 | from . import fused_lans
3 |
4 | from .fused_lans import FusedLANS
5 |
--------------------------------------------------------------------------------
/src/gluonnlp/torch/optimizers/schedules.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020, Amazon. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Optimization for BERT model."""
15 |
16 | from torch.optim.lr_scheduler import LambdaLR
17 |
18 | __all__ = ['get_warmup_linear_const_decay_poly_schedule']
19 |
20 |
21 | def get_warmup_linear_const_decay_poly_schedule(optimizer, total_steps, warmup_ratio=0.002,
22 | const_ratio=0., degree=1.0, last_epoch=-1):
23 | """Create a schedule with a learning rate that decreases linearly from the
24 | initial lr set in the optimizer to 0, after a warmup period during which it
25 | increases linearly from 0 to the initial lr set in the optimizer and a
26 | constant period.
27 |
28 | Args:
29 | optimizer (:class:`~torch.optim.Optimizer`):
30 | The optimizer for which to schedule the learning rate.
31 | total_steps (:obj:`int`):
32 | The total number of training steps.
33 | warmup_ratio (:obj:`float`):
34 | The number of steps for the warmup phase.
35 | constant_ratio (:obj:`float`):
36 | The total number of training steps.
37 | last_epoch (:obj:`int`, `optional`, defaults to -1):
38 | The index of the last epoch when resuming training.
39 |
40 | Return:
41 | :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
42 |
43 | """
44 | def lr_lambda(global_step: int):
45 | x = global_step / total_steps
46 | if warmup_ratio == 0.0:
47 | return 1.0
48 | elif x < warmup_ratio:
49 | return x / warmup_ratio
50 | elif x < warmup_ratio + const_ratio:
51 | return 1.0
52 | return ((1.0 - x) / (1.0 - warmup_ratio - const_ratio))**degree
53 |
54 | return LambdaLR(optimizer, lr_lambda, last_epoch)
55 |
--------------------------------------------------------------------------------
/src/gluonnlp/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import shm
3 | from . import lazy_imports
4 | from . import preprocessing
5 | from . import registry
6 | from . import testing
7 | from .parameter import *
8 | from .misc import *
9 |
--------------------------------------------------------------------------------
/src/gluonnlp/utils/config.py:
--------------------------------------------------------------------------------
1 | import yacs.config
2 |
3 |
4 | class CfgNode(yacs.config.CfgNode):
5 | def clone_merge(self, cfg_filename_or_other_cfg):
6 | """Create a new cfg by cloning and merging with the given cfg
7 |
8 | Parameters
9 | ----------
10 | cfg_filename_or_other_cfg
11 |
12 | Returns
13 | -------
14 |
15 | """
16 | ret = self.clone()
17 | if isinstance(cfg_filename_or_other_cfg, str):
18 | ret.merge_from_file(cfg_filename_or_other_cfg)
19 | return ret
20 | elif isinstance(cfg_filename_or_other_cfg, CfgNode):
21 | ret.merge_from_other_cfg(cfg_filename_or_other_cfg)
22 | return ret
23 | elif cfg_filename_or_other_cfg is None:
24 | return ret
25 | else:
26 | raise TypeError('Type of config path is not supported!')
27 |
--------------------------------------------------------------------------------
/src/gluonnlp/utils/shm.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import mmap
3 |
4 | if pickle.HIGHEST_PROTOCOL < 5:
5 | del pickle
6 | import pickle5 as pickle
7 |
8 |
9 | def serialize(path, tbl):
10 | """Serialize tbl with out-of-band data to path for zero-copy shared memory usage.
11 |
12 | If the object to be serialized itself, or the objects it uses for data
13 | storage (such as numpy arrays) implement the the pickle protocol version 5
14 | pickle.PickleBuffer type in __reduce_ex__, then this function can store
15 | these buffers out-of-band as files in `path` so that they subsequently be
16 | re-used for zero-copy sharing accross processes.
17 |
18 | Parameters
19 | ----------
20 | path : pathlib.Path
21 | Empty folder used to save serialized data. Usually a folder /dev/shm
22 | tbl : object
23 | Object to serialize. For example a PyArrow Table, a Pandas Dataframe or
24 | any type that relies on NumPy to store the binary data.
25 |
26 | """
27 | idx = 0
28 |
29 | def buffer_callback(buf):
30 | nonlocal idx
31 | with open(path / f'{idx}.bin', 'wb') as f:
32 | f.write(buf)
33 | idx += 1
34 |
35 | with open(path / 'meta.pkl', 'wb') as f:
36 | pickle.dump(tbl, f, protocol=5, buffer_callback=buffer_callback)
37 |
38 |
39 | def load(path):
40 | """Load serialized object with out-of-band data from path based on zero-copy shared memory.
41 |
42 | Parameters
43 | ----------
44 | path : pathlib.Path
45 | Folder used to save serialized data with serialize(). Usually a folder /dev/shm
46 |
47 | """
48 | num_buffers = len(list(path.iterdir())) - 1 # exclude meta.idx
49 | buffers = []
50 | for idx in range(num_buffers):
51 | f = open(path / f'{idx}.bin', 'rb')
52 | buffers.append(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ))
53 | with open(path / 'meta.pkl', 'rb') as f:
54 | return pickle.load(f, buffers=buffers)
55 |
--------------------------------------------------------------------------------
/src/gluonnlp/utils/tvm_utils.py:
--------------------------------------------------------------------------------
1 | __all__ = ['get_ec2_tvm_flags', 'update_tvm_convert_map']
2 |
3 | import tvm.relay.op as _op
4 | import tvm.relay.expr as _expr
5 | from typing import Dict
6 | from tvm.relay.frontend.mxnet import _convert_map
7 | from tvm.relay.frontend.common import infer_type as _infer_type
8 |
9 | def get_ec2_tvm_flags() -> Dict[str, Dict]:
10 | r"""Return the recommended flags for TVM compilation in AWS EC2 instances.
11 |
12 | Including C4, C5, G4, P3.
13 |
14 | For more details about AWS EC2 instances, refer to https://aws.amazon.com/ec2/instance-types/.
15 |
16 | Returns
17 | -------
18 | info_dict
19 | A dictionary that contains the mapping between instance type and the
20 | corresponding compilation flags.
21 | Each element includes:
22 |
23 | - target
24 | The compilation target
25 | - use_gpu
26 | Whether it's a GPU instance
27 | - opt_level
28 | The optimization level in compilation
29 | - pass
30 | Additional graph passes for further improvement.
31 | """
32 | instance_info = {
33 | 'g4': {'target': "cuda -model=t4 -libs=cublas,cudnn",
34 | 'use_gpu': True,
35 | 'opt_level': 3,
36 | 'required_pass': ["FastMath"]},
37 | 'c4': {'target': 'llvm -mcpu=core-avx2 -libs=cblas',
38 | 'use_gpu': False,
39 | 'opt_level': 3,
40 | 'required_pass': ["FastMath"]},
41 | 'c5': {'target': 'llvm -mcpu=skylake-avx512 -libs=cblas',
42 | 'use_gpu': False,
43 | 'opt_level': 3,
44 | 'required_pass': ["FastMath"]},
45 | 'p3': {'target': 'cuda -model=v100 -libs=cublas,cudnn',
46 | 'use_gpu': True,
47 | 'opt_level': 3,
48 | 'required_pass': ["FastMath"]}
49 | }
50 | return instance_info
51 |
52 |
53 | def update_tvm_convert_map() -> None:
54 | """A Monkey Patch to update convert map in tvm/relay/frontend/mxnet.py"""
55 | op = (('masked_softmax', _mx_masked_softmax),)
56 | _convert_map.update({key: value for key, value in op})
57 |
58 |
59 | def _mx_masked_softmax(inputs, attrs):
60 | assert len(inputs) == 1 or len(inputs) == 2
61 | axis = attrs.get_int("axis")
62 | temperature = attrs.get_float("temperature")
63 | if len(inputs) == 1:
64 | result = _op.nn.softmax(inputs[0] / _expr.const(temperature), axis=axis)
65 | else:
66 | neg = -1e18
67 | att_score, mask = inputs
68 | att_score_dtype = _infer_type(att_score).checked_type.dtype
69 | if att_score_dtype == "float16":
70 | neg = -1e4
71 | temp = _op.where(mask,
72 | att_score,
73 | _expr.const(neg))
74 | result = _op.multiply(_op.nn.softmax(temp / _expr.const(temperature), axis=axis), mask.astype("float32"))
75 | return result
76 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Unit Tests
2 |
3 | To run the unittests, use the following command
4 |
5 | ```bash
6 | python3 -m pytest --forked --device="cpu" .
7 | ```
8 |
9 | To test for certain file, e.g., the `test_models_transformer.py`, use the following command
10 |
11 | ```bash
12 | python3 -m pytest --forked --device="cpu" test_models_transformer.py
13 | ```
14 |
15 | To test only for gpu device, use the following command
16 |
17 | ```bash
18 | python3 -m pytest --forked --device="gpu" test_models_transformer.py
19 | ```
20 |
21 | To test both for cpu and gpu device, use the following command
22 |
23 | ```bash
24 | python3 -m pytest --forked --device="cpu" --device="gpu" test_models_transformer.py
25 | ```
26 |
27 | In addition, to run all the tests, you should add the `--runslow` flag
28 |
29 | ```bash
30 | python3 -m pytest --forked --device="gpu" --runslow test_models.py
31 | ```
32 |
33 | Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details.
34 |
35 | # Naming Convention
36 |
37 | The naming convention of the tests are `test_{module_name}.py`.
38 | For example, the test of [models/transformer.py](../src/gluonnlp/models/transformer.py) will be in
39 | `test_models_transformer.py`. The test of [models/__init__.py](../src/gluonnlp/models/__init__.py)
40 | is `test_models.py`.
41 |
42 | Also, we include the scheduled testing scripts for `nlp_process` in [process_cli](process_cli),
43 | and 'nlp_data' in [data_cli](data_cli).
44 |
45 |
--------------------------------------------------------------------------------
/tests/data_cli/test_glue.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tempfile
3 | import pandas as pd
4 | from gluonnlp.cli.data.general_nlp_benchmark import prepare_glue
5 |
6 |
7 | @pytest.mark.remote_required
8 | @pytest.mark.parametrize('task', ["cola", "sst", "mrpc", "qqp", "sts", "mnli",
9 | "snli", "qnli", "rte", "wnli", "diagnostic"])
10 | def test_glue(task):
11 | parser = prepare_glue.get_parser()
12 | with tempfile.TemporaryDirectory() as root:
13 | args = parser.parse_args(['--benchmark', 'glue',
14 | '--tasks', task,
15 | '--data_dir', root])
16 | prepare_glue.main(args)
17 |
18 |
19 | @pytest.mark.remote_required
20 | @pytest.mark.parametrize('task', ["cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record",
21 | 'broadcoverage-diagnostic', 'winogender-diagnostic'])
22 | def test_glue(task):
23 | parser = prepare_glue.get_parser()
24 | with tempfile.TemporaryDirectory() as root:
25 | args = parser.parse_args(['--benchmark', 'superglue',
26 | '--tasks', task,
27 | '--data_dir', root])
28 | prepare_glue.main(args)
29 |
--------------------------------------------------------------------------------
/tests/data_cli/test_wikipedia.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tempfile
3 | from gluonnlp.cli.data.pretrain_corpus import prepare_wikipedia
4 |
5 |
6 | @pytest.mark.remote_required
7 | # Test for zh-classical (文言) + wuu (吴语), which are smaller compared with English
8 | @pytest.mark.parametrize('lang', ['zh-classical', 'wuu'])
9 | def test_download_format(lang):
10 | parser = prepare_wikipedia.get_parser()
11 | with tempfile.TemporaryDirectory() as root:
12 | download_args = parser.parse_args(['--mode', 'download+format',
13 | '--lang', lang,
14 | '--date', 'latest', '-o', root])
15 | prepare_wikipedia.main(download_args)
16 |
--------------------------------------------------------------------------------
/tests/process_cli/test_average_checkpoint.py:
--------------------------------------------------------------------------------
1 | import os
2 | from gluonnlp.cli import average_checkpoint
3 | from mxnet.gluon import nn
4 | from numpy.testing import assert_allclose
5 |
6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
7 |
8 | def test_avg_ckpt():
9 | try:
10 | average_checkpoint.cli_main()
11 | except:
12 | pass
13 | num_ckpts = 5
14 | model = nn.Dense(units=10, in_units=10)
15 | model.initialize()
16 | params = model.collect_params()
17 | gd_avg = {}
18 | for key in params.keys():
19 | gd_avg[key] = params[key].data().asnumpy()
20 | model.save_parameters(os.path.join(_CURR_DIR, 'update0.params'))
21 |
22 | for i in range(1, num_ckpts):
23 | model.initialize(force_reinit=True)
24 | params = model.collect_params()
25 | for key in gd_avg.keys():
26 | gd_avg[key] += params[key].data().asnumpy()
27 | model.save_parameters(os.path.join(_CURR_DIR, 'update{}.params'.format(i)))
28 |
29 | for key in gd_avg.keys():
30 | gd_avg[key] /= num_ckpts
31 |
32 | parser = average_checkpoint.get_parser()
33 | args = parser.parse_args(['--checkpoints', None,
34 | '--begin', '0',
35 | '--end', str(num_ckpts-1),
36 | '--save-path', os.path.join(_CURR_DIR, 'avg.params')])
37 | args.checkpoints = ['fake', 'ckpt']
38 | try:
39 | average_checkpoint.main(args)
40 | except:
41 | pass
42 | args.checkpoints = [os.path.join(_CURR_DIR, 'update{}.params'.format(i)) \
43 | for i in range(0, num_ckpts)]
44 | average_checkpoint.main(args)
45 |
46 | model.load_parameters(os.path.join(_CURR_DIR, 'avg.params'))
47 | params = model.collect_params()
48 |
49 | for key in gd_avg.keys():
50 | assert_allclose(gd_avg[key], params[key].data().asnumpy(), 1E-7, 1E-7)
51 |
--------------------------------------------------------------------------------
/tests/test_data_filtering.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from gluonnlp.data.filtering import ProfanityFilter, MosesNormalizer, LanguageIdentifier
3 | import multiprocessing
4 |
5 |
6 | def test_profanity_filter():
7 | profanity_filter = ProfanityFilter('en')
8 | filter_word = 'anal'
9 | unfilter_word = 'analysis'
10 | for text in [' ' + filter_word, ' ' + filter_word + ' ',
11 | filter_word, filter_word + ' ' + unfilter_word]:
12 | assert profanity_filter.match(text) is True
13 | for text in [' ' + unfilter_word, unfilter_word, unfilter_word + ' ']:
14 | assert profanity_filter.match(text) is False
15 |
16 |
17 | def test_sentence_normalizer():
18 | normalizer = MosesNormalizer('en')
19 | assert normalizer(' hello world!!".\t\t\r') == ' hello world!!." '
20 | assert normalizer(
21 | b'We therefore defend, and will continue to defend wherever necessary, our position of \xe2\x80\x98no diversion\xe2\x80\x99.\n'.decode('utf-8')) == \
22 | "We therefore defend, and will continue to defend wherever necessary, our position of 'no diversion'. "
23 | normalizer = MosesNormalizer('en', remove_non_printable_char=False)
24 | assert normalizer(' hello world!!".\t\t\r') == ' hello world!!."\t\t'
25 | normalizer = MosesNormalizer('en', remove_non_printable_char=False, unicode_norm_form='NFKC')
26 | assert normalizer(' hello world!!"⁵.\t\t\r') == ' hello world!!"5.\t\t'
27 |
28 |
29 | @pytest.mark.parametrize('algo', ['fasttext', 'fasttext_compressed', 'langid'])
30 | def test_language_identifier(algo):
31 | lang_id_model = LanguageIdentifier(algo=algo)
32 | lang_label, score = lang_id_model('你好,世界')
33 | assert lang_label == 'zh'
34 | with multiprocessing.Pool(2) as pool:
35 | out = pool.map(lang_id_model, ['你好,世界', 'Hello World'])
36 | assert out[0][0] == 'zh'
37 | assert out[1][0] == 'en'
38 |
--------------------------------------------------------------------------------
/tests/test_embedding.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import collections
3 | import os
4 | import tempfile
5 | import pytest
6 | from gluonnlp.embedding import load_embeddings, get_fasttext_model
7 | from gluonnlp.data import Vocab
8 |
9 | def test_load_embeddings():
10 | text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world', 'sadgood']
11 | counter = collections.Counter(text_data)
12 | vocab1 = Vocab(counter)
13 | # load with vocab
14 | matrix1 = load_embeddings(vocab1)
15 | assert len(matrix1) == len(vocab1)
16 | # load without vocab
17 | matrix2, vocab2 = load_embeddings()
18 | assert len(matrix2) == len(vocab2)
19 | np.testing.assert_almost_equal(matrix1[vocab1["hello"]], matrix2[vocab2["hello"]])
20 |
21 | # test_unk_method
22 | def simple(words):
23 | return np.ones((len(words), 50))
24 | matrix3 = load_embeddings(vocab1, unk_method=simple)
25 | assert sum(matrix3[vocab1['sadgood']] == 1) == matrix3.shape[-1]
26 | np.testing.assert_almost_equal(matrix3[vocab1["hello"]], matrix2[vocab2["hello"]])
27 |
28 | # load txt
29 | with tempfile.TemporaryDirectory() as root:
30 | path = os.path.join(root, "tmp.txt")
31 | with open(path, "w") as f:
32 | f.write("{} {}\n".format(matrix1.shape[0], matrix1.shape[1]))
33 | for word, vec in zip(vocab1.all_tokens, matrix1):
34 | f.write(word + " ")
35 | f.write(" ".join([str(num) for num in vec.tolist()]))
36 | f.write("\n")
37 | matrix4 = load_embeddings(vocab1, path)
38 | np.testing.assert_almost_equal(matrix4, matrix1)
39 |
40 |
41 | @pytest.mark.slow
42 | @pytest.mark.remote_required
43 | def test_get_fasttext_model():
44 | text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
45 | counter = collections.Counter(text_data)
46 | vocab1 = Vocab(counter)
47 | matrix1 = load_embeddings(vocab1, 'wiki.en')
48 | ft = get_fasttext_model('wiki.en')
49 | np.testing.assert_almost_equal(matrix1[vocab1["hello"]], ft['hello'], decimal=4)
50 | with pytest.raises(ValueError):
51 | get_fasttext_model('wiki.multi.ar')
52 |
53 |
--------------------------------------------------------------------------------
/tests/test_gluon_block.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import mxnet as mx
3 | from mxnet import nd, np, npx
4 | from mxnet.test_utils import assert_allclose
5 | from mxnet.gluon import HybridBlock, Constant
6 | from mxnet.gluon.data import DataLoader
7 | import itertools
8 | mx.npx.set_np()
9 |
10 |
11 | def test_const():
12 | class Foo(HybridBlock):
13 | def __init__(self):
14 | super().__init__()
15 | self.weight = Constant(np.ones((10, 10)))
16 |
17 | def forward(self, x, weight):
18 | return x, weight.astype(np.float32)
19 |
20 | foo = Foo()
21 | foo.hybridize()
22 | foo.initialize()
23 |
24 |
25 | def test_scalar():
26 | class Foo(HybridBlock):
27 | def forward(self, x):
28 | return x * x * 2
29 |
30 | foo = Foo()
31 | foo.hybridize()
32 | foo.initialize()
33 | out = foo(mx.np.array(1.0))
34 | assert_allclose(out.asnumpy(), np.array(2.0))
35 |
36 |
37 | def test_gluon_nonzero_hybridize():
38 | class Foo(HybridBlock):
39 | def __init__(self):
40 | super().__init__()
41 |
42 | def forward(self, x):
43 | dat = npx.nonzero(x)
44 | return dat.sum() + dat
45 |
46 | foo = Foo()
47 | foo.hybridize()
48 | out = foo(mx.np.array([1, 0, 2, 0, 3, 0]))
49 | out.wait_to_read()
50 | out = foo(mx.np.array([0, 0, 0, 0, 0, 0]))
51 | out.wait_to_read()
52 |
53 |
54 | @pytest.mark.xfail(reason='Expected to fail due to MXNet bug https://github.com/apache/'
55 | 'incubator-mxnet/issues/19659')
56 | def test_gluon_boolean_mask():
57 | class Foo(HybridBlock):
58 | def forward(self, data, indices):
59 | mask = indices < 3
60 | data = npx.reshape(data, (-1, -2), reverse=True)
61 | mask = np.reshape(mask, (-1,))
62 | sel = nd.np._internal.boolean_mask(data, mask)
63 | return sel
64 | data = mx.np.random.normal(0, 1, (5, 5, 5, 5, 16))
65 | indices = mx.np.random.randint(0, 5, (5, 5, 5, 5))
66 | data.attach_grad()
67 | indices.attach_grad()
68 | foo = Foo()
69 | foo.hybridize()
70 | with mx.autograd.record():
71 | out = foo(data, indices)
72 | out.backward()
73 | out.wait_to_read()
74 |
75 |
76 | def test_basic_dataloader():
77 | def grouper(iterable, n, fillvalue=None):
78 | """Collect data into fixed-length chunks or blocks"""
79 | # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
80 | args = [iter(iterable)] * n
81 | return itertools.zip_longest(*args, fillvalue=fillvalue)
82 | ctx_l = [mx.cpu(i) for i in range(8)]
83 | dataset = [mx.np.ones((2,)) * i for i in range(1000)]
84 | dataloader = DataLoader(dataset, 2, num_workers=4, prefetch=10)
85 |
86 | for i, data_l in enumerate(grouper(dataloader, len(ctx_l))):
87 | for data, ctx in zip(data_l, ctx_l):
88 | if data is None:
89 | continue
90 | data = data.as_in_ctx(ctx)
91 | mx.npx.waitall()
92 |
--------------------------------------------------------------------------------
/tests/test_initializer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from gluonnlp import initializer
3 | import mxnet as mx
4 | from mxnet.gluon import nn
5 | mx.npx.set_np()
6 |
7 |
8 | def test_truncnorm_string_alias_works():
9 | try:
10 | layer = nn.Dense(in_units=1, units=1, weight_initializer='truncnorm')
11 | layer.initialize()
12 | except RuntimeError:
13 | pytest.fail('Layer couldn\'t be initialized')
14 |
15 |
16 | def test_truncnorm_all_values_inside_boundaries():
17 | mean = 0
18 | std = 0.01
19 | layer = nn.Dense(in_units=1, units=1000)
20 | layer.initialize(init=initializer.TruncNorm(mean, std))
21 | assert (layer.weight.data() <= 2 * std).asnumpy().all()
22 | assert (layer.weight.data() >= -2 * std).asnumpy().all()
23 |
24 |
25 | def test_truncnorm_generates_values_with_defined_mean_and_std():
26 | from scipy import stats
27 |
28 | mean = 10
29 | std = 5
30 | layer = nn.Dense(in_units=1, units=100000)
31 | layer.initialize(init=initializer.TruncNorm(mean, std))
32 | samples = layer.weight.data().reshape((-1, )).asnumpy()
33 |
34 | p_value = stats.kstest(samples, 'truncnorm', args=(-2, 2, mean, std)).pvalue
35 | assert p_value > 0.0001
36 |
--------------------------------------------------------------------------------
/tests/test_loss.py:
--------------------------------------------------------------------------------
1 | import mxnet as mx
2 | import numpy as np
3 | import pytest
4 | from numpy.testing import assert_allclose
5 | import scipy.special as sspecial
6 | from gluonnlp.loss import LabelSmoothCrossEntropyLoss
7 | mx.npx.set_np()
8 |
9 |
10 | @pytest.mark.parametrize('label_shape', [(5, 3), (3,), (2, 3, 2)])
11 | @pytest.mark.parametrize('alpha', [0.0, 0.1])
12 | @pytest.mark.parametrize('from_logits', [True, False])
13 | @pytest.mark.parametrize('hybridize', [True, False])
14 | def test_label_smoothing(label_shape, alpha, from_logits, hybridize):
15 | def _np_label_smoothing(pred, labels, alpha, from_logits):
16 | flatten_pred = pred.reshape((-1, pred.shape[-1]))
17 | flatten_labels = labels.reshape((-1,))
18 | smoothed_labels = np.full_like(flatten_pred,
19 | fill_value=alpha / flatten_pred.shape[-1])
20 | smoothed_labels[np.arange(flatten_pred.shape[0]), flatten_labels]\
21 | = 1 - alpha + alpha / flatten_pred.shape[-1]
22 | if not from_logits:
23 | flatten_logits = np.log(sspecial.softmax(flatten_pred, axis=-1))
24 | else:
25 | flatten_logits = flatten_pred
26 | # Calculate cross-entropy
27 | loss = - (smoothed_labels * flatten_logits).sum(axis=-1)
28 | return loss.reshape(labels.shape)
29 | label_num = 5
30 | loss = LabelSmoothCrossEntropyLoss(num_labels=label_num, alpha=alpha, from_logits=from_logits)
31 | if hybridize:
32 | loss.hybridize()
33 | if from_logits:
34 | pred = mx.np.random.uniform(-10, -1, label_shape + (label_num,))
35 | else:
36 | pred = mx.np.random.normal(0, 1, label_shape + (label_num,))
37 | labels = mx.np.random.randint(0, label_num, label_shape)
38 | out = loss(pred, labels)
39 | np_out = _np_label_smoothing(pred.asnumpy(), labels.asnumpy(), alpha, from_logits)
40 | assert_allclose(np_out, out.asnumpy(), 1E-4, 1E-4)
41 |
42 |
--------------------------------------------------------------------------------
/tests/test_models_mt5.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import tempfile
3 |
4 | from gluonnlp.models.mt5 import (
5 | MT5Model, MT5Inference, mt5_cfg_reg, list_pretrained_mt5, get_pretrained_mt5
6 | )
7 |
8 | def test_list_pretrained_mt5():
9 | assert len(list_pretrained_mt5()) > 0
10 |
11 |
12 | @pytest.mark.parametrize('cfg_key', mt5_cfg_reg.list_keys())
13 | def test_mt5_model_and_inference(cfg_key, ctx):
14 | # since MT5Model, MT5Inference simply inherits the T5Model, T5Inference,
15 | # we just want to make sure the model can be properly loaded, and leave
16 | # the correctness tests to test_model_t5.py
17 | with ctx:
18 | cfg = MT5Model.get_cfg(cfg_key)
19 | if cfg_key != 'google_mt5_small':
20 | cfg.defrost()
21 | cfg.MODEL.vocab_size = 256
22 | cfg.MODEL.d_model = 128
23 | cfg.MODEL.d_ff = 512
24 | cfg.MODEL.num_layers = 2
25 | cfg.MODEL.num_heads = 4
26 | cfg.freeze()
27 | mt5_model = MT5Model.from_cfg(cfg)
28 | mt5_model.initialize()
29 | mt5_model.hybridize()
30 | if cfg_key == 'google_mt5_small':
31 | inference_model = MT5Inference(mt5_model)
32 | inference_model.hybridize()
33 |
34 |
35 | def test_mt5_get_pretrained(ctx):
36 | with tempfile.TemporaryDirectory() as root, ctx:
37 | cfg, tokenizer, backbone_params_path, _ = get_pretrained_mt5('google_mt5_small')
38 | # we exclude s in the comparison below by avoiding len(tokenizer.vocab)
39 | assert cfg.MODEL.vocab_size >= len(tokenizer._sp_model)
40 | mt5_model = MT5Model.from_cfg(cfg)
41 | mt5_model.load_parameters(backbone_params_path)
42 | mt5_model.hybridize()
43 | mt5_inference_model = MT5Inference(mt5_model)
44 | mt5_inference_model.hybridize()
45 |
--------------------------------------------------------------------------------
/tests/test_models_xlmr.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | import mxnet as mx
4 | import tempfile
5 | from gluonnlp.models.xlmr import XLMRModel, \
6 | list_pretrained_xlmr, get_pretrained_xlmr
7 | from gluonnlp.loss import LabelSmoothCrossEntropyLoss
8 |
9 | mx.npx.set_np()
10 |
11 |
12 | def test_list_pretrained_xlmr():
13 | assert len(list_pretrained_xlmr()) > 0
14 |
15 |
16 | # We choose to not test amp for XLMR because it's the same as RoBERTa.
17 | @pytest.mark.slow
18 | @pytest.mark.remote_required
19 | @pytest.mark.parametrize('model_name', list_pretrained_xlmr())
20 | def test_xlmr(model_name, ctx):
21 | # test from pretrained
22 | assert len(list_pretrained_xlmr()) > 0
23 | with ctx:
24 | with tempfile.TemporaryDirectory() as root:
25 | cfg, tokenizer, params_path, mlm_params_path =\
26 | get_pretrained_xlmr(model_name, load_backbone=True, load_mlm=False, root=root)
27 | assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
28 | # test backbone
29 | xlmr_model = XLMRModel.from_cfg(cfg)
30 | xlmr_model.load_parameters(params_path)
31 | # pass the mlm model
32 |
33 | # test forward
34 | batch_size = 1
35 | seq_length = 4
36 | vocab_size = len(tokenizer.vocab)
37 | input_ids = mx.np.array(
38 | np.random.randint(
39 | 2,
40 | vocab_size,
41 | (batch_size, seq_length)
42 | ),
43 | dtype=np.int32
44 | )
45 | valid_length = mx.np.array(
46 | np.random.randint(
47 | seq_length // 2,
48 | seq_length,
49 | (batch_size,)
50 | ),
51 | dtype=np.int32
52 | )
53 | contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length)
54 | mx.npx.waitall()
55 | # test backward
56 | label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
57 | with mx.autograd.record():
58 | contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length)
59 | loss = label_smooth_loss(contextual_embeddings, input_ids)
60 | loss.backward()
61 | mx.npx.waitall()
62 |
--------------------------------------------------------------------------------
/tests/test_pytest.py:
--------------------------------------------------------------------------------
1 | import random
2 | import pytest
3 | import numpy as np
4 | import mxnet as mx
5 |
6 |
7 | @pytest.mark.seed(1)
8 | def test_test():
9 | """Test that fixing a random seed works."""
10 | py_rnd = random.randint(0, 100)
11 | np_rnd = np.random.randint(0, 100)
12 | mx_rnd = mx.nd.random_uniform(shape=(1, )).asscalar()
13 |
14 | random.seed(1)
15 | mx.random.seed(1)
16 | np.random.seed(1)
17 |
18 | assert py_rnd == random.randint(0, 100)
19 | assert np_rnd == np.random.randint(0, 100)
20 | assert mx_rnd == mx.nd.random_uniform(shape=(1, )).asscalar()
21 |
--------------------------------------------------------------------------------
/tests/test_utils_preprocessing.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from numpy.testing import assert_allclose
4 | from gluonnlp.utils.preprocessing import get_trimmed_lengths, match_tokens_with_char_spans
5 |
6 |
7 | def test_get_trimmed_lengths():
8 | for lengths, do_merge, max_length, gt_trimmed_lengths in\
9 | [([10, 5, 4, 8], False, 6, [6, 5, 4, 6]),
10 | ([10, 5, 4, 8], True, 6, [2, 2, 1, 1]),
11 | ([20], False, 30, [20]),
12 | ([20], True, 30, [20]),
13 | ([15, 20], False, 30, [15, 20]),
14 | ([15, 20], True, 30, [15, 15])]:
15 | trimmed_lengths = get_trimmed_lengths(lengths,
16 | max_length=max_length,
17 | do_merge=do_merge)
18 | assert_allclose(trimmed_lengths, np.array(gt_trimmed_lengths))
19 |
20 |
21 | def test_match_tokens_with_char_spans():
22 | token_offsets = np.array([(0, 1), (1, 2), (3, 4), (5, 6)])
23 | spans = np.array([(0, 3), (4, 6)])
24 | out = match_tokens_with_char_spans(token_offsets, spans)
25 | assert_allclose(out, np.array([[0, 2],
26 | [2, 3]]))
27 |
28 | token_offsets = np.array([(5, 10), (10, 20), (20, 25), (26, 30)])
29 | spans = np.array([(0, 3), (4, 6), (10, 30),
30 | (22, 23), (15, 25),
31 | (10, 35), (36, 38)])
32 | out = match_tokens_with_char_spans(token_offsets, spans)
33 | assert_allclose(out, np.array([[0, 0],
34 | [0, 0],
35 | [1, 3],
36 | [2, 2],
37 | [1, 2],
38 | [1, 3],
39 | [3, 3]]))
40 |
--------------------------------------------------------------------------------
/tests/test_utils_registry.py:
--------------------------------------------------------------------------------
1 | from gluonnlp.utils.registry import Registry
2 |
3 |
4 | def test_registry():
5 | MODEL_REGISTRY = Registry('MODEL')
6 | @MODEL_REGISTRY.register()
7 | class MyModel:
8 | def __init__(self, a, b):
9 | self.a = a
10 | self.b = b
11 |
12 | @MODEL_REGISTRY.register()
13 | def my_model():
14 | return
15 |
16 | @MODEL_REGISTRY.register('test_class')
17 | class MyModelWithNickName:
18 | def __init__(self, a, b, c):
19 | self.a = a
20 | self.b = b
21 | self.c = c
22 |
23 | @MODEL_REGISTRY.register('test_function')
24 | def my_model_with_nick_name():
25 | return
26 |
27 | class MyModel2:
28 | pass
29 |
30 | MODEL_REGISTRY.register(MyModel2)
31 | MODEL_REGISTRY.register('my_model2', MyModel2)
32 | assert MODEL_REGISTRY.list_keys() ==\
33 | ['MyModel', 'my_model', 'test_class', 'test_function', 'MyModel2', 'my_model2']
34 | model = MODEL_REGISTRY.create('MyModel', 1, 2)
35 | assert model.a == 1 and model.b == 2
36 | model = MODEL_REGISTRY.create('MyModel', a=2, b=3)
37 | assert model.a == 2 and model.b == 3
38 | model = MODEL_REGISTRY.create_with_json('MyModel', '[4, 5]')
39 | assert model.a == 4 and model.b == 5
40 | model = MODEL_REGISTRY.create_with_json('test_class',
41 | '{"a": 100, "b": 200, "c": 300}')
42 | assert model.a == 100 and model.b == 200 and model.c == 300
43 | assert MODEL_REGISTRY.get('test_class') == MyModelWithNickName
44 |
45 |
46 |
--------------------------------------------------------------------------------
/tests/torch/test_layers_torch.py:
--------------------------------------------------------------------------------
1 | import torch as th
2 | import numpy as np
3 | from gluonnlp.torch.layers import SinusoidalPositionalEmbedding
4 |
5 |
6 | def test_sinusoidal_pos_embed():
7 | embed1 = SinusoidalPositionalEmbedding(128, learnable=False)
8 | embed2 = SinusoidalPositionalEmbedding(128, learnable=True)
9 | assert len([(name, param) for name, param in embed1.named_parameters()
10 | if param.requires_grad]) == 0
11 | assert len([(name, param) for name, param in embed2.named_parameters()
12 | if param.requires_grad]) == 1
13 | inputs = th.randint(0, 128, (8, 4))
14 | np.testing.assert_allclose(embed1(inputs).detach().cpu().numpy(),
15 | embed2(inputs).detach().cpu().numpy(), 1E-3, 1E-3)
16 |
--------------------------------------------------------------------------------
/tools/batch/backbone_benchmark/run_batch_backbone_benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 | INSTANCE_TYPE=${1:-g4dn.2x}
5 | LOG_PATH=${2:-submit_backbone_benchmark.log}
6 | SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py
7 |
8 | python3 ${SUBMIT_SCRIPT_PATH} \
9 | --region us-east-1 \
10 | --source-ref fix_benchmark3 \
11 | --job-type ${INSTANCE_TYPE} \
12 | --save-path temp \
13 | --name test_backbone_benchmark_${INSTANCE_TYPE} \
14 | --work-dir scripts/benchmarks \
15 | --remote https://github.com/sxjscience/gluon-nlp/ \
16 | --command "bash run_backbone_benchmark.sh 2>&1 | tee stdout.log" \
17 | | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \
18 | | sed -e 's/ - / /g' >> ${LOG_PATH}
19 |
--------------------------------------------------------------------------------
/tools/batch/batch_states/compile_notebooks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Shell script for submitting AWS Batch jobs to compile notebooks
3 |
4 | event=$1
5 | ref=$2
6 |
7 | FAIL=0
8 |
9 | compile_notebook () {
10 | local MDFILE=$1
11 | DIR=$(dirname $MDFILE)
12 | BASENAME=$(basename $MDFILE)
13 | TARGETNAME=$(dirname $MDFILE)/${BASENAME%.md}.ipynb
14 | LOGNAME=$(dirname $MDFILE)/${BASENAME%.md}.stdout.log
15 |
16 | echo Compiling $BASENAME ...
17 |
18 | python3 docs/md2ipynb.py ${MDFILE} &> $LOGNAME
19 |
20 | EXIT_CODE=$?
21 |
22 | if [ $EXIT_CODE -ne 0 ]; then
23 | echo Compiling $BASENAME Failed, please download Notebook_Logs in build Artifacts for more details.
24 | else
25 | echo Compiling $BASENAME Succeeded
26 | fi
27 | exit $EXIT_CODE
28 | }
29 |
30 | pids=()
31 |
32 | for f in $(find docs/tutorials -type f -name '*.md' -print); do
33 | compile_notebook "$f" &
34 | pids+=($!)
35 | done;
36 |
37 | for pid in "${pids[@]}"; do
38 | wait "$pid" || let "FAIL+=1"
39 | done;
40 |
41 | if [ "$FAIL" == "0" ]; then
42 | echo Building Website
43 | make docs_local
44 | EXIT_CODE=$?
45 | if [ $EXIT_CODE -ne 0 ]; then
46 | echo Building Website Failed.
47 | exit $EXIT_CODE
48 | else
49 | echo Building Website Succeeded.
50 | if [ "$1" == "push" ]; then
51 | echo "Uploading docs to s3://gluon-nlp/$2/"
52 | aws s3 sync --delete ./docs/_build/html/ s3://gluon-nlp/$2/ --quiet --acl public-read
53 | else
54 | echo "Uploading docs to s3://gluon-nlp-staging/PR$1/$2/"
55 | aws s3 sync --delete ./docs/_build/html/ s3://gluon-nlp-staging/PR$1/$2/ --quiet --acl public-read
56 | fi
57 | fi
58 | else
59 | exit 1
60 | fi
61 |
--------------------------------------------------------------------------------
/tools/batch/batch_states/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Shell script for installing dependencies and running test on AWS Batch
3 | set -ex
4 |
5 | echo $PWD
6 | SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
7 | REPODIR="$( readlink -f ${SCRIPTPATH}/../../../../gluon-nlp)"
8 |
9 | python3 -m pip install --upgrade --user pytest pytest-cov contextvars
10 | python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/
11 |
--------------------------------------------------------------------------------
/tools/batch/batch_states/test_data_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Shell script for testing the data preprocessing on AWS Batch
3 |
4 | set -ex
5 | export PYTHONIOENCODING=utf8
6 | echo $PWD
7 |
8 | for MODEL in spm yttm
9 | do
10 | bash ../../../scripts/datasets/machine_translation/wmt2014_ende.sh ${MODEL}
11 | done
12 | for MODEL in spm yttm
13 | do
14 | bash ../../../scripts/datasets/machine_translation/wmt2017_zhen.sh ${MODEL}
15 | done
16 |
--------------------------------------------------------------------------------
/tools/batch/hello_world.py:
--------------------------------------------------------------------------------
1 | from gluonnlp.data.vocab import Vocab
2 | import mxnet as mx
3 |
4 |
5 | if __name__ == '__main__':
6 | vocab = Vocab(['Hello', 'World!'], unk_token=None)
7 | print(vocab)
8 | num_gpus = mx.context.num_gpus()
9 | print('Number of GPUS:', num_gpus)
10 |
11 |
--------------------------------------------------------------------------------
/tools/batch/question_answering/parse_squad_results.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import pandas as pd
4 | import glob
5 | import math
6 | import argparse
7 | from datetime import datetime
8 |
9 | parser = argparse.ArgumentParser(description='Parse SQuAD results generated by '
10 | '"sync_batch_result.sh" to csv.')
11 | parser.add_argument('--dir', type=str, required=True,
12 | help='The basic directory to analyze the results.')
13 | parser.add_argument('--save_path', type=str, default=None, help='The path to save the results.')
14 | args = parser.parse_args()
15 |
16 | if args.save_path is None:
17 | args.save_path = os.path.basename(os.path.realpath(args.dir)) + '.csv'
18 |
19 | base_dir = args.dir
20 | prefix = 'test_squad2_'
21 |
22 | dat_l = []
23 | datetime_parser = '%Y-%m-%d %H:%M:%S,%f'
24 |
25 | for folder in sorted(os.listdir(base_dir)):
26 | if folder.startswith(prefix):
27 | model_name = folder[len(prefix):]
28 | log_path_l = glob.glob(os.path.join(base_dir, folder, 'fintune*/finetune*.log'))
29 | param_path_l = sorted(glob.glob(os.path.join(base_dir, folder, 'fintune*/*.params')))
30 | if len(param_path_l) == 0 or len(log_path_l) == 0:
31 | best_f1_threshold = math.nan
32 | best_exact_threshold = math.nan
33 | best_f1 = math.nan
34 | best_em = math.nan
35 | time_spent_in_hours = math.nan
36 | else:
37 | log_path = log_path_l[0]
38 | result_file = glob.glob(os.path.join(base_dir, folder, 'fintune*/best_results.json'))[0]
39 | with open(result_file, 'r') as in_f:
40 | result_dat = json.load(in_f)
41 | if 'best_f1_thresh' in result_dat:
42 | best_f1_threshold = result_dat['best_f1_thresh']
43 | best_exact_threshold = result_dat['best_exact_thresh']
44 | best_f1 = result_dat['best_f1']
45 | best_em = result_dat['best_exact']
46 | else:
47 | best_f1_threshold = math.nan
48 | best_exact_threshold = math.nan
49 | best_f1 = result_dat['f1']
50 | best_em = result_dat['exact']
51 | with open(log_path, 'r') as in_f:
52 | log_lines = in_f.readlines()
53 | start_time_str = ' '.join(log_lines[0].split()[0:2])
54 | end_time_str = ' '.join(log_lines[-1].split()[0:2])
55 | start_time = datetime.strptime(start_time_str, datetime_parser)
56 | end_time = datetime.strptime(end_time_str, datetime_parser)
57 | time_spent = end_time - start_time
58 | time_spent_in_hours = time_spent.total_seconds() / 3600
59 | dat_l.append({'name': model_name,
60 | 'best_f1': best_f1,
61 | 'best_em': best_em,
62 | 'best_f1_thresh': best_f1_threshold,
63 | 'best_em_thresh': best_exact_threshold,
64 | 'time_spent_in_hours': time_spent_in_hours})
65 | df = pd.DataFrame(dat_l)
66 | print(df)
67 | print('Saving to {}'.format(args.save_path))
68 | df.to_csv(args.save_path)
69 |
--------------------------------------------------------------------------------
/tools/batch/question_answering/run_batch_squad.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | USE_HOROVOD=${1:-0}
6 | VERSION=${2:-2.0}
7 | LOG_PATH=${3:-submit_squad_v2.log}
8 | DTYPE=${4:-float32}
9 | SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py
10 |
11 |
12 | for MODEL_NAME in albert_base \
13 | albert_large \
14 | albert_xlarge \
15 | albert_xxlarge \
16 | electra_base \
17 | electra_large \
18 | electra_small \
19 | roberta_large \
20 | uncased_bert_base \
21 | uncased_bert_large \
22 | uncased_bert_wwm_large \
23 | gluon_en_cased_bert_base_v1 \
24 | mobilebert
25 | do
26 | python3 ${SUBMIT_SCRIPT_PATH} \
27 | --region us-east-1 \
28 | --source-ref master \
29 | --job-type g4dn.12x \
30 | --save-path temp \
31 | --name test_squad2_${MODEL_NAME} \
32 | --work-dir scripts/question_answering \
33 | --remote https://github.com/dmlc/gluon-nlp/ \
34 | --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} ${DTYPE} | tee stdout.log" \
35 | | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \
36 | | sed -e 's/ - / /g' >> ${LOG_PATH}
37 | done
38 |
--------------------------------------------------------------------------------
/tools/batch/run_batch_conversion.sh:
--------------------------------------------------------------------------------
1 | for MODEL_NAME in bert albert electra mobilebert roberta xlmr bart
2 | do
3 | python3 submit-job.py \
4 | --region us-east-1 \
5 | --source-ref master \
6 | --job-type c5n.4x \
7 | --name convert_${MODEL_NAME} \
8 | --work-dir scripts/conversion_toolkits \
9 | --remote https://github.com/dmlc/gluon-nlp/ \
10 | --command 'bash convert_'${MODEL_NAME}'.sh | tee stdout.log' >> log.info
11 | done
12 |
--------------------------------------------------------------------------------
/tools/batch/sync_batch_result.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | LOG_PATH=$1
6 | SAVE_DIR_NAME=${2:-squad_2.0}
7 |
8 | while read -r job_name job_id; do
9 | aws s3 sync s3://gluon-nlp-dev/batch/${job_id}/temp ${SAVE_DIR_NAME}/${job_name}
10 | done < ${LOG_PATH}
11 |
--------------------------------------------------------------------------------
/tools/docker/devel_entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source /start_jupyter.sh
4 |
5 | exec "$@"
6 |
--------------------------------------------------------------------------------
/tools/docker/gluon_nlp_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | date
4 | echo "Args: $@"
5 | env
6 | echo "jobId: $AWS_BATCH_JOB_ID"
7 | echo "jobQueue: $AWS_BATCH_JQ_NAME"
8 | echo "computeEnvironment: $AWS_BATCH_CE_NAME"
9 |
10 | SOURCE_REF=$1
11 | WORK_DIR=$2
12 | COMMAND=$3
13 | SAVED_OUTPUT=$4
14 | SAVE_PATH=$5
15 | REMOTE=$6
16 | DEVICE=${7:-gpu}
17 |
18 | if [ ! -z $REMOTE ]; then
19 | git remote set-url origin $REMOTE
20 | fi;
21 |
22 | git fetch origin $SOURCE_REF:working
23 | git checkout working
24 |
25 | if [ $DEVICE == "cpu" ]; then
26 | python3 -m pip uninstall --quiet mxnet -y
27 | python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20210121" -f https://dist.mxnet.io/python
28 | else
29 | python3 -m pip uninstall --quiet mxnet-cu102 -y
30 | python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0a" --user
31 | fi
32 |
33 | python3 -m pip install --quiet -e .[extras,dev]
34 |
35 | cd $WORK_DIR
36 | /bin/bash -o pipefail -c "$COMMAND"
37 | COMMAND_EXIT_CODE=$?
38 | if [[ -f $SAVED_OUTPUT ]]; then
39 | aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH --quiet;
40 | elif [[ -d $SAVED_OUTPUT ]]; then
41 | aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH --quiet;
42 | fi;
43 | exit $COMMAND_EXIT_CODE
44 |
--------------------------------------------------------------------------------
/tools/docker/install/install_horovod.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 |
3 | # Install Horovod
4 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \
5 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \
6 | HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user
7 | # Debug horovod by default
8 | echo NCCL_DEBUG=INFO >> /etc/nccl.conf
9 |
--------------------------------------------------------------------------------
/tools/docker/install/install_jupyter_lab.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 |
3 | # Install NodeJS + Tensorboard + TensorboardX
4 |
5 | curl -sL https://deb.nodesource.com/setup_14.x | bash - \
6 | && apt-get install -y nodejs
7 |
8 | apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev
9 |
10 | python3 -m pip install --no-cache --upgrade \
11 | soundfile==0.10.2 \
12 | ipywidgets==7.5.1 \
13 | jupyter_tensorboard==0.2.0 \
14 | widgetsnbextension==3.5.1 \
15 | tensorboard==2.1.1 \
16 | tensorboardX==2.1 --user
17 | jupyter labextension install jupyterlab_tensorboard \
18 | && jupyter nbextension enable --py widgetsnbextension \
19 | && jupyter labextension install @jupyter-widgets/jupyterlab-manager
20 |
21 | # Revise default shell to /bin/bash
22 | jupyter notebook --generate-config \
23 | && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py
24 |
--------------------------------------------------------------------------------
/tools/docker/install/install_llvm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 |
19 | set -e
20 | set -u
21 | set -o pipefail
22 |
23 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\
24 | >> /etc/apt/sources.list.d/llvm.list
25 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\
26 | >> /etc/apt/sources.list.d/llvm.list
27 |
28 |
29 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
30 | >> /etc/apt/sources.list.d/llvm.list
31 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
32 | >> /etc/apt/sources.list.d/llvm.list
33 |
34 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\
35 | >> /etc/apt/sources.list.d/llvm.list
36 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\
37 | >> /etc/apt/sources.list.d/llvm.list
38 |
39 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
40 | >> /etc/apt/sources.list.d/llvm.list
41 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
42 | >> /etc/apt/sources.list.d/llvm.list
43 |
44 | wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
45 | apt-get update && apt-get install -y llvm-9 llvm-10 llvm-11 clang-9 clang-10 clang-11
46 |
--------------------------------------------------------------------------------
/tools/docker/install/install_openmpi.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 |
3 | mkdir /tmp/openmpi \
4 | && cd /tmp/openmpi \
5 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
6 | && tar zxf openmpi-4.0.1.tar.gz \
7 | && cd openmpi-4.0.1 \
8 | && ./configure --enable-orterun-prefix-by-default \
9 | && make -j $(nproc) all \
10 | && make install \
11 | && ldconfig \
12 | && rm -rf /tmp/openmpi
13 |
14 | # Create a wrapper for OpenMPI to allow running as root by default
15 | mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
16 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \
17 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
18 | && chmod a+x /usr/local/bin/mpirun
19 |
20 | echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
21 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
22 |
--------------------------------------------------------------------------------
/tools/docker/install/install_python_packages.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 |
3 |
4 | python3 -m pip --no-cache-dir install --upgrade \
5 | pip \
6 | setuptools \
7 | wheel
8 |
9 | # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
10 | python3 -m pip install --no-cache --upgrade \
11 | numpy==1.19.1 \
12 | pandas==0.25.1 \
13 | cython \
14 | pytest \
15 | pytest-cov \
16 | Pillow \
17 | requests==2.22.0 \
18 | scikit-learn==0.20.4 \
19 | scipy==1.2.2 \
20 | urllib3==1.25.8 \
21 | python-dateutil==2.8.0 \
22 | sagemaker-experiments==0.* \
23 | PyYAML==5.3.1 \
24 | mpi4py==3.0.2 \
25 | jupyterlab==2.2.4 \
26 | contextvars \
27 | cmake \
28 | awscli --user
29 |
--------------------------------------------------------------------------------
/tools/docker/install/install_tvm_cpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 |
19 | set -e
20 | set -u
21 | set -o pipefail
22 |
23 | cd ${WORKDIR}
24 | git clone https://github.com/apache/incubator-tvm tvm --recursive
25 | cd ${WORKDIR}/tvm
26 | # checkout a hash-tag
27 | git checkout bf862d4c4355eae4f18d89b3b6b98ed0a2c18e9c
28 |
29 | mkdir -p build
30 | cp cmake/config.cmake build
31 | echo set\(USE_LLVM llvm-config-10\) >> build/config.cmake
32 | echo set\(USE_GRAPH_EXECUTOR ON\) >> build/config.cmake
33 | echo set\(USE_BLAS openblas\) >> build/config.cmake
34 |
35 | cd build
36 | cmake .. -GNinja
37 | ninja
38 |
39 | # install python binding
40 | cd ..
41 | cd python
42 | python3 -m pip install -U -e . --user
43 |
--------------------------------------------------------------------------------
/tools/docker/install/install_tvm_gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied. See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 |
19 | set -e
20 | set -u
21 | set -o pipefail
22 |
23 | cd ${WORKDIR}
24 | git clone https://github.com/apache/incubator-tvm tvm --recursive
25 | cd ${WORKDIR}/tvm
26 | # checkout a hash-tag
27 | git checkout bf862d4c4355eae4f18d89b3b6b98ed0a2c18e9c
28 |
29 |
30 | mkdir -p build
31 | cp cmake/config.cmake build
32 | echo set\(USE_LLVM llvm-config-10\) >> build/config.cmake
33 | echo set\(USE_CUDA ON\) >> build/config.cmake
34 | echo set\(USE_CUDNN ON\) >> build/config.cmake
35 | echo set\(USE_CUBLAS ON\) >> build/config.cmake
36 | echo set\(USE_GRAPH_EXECUTOR ON\) >> build/config.cmake
37 | echo set\(USE_BLAS openblas\) >> build/config.cmake
38 |
39 | cd build
40 | cmake -GNinja -DCUDA_CUBLAS_LIBRARY=/usr/lib/x86_64-linux-gnu/libcublas.so ..
41 | ninja
42 |
43 | # install python binding
44 | cd ..
45 | cd python
46 | python3 -m pip install -U -e . --user
47 |
--------------------------------------------------------------------------------
/tools/docker/install/install_ubuntu18.04_core.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | set -u
3 | set -o pipefail
4 |
5 | export DEBIAN_FRONTEND=noninteractive
6 |
7 | rm -rf /var/lib/apt/lists/* \
8 | && apt-get clean \
9 | && apt-get update \
10 | && apt-get install -y --no-install-recommends \
11 | software-properties-common \
12 | build-essential \
13 | ca-certificates \
14 | curl \
15 | emacs \
16 | subversion \
17 | locales \
18 | cmake \
19 | git \
20 | libopencv-dev \
21 | htop \
22 | vim \
23 | wget \
24 | unzip \
25 | less \
26 | libopenblas-dev \
27 | gpg-agent \
28 | ninja-build \
29 | openssh-client \
30 | openssh-server \
31 | python3-dev \
32 | python3-pip \
33 | python3-setuptools \
34 | libxft-dev \
35 | zlib1g-dev \
36 | && apt-get clean \
37 | && rm -rf /var/lib/apt/lists/*
38 |
39 | ln -s $(which python3) /usr/local/bin/python
40 |
--------------------------------------------------------------------------------
/tools/docker/start_jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Run Jupyter in foreground if $JUPYTER_FG is set
4 | if [[ "${JUPYTER_FG}" == "true" ]]; then
5 | jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token=''
6 | exit 0
7 | else
8 | nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
9 |
10 | echo "Notebook server successfully started, a JupyterLab instance has been executed!"
11 | echo "Make local folders visible by volume mounting to /workspace/notebook"
12 | echo "To access visit http://localhost:8888 on your host machine."
13 | echo 'Ensure the following arguments to "docker run" are added to expose the server ports to your host machine:
14 | -p 8888:8888 -p 8787:8787 -p 8786:8786'
15 | fi
16 |
--------------------------------------------------------------------------------
/tools/docker/ubuntu18.04-cpu.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04 as base
2 |
3 | LABEL maintainer="GluonNLP Team"
4 | COPY install /install
5 |
6 | ENV PYTHONDONTWRITEBYTECODE=1 \
7 | PYTHONUNBUFFERED=1 \
8 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
9 | PYTHONIOENCODING=UTF-8 \
10 | LANG=C.UTF-8 \
11 | LC_ALL=C.UTF-8
12 |
13 | ENV WORKDIR=/workspace
14 | ENV SHELL=/bin/bash
15 |
16 | RUN mkdir -p ${WORKDIR}
17 |
18 |
19 | RUN bash /install/install_ubuntu18.04_core.sh
20 |
21 | # Install Open MPI
22 | RUN bash /install/install_openmpi.sh
23 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
24 | ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH
25 |
26 | # Install LLVM
27 | RUN bash /install/install_llvm.sh
28 |
29 | # Install Python Packages
30 | RUN bash /install/install_python_packages.sh
31 |
32 | # Install TVM
33 | RUN bash /install/install_tvm_cpu.sh
34 |
35 | # Install MXNet
36 | RUN python3 -m pip install -U --pre "mxnet>=2.0.0a" --user
37 |
38 | # Install PyTorch
39 | RUN python3 -m pip install "torch==1.7.1+cpu" torchvision -f https://download.pytorch.org/whl/torch_stable.html
40 |
41 | # Install Jupyter Lab
42 | RUN bash /install/install_jupyter_lab.sh
43 |
44 | RUN mkdir -p ${WORKDIR}/data
45 | RUN mkdir -p /.init
46 | RUN cd ${WORKDIR} \
47 | && git clone https://github.com/dmlc/gluon-nlp \
48 | && cd gluon-nlp \
49 | && git checkout master \
50 | && python3 -m pip install -U -e ."[extras,dev]"
51 |
52 |
53 | # Stage-CI
54 | FROM base as ci
55 | WORKDIR ${WORKDIR}/gluon-nlp
56 | ADD gluon_nlp_job.sh .
57 | RUN chmod +x gluon_nlp_job.sh
58 |
59 |
60 | # Stage-Devel
61 | FROM base as devel
62 | COPY start_jupyter.sh /start_jupyter.sh
63 | COPY devel_entrypoint.sh /devel_entrypoint.sh
64 | RUN chmod +x /devel_entrypoint.sh
65 |
66 | EXPOSE 8888
67 | EXPOSE 8787
68 | EXPOSE 8786
69 |
70 | WORKDIR ${WORKDIR}
71 |
72 | # Add Tini
73 | ARG TINI_VERSION=v0.19.0
74 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
75 | RUN chmod +x /tini
76 | ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
77 | CMD ["/bin/bash"]
78 |
--------------------------------------------------------------------------------
/tools/docker/ubuntu18.04-gpu.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as base
2 |
3 | LABEL maintainer="GluonNLP Team"
4 | COPY install /install
5 |
6 | ENV PYTHONDONTWRITEBYTECODE=1 \
7 | PYTHONUNBUFFERED=1 \
8 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
9 | PYTHONIOENCODING=UTF-8 \
10 | LANG=C.UTF-8 \
11 | LC_ALL=C.UTF-8
12 |
13 | ENV WORKDIR=/workspace
14 | ENV SHELL=/bin/bash
15 |
16 | RUN mkdir -p ${WORKDIR}
17 |
18 | RUN bash /install/install_ubuntu18.04_core.sh
19 |
20 | # Install Open MPI
21 | RUN bash /install/install_openmpi.sh
22 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
23 | ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:/usr/bin:$PATH
24 |
25 | # Install LLVM
26 | RUN bash /install/install_llvm.sh
27 |
28 | # Install Python Packages
29 | RUN bash /install/install_python_packages.sh
30 |
31 | # Install TVM
32 | RUN bash /install/install_tvm_gpu.sh
33 |
34 | # Install MXNet
35 | RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0a" --user
36 |
37 | # Install PyTorch
38 | RUN python3 -m pip install "torch==1.8.1+cu102" torchvision -f https://download.pytorch.org/whl/torch_stable.html
39 |
40 | # Install Horovod
41 | RUN bash /install/install_horovod.sh
42 |
43 | # Install Jupyter Lab
44 | RUN bash /install/install_jupyter_lab.sh
45 |
46 | RUN mkdir -p ${WORKDIR}/data
47 | RUN mkdir -p /.init
48 | RUN cd ${WORKDIR} \
49 | && git clone https://github.com/dmlc/gluon-nlp \
50 | && cd gluon-nlp \
51 | && git checkout master \
52 | && python3 -m pip install -U -e ."[extras,dev]"
53 |
54 | # Stage-CI
55 | FROM base as ci
56 | WORKDIR ${WORKDIR}/gluon-nlp
57 | ADD gluon_nlp_job.sh .
58 | RUN chmod +x gluon_nlp_job.sh
59 |
60 | # Stage-Devel
61 | FROM base as devel
62 | COPY start_jupyter.sh /start_jupyter.sh
63 | COPY devel_entrypoint.sh /devel_entrypoint.sh
64 | RUN chmod +x /devel_entrypoint.sh
65 |
66 | EXPOSE 8888
67 | EXPOSE 8787
68 | EXPOSE 8786
69 |
70 | WORKDIR ${WORKDIR}
71 |
72 | # Add Tini
73 | ARG TINI_VERSION=v0.19.0
74 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
75 | RUN chmod +x /tini
76 | ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
77 | CMD ["/bin/bash"]
78 |
--------------------------------------------------------------------------------