├── .coveragerc ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── buildwebsite.yml │ ├── data-pipeline.yml │ ├── nightly-test.yml │ ├── unittests-gpu.yml │ └── unittests.yml ├── .gitignore ├── .pylintrc ├── .pytype.cfg ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE ├── Makefile ├── README.md ├── conftest.py ├── docs ├── .gitignore ├── .nojekyll ├── 404.rst ├── Doxyfile ├── Makefile ├── README.txt ├── _static │ ├── 404.jpg │ ├── custom.css │ ├── gluon-logo.svg │ ├── gluon.ico │ ├── google_analytics.js │ ├── hidebib.js │ └── install-options.js ├── api │ ├── attention.rst │ ├── data.rst │ ├── embedding.rst │ ├── index.rst │ ├── layers.rst │ ├── models.rst │ ├── operators.rst │ ├── sequence_sampler.rst │ └── utils.rst ├── conf.py ├── examples.rst ├── genindex.rst ├── index.rst ├── install.rst ├── install │ ├── install-include.rst │ └── install-more.rst ├── md2ipynb.py ├── model_zoo ├── tutorials │ ├── deep_learning_compiler │ │ ├── index.rst │ │ └── tvm_basic.md │ ├── index.rst │ ├── pretrained_models │ │ ├── index.rst │ │ ├── pretrained_t5_mlm.md │ │ └── pretraining_objectives.png │ ├── question_answering │ │ ├── index.rst │ │ ├── offsets_match.png │ │ ├── qa1.png │ │ ├── qa2.png │ │ ├── question_answering.md │ │ └── squad_utils.py │ ├── text_prediction │ │ ├── bert_illustration.png │ │ ├── index.rst │ │ ├── merge_input.png │ │ ├── text_prediction_part1.md │ │ └── text_prediction_part2.md │ ├── tokenization │ │ ├── index.rst │ │ ├── tokenization_part1.md │ │ ├── tokenization_part2.md │ │ └── tokenization_part3.md │ └── word_embedding │ │ ├── index.rst │ │ └── word_embedding.md └── website │ ├── configuration.rst │ ├── contribute.rst │ ├── git.rst │ ├── index.rst │ └── release.rst ├── pytest.ini ├── scripts ├── __init__.py ├── benchmarks │ ├── README.md │ ├── benchmark_gluonnlp.py │ ├── benchmark_gluonnlp.sh │ ├── benchmark_gluonnlp_fp16.sh │ ├── benchmark_gluonnlp_tvm.sh │ ├── benchmark_hf.py │ ├── benchmark_utils.py │ ├── requirements.txt │ └── run_backbone_benchmark.sh ├── classification │ ├── README.md │ ├── classification.py │ ├── classification_utils.py │ └── train_classification.py ├── conversion_toolkits │ ├── README.md │ ├── bert_base_config.json │ ├── bert_large_config.json │ ├── convert_albert.sh │ ├── convert_bart.sh │ ├── convert_bert.sh │ ├── convert_bert_torch.sh │ ├── convert_electra.py │ ├── convert_electra.sh │ ├── convert_fairseq_bart.py │ ├── convert_fairseq_roberta.py │ ├── convert_fairseq_xlmr.py │ ├── convert_gpt2.py │ ├── convert_gpt2.sh │ ├── convert_mobilebert.py │ ├── convert_mobilebert.sh │ ├── convert_mt5.py │ ├── convert_mt5.sh │ ├── convert_roberta.sh │ ├── convert_t5.py │ ├── convert_t5.sh │ ├── convert_tf_hub_model.py │ └── convert_xlmr.sh ├── datasets │ ├── README.md │ ├── __init__.py │ ├── __main__.py │ ├── general_nlp_benchmark │ │ ├── README.md │ │ ├── __init__.py │ │ ├── prepare_glue.py │ │ └── prepare_text_classification.py │ ├── language_modeling │ │ ├── README.md │ │ ├── __init__.py │ │ └── prepare_lm.py │ ├── machine_translation │ │ ├── README.md │ │ ├── __init__.py │ │ ├── prepare_wmt.py │ │ ├── wmt2014_ende.sh │ │ └── wmt2017_zhen.sh │ ├── music_generation │ │ ├── README.md │ │ ├── __init__.py │ │ └── prepare_music_midi.py │ ├── pretrain_corpus │ │ ├── README.md │ │ ├── __init__.py │ │ ├── prepare_bookcorpus.py │ │ ├── prepare_gutenberg.py │ │ ├── prepare_openwebtext.py │ │ └── prepare_wikipedia.py │ ├── question_answering │ │ ├── README.md │ │ ├── __init__.py │ │ ├── prepare_hotpotqa.py │ │ ├── prepare_naturalquestions.py │ │ ├── prepare_searchqa.py │ │ ├── prepare_squad.py │ │ └── prepare_triviaqa.py │ ├── update_download_stats.py │ └── url_checksums │ │ ├── bookcorpus.txt │ │ ├── glue.txt │ │ ├── gutenberg.txt │ │ ├── hotpotqa.txt │ │ ├── language_model.txt │ │ ├── mirror │ │ └── wmt.json │ │ ├── music_midi.txt │ │ ├── naturalquestions.txt │ │ ├── searchqa.txt │ │ ├── squad.txt │ │ ├── superglue.txt │ │ ├── text_classification.txt │ │ ├── triviaqa.txt │ │ ├── wikipedia.txt │ │ └── wmt.txt ├── generation │ ├── README.md │ ├── calculate_metrics.py │ ├── generate_unconditional_gpt2_samples.py │ └── interactive_conditional_gpt2_samples.py ├── index.rst ├── machine_translation │ ├── README.md │ ├── __init__.py │ ├── evaluate_epochs_wmt2014_ende.sh │ ├── evaluate_transformer.py │ ├── train_transformer.py │ ├── transformer_enc12_dec1.yml │ └── wmt2014_back_translation.sh ├── pretraining │ ├── README.md │ ├── bert │ │ ├── README.md │ │ ├── covert_bookcorpus_format.py │ │ ├── create_pretraining_data.py │ │ ├── pretraining_utils.py │ │ └── run_pretraining.py │ ├── convert_electra_pretrain_backbone.py │ ├── data_preprocessing.py │ ├── pretraining_utils.py │ ├── run_electra.py │ └── torch │ │ └── bert │ │ ├── README.md │ │ ├── prepare_quickthought.py │ │ └── run_pretraining.py ├── processing │ ├── README.md │ ├── __init__.py │ ├── __main__.py │ ├── apply_subword.py │ ├── clean_tok_corpus.py │ ├── learn_subword.py │ └── segment_sentences.py └── question_answering │ ├── README.md │ ├── albert_custom.yaml │ ├── commands │ ├── README.md │ ├── generate_commands.py │ ├── run_squad.template │ ├── run_squad2_albert_base.sh │ ├── run_squad2_albert_large.sh │ ├── run_squad2_albert_xlarge.sh │ ├── run_squad2_albert_xxlarge.sh │ ├── run_squad2_electra_base.sh │ ├── run_squad2_electra_large.sh │ ├── run_squad2_electra_small.sh │ ├── run_squad2_gluon_en_cased_bert_base_v1.sh │ ├── run_squad2_mobilebert.sh │ ├── run_squad2_roberta_large.sh │ ├── run_squad2_uncased_bert_base.sh │ ├── run_squad2_uncased_bert_large.sh │ └── run_squad2_uncased_bert_wwm_large.sh │ ├── custom_strategy.py │ ├── eval_utils.py │ ├── models.py │ ├── run_squad.py │ ├── run_squad_albert.py │ └── squad_utils.py ├── setup.py ├── src └── gluonnlp │ ├── __init__.py │ ├── attention_cell.py │ ├── base.py │ ├── cli │ ├── __init__.py │ ├── average_checkpoint.py │ ├── data │ └── process │ ├── data │ ├── __init__.py │ ├── batchify.py │ ├── filtering.py │ ├── loading.py │ ├── sampler.py │ ├── tokenizers │ │ ├── __init__.py │ │ ├── base.py │ │ ├── huggingface.py │ │ ├── jieba.py │ │ ├── moses.py │ │ ├── sentencepiece.py │ │ ├── spacy.py │ │ ├── subword_nmt.py │ │ ├── whitespace.py │ │ └── yttm.py │ └── vocab.py │ ├── embedding │ ├── __init__.py │ ├── _constants.py │ └── embed_loader.py │ ├── initializer.py │ ├── layers.py │ ├── loss.py │ ├── lr_scheduler.py │ ├── models │ ├── __init__.py │ ├── albert.py │ ├── bart.py │ ├── base.py │ ├── bert.py │ ├── electra.py │ ├── gpt2.py │ ├── mobilebert.py │ ├── model_zoo_checksums │ │ ├── albert.txt │ │ ├── bart.txt │ │ ├── bert.txt │ │ ├── electra.txt │ │ ├── gpt2.txt │ │ ├── mobilebert.txt │ │ ├── mt5.txt │ │ ├── roberta.txt │ │ ├── t5.txt │ │ └── xlmr.txt │ ├── mt5.py │ ├── roberta.py │ ├── t5.py │ ├── transformer.py │ ├── transformer_xl.py │ └── xlmr.py │ ├── op.py │ ├── sequence_sampler.py │ ├── third_party │ ├── __init__.py │ ├── sentencepiece_model_pb2.py │ └── sentencepiece_pb2.py │ ├── torch │ ├── __init__.py │ ├── attention_cell.py │ ├── clib │ │ ├── amp_C_frontend.cpp │ │ ├── compat.h │ │ ├── multi_tensor_apply.cuh │ │ ├── multi_tensor_l2norm_kernel.cu │ │ ├── multi_tensor_lans.cu │ │ └── type_shim.h │ ├── data │ │ ├── __init__.py │ │ └── batchify.py │ ├── layers.py │ ├── models │ │ ├── __init__.py │ │ ├── bert.py │ │ └── transformer.py │ ├── optimizers │ │ ├── __init__.py │ │ ├── fused_lans.py │ │ └── schedules.py │ └── utils.py │ └── utils │ ├── __init__.py │ ├── config.py │ ├── lazy_imports.py │ ├── misc.py │ ├── parameter.py │ ├── preprocessing.py │ ├── registry.py │ ├── shm.py │ ├── testing.py │ └── tvm_utils.py ├── tests ├── README.md ├── data_cli │ ├── test_glue.py │ └── test_wikipedia.py ├── process_cli │ ├── data │ │ ├── wmt19-test-de-en.de │ │ ├── wmt19-test-de-en.en │ │ └── wmt19-test-zh-en.zh.jieba │ ├── test_average_checkpoint.py │ └── test_learn_apply_subword.py ├── test_attention_cell.py ├── test_data_batchify.py ├── test_data_filtering.py ├── test_data_loading.py ├── test_data_sampler.py ├── test_data_tokenizers.py ├── test_data_vocab.py ├── test_embedding.py ├── test_gluon_block.py ├── test_initializer.py ├── test_layers.py ├── test_loss.py ├── test_models.py ├── test_models_albert.py ├── test_models_bart.py ├── test_models_bert.py ├── test_models_electra.py ├── test_models_gpt2.py ├── test_models_mobilebert.py ├── test_models_mt5.py ├── test_models_roberta.py ├── test_models_t5.py ├── test_models_transformer.py ├── test_models_transformer_xl.py ├── test_models_xlmr.py ├── test_op.py ├── test_pytest.py ├── test_sequence_sampler.py ├── test_utils_misc.py ├── test_utils_parameter.py ├── test_utils_preprocessing.py ├── test_utils_registry.py └── torch │ ├── test_attention_cell_torch.py │ ├── test_bert_torch.py │ └── test_layers_torch.py └── tools ├── batch ├── README.md ├── backbone_benchmark │ └── run_batch_backbone_benchmark.sh ├── batch_states │ ├── compile_notebooks.sh │ ├── test.sh │ └── test_data_pipeline.sh ├── hello_world.py ├── question_answering │ ├── parse_squad_results.py │ └── run_batch_squad.sh ├── run_batch_conversion.sh ├── submit-job.py ├── sync_batch_result.sh └── wait-job.py ├── diagnose.py └── docker ├── README.md ├── devel_entrypoint.sh ├── gluon_nlp_job.sh ├── install ├── install_horovod.sh ├── install_jupyter_lab.sh ├── install_llvm.sh ├── install_openmpi.sh ├── install_python_packages.sh ├── install_tvm_cpu.sh ├── install_tvm_gpu.sh └── install_ubuntu18.04_core.sh ├── start_jupyter.sh ├── ubuntu18.04-cpu.Dockerfile └── ubuntu18.04-gpu.Dockerfile /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | omit = 4 | tests/* 5 | scripts/* 6 | concurrency = 7 | multiprocessing 8 | thread 9 | 10 | [report] 11 | ignore_errors = True 12 | 13 | [html] 14 | directory = coverage_html_report 15 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | max-complexity = 18 4 | exclude = tests,__init__.py 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | 8 | --- 9 | ## Description 10 | (A clear and concise description of what the bug is.) 11 | 12 | ### Error Message 13 | (Paste the complete error message, including stack trace.) 14 | 15 | ## To Reproduce 16 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.) 17 | 18 | ### Steps to reproduce 19 | (Paste the commands you ran that produced the error.) 20 | 21 | 1. 22 | 2. 23 | 24 | ## What have you tried to solve it? 25 | 26 | 1. 27 | 2. 28 | 29 | ## Environment 30 | 31 | We recommend using our script for collecting the diagnositc information. Run the following command and paste the outputs below: 32 | ``` 33 | curl --retry 10 -s https://raw.githubusercontent.com/dmlc/gluon-nlp/master/tools/diagnose.py | python 34 | 35 | # paste outputs here 36 | ``` 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Description 11 | (A clear and concise description of what the feature is.) 12 | - If the proposal is about a new model, provide description of what the model is. 13 | - If the proposal is about an API, provide mock examples if possible. 14 | 15 | ## References 16 | - list reference and related literature 17 | - list known implementations 18 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description ## 2 | (Brief description on what this PR is about) 3 | 4 | ## Checklist ## 5 | ### Essentials ### 6 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [MODEL], [TUTORIAL], [FEATURE], [DOC], etc) 7 | - [ ] Changes are complete (i.e. I finished coding on this PR) 8 | - [ ] All changes have test coverage 9 | - [ ] Code is well-documented 10 | 11 | ### Changes ### 12 | - [ ] Feature1, tests, (and when applicable, API doc) 13 | - [ ] Feature2, tests, (and when applicable, API doc) 14 | 15 | ## Comments ## 16 | - If this change is a backward incompatible change, why must this change be made. 17 | - Interesting edge cases to note here 18 | 19 | cc @dmlc/gluon-nlp-team 20 | -------------------------------------------------------------------------------- /.github/workflows/data-pipeline.yml: -------------------------------------------------------------------------------- 1 | name: data pipeline end-to-end 2 | 3 | on: 4 | schedule: 5 | - cron: '00 18 * * *' # At UTC 18:00 AM, everyday, use https://crontab.guru/ 6 | 7 | defaults: 8 | run: 9 | shell: bash 10 | 11 | jobs: 12 | unittest: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | python-version: [ '3.7' ] 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v2 21 | 22 | - name: Install Other Dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install boto3 26 | 27 | - name: Configure AWS Credentials 28 | uses: aws-actions/configure-aws-credentials@v1 29 | with: 30 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 31 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 32 | aws-region: us-east-1 33 | 34 | - name: Test Data Pipeline on AWS Batch 35 | run: | 36 | python ./tools/batch/submit-job.py --region us-east-1 \ 37 | --job-type c5n.4x \ 38 | --source-ref ${{ github.ref }} \ 39 | --work-dir tools/batch/batch_states \ 40 | --remote https://github.com/${{ github.repository }} \ 41 | --command "bash test_data_pipeline.sh" --wait 42 | -------------------------------------------------------------------------------- /.github/workflows/nightly-test.yml: -------------------------------------------------------------------------------- 1 | name: nightly test 2 | 3 | on: 4 | schedule: 5 | - cron: '30 23 * * *' # At UTC 23:30, everyday, use https://crontab.guru/ 6 | 7 | defaults: 8 | run: 9 | shell: bash 10 | 11 | jobs: 12 | unittest: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | python-version: [ '3.7' ] 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v2 21 | 22 | - name: Install Other Dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install boto3 26 | 27 | - name: Configure AWS Credentials 28 | uses: aws-actions/configure-aws-credentials@v1 29 | with: 30 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 31 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 32 | aws-region: us-east-1 33 | 34 | - name: Test GluonNLP on MXNet nightly realse 35 | run: | 36 | echo "Start submitting job" 37 | python ./tools/batch/submit-job.py --region us-east-1 \ 38 | --job-type g4dn.4x \ 39 | --name GluonNLP-Nightly-Test \ 40 | --source-ref ${{ github.ref }} \ 41 | --work-dir . \ 42 | --remote https://github.com/${{ github.repository }} \ 43 | --command "python3 -m pip install pytest-forked \ 44 | && python3 -m pip install -U --pre 'mxnet-cu102>=2.0.0b20210418' -f https://dist.mxnet.io/python/cu102 \ 45 | && python3 -m pytest --forked --durations=50 --device="cpu" --device="gpu" --runslow ./tests/" \ 46 | --wait | tee batch_job.log 47 | 48 | - name: Upload Cloud Watch Log 49 | if: ${{ failure() || success() }} 50 | uses: actions/upload-artifact@v2 51 | with: 52 | name: Test_Log 53 | path: ./batch_job.log 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # pycharm 77 | .idea 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # test data 107 | tests/data/ 108 | tests/data/embedding/ 109 | tests/data/my_embed/ 110 | tests/externaldata/ 111 | .pytest_cache 112 | 113 | # docs 114 | docs/html 115 | 116 | # release 117 | scripts/*.zip 118 | docs/tutorials/*.zip 119 | docs/tutorials/*/*.ipynb 120 | 121 | conda 122 | 123 | # temp files 124 | *.swp 125 | 126 | # vscode 127 | .vscode 128 | 129 | # Mac 130 | .DS_Store 131 | 132 | # license checker 133 | ci/rat/apache-rat/ 134 | ci/rat/apache-rat.jar 135 | -------------------------------------------------------------------------------- /.pytype.cfg: -------------------------------------------------------------------------------- 1 | # NOTE: All relative paths are relative to the location of this file. 2 | [pytype] 3 | # Space-separated list of files or directories to process. 4 | inputs = 5 | src/gluonnlp 6 | 7 | # Python version (major.minor) of the target code. 8 | python_version = 3.6 9 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Watchers and contributors to DMLC GluonNLP repo directories/packages/files 2 | # Please see documentation of use of CODEOWNERS file at 3 | # https://help.github.com/articles/about-codeowners/ and 4 | # https://github.com/blog/2392-introducing-code-owners 5 | # 6 | # Anybody can add themselves or a team as additional watcher or contributor 7 | # to get notified about changes in a specific package. 8 | # See https://help.github.com/articles/about-teams how to setup teams. 9 | 10 | 11 | # Global owners 12 | * @dmlc/gluon-nlp-committers @dmlc/gluon-nlp-reviewers 13 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | doxygen 2 | _build 3 | gen_modules 4 | tutorials 5 | doctrees 6 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/.nojekyll -------------------------------------------------------------------------------- /docs/404.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | Page Not Found 4 | -------------- 5 | 6 | You stumbled upon a page that's making us scratch our brains right now. Before any of us panics, 7 | we'd like you to know that you are being redirected to a better known and cozy place, in just a few seconds. 8 | 9 | .. image:: _static/404.jpg 10 | :alt: Page Not Found 11 | :width: 60% 12 | :align: center 13 | :target: ./index.html 14 | 15 | .. raw:: html 16 | 17 | 26 | -------------------------------------------------------------------------------- /docs/README.txt: -------------------------------------------------------------------------------- 1 | The documentation of GluonNLP is generated with recommonmark and sphinx. 2 | 3 | - pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark 4 | 5 | For more details, refer to [website/configuration.rst](website/configuration.rst) 6 | -------------------------------------------------------------------------------- /docs/_static/404.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/_static/404.jpg -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | .Logos { 2 | display: inline; 3 | margin: 1em; 4 | max-width: 120px; 5 | } 6 | 7 | .install { 8 | max-width: 800px; 9 | } 10 | .install .title { 11 | display: inline-block; 12 | min-width: 100px; 13 | text-transform: uppercase; 14 | font-size: 90%; 15 | color: #555; 16 | } 17 | 18 | .install .option { 19 | margin: 5px; 20 | } 21 | 22 | @media (max-width: 650px) { 23 | .install .option, .install .title { 24 | width: 90%; 25 | } 26 | 27 | .install .title { 28 | margin-top: 1em; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /docs/_static/gluon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/_static/gluon.ico -------------------------------------------------------------------------------- /docs/_static/google_analytics.js: -------------------------------------------------------------------------------- 1 | (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ 2 | (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), 3 | m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) 4 | })(window,document,'script','https://www.google-analytics.com/analytics.js','ga'); 5 | 6 | ga('create', 'UA-96378503-8', 'auto'); 7 | ga('send', 'pageview'); 8 | -------------------------------------------------------------------------------- /docs/_static/hidebib.js: -------------------------------------------------------------------------------- 1 | // adapted from: http://www.robots.ox.ac.uk/~vedaldi/assets/hidebib.js 2 | function hideallbibs() 3 | { 4 | var el = document.getElementsByTagName("div") ; 5 | for (var i = 0 ; i < el.length ; ++i) { 6 | if (el[i].className == "paper") { 7 | var bib = el[i].getElementsByTagName("pre") ; 8 | if (bib.length > 0) { 9 | bib [0] .style.display = 'none' ; 10 | } 11 | } 12 | } 13 | } 14 | 15 | function togglebib(paperid) 16 | { 17 | var paper = document.getElementById(paperid) ; 18 | var bib = paper.getElementsByTagName('pre') ; 19 | if (bib.length > 0) { 20 | if (bib [0] .style.display == 'none') { 21 | bib [0] .style.display = 'block' ; 22 | } else { 23 | bib [0] .style.display = 'none' ; 24 | } 25 | } 26 | } 27 | 28 | function toggleblock(blockId) 29 | { 30 | var block = document.getElementById(blockId); 31 | if (block.style.display == 'none') { 32 | block.style.display = 'block' ; 33 | } else { 34 | block.style.display = 'none' ; 35 | } 36 | } 37 | 38 | function hideblock(blockId) 39 | { 40 | var block = document.getElementById(blockId); 41 | block.style.display = 'none' ; 42 | } 43 | -------------------------------------------------------------------------------- /docs/_static/install-options.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function () { 2 | 3 | function label(lbl) { 4 | return $.trim(lbl.replace(/[ .]/g, '-').replace('+-', '').toLowerCase()); 5 | } 6 | 7 | // a hack: macos doesn't support cuda, so disable all cuda options when it 8 | // is selected. 9 | function disableCuda() { 10 | $('.install .option').each(function(){ 11 | if (label($(this).text()).indexOf("cuda") != -1) { 12 | $(this).addClass('disabled'); 13 | } 14 | }); 15 | } 16 | function enableCuda() { 17 | $('.install .option').each(function(){ 18 | if (label($(this).text()).indexOf("cuda") != -1) { 19 | $(this).removeClass('disabled'); 20 | } 21 | }); 22 | } 23 | 24 | // find the user os, and set the according option to active 25 | function setActiveOSButton() { 26 | var os = "linux" 27 | var agent = window.navigator.userAgent.toLowerCase(); 28 | if (agent.indexOf("win") != -1) { 29 | os = "windows" 30 | } else if (agent.indexOf("mac") != -1) { 31 | os = "macos" 32 | } 33 | if (os == "macos") { 34 | disableCuda(); 35 | } 36 | $('.install .option').each(function(){ 37 | if (label($(this).text()).indexOf(os) != -1) { 38 | $(this).addClass('active'); 39 | } 40 | }); 41 | } 42 | 43 | setActiveOSButton(); 44 | 45 | // apply theme 46 | function setTheme() { 47 | $('.opt-group .option').each(function(){ 48 | $(this).addClass('mdl-button mdl-js-button mdl-js-ripple-effect mdl-button--raised '); 49 | $(this).attr('id', label($(this).text())); 50 | }); 51 | $('.opt-group .active').each(function(){ 52 | $(this).addClass('mdl-button--colored'); 53 | }); 54 | } 55 | setTheme(); 56 | 57 | 58 | // show the command according to the active options 59 | function showCommand() { 60 | $('.opt-group .option').each(function(){ 61 | $('.'+label($(this).text())).hide(); 62 | // console.log('disable '+label($(this).text())); 63 | }); 64 | $('.opt-group .active').each(function(){ 65 | $('.'+label($(this).text())).show(); 66 | // console.log('enable '+label($(this).text())); 67 | }); 68 | } 69 | showCommand(); 70 | 71 | function setOptions() { 72 | var el = $(this); 73 | el.siblings().removeClass('active'); 74 | el.siblings().removeClass('mdl-button--colored'); 75 | el.addClass('active'); 76 | el.addClass('mdl-button--colored'); 77 | // console.log('enable'+el.text()) 78 | // console.log('disable'+el.siblings().text()) 79 | console.log($('.install #macos').hasClass('active') ) 80 | if ($('.install #macos').hasClass('active') == true) { 81 | disableCuda(); 82 | } else { 83 | enableCuda(); 84 | } 85 | showCommand(); 86 | } 87 | 88 | $('.opt-group').on('click', '.option', setOptions); 89 | 90 | }); 91 | -------------------------------------------------------------------------------- /docs/api/attention.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.attention_cell 2 | ======================= 3 | 4 | GluonNLP Toolkit provides ways to implement the attention mechanism that is prevailing in NLP models. 5 | 6 | .. currentmodule:: gluonnlp.attention_cell 7 | 8 | Attention Mechanism 9 | ------------------- 10 | 11 | .. automodule:: gluonnlp.attention_cell 12 | :members: 13 | :imported-members: 14 | :special-members: __contains__, __getitem__, __setitem__ 15 | 16 | -------------------------------------------------------------------------------- /docs/api/data.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.data 2 | ============= 3 | 4 | GluonNLP Toolkit provides tools for building efficient data pipelines for NLP tasks. 5 | 6 | .. currentmodule:: gluonnlp.data 7 | 8 | Tokenizers 9 | ---------- 10 | .. automodule:: gluonnlp.data.tokenizers 11 | :members: 12 | :imported-members: 13 | :special-members: __contains__, __getitem__, __setitem__ 14 | 15 | Vocabulary 16 | ---------- 17 | .. automodule:: gluonnlp.data.vocab 18 | :members: 19 | :imported-members: 20 | :special-members: __contains__, __getitem__, __setitem__ 21 | 22 | Batchify Function 23 | ----------------- 24 | .. automodule:: gluonnlp.data.batchify 25 | :members: 26 | 27 | Data Sampler 28 | ------------ 29 | .. automodule:: gluonnlp.data.sampler 30 | :members: 31 | :imported-members: 32 | 33 | Text Filtering 34 | -------------- 35 | .. automodule:: gluonnlp.data.filtering 36 | :members: 37 | :imported-members: 38 | 39 | Data Loading 40 | ------------ 41 | .. automodule:: gluonnlp.data.loading 42 | :members: 43 | :imported-members: 44 | -------------------------------------------------------------------------------- /docs/api/embedding.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.embedding 2 | ================== 3 | 4 | GluonNLP Toolkit provides tools for working with embeddings. 5 | 6 | .. currentmodule:: gluonnlp.embedding 7 | 8 | This page describes the ``gluonnlp`` APIs for text embedding, such as loading 9 | pre-trained embedding vectors for text tokens and storing them in the 10 | ``numpy.ndarray`` format. 11 | 12 | 13 | Pre-trained Embeddings 14 | ---------------------- 15 | 16 | .. currentmodule:: gluonnlp.embedding 17 | .. autosummary:: 18 | :nosignatures: 19 | 20 | list_sources 21 | load_embeddings 22 | get_fasttext_model 23 | 24 | 25 | API Reference 26 | ------------- 27 | 28 | .. automodule:: gluonnlp.embedding 29 | :members: 30 | :imported-members: 31 | :special-members: __contains__, __getitem__, __setitem__ 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | data 8 | embedding 9 | models 10 | attention 11 | layers 12 | operators 13 | sequence_sampler 14 | utils 15 | -------------------------------------------------------------------------------- /docs/api/layers.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.layers 2 | =============== 3 | 4 | GluonNLP Toolkit provides some common layers that can help you build NLP models. 5 | 6 | .. currentmodule:: gluonnlp.layers 7 | 8 | Layers 9 | ------ 10 | 11 | .. automodule:: gluonnlp.layers 12 | :members: 13 | :imported-members: 14 | :special-members: __contains__, __getitem__, __setitem__ 15 | 16 | -------------------------------------------------------------------------------- /docs/api/models.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.models 2 | =============== 3 | 4 | GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default, 5 | all requested pre-trained weights are downloaded from public repo and stored in `~/.gluonnlp/models/`. 6 | 7 | .. currentmodule:: gluonnlp.models 8 | .. autosummary:: 9 | 10 | Models 11 | ------ 12 | .. automodule:: gluonnlp.models 13 | :members: 14 | :no-inherited-members: 15 | :imported-members: 16 | :special-members: __contains__, __getitem__, __setitem__ 17 | -------------------------------------------------------------------------------- /docs/api/operators.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.op 2 | =============== 3 | 4 | GluonNLP Toolkit provides some functions that can help you build NLP architectures and training pipelines. 5 | 6 | .. currentmodule:: gluonnlp.op 7 | 8 | Layers 9 | ------ 10 | 11 | .. automodule:: gluonnlp.op 12 | :members: 13 | :imported-members: 14 | :special-members: __contains__, __getitem__, __setitem__ 15 | 16 | -------------------------------------------------------------------------------- /docs/api/sequence_sampler.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.sequence_sampler 2 | ========================= 3 | 4 | GluonNLP Toolkit provides ways to sample from a sequence generator. 5 | 6 | .. currentmodule:: gluonnlp.sequence_sampler 7 | 8 | Sequence Sampler 9 | ---------------- 10 | 11 | .. automodule:: gluonnlp.sequence_sampler 12 | :members: 13 | :imported-members: 14 | :special-members: __contains__, __getitem__, __setitem__ 15 | 16 | -------------------------------------------------------------------------------- /docs/api/utils.rst: -------------------------------------------------------------------------------- 1 | gluonnlp.utils 2 | ============== 3 | 4 | GluonNLP Toolkit provides tools for easily setting up task specific loss. 5 | 6 | .. currentmodule:: gluonnlp.utils 7 | 8 | API Reference 9 | ------------- 10 | 11 | .. automodule:: gluonnlp.utils 12 | :members: 13 | :imported-members: 14 | :special-members: __contains__, __getitem__, __setitem__ 15 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | -------- 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Benchmarking the Performance of NLP Backbones 8 | :link: model_zoo/benchmarks/index.html 9 | 10 | Benchmarking the performance of NLP models. 11 | We released the benchmarking script that compares different NLP packages. 12 | 13 | .. card:: 14 | :title: Conversion Scripts 15 | :link: model_zoo/conversion_toolkits/index.html 16 | 17 | Converting NLP models from other frameworks to GluonNLP. 18 | 19 | .. card:: 20 | :title: Datasets 21 | :link: model_zoo/datasets/index.html 22 | 23 | Example about the datasets supported by `nlp_data` 24 | 25 | .. card:: 26 | :title: Generation 27 | :link: model_zoo/generation/index.html 28 | 29 | Example about how to generate from a pretrained GPT-2 model with GluonNLP. 30 | We provided the generation script and tried to compare different sampling methods. 31 | 32 | .. card:: 33 | :title: Machine Translation 34 | :link: model_zoo/machine_translation/index.html 35 | 36 | Train machine translation model with GluonNLP. 37 | 38 | .. card:: 39 | :title: Data Preprocessing Toolkit in GluonNLP 40 | :link: model_zoo/processing/index.html 41 | 42 | Example about the data processing toolkit (`nlp_process`) offered in GluonNLP. 43 | 44 | .. card:: 45 | :title: Pretraining Model 46 | :link: model_zoo/pretraining/index.html 47 | 48 | Examples about pretraining your own backbones. 49 | 50 | .. card:: 51 | :title: Question Answering Examples 52 | :link: model_zoo/question_answering/index.html 53 | 54 | Run SQuAD 1.1 and 2.0 finetuning with GluonNLP. You will know how to run the models with 55 | mixed-precision training (AMP) and Horovod. 56 | 57 | -------------------------------------------------------------------------------- /docs/genindex.rst: -------------------------------------------------------------------------------- 1 | Index 2 | ===== 3 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | GluonNLP: NLP made easy 2 | ======================= 3 | 4 | Get Started: A Quick Example 5 | ---------------------------- 6 | 7 | Here is a quick example that downloads and creates a word embedding model and then 8 | computes the cosine similarity between two words. 9 | 10 | (You can click the play button below to run this example.) 11 | 12 | .. container:: demo 13 | :name: frontpage-demo 14 | 15 | `Word Embedding `_ 16 | 17 | .. raw:: html 18 | 19 | 38 | 39 | 40 | .. include:: examples.rst 41 | 42 | And more in :doc:`tutorials `. 43 | 44 | 45 | .. include:: install.rst 46 | 47 | 48 | About GluonNLP 49 | -------------- 50 | 51 | .. hint:: 52 | 53 | You can find out the doc for our master development branch `here `_. 54 | 55 | GluonNLP provides implementations of the state-of-the-art (SOTA) deep learning 56 | models in NLP, and build blocks for text data pipelines and models. 57 | It is designed for engineers, researchers, and students to fast prototype 58 | research ideas and products based on these models. This toolkit offers five main features: 59 | 60 | 1. Carefully designed APIs that greatly reduce the implementation complexity. 61 | 2. Pre-trained models for common NLP tasks. 62 | 3. Tutorials to help get started on new NLP tasks. 63 | 4. Community support. 64 | 65 | This toolkit assumes that users have basic knowledge about deep learning and 66 | NLP. Otherwise, please refer to an introductory course such as 67 | `Dive into Deep Learning `_ or 68 | `Stanford CS224n `_. 69 | If you are not familiar with Gluon, check out the `Gluon documentation 70 | `__. 71 | You may find the 60-min Gluon crash course linked from there especially helpful. 72 | 73 | 74 | .. toctree:: 75 | :hidden: 76 | :maxdepth: 2 77 | 78 | tutorials/index 79 | model_zoo/index 80 | api/index 81 | website/index 82 | genindex 83 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | .. Ignore prerequisites to make the index page concise, which will be shown at 5 | the install page 6 | 7 | .. raw:: html 8 | 9 | 10 | 11 | .. include:: install/install-include.rst 12 | 13 | .. raw:: html 14 | 15 | 16 | 17 | 18 | Check :doc:`install/install-more` for more installation instructions and options. 19 | -------------------------------------------------------------------------------- /docs/install/install-more.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | Installation 4 | ------------ 5 | 6 | .. include:: install-include.rst 7 | 8 | .. raw:: html 9 | 10 | 11 | 12 | 13 | 14 | Next steps 15 | ---------- 16 | 17 | - Checkout Apache MXNet `Get Started `_ for more options such as ARM devices and docker images. 18 | - `Verify your MXNet installation `_ 19 | - `Configure MXNet environment variables `_ 20 | - For new users: MXNet `Crash Course `_ and `other tutorials `_. 21 | - For experienced users: `Packages & Modules `_ and `Performance tips `_. 22 | - For advanced users: Apache MXNet `API `_ and `GluonNLP API <../api/index.html>`_. 23 | 24 | .. 25 | TOOD: write a new directive `no-local-toc` for it 26 | 27 | .. raw:: html 28 | 29 | 30 | -------------------------------------------------------------------------------- /docs/md2ipynb.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | 5 | import nbformat 6 | import notedown 7 | 8 | parser = argparse.ArgumentParser(description='Convert md file to ipynb files.') 9 | parser.add_argument('input', help='input.md', type=str) 10 | parser.add_argument('-d', '--disable_compute', 11 | help='Disable computing python scripts', action="store_true") 12 | args = parser.parse_args() 13 | 14 | # timeout for each notebook, in sec 15 | timeout = 90 * 60 16 | 17 | # the files will be ignored for execution 18 | ignore_execution = [] 19 | 20 | # Change working directory to directory of input file 21 | input_dir, input_fn = os.path.split(args.input) 22 | if input_dir: 23 | os.chdir(input_dir) 24 | 25 | output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb']) 26 | 27 | reader = notedown.MarkdownReader() 28 | 29 | # read 30 | with open(input_fn, encoding='utf-8', mode='r') as f: 31 | notebook = reader.read(f) 32 | 33 | if not any([i in input_fn for i in ignore_execution]): 34 | tic = time.time() 35 | if not args.disable_compute: 36 | notedown.run(notebook, timeout) 37 | print('=== Finished evaluation in %f sec' % (time.time() - tic)) 38 | 39 | # write 40 | # need to add language info to for syntax highlight 41 | notebook['metadata'].update({'language_info': {'name': 'ipython', 'version': 3}}) 42 | 43 | notebook_json = nbformat.writes(notebook) 44 | 45 | with open(output_fn, encoding='utf-8', mode='w') as f: 46 | f.write(notebook_json) 47 | -------------------------------------------------------------------------------- /docs/model_zoo: -------------------------------------------------------------------------------- 1 | ../scripts -------------------------------------------------------------------------------- /docs/tutorials/deep_learning_compiler/index.rst: -------------------------------------------------------------------------------- 1 | Compile NLP Models 2 | ================== 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Compile and accelerate NLP models via TVM 8 | :link: tvm_basic.html 9 | 10 | Basic example of how to use TVM to compile backbone models in GluonNLP. 11 | 12 | .. toctree:: 13 | :hidden: 14 | :maxdepth: 2 15 | 16 | tvm_basic.ipynb 17 | -------------------------------------------------------------------------------- /docs/tutorials/index.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | Interested in getting started in a new NLP area? Here are some tutorials to help get started. 5 | 6 | 7 | Embedding 8 | ----------------------- 9 | 10 | .. container:: cards 11 | 12 | .. card:: 13 | :title: Using Pre-trained Word Embeddings 14 | :link: word_embedding/word_embedding.html 15 | 16 | Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and 17 | analogy problems. 18 | 19 | 20 | Text Prediction 21 | ----------------------- 22 | 23 | .. container:: cards 24 | 25 | .. card:: 26 | :title: Text Prediction Part1 27 | :link: text_prediction/text_prediction_part1.html 28 | 29 | Load pretrained NLP backbones. 30 | 31 | .. card:: 32 | :title: Text Prediction Part2 33 | :link: text_prediction/text_prediction_part2.html 34 | 35 | An example that finetunes MobileBERT for sentiment analysis and sentence similarity. 36 | 37 | 38 | Question Answering 39 | ----------------------- 40 | 41 | .. container:: cards 42 | 43 | .. card:: 44 | :title: Question Answering with GluonNLP 45 | :link: question_answering/question_answering.html 46 | 47 | Learn how to build a model for Question Answering (QA) based on the backbone provided in GluonNLP. 48 | 49 | 50 | Tokenization 51 | ----------------------- 52 | 53 | .. container:: cards 54 | 55 | .. card:: 56 | :title: Tokenization Part1 57 | :link: tokenization/tokenization_part1.html 58 | 59 | The basic usage tokenizers in GluonNLP. 60 | 61 | .. card:: 62 | :title: Tokenization Part2 63 | :link: tokenization/tokenization_part2.html 64 | 65 | Try out different subword learning algorithms. 66 | 67 | 68 | Using Pretrained Models 69 | ----------------------- 70 | 71 | .. container:: cards 72 | 73 | .. card:: 74 | :title: T5 for Masked Language Modeling 75 | :link: pretrained_models/pretrained_t5_mlm.html 76 | 77 | An example of using pretrained models in GluonNLP. 78 | 79 | 80 | Compiling NLP Models 81 | -------------------- 82 | 83 | .. container:: cards 84 | 85 | .. card:: 86 | :title: Compile and accelerate NLP models via TVM 87 | :link: deep_learning_compiler/tvm_basic.html 88 | 89 | Basic example of how to use TVM to compile backbone models in GluonNLP. 90 | 91 | 92 | .. toctree:: 93 | :hidden: 94 | :maxdepth: 2 95 | 96 | word_embedding/index 97 | text_prediction/index 98 | question_answering/index 99 | tokenization/index 100 | pretrained_models/index 101 | deep_learning_compiler/index 102 | -------------------------------------------------------------------------------- /docs/tutorials/pretrained_models/index.rst: -------------------------------------------------------------------------------- 1 | Using Pretrained Models 2 | ======================= 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: T5 for Masked Language Modeling 8 | :link: pretrained_t5_mlm.html 9 | 10 | Use a pretrained T5 for MLM with noise spans. 11 | 12 | .. toctree:: 13 | :hidden: 14 | :maxdepth: 2 15 | 16 | pretrained_t5_mlm.ipynb 17 | -------------------------------------------------------------------------------- /docs/tutorials/pretrained_models/pretraining_objectives.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/pretrained_models/pretraining_objectives.png -------------------------------------------------------------------------------- /docs/tutorials/question_answering/index.rst: -------------------------------------------------------------------------------- 1 | Question Answering 2 | ======================= 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Question Answering with GluonNLP 8 | :link: question_answering.html 9 | 10 | Learn how to build a model for Question Answering (QA) based on the backbone provided in GluonNLP. 11 | 12 | .. toctree:: 13 | :hidden: 14 | :maxdepth: 2 15 | 16 | question_answering.ipynb 17 | -------------------------------------------------------------------------------- /docs/tutorials/question_answering/offsets_match.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/offsets_match.png -------------------------------------------------------------------------------- /docs/tutorials/question_answering/qa1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/qa1.png -------------------------------------------------------------------------------- /docs/tutorials/question_answering/qa2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/qa2.png -------------------------------------------------------------------------------- /docs/tutorials/text_prediction/bert_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/text_prediction/bert_illustration.png -------------------------------------------------------------------------------- /docs/tutorials/text_prediction/index.rst: -------------------------------------------------------------------------------- 1 | Text Prediction 2 | ======================= 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Text Prediction Part1 8 | :link: text_prediction_part1.html 9 | 10 | Load pretrained NLP backbones. 11 | 12 | .. card:: 13 | :title: Text Prediction Part2 14 | :link: text_prediction_part2.html 15 | 16 | An example that finetunes MobileBERT for sentiment analysis and sentence similarity. 17 | 18 | 19 | .. toctree:: 20 | :hidden: 21 | :maxdepth: 2 22 | 23 | text_prediction_part1.ipynb 24 | text_prediction_part2.ipynb 25 | -------------------------------------------------------------------------------- /docs/tutorials/text_prediction/merge_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/text_prediction/merge_input.png -------------------------------------------------------------------------------- /docs/tutorials/tokenization/index.rst: -------------------------------------------------------------------------------- 1 | Tokenization 2 | ======================= 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Tokenization Part1 8 | :link: tokenization_part1.html 9 | 10 | The basic usage tokenizers in GluonNLP. 11 | 12 | 13 | .. card:: 14 | :title: Tokenization Part2 15 | :link: tokenization_part2.html 16 | 17 | Try out different subword learning algorithms. 18 | 19 | 20 | .. card:: 21 | :title: Tokenization Part3 22 | :link: tokenization_part3.html 23 | 24 | Tutorial that downloads wikipedia data and learn subword. 25 | 26 | 27 | .. toctree:: 28 | :hidden: 29 | :maxdepth: 2 30 | 31 | tokenization_part1.ipynb 32 | tokenization_part2.ipynb 33 | tokenization_part3.ipynb 34 | -------------------------------------------------------------------------------- /docs/tutorials/tokenization/tokenization_part3.md: -------------------------------------------------------------------------------- 1 | # Part3: Download Data from Wikipedia and Learn Subword 2 | 3 | In this tutorial, we will download the Wikipedia classical Chinese dataset with `nlp_data` and learn a customized sentencepiece vocabulary. 4 | 5 | ## Download Data 6 | 7 | ```{.shell .input} 8 | !nlp_data prepare_wikipedia --mode download+format --lang zh-classical --date latest --quiet -o wiki_zh_classical 9 | ``` 10 | 11 | To save time, we will use the first 10000 sentences for training the subword model. 12 | 13 | 14 | ```{.shell .input} 15 | !head -10000 wiki_zh_classical/prepared_wikipedia/wikipedia-prepared-0000.txt > train_corpus.txt 16 | ``` 17 | 18 | ```{.shell .input} 19 | !nlp_process learn_subword --model spm --corpus train_corpus.txt --vocab-size 10000 \ 20 | --disable-bos --disable-eos \ 21 | --custom-special-tokens "cls_token=[CLS]" "sep_token=[SEP]" "mask_token=[MASK]" 22 | ``` 23 | 24 | The model are saved in "spm" folder. 25 | 26 | ```{.shell .input} 27 | !ls spm 28 | ``` 29 | 30 | ## Build the Tokenizer with the Saved Model 31 | 32 | 33 | ```{.python .input} 34 | import gluonnlp 35 | import json 36 | from gluonnlp.data.tokenizers import SentencepieceTokenizer 37 | tokenizer = SentencepieceTokenizer(model_path='spm/spm.model', vocab="spm/spm.vocab") 38 | print(tokenizer) 39 | print() 40 | print('The first 10 tokens in the vocabulary:') 41 | print('--------------------------------------') 42 | print(tokenizer.vocab.all_tokens[:10]) 43 | ``` 44 | 45 | You can use the tokenizer direclty. 46 | 47 | 48 | ```{.python .input} 49 | tokenizer.encode('賈夫人仙逝揚州城 ·') 50 | ``` 51 | 52 | 53 | ```{.python .input} 54 | tokenizer.encode_with_offsets('賈夫人仙逝揚州城 ·') 55 | ``` 56 | 57 | ## Explore More Options 58 | 59 | To explore more options, you may check the README. 60 | 61 | 62 | ```{.shell .input} 63 | !nlp_process learn_subword --help 64 | ``` 65 | -------------------------------------------------------------------------------- /docs/tutorials/word_embedding/index.rst: -------------------------------------------------------------------------------- 1 | Representation Learning 2 | ======================= 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Using Pre-trained Word Embeddings 8 | :link: word_embedding.html 9 | 10 | Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and 11 | analogy problems. 12 | 13 | 14 | .. toctree:: 15 | :hidden: 16 | :maxdepth: 2 17 | 18 | word_embedding.ipynb 19 | -------------------------------------------------------------------------------- /docs/website/index.rst: -------------------------------------------------------------------------------- 1 | Community 2 | ========= 3 | 4 | .. card:: 5 | :title: Community 6 | :is_head: true 7 | :link: https://www.apache.org/foundation/policies/conduct 8 | 9 | Welcome to GluonNLP community. We strive to foster a collaborative and welcoming community. We 10 | expect all members to follow the `code of conduct `__. 11 | 12 | 13 | .. container:: cards 14 | 15 | .. card:: 16 | :title: Github Issues 17 | :link: https://github.com/dmlc/gluon-nlp/issues 18 | 19 | Feature requests, bug reports, design and roadmap discussion. 20 | 21 | .. card:: 22 | :title: GluonNLP Slack Channel 23 | :link: https://apache-mxnet.slack.com/messages/CCCDM10V9 24 | 25 | #gluon-nlp Slack channel. Click the `sign-up link `_ to register. 26 | 27 | 28 | Interested in contributing to GluonNLP? Check our contribution guide: 29 | 30 | .. toctree:: 31 | :maxdepth: 3 32 | 33 | contribute 34 | git 35 | release 36 | configuration -------------------------------------------------------------------------------- /docs/website/release.rst: -------------------------------------------------------------------------------- 1 | Release Checklist 2 | ================= 3 | 4 | Below is the checklist for releasing a new minor version of GluonNLP: 5 | 6 | - Creat a new release branch $major.$minor.x with commits from the master branch 7 | - Bump the version in the master branch to $major.$minor+1.$patch.dev 8 | - Bump the version in the release branch to $major.$minor.$patch 9 | - Update the installation from source instruction in the release branch 10 | - Draft the release note, highlight important events/models/features, as well as breaking changes 11 | - Publish the release on Github, creating a tag $major.$minor.$patch 12 | - Check the content at http://gluon-nlp.mxnet.io/$major.$minor.x/index.html 13 | - Upload and refresh the default version website 14 | - Prepare pip package 15 | - Make annoucement (Twitter, etc) 16 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | seed: set the python, numpy and mxnet random seeds to a specified value for test reproducibility 4 | serial: mark a test that requires more resources to run that are thus only suitable for serial run. 5 | remote_required: mark a test that requires internet access. 6 | gpu: mark a test that requires GPU. 7 | integration: mark an integration test 8 | skip_master: mark a test that is temporarily skipped for mxnet master validation. 9 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking the Performance of NLP Backbones 2 | 3 | We benchmark the latency and peak memory usage of a single training (forward + backward) and inference (forward-only) step 4 | of the NLP backbones. 5 | For comparison, we also provide the numbers of the models in huggingface. 6 | 7 | ## Backbones in HuggingFace 8 | 9 | We use the [huggingface benchmark](https://github.com/huggingface/transformers/tree/master/examples/benchmarking) 10 | to benchmark the training + inference speed of common workloads in NLP. 11 | 12 | ```bash 13 | python3 -m pip install -U -r requirements.txt 14 | python3 benchmark_hf.py 15 | ``` 16 | 17 | It will generate a list of csv files: 18 | 19 | ``` 20 | ├── pytorch_train_fp32.csv 21 | ├── pytorch_train_fp16.csv 22 | ├── pytorch_infer_fp32.csv 23 | ├── pytorch_infer_fp16.csv 24 | ├── pytorch_infer_fp32_ts.csv 25 | ``` 26 | 27 | ## GluonNLP Backbones based on MXNet-2.0 28 | 29 | We profile three options: `NT` layout, `NT` layout with `TN` layout as the compute layout, 30 | and `TN` layout. 31 | 32 | ```bash 33 | python3 -m pip install -U -r requirements.txt 34 | bash benchmark_gluonnlp.sh 35 | ``` 36 | 37 | It will generate csv files with `gluonnlp_` as the prefix 38 | ``` 39 | ├── gluonnlp_train_fp32_NT_NT.csv 40 | ├── gluonnlp_train_fp32_NT_TN.csv 41 | ├── gluonnlp_train_fp32_TN_TN.csv 42 | ├── gluonnlp_infer_fp32_NT_NT_tvm0.csv 43 | ├── gluonnlp_infer_fp32_NT_TN_tvm0.csv 44 | ├── gluonnlp_infer_fp32_TN_TN_tvm0.csv 45 | ``` 46 | 47 | ## GluonNLP + TVM for Inference 48 | 49 | Install TVM as described in https://tvm.apache.org/docs/install/index.html 50 | 51 | ```bash 52 | bash benchmark_gluonnlp_tvm.sh 53 | ``` 54 | 55 | ``` 56 | ├── gluonnlp_infer_fp32_NT_NT_tvm1.csv 57 | ├── gluonnlp_infer_fp32_NT_TN_tvm1.csv 58 | ├── gluonnlp_infer_fp32_TN_TN_tvm1.csv 59 | ``` 60 | 61 | ## Generate the Benchmark Report 62 | -------------------------------------------------------------------------------- /scripts/benchmarks/benchmark_gluonnlp.sh: -------------------------------------------------------------------------------- 1 | for mode in train inference 2 | do 3 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode 4 | done 5 | 6 | for mode in train inference 7 | do 8 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode 9 | done 10 | 11 | for mode in train inference 12 | do 13 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode 14 | done 15 | -------------------------------------------------------------------------------- /scripts/benchmarks/benchmark_gluonnlp_fp16.sh: -------------------------------------------------------------------------------- 1 | for mode in train inference 2 | do 3 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16 4 | done 5 | 6 | for mode in train inference 7 | do 8 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16 9 | done 10 | 11 | for mode in train inference 12 | do 13 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16 14 | done 15 | -------------------------------------------------------------------------------- /scripts/benchmarks/benchmark_gluonnlp_tvm.sh: -------------------------------------------------------------------------------- 1 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode inference --use_tvm --instance_type g4 2 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode inference --use_tvm --instance_type g4 3 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode inference --use_tvm --instance_type g4 4 | -------------------------------------------------------------------------------- /scripts/benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | py3nvml 3 | torch 4 | torchvision 5 | -------------------------------------------------------------------------------- /scripts/benchmarks/run_backbone_benchmark.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install -U -r requirements.txt 2 | python3 benchmark_hf.py 3 | bash benchmark_gluonnlp.sh 4 | bash benchmark_gluonnlp_fp16.sh 5 | -------------------------------------------------------------------------------- /scripts/classification/README.md: -------------------------------------------------------------------------------- 1 | # finetune classification 2 | ## prepare datasets 3 | use nlp_data to prepare data at first. 4 | ```bash 5 | nlp_data prepare_glue --benchmark glue -t sst 6 | ``` 7 | ##finetine scripts 8 | Then run the scripts to finetune: 9 | ```bash 10 | python train_classification.py \ 11 | --model_name google_en_uncased_bert_base \ 12 | --task_name cola \ 13 | --lr 2e-5\ 14 | --model_name google_en_cased_bert_base \ 15 | --batch_size 32 \ 16 | --do_train \ 17 | --do_eval \ 18 | --seed 7800 \ 19 | --epochs 10 \ 20 | --optimizer adamw \ 21 | --train_dir glue/cola/train.parquet \ 22 | --eval_dir glue/cola/dev.parquet \ 23 | --gpus 0 24 | ``` 25 | alternatively, because some task are slow(like MNLI), you can use horovod to accelerate, 26 | ```bash 27 | horovodrun -np 4 -H localhost:4 python train_classification.py \ 28 | --comm_backend horovod \ 29 | --model_name google_en_uncased_bert_base \ 30 | --task_name mnli \ 31 | --lr 2e-4\ 32 | --batch_size 32 \ 33 | --do_train \ 34 | --do_eval \ 35 | --epochs 5 \ 36 | --log_interval 500 \ 37 | --warmup_ratio 0.1 \ 38 | --optimizer adamw \ 39 | --train_dir glue/mnli/train.parquet \ 40 | --eval_dir glue/mnli/dev_matched.parquet \ 41 | --gpus 0,1,2,3 42 | ``` 43 | 44 | ## some result 45 | here are some results with their hyperparameters 46 | 47 | | task Name | metirc | learning rate | batch size | seed | epoch | result | tensorboard dev | 48 | |-----------|-------------|---------------|--------------|---------|-------|------|-----| 49 | | SST | Accuracy | 2e-5 | 32 | 7800 | 5 | 93.23 | https://tensorboard.dev/experiment/eKVI0DC6SEWBbHzS8ZphNg/| 50 | | STS | Pearson Corr. | 2e-5 | 32 | 24 | 10 | 89.26 | https://tensorboard.dev/experiment/kPOnlNeiQ4W5EmFlkqjC6A/| 51 | | CoLA | Matthew Corr. | 2e-5 | 32 | 7800 | 10 | 59.23 | https://tensorboard.dev/experiment/33euRGh9SrW3p15JWgILnw/ | 52 | | RTE | Accuracy | 2e-5 | 32 | 1800 | 10 | 69.67 | https://tensorboard.dev/experiment/XjTxr5anRrC1LMukLJJQ3g/| 53 | | MRPC | Accuracy/F1 | 3e-5 | 32 | 7800 | 5 | 85.38/87.31 | https://tensorboard.dev/experiment/jEJFq2XXQ8SvCxt6eKIjwg/ | 54 | | MNLI | Accuracy(m/mm) | 2e-5 | 48 | 7800 | 5 | 84.90/85.10 | https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ | -------------------------------------------------------------------------------- /scripts/classification/classification.py: -------------------------------------------------------------------------------- 1 | import gluonnlp 2 | import numpy as np 3 | import mxnet as mx 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from gluonnlp.data.sampler import SplitSampler 7 | from tqdm import tqdm 8 | from mxnet.gluon import nn 9 | from gluonnlp.models import get_backbone 10 | from gluonnlp.utils.parameter import clip_grad_global_norm 11 | from gluonnlp.utils.preprocessing import get_trimmed_lengths 12 | from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat 13 | from mxnet.gluon.data import batchify as bf 14 | from mxnet.gluon.data import DataLoader 15 | from mxnet.lr_scheduler import PolyScheduler 16 | from gluonnlp.utils import set_seed 17 | 18 | class TextPredictionNet(nn.HybridBlock): 19 | def __init__(self, backbone, output_size = 2): 20 | super().__init__() 21 | self.backbone = backbone 22 | self.output_size = output_size 23 | self.out_proj = nn.Dense(in_units=backbone.units, 24 | units=self.output_size, 25 | flatten=False) 26 | 27 | 28 | def forward(self, data, token_types, valid_length): 29 | _, pooled_out = self.backbone(data, token_types, valid_length) 30 | out = self.out_proj(pooled_out) 31 | return out 32 | 33 | def initialize_with_pretrained_backbone(self, backbone_params_path, ctx=None): 34 | self.backbone.load_parameters(backbone_params_path, ctx=ctx) 35 | self.out_proj.initialize(ctx=ctx) 36 | 37 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/README.md: -------------------------------------------------------------------------------- 1 | # Conversion Scripts 2 | 3 | In GluonNLP, we provide shared scripts to convert the model checkpoints in other repositories to GluonNLP. 4 | 5 | At this stage, the model needs to be downloaded locally, and the converting scripts accepts only the file directory as the argument, 6 | without the support of accepting the url. In addition, both the tensorflow fine-tuned models that 7 | can be loaded in TF1 Hub modules and TF2 SavedModels are accepted, although the parameters of mask 8 | language model are not provided in TF2 SavedModels in most cases, and 9 | the differences of these parameters are not required to be tested after converting. 10 | 11 | The testing step mentioned above are controlled by the flag `--test`, in which the maximum 12 | tolerance of 1e-3 between gluon model with converted weights and original tensorflow model. 13 | In addition, we can use GPU in all converting scripts by adding `--gpu 0`. 14 | 15 | For RoBERTa XLM-R and BART model, we rely on the master version of [fairseq](https://github.com/pytorch/fairseq#requirements-and-installation) package locally as `pip install git+https://github.com/pytorch/fairseq.git@master`. 16 | 17 | ## BERT 18 | Convert model from [BERT LIST](https://tfhub.dev/google/collections/bert/1). 19 | 20 | You can use the script provided in [convert_bert.sh](convert_bert.sh). 21 | The following command give you a rough idea about the code. 22 | 23 | ```bash 24 | bash convert_bert.sh 25 | ``` 26 | 27 | In the process, we downloaded the config file from the [official repo](https://github.com/google-research/bert#pre-trained-models), download the configuration file `bert_config.json`, 28 | and move it into `${case}_bert_${model}/assets/`. 29 | 30 | ## ALBERT 31 | You can use the command described in 32 | ```bash 33 | bash convert_albert.sh 34 | ``` 35 | 36 | ## ELECTRA 37 | The TF Hub is not available for ELECTRA model currently. 38 | Thus, you will need to clone the [electra repository](https://github.com/ZheyuYe/electra) 39 | and download the checkpoint. The parameters are converted from local checkpoints. 40 | By running the following command, you can convert + verify the ELECTRA model with both the discriminator and the generator. 41 | 42 | Notice: please set up the `--electra_path` with the cloned path if you'd like to directly use `convert_electra.py`. 43 | 44 | ```bash 45 | bash convert_electra.sh 46 | ``` 47 | 48 | ## MobileBert 49 | ```bash 50 | bash convert_mobilebert.sh 51 | ``` 52 | 53 | ## RoBERTa 54 | ```bash 55 | bash convert_roberta.sh 56 | ``` 57 | 58 | ## XLM-R 59 | ```bash 60 | bash convert_xlmr.sh 61 | ``` 62 | 63 | ## BART 64 | ```bash 65 | bash convert_bart.sh 66 | ``` 67 | 68 | ## GPT-2 69 | ```bash 70 | bash convert_gpt2.sh 71 | ``` 72 | 73 | ## T5 74 | ```bash 75 | bash convert_t5.sh 76 | ``` 77 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/bert_base_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/bert_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1024, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 4096, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 24, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_albert.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | python3 -m pip install tensorflow==1.15 --upgrade --user 4 | python3 -m pip install tensorflow_hub --upgrade --user 5 | export TF_FORCE_GPU_ALLOW_GROWTH="true" 6 | for model in base large xlarge xxlarge 7 | do 8 | hub_directory="google_albert_${model}_v2" 9 | mkdir -p ${hub_directory} 10 | wget "https://tfhub.dev/google/albert_${model}/3?tf-hub-format=compressed" -O "${hub_directory}.tar.gz" 11 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory} 12 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type albert --test 13 | done 14 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_bart.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user 2 | for model in base large 3 | do 4 | mkdir bart_${model} 5 | wget "https://dl.fbaipublicfiles.com/fairseq/models/bart.${model}.tar.gz" 6 | tar zxf bart.${model}.tar.gz --directory bart_${model} 7 | python3 convert_fairseq_bart.py --fairseq_model_path bart_${model}/bart.${model} --test 8 | done 9 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_bert.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | python3 -m pip install 'tensorflow<3' --upgrade --user 4 | python3 -m pip install tensorflow_hub --upgrade --user 5 | export TF_FORCE_GPU_ALLOW_GROWTH="true" 6 | 7 | # Conversion for English Models 8 | for model in base large 9 | do 10 | for case in cased uncased 11 | do 12 | hub_directory="google_en_${case}_bert_${model}" 13 | mkdir -p ${hub_directory} 14 | if [ ${model} == base ];then 15 | url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed" 16 | else 17 | url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed" 18 | fi 19 | wget ${url} -O "${hub_directory}.tar.gz" 20 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory} 21 | cp bert_${model}_config.json ${hub_directory}/assets/ 22 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test 23 | done 24 | done 25 | 26 | # Conversion for Chinese Models 27 | url="https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/2?tf-hub-format=compressed" 28 | hub_directory="google_zh_bert_base" 29 | mkdir -p ${hub_directory} 30 | wget ${url} -O "${hub_directory}.tar.gz" 31 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory} 32 | cp bert_base_config.json ${hub_directory}/assets/ 33 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test 34 | 35 | # Conversion for Multi-lingual Models 36 | url="https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2?tf-hub-format=compressed" 37 | hub_directory="google_multi_cased_bert_base" 38 | mkdir -p ${hub_directory} 39 | wget ${url} -O "${hub_directory}.tar.gz" 40 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory} 41 | cp bert_base_config.json ${hub_directory}/assets/ 42 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test 43 | 44 | # Conversion for Whole-word-masking Models 45 | for case in cased uncased 46 | do 47 | hub_directory="google_en_${case}_bert_wwm_large" 48 | mkdir -p ${hub_directory} 49 | url="https://tfhub.dev/tensorflow/bert_en_wwm_${case}_L-24_H-1024_A-16/2?tf-hub-format=compressed" 50 | wget ${url} -O "${hub_directory}.tar.gz" 51 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory} 52 | cp bert_large_config.json ${hub_directory}/assets/ 53 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test 54 | done 55 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_bert_torch.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | python3 -m pip install 'tensorflow<3' --upgrade --user 4 | python3 -m pip install tensorflow_hub --upgrade --user 5 | export TF_FORCE_GPU_ALLOW_GROWTH="true" 6 | 7 | # Conversion for English Models 8 | for model in base large 9 | do 10 | for case in cased uncased 11 | do 12 | hub_directory="google_en_${case}_bert_${model}" 13 | mkdir -p ${hub_directory} 14 | if [ ${model} == base ];then 15 | url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed" 16 | else 17 | url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed" 18 | fi 19 | wget ${url} -O "${hub_directory}.tar.gz" 20 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory} 21 | cp bert_${model}_config.json ${hub_directory}/assets/ 22 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test --torch 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_electra.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install tensorflow==1.15 --upgrade --user 2 | export TF_FORCE_GPU_ALLOW_GROWTH="true" 3 | git clone https://github.com/ZheyuYe/electra.git 4 | cd electra 5 | git checkout 923179410471f9e1820b3f0771c239e1752e4e18 6 | cd .. 7 | for model in small base large 8 | do 9 | wget https://storage.googleapis.com/electra-data/electra_${model}.zip 10 | unzip electra_${model}.zip 11 | python3 convert_electra.py --tf_model_path electra_${model} --electra_path electra --model_size ${model} --test 12 | done 13 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_gpt2.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install tensorflow==1.15 --upgrade --user 2 | git clone https://github.com/openai/gpt-2.git gpt_2 3 | for model in 124M 355M 774M 1558M 4 | do 5 | python3 gpt_2/download_model.py ${model} 6 | mkdir gpt2_${model} 7 | CUDA_VISIBLE_DEVICES="" python3 convert_gpt2.py --tf_model_path models/${model} --save_dir gpt2_${model} --test 8 | done 9 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_mobilebert.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install tensorflow==1.15 --upgrade --user 2 | export TF_FORCE_GPU_ALLOW_GROWTH="true" 3 | svn checkout https://github.com/google-research/google-research/trunk/mobilebert 4 | 5 | mkdir mobilebert_model 6 | url='https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/uncased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz' 7 | wget ${url} -O "mobilebert.tar.gz" 8 | tar -xvf mobilebert.tar.gz --directory mobilebert_model 9 | python3 convert_mobilebert.py --tf_model_path mobilebert_model/mobilebert --mobilebert_dir mobilebert --test 10 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_mt5.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install git+https://github.com/huggingface/transformers.git --upgrade 2 | for model in small base large xl xxl 3 | do 4 | dest_dir="google_mt5_${model}" 5 | mkdir ${dest_dir} 6 | python3 convert_mt5.py "google/mt5-${model}" ${dest_dir} --test 7 | done 8 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_roberta.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user 2 | for model in base large 3 | do 4 | mkdir roberta_${model} 5 | wget "https://dl.fbaipublicfiles.com/fairseq/models/roberta.${model}.tar.gz" 6 | tar zxf roberta.${model}.tar.gz --directory roberta_${model} 7 | python3 convert_fairseq_roberta.py --fairseq_model_path roberta_${model}/roberta.${model} --test 8 | done 9 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_t5.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install git+https://github.com/huggingface/transformers.git --upgrade 2 | for model in small base large 3B 11B 3 | do 4 | dest_dir="google_t5_${model}" 5 | mkdir ${dest_dir} 6 | python3 convert_t5.py "t5-${model,,}" ${dest_dir} --test 7 | done 8 | -------------------------------------------------------------------------------- /scripts/conversion_toolkits/convert_xlmr.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install fairseq==0.10.1 --upgrade --user 2 | for model in base large 3 | do 4 | mkdir xlmr_${model} 5 | wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.${model}.tar.gz" 6 | tar zxf xlmr.${model}.tar.gz --directory xlmr_${model} 7 | python3 convert_fairseq_xlmr.py --fairseq_model_path xlmr_${model}/xlmr.${model} --model_size ${model} --test 8 | done 9 | -------------------------------------------------------------------------------- /scripts/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import importlib 3 | import os 4 | 5 | SUBCOMMAND_DICT = dict() 6 | 7 | # Find all modules starting with `prepare_` 8 | CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) 9 | for root, dirs, files in os.walk(CURR_DIR, topdown=False): 10 | for name in files: 11 | if name.startswith('prepare_') and name.endswith('.py'): 12 | command = name[:-3] 13 | path = os.path.join(root, name) 14 | relpath = os.path.relpath(path, CURR_DIR)[:-3] 15 | if relpath.startswith(os.sep): 16 | relpath = path[len(os.sep):] 17 | subpackage = relpath.replace(os.sep, '.') 18 | SUBCOMMAND_DICT[command] = 'gluonnlp.cli.data.' + subpackage 19 | 20 | 21 | def cli_main(): 22 | parser = argparse.ArgumentParser( 23 | description='Build-in scripts for downloading and preparing the data in GluonNLP.', 24 | prog='nlp_data', add_help=False) 25 | parser.add_argument('command', type=str, 26 | choices=sorted(SUBCOMMAND_DICT.keys()) + ['help'], 27 | metavar='[subcommand]', 28 | help='The subcommand to use. ' 29 | 'Choices are {}.'.format(sorted(SUBCOMMAND_DICT.keys()) + ['help'])) 30 | args, other_args = parser.parse_known_args() 31 | if args.command == 'help': 32 | parser.print_help() 33 | else: 34 | mod = importlib.import_module(SUBCOMMAND_DICT[args.command]) 35 | parser = mod.get_parser() 36 | sub_args = parser.parse_args(other_args) 37 | mod.main(sub_args) 38 | 39 | 40 | if __name__ == '__main__': 41 | cli_main() 42 | -------------------------------------------------------------------------------- /scripts/datasets/general_nlp_benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/general_nlp_benchmark/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/language_modeling/README.md: -------------------------------------------------------------------------------- 1 | # Language Modeling Benchmark 2 | 3 | Prepare the language modeling benchmarking datasets. 4 | In order to help reproduce the papers, we use 5 | the tokenized corpus as the training/validation/testing dataset. 6 | 7 | ```bash 8 | # WikiText-2 9 | nlp_data prepare_lm --dataset wikitext2 10 | 11 | # WikiText-103 12 | nlp_data prepare_lm --dataset wikitext103 13 | 14 | # enwik8 15 | nlp_data prepare_lm --dataset enwik8 16 | 17 | # Text-8 18 | nlp_data prepare_lm --dataset text8 19 | 20 | # Google One-Billion-Word 21 | nlp_data prepare_lm --dataset gbw 22 | ``` 23 | 24 | Happy language modeling :) 25 | -------------------------------------------------------------------------------- /scripts/datasets/language_modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/language_modeling/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/machine_translation/README.md: -------------------------------------------------------------------------------- 1 | # Machine Translation 2 | 3 | In machine translation, we train a model to map a sentence from the source language, e.g., English, 4 | to the target language, e.g., Chinese. Here, we provide scripts to download the common benchmark 5 | datasets for machine translation. The downloaded datasets are stored as a pair of corpus files, 6 | one for the source and the other for the target. 7 | 8 | ## WMT 9 | You can use [prepare_wmt.py](prepare_wmt.py) to download and prepare the raw training corpus and 10 | then use [clean_parallel_corpus.py](../../preprocess/clean_parallel_corpus.py) to clean and 11 | filter the corpus. 12 | 13 | You may download the raw WMT2014 en-de 14 | ```bash 15 | nlp_data prepare_wmt \ 16 | --dataset wmt2014 \ 17 | --lang-pair en-de \ 18 | --save-path wmt2014_en_de 19 | ``` 20 | 21 | By combining `nlp_data` and `nlp_process`, we provide the example for preparing the 22 | WMT2014 en-de training dataset: [wmt2014_ende.sh](wmt2014_ende.sh). This involves three steps: 23 | - Downloading the raw text data 24 | - Clean and tokenize the data 25 | - Learn subword model and apply the learned subword model. 26 | 27 | ```bash 28 | bash wmt2014_ende.sh yttm 29 | ``` 30 | 31 | We support the following subword learning algorithms: 32 | 33 | ```bash 34 | # BPE from YouTokenToMe 35 | bash wmt2014_ende.sh yttm 36 | 37 | # BPE from Huggingface 38 | bash wmt2014_ende.sh hf_bpe 39 | 40 | # BPE from subword-nmt 41 | bash wmt2014_ende.sh subword_nmt 42 | 43 | # Byte-level BPE 44 | bash wmt2014_ende.sh hf_bytebpe 45 | 46 | # Sentencepiece 47 | bash wmt2014_ende.sh spm 48 | 49 | # WordPiece 50 | bash wmt2014_ende.sh hf_wordpiece 51 | ``` 52 | 53 | 54 | Apart from WMT2014 EN-DE, we also provided the script for preparing the training data for 55 | WMT2017 ZH-EN task: 56 | [wmt2017_zhen.sh](wmt2017_zhen.sh). 57 | 58 | ### Monolingual Corpus 59 | In the WMT competition, there are additional monolingual corpus that helps you train NMT models. 60 | You may download the raw monolingual corpus by adding `--mono` flag. 61 | 62 | One example is to download the newscrawl monolingual corpus in German: 63 | 64 | ```bash 65 | nlp_data prepare_wmt \ 66 | --mono \ 67 | --mono_lang de \ 68 | --dataset newscrawl \ 69 | --save-path wmt2014_mono 70 | ``` 71 | 72 | 73 | ### Directory Structure of Translation Dataset 74 | 75 | The basic structure of a translation dataset is like the following: 76 | ``` 77 | folder_name 78 | ├── train.raw.{src} 79 | ├── train.raw.{tgt} 80 | ├── train.tok.{src} 81 | ├── train.tok.{tgt} 82 | ├── train.tok.{subword_model}.{src} 83 | ├── train.tok.{subword_model}.{tgt} 84 | ├── ... 85 | ├── ... Repeat for valid and test 86 | ├── ... 87 | ├── {subword_model}.model 88 | ├── {subword_model}.path 89 | ``` 90 | -------------------------------------------------------------------------------- /scripts/datasets/machine_translation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/machine_translation/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/music_generation/README.md: -------------------------------------------------------------------------------- 1 | # Music Generation 2 | 3 | We provide datasets for training a music generation model. 4 | 5 | ## Maestro 6 | 7 | See https://magenta.tensorflow.org/datasets/maestro for detailed introduction. 8 | 9 | ``` 10 | # Get V1 Dataset 11 | nlp_data prepare_music_midi --dataset maestro_v1 12 | 13 | # Get V2 Dataset 14 | nlp_data prepare_music_midi --dataset maestro_v2 15 | ``` 16 | 17 | ## LakhMIDI 18 | 19 | See https://colinraffel.com/projects/lmd/ for more details 20 | 21 | ``` 22 | # Get Lakh MIDI Full Dataset 23 | nlp_data prepare_music_midi --dataset lmd_full 24 | 25 | # Get the subset of 45,129 files from LMD-full 26 | # which have been matched to entries in the Million Song Datase 27 | nlp_data prepare_music_midi --dataset lmd_matched 28 | 29 | # Get the aligned version of lmd_matched 30 | nlp_data prepare_music_midi --dataset lmd_aligned 31 | 32 | # Get the clean midi data 33 | nlp_data prepare_music_midi --dataset clean_midi 34 | ``` 35 | 36 | ## Geocities 37 | 38 | The Geocities collection of MIDI files. 39 | See https://archive.org/details/archiveteam-geocities-midi-collection-2009 for more details. 40 | ``` 41 | nlp_data prepare_music_midi --dataset geocities 42 | ``` 43 | -------------------------------------------------------------------------------- /scripts/datasets/music_generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/music_generation/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/pretrain_corpus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/pretrain_corpus/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/question_answering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/question_answering/__init__.py -------------------------------------------------------------------------------- /scripts/datasets/question_answering/prepare_hotpotqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from gluonnlp.utils.misc import download, load_checksum_stats 4 | from gluonnlp.base import get_data_home_dir 5 | 6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) 7 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'hotpotqa') 8 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'hotpotqa.txt') 9 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH) 10 | 11 | 12 | _CITATIONS = """ 13 | @inproceedings{yang2018hotpotqa, 14 | title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering}, 15 | author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.}, 16 | booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})}, 17 | year={2018} 18 | } 19 | 20 | """ 21 | 22 | _URLS = { 23 | 'train': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json', 24 | 'dev_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json', 25 | 'dev_distractor': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json', 26 | 'test_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json', 27 | } 28 | 29 | 30 | def get_parser(): 31 | parser = argparse.ArgumentParser(description='Downloading the HotpotQA Dataset.') 32 | parser.add_argument('--save-path', type=str, default='hotpotqa') 33 | parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH, 34 | help='The path to download the dataset.') 35 | parser.add_argument('--overwrite', action='store_true') 36 | return parser 37 | 38 | 39 | def main(args): 40 | if not os.path.exists(args.save_path): 41 | os.makedirs(args.save_path) 42 | for url in _URLS.values(): 43 | file_name = url[url.rfind('/') + 1:] 44 | file_hash = _URL_FILE_STATS[url] 45 | download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash) 46 | if not os.path.exists(os.path.join(args.save_path, file_name))\ 47 | or (args.overwrite and args.save_path != args.cache_path): 48 | os.symlink(os.path.join(args.cache_path, file_name), 49 | os.path.join(args.save_path, file_name)) 50 | 51 | 52 | def cli_main(): 53 | parser = get_parser() 54 | args = parser.parse_args() 55 | main(args) 56 | 57 | 58 | if __name__ == '__main__': 59 | cli_main() 60 | -------------------------------------------------------------------------------- /scripts/datasets/question_answering/prepare_searchqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from gluonnlp.utils.misc import download, load_checksum_stats 4 | from gluonnlp.base import get_data_home_dir, get_repo_url 5 | 6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) 7 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa') 8 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'searchqa.txt') 9 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH) 10 | 11 | 12 | _CITATIONS = """ 13 | @article{dunn2017searchqa, 14 | title={Searchqa: A new q\&a dataset augmented with context from a search engine}, 15 | author={Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V Ugur and Cirik, Volkan and Cho, Kyunghyun}, 16 | journal={arXiv preprint arXiv:1704.05179}, 17 | year={2017} 18 | } 19 | 20 | """ 21 | 22 | _URLS = { 23 | 'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt', 24 | 'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt', 25 | 'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt' 26 | } 27 | 28 | 29 | def get_parser(): 30 | parser = argparse.ArgumentParser(description='Downloading the SearchQA Dataset.') 31 | parser.add_argument('--save-path', type=str, default='searchqa') 32 | parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH, 33 | help='The path to download the dataset.') 34 | parser.add_argument('--overwrite', action='store_true') 35 | return parser 36 | 37 | 38 | def main(args): 39 | if not os.path.exists(args.save_path): 40 | os.makedirs(args.save_path) 41 | for url in _URLS.values(): 42 | file_name = url[url.rfind('/') + 1:] 43 | file_hash = _URL_FILE_STATS[url] 44 | download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash) 45 | if not os.path.exists(os.path.join(args.save_path, file_name))\ 46 | or (args.overwrite and args.save_path != args.cache_path): 47 | os.symlink(os.path.join(args.cache_path, file_name), 48 | os.path.join(args.save_path, file_name)) 49 | 50 | 51 | def cli_main(): 52 | parser = get_parser() 53 | args = parser.parse_args() 54 | main(args) 55 | 56 | 57 | if __name__ == '__main__': 58 | cli_main() 59 | -------------------------------------------------------------------------------- /scripts/datasets/question_answering/prepare_triviaqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tarfile 3 | import argparse 4 | from gluonnlp.utils.misc import download, load_checksum_stats 5 | from gluonnlp.base import get_data_home_dir 6 | 7 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) 8 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'triviaqa') 9 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'triviaqa.txt') 10 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH) 11 | 12 | 13 | _CITATIONS = """ 14 | @InProceedings{JoshiTriviaQA2017, 15 | author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke}, 16 | title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, 17 | booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics}, 18 | month = {July}, 19 | year = {2017}, 20 | address = {Vancouver, Canada}, 21 | publisher = {Association for Computational Linguistics}, 22 | } 23 | 24 | """ 25 | 26 | _URLS = { 27 | 'rc': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz', 28 | 'unfiltered': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz' 29 | } 30 | 31 | 32 | def get_parser(): 33 | parser = argparse.ArgumentParser(description='Downloading the TriviaQA Dataset.') 34 | parser.add_argument('--type', type=str, choices=['rc', 'unfiltered'], default='rc', 35 | help='type of the triviaqa dataset.') 36 | parser.add_argument('--save-path', type=str, default='triviaqa') 37 | parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH, 38 | help='The path to download the dataset.') 39 | parser.add_argument('--overwrite', action='store_true') 40 | return parser 41 | 42 | 43 | def main(args): 44 | 45 | def extract(tar_path, target_path): 46 | try: 47 | tar = tarfile.open(tar_path, "r:gz") 48 | file_names = tar.getnames() 49 | for file_name in file_names: 50 | tar.extract(file_name, target_path) 51 | tar.close() 52 | except Exception as e: 53 | print(e) 54 | 55 | tar_url = _URLS[args.type] 56 | file_name = tar_url[tar_url.rfind('/') + 1:] 57 | file_hash = _URL_FILE_STATS[tar_url] 58 | download(tar_url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash) 59 | if not os.path.exists(args.save_path): 60 | os.makedirs(args.save_path) 61 | if not os.path.exists(os.path.join(args.save_path, file_name))\ 62 | or (args.overwrite and args.save_path != args.cache_path): 63 | os.symlink(os.path.join(args.cache_path, file_name), 64 | os.path.join(args.save_path, file_name)) 65 | extract(os.path.join(args.save_path, file_name), args.save_path) 66 | 67 | 68 | def cli_main(): 69 | parser = get_parser() 70 | args = parser.parse_args() 71 | main(args) 72 | 73 | 74 | if __name__ == '__main__': 75 | cli_main() 76 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/bookcorpus.txt: -------------------------------------------------------------------------------- 1 | https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz 87ca37e83fd7ea58573a1630ebf9d1da9ee34a41 2404269430 -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/glue.txt: -------------------------------------------------------------------------------- 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/cola.zip 19096246cd2a06d8fe2d13880d6cec61149f77c7 376971 2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/sst.zip 44f5954391612a8b3d9d65f6d4a824e9ae8d19ce 7439277 3 | https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt 716e0f67af962f08220b7e97d229b293077ef41f 1047044 4 | https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc 506c7a1a5e0dd551ceec2f84070fa1a8c2bc4b41 6222 5 | https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt 4265196c15cf75620b0b592b8b921f543bda7e6c 441275 6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/qqp.zip d775bd543ee78e3f64892a43ada949daf93e003d 41696084 7 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/sts.zip cc66d8533052de6d7475ac56dfce300751e070a4 802872 8 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/mnli.zip c22c684daa5cc9fad949d09d10ecedf94a2ce053 312783507 9 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/snli.zip c60db4cc8820749e6af9f713f4d55109dd46e8c1 129820157 10 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/qnli.zip 6700cb1d2536bf512314b01350f9ac382439218e 10627589 11 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/rte.zip 2eb8630df898b7d8df14ca9130c1ac1cf79eb376 697150 12 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/wnli.zip fc9834b5a8af4e1d8412e48bc38b477510a8c2d0 28999 13 | https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D c137a2020ab489011dc38fde9ee429f4e2c71257 222257 14 | https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1 2f46c4b80fea8d3ea52a28e05467af3332fa65d9 265530 15 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/gutenberg.txt: -------------------------------------------------------------------------------- 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225 2 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/hotpotqa.txt: -------------------------------------------------------------------------------- 1 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json 08c42431c22984f362e94de0e635c7b858c3cff0 566426227 2 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json 825b6cfc34a61db41e82bbb14d978d5a834925f8 46320117 3 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json 96a41025612e8cb15989251102dc05efe9647eda 47454698 4 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json b30e4ff0d8b7bd808240e5609410f9c36279ef36 46213747 5 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/language_model.txt: -------------------------------------------------------------------------------- 1 | https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d17d80b1459be871a5039ac23e752a53cbe 4475746 2 | https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076 3 | http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475 4 | http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016 5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805 6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106 7 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/music_midi.txt: -------------------------------------------------------------------------------- 1 | http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz 330b3c67f24f9280f81e1f7ab12749087dd83f08 1768163879 2 | http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz 218b7c82ecb230a6679053e48e87714f0bd4836f 1407072670 3 | http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz 9873e84dd5a531ba3623e0a24ce33a81681cba80 272169548 4 | http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz ae47e29dfc18d7779d95697a6461d759504c7a1c 234283029 5 | https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip e189d8a0b6769f3be576a036da840adafe489327 46579421 6 | https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip 13808bf9503c72371d38e9705e93ce8623b21c01 59243107 7 | https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip 493880759c648dd96167a2f4d394421e6fa33874 437506993 8 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/naturalquestions.txt: -------------------------------------------------------------------------------- 1 | s3://gluonnlp-numpy-data/NaturalQuestions/v1.0-simplified_simplified-nq-train.jsonl.gz 9ae896ea4b29370fe157aea61a088ffdc0fbda8f 4715820286 2 | s3://gluonnlp-numpy-data/NaturalQuestions/nq-dev-all.jsonl.gz b4cc081a2d065f84d630a1338dead7faad77eeff 1068038975 3 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/searchqa.txt: -------------------------------------------------------------------------------- 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217 2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988 3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902 4 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/squad.txt: -------------------------------------------------------------------------------- 1 | https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 1faea1252438a64f9718412a55036b786cfcc636 30288272 2 | https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json e1621aae0683b346ee9743bd5609266ba0cc34fc 4854279 3 | https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json ceb2acdea93b9d82ab1829c7b1e03bee9e302c99 42123633 4 | https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json 53ebaeb15bc5cab36645150f6f65d074348e2f3d 4370528 5 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/superglue.txt: -------------------------------------------------------------------------------- 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/cb.zip c16fa0a46f0f888d59767851c44d8db397896fe5 75482 2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/copa.zip ef110b215d7ff95a2fd2d0133f0959d324e9eec3 43986 3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/multirc.zip 05bfcb1da7ea06742266f24503342fc20b2ab88a 1116225 4 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/rte.zip 66105efeccc3fc54f9c5539de4c2d393d5bb4d36 750920 5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/wic.zip 5b95487a3690abc718bc173ccd35bf084c43b10a 396213 6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/wsc.zip 829ec3dd532284281cc19bacf9cded6c11d3452d 32751 7 | https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip 8c8874dcace4942dd00cf9f76c2537ea0e2026eb 33950 8 | https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip 949909079262bc4f6fb66bd889707aa71218975f 10413 9 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/boolq.zip 90bf152c8012869d326260709404ce5111a76b46 4118001 10 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/record.zip af2825be511efa8fbc7813756e768efffb8fcc11 51757880 11 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/text_classification.txt: -------------------------------------------------------------------------------- 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/ag_news_csv.tar.gz 00b73919ec0527118ca35d819029985c33ca4005 11784327 2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/imdb.tar.gz af11c368141a0cec4d49563000a2a54f9afdc38d 35673480 3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/dbpedia_csv.tar.gz f39ead1841501739a34a5bbb22d405677e3165f7 68341698 4 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/yelp_review_polarity_csv.tar.gz dd08ed616d28c633b1ff7a5e12d900426e5db779 166373322 5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/yelp_review_full_csv.tar.gz d0a1011a88be15254054e94144c83e92a048e318 196146693 6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/amazon_review_polarity_csv.tar.gz 9689538a9ee0630340da8aa456a0888cc6733919 688340758 7 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/amazon_review_full_csv.tar.gz e85b2d264aa8d8d3cc4dbe08adba88c0db92ff5b 643695117 8 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/triviaqa.txt: -------------------------------------------------------------------------------- 1 | https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz aa7d8c01d4a5e563caaeb648e8c1f506e353ebd6 2665779500 2 | https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz 670ba904b286865e25bb67ebd31c25e7c74c18ae 632549060 3 | -------------------------------------------------------------------------------- /scripts/datasets/url_checksums/wikipedia.txt: -------------------------------------------------------------------------------- 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz 1e1d77c31622744aaa45ff5bfbfca397154d9186 5068070627 2 | -------------------------------------------------------------------------------- /scripts/index.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | .. container:: cards 5 | 6 | .. card:: 7 | :title: Benchmarking the Performance of NLP Backbones 8 | :link: benchmarks/index.html 9 | 10 | NLP Benchmark. 11 | 12 | .. card:: 13 | :title: Classification Scripts 14 | :link: classification/index.html 15 | 16 | NLP Classification example. 17 | 18 | .. card:: 19 | :title: Conversion Scripts 20 | :link: conversion_toolkits/index.html 21 | 22 | Converting NLP models from other frameworks to GluonNLP. 23 | 24 | .. card:: 25 | :title: Datasets 26 | :link: datasets/index.html 27 | 28 | Datasets in GluonNLP. 29 | 30 | .. card:: 31 | :title: Generation 32 | :link: generation/index.html 33 | 34 | Sequence generation with GPT-2 35 | 36 | .. card:: 37 | :title: Machine Translation 38 | :link: machine_translation/index.html 39 | 40 | Machine Translation examples. 41 | 42 | .. card:: 43 | :title: Data Preprocessing Toolkit in GluonNLP 44 | :link: processing/index.html 45 | 46 | Data preprocessing examples. 47 | 48 | .. card:: 49 | :title: Pretraining Model 50 | :link: pretraining/index.html 51 | 52 | Pretraining examples. 53 | 54 | .. card:: 55 | :title: Question Answering Examples 56 | :link: question_answering/index.html 57 | 58 | Question Answering Example. 59 | 60 | .. toctree:: 61 | :hidden: 62 | :maxdepth: 1 63 | 64 | 65 | benchmarks/index 66 | conversion_toolkits/index 67 | datasets/index 68 | classification/index 69 | generation/index 70 | machine_translation/index 71 | pretraining/index 72 | processing/index 73 | question_answering/index 74 | 75 | -------------------------------------------------------------------------------- /scripts/machine_translation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/machine_translation/__init__.py -------------------------------------------------------------------------------- /scripts/machine_translation/evaluate_epochs_wmt2014_ende.sh: -------------------------------------------------------------------------------- 1 | SAVE_DIR=$1 2 | SUBWORD_ALGO=${2:-yttm} 3 | EPOCH_BEGIN=${3:-30} 4 | EPOCH_END=${4:-60} 5 | STOCHASTIC=${5:-0} 6 | LP_ALPHA=${6:-0.6} 7 | LP_K=${7:-5} 8 | BEAM_SIZE=${8:-4} 9 | 10 | 11 | for epoch in $( seq ${EPOCH_BEGIN} ${EPOCH_END}) 12 | do 13 | for fold in dev test 14 | do 15 | python3 evaluate_transformer.py \ 16 | --param_path ${SAVE_DIR}/epoch${epoch}.params \ 17 | --src_lang en \ 18 | --tgt_lang de \ 19 | --cfg ${SAVE_DIR}/config.yml \ 20 | --src_tokenizer ${SUBWORD_ALGO} \ 21 | --tgt_tokenizer ${SUBWORD_ALGO} \ 22 | --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ 23 | --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ 24 | --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ 25 | --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ 26 | --src_corpus wmt2014_ende/${fold}.raw.en \ 27 | --tgt_corpus wmt2014_ende/${fold}.raw.de \ 28 | --lp_alpha ${LP_ALPHA} \ 29 | --lp_k ${LP_K} \ 30 | --beam-size ${BEAM_SIZE} \ 31 | --save_dir ${SAVE_DIR}/epoch${epoch}_evaluation_${fold}_alpha${LP_ALPHA}_K${LP_K}_beam${BEAM_SIZE} \ 32 | --fp16 33 | done 34 | done 35 | -------------------------------------------------------------------------------- /scripts/machine_translation/transformer_enc12_dec1.yml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | dropout: 0.2 3 | DECODER: 4 | pre_norm: false 5 | num_layers: 1 6 | ENCODER: 7 | pre_norm: false 8 | num_layers: 12 9 | -------------------------------------------------------------------------------- /scripts/pretraining/bert/covert_bookcorpus_format.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | class BookscorpusTextFormatting: 5 | def __init__(self, books_path, output_filename, recursive = False, interval = 500): 6 | self.books_path = books_path 7 | self.recursive = recursive 8 | self.output_filename = output_filename.split('.') 9 | self.interval = interval 10 | 11 | # This puts one book per line 12 | 13 | def merge(self): 14 | count = 0 15 | for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True): 16 | if count == 0: 17 | ofile_name = '.'.join([self.output_filename[0]+'-'+str(count//500), self.output_filename[1]]) 18 | ofile = open(ofile_name, mode='w', encoding='utf-8-sig', newline='\n') 19 | elif count%self.interval == 0: 20 | print(count) 21 | ofile.close() 22 | ofile_name = '.'.join([self.output_filename[0]+'-'+str(count//500), self.output_filename[1]]) 23 | ofile = open(ofile_name, mode='w', encoding='utf-8-sig', newline='\n') 24 | file = open(filename, mode='r', encoding='utf-8-sig', newline='\n') 25 | for line in file: 26 | if line.strip() != '': 27 | ofile.write(line.strip() + ' ') 28 | ofile.write("\n\n") 29 | count += 1 30 | ofile.close() 31 | 32 | data_dir = 'BookCorpus/books1/epubtxt/' 33 | output_name_format = 'BookCorpus/after_prepare/bookcorpus.txt' 34 | 35 | FormatTool = BookscorpusTextFormatting(data_dir, output_name_format) 36 | FormatTool.merge() 37 | 38 | 39 | -------------------------------------------------------------------------------- /scripts/pretraining/convert_electra_pretrain_backbone.py: -------------------------------------------------------------------------------- 1 | """Convert pre-trained model parameters from ElectraForPretrain to ElectraModel""" 2 | 3 | import os 4 | import argparse 5 | import mxnet as mx 6 | 7 | from pretraining_utils import get_electra_pretraining_model 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description=__doc__) 12 | group = parser.add_mutually_exclusive_group(required=True) 13 | group.add_argument('--model-name', type=str, default='google_electra_small', 14 | help='Name of the pretrained model.') 15 | parser.add_argument('--params-file', type=str, required=True, 16 | help='Path to the pretrained parameter file.') 17 | parser.add_argument('--out-file', type=str, default=None, 18 | help='Output file path.') 19 | parser.add_argument('--generator_units_scale', type=float, default=None, 20 | help='The scale size of the generator units, same as used in pretraining.') 21 | parser.add_argument('--generator_layers_scale', type=float, default=None, 22 | help='The scale size of the generator layer, same as used in pretraining.') 23 | 24 | args = parser.parse_args() 25 | return args 26 | 27 | 28 | def convert_params(model_name, generator_units_scale, generator_layers_scale, 29 | params_path, out_path): 30 | _, _, pretrain_model = get_electra_pretraining_model(model_name, [mx.cpu()], 31 | generator_units_scale=generator_units_scale, 32 | generator_layers_scale=generator_layers_scale, 33 | params_path=params_path) 34 | backbone_model = pretrain_model.disc_backbone 35 | backbone_model.save_parameters(out_path) 36 | 37 | 38 | if __name__ == '__main__': 39 | args = parse_args() 40 | out_path = args.out_file 41 | if not out_path: 42 | params_file = args.params_file 43 | file_name_sep = os.path.basename(params_file).split(os.path.extsep) 44 | file_name_sep.insert(-1, 'backbone') 45 | out_path = os.path.join( 46 | os.path.dirname(params_file), 47 | os.path.extsep.join(file_name_sep)) 48 | convert_params(args.model_name, args.generator_units_scale, args.generator_layers_scale, 49 | args.params_file, out_path) 50 | -------------------------------------------------------------------------------- /scripts/pretraining/torch/bert/README.md: -------------------------------------------------------------------------------- 1 | NOTE: GluonNLP uses `/dev/shm/gluonnlp` shared memory filesystem to share 2 | datasets among multi-process workloads. At this time, `/dev/shm/gluonnlp` is not 3 | cleaned up automatically after the workload completes and manual deletion is 4 | needed to free up memory. Sometimes you may not want to delete 5 | `/dev/shm/gluonnlp` after running a workload, as you intend to run a workload 6 | based on same dataset later and it's useful to keep the dataset in shared 7 | memory. 8 | 9 | # BERT 10 | 11 | -1. p4 instance preparation 12 | 13 | ```bash 14 | sudo mkfs.btrfs /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1 /dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 15 | sudo mount /dev/nvme1n1 /mnt 16 | sudo chown ubuntu:ubuntu /mnt/ 17 | ``` 18 | 19 | 1. Get the dataset 20 | 21 | ```bash 22 | nlp_data prepare_bookcorpus --segment_sentences --segment_num_worker 16 23 | nlp_data prepare_wikipedia --mode download_prepared --segment_sentences --segment_num_worker 16 24 | find wikicorpus/one_sentence_per_line BookCorpus/one_sentence_per_line -type f > input_reference 25 | ``` 26 | 27 | 2. Prepare batches 28 | 29 | ```bash 30 | python3 prepare_quickthought.py \ 31 | --input-reference input_reference 32 | --output /mnt/out_quickthought_128 \ 33 | --model-name google_en_cased_bert_base \ 34 | --max-seq-length 128 35 | ``` 36 | 37 | 38 | 1. Phase 1 training with sequence length 128 39 | 40 | ```bash 41 | python3 -m torch.distributed.launch --nproc_per_node=8 run_pretraining.py \ 42 | --model_name google_en_cased_bert_base \ 43 | --lr 0.005 \ 44 | --batch_size 128 \ 45 | --num_accumulated 96 \ 46 | --num_dataloader_workers 4 \ 47 | --num_steps 3870 \ 48 | --input-files /mnt/out_quickthought_128/*feather \ 49 | --mmap-folder /mnt/gluonnlp_mmap \ 50 | --ckpt_dir /mnt/ckpt_dir \ 51 | --ckpt_interval 1000 2>&1| tee train.log; 52 | ``` 53 | 54 | 3. Phase 2 training with sequence length 512 55 | 56 | TBD 57 | 58 | Finally we obtain a folder of structure as followed, 59 | 60 | ``` 61 | coder_base 62 | ├── vocab-{short_hash}.json 63 | ├── model-{short_hash}.params 64 | ├── model-{short_hash}.yml 65 | ``` 66 | -------------------------------------------------------------------------------- /scripts/processing/README.md: -------------------------------------------------------------------------------- 1 | # Data Processing Toolkit in GluonNLP 2 | We provide a bunch of data 3 | 4 | ## Clean and Tokenize a Parallel Corpus 5 | 6 | To clean and tokenize a parallel corpus, use 7 | ``` 8 | nlp_process clean_tok_para_corpus --help 9 | ``` 10 | 11 | ## Learn a subword model 12 | 13 | To learn a subword tokenizer, use 14 | ``` 15 | nlp_process learn_subword --help 16 | ``` 17 | 18 | 19 | ## Apply the learned subword model 20 | To apply the learned subword tokenizer, user 21 | ``` 22 | nlp_process apply_subword --help 23 | ``` 24 | -------------------------------------------------------------------------------- /scripts/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/processing/__init__.py -------------------------------------------------------------------------------- /scripts/processing/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import textwrap 3 | 4 | from . import ( 5 | clean_tok_corpus, 6 | learn_subword, 7 | apply_subword 8 | ) 9 | 10 | 11 | SUBCOMMANDS = ['clean_tok_para_corpus', 'clean_tok_mono_corpus', 12 | 'learn_subword', 'apply_subword', 'help'] 13 | 14 | 15 | def cli_main(): 16 | parser = argparse.ArgumentParser( 17 | description='Sharable data preprocessing utilities in GluonNLP.', 18 | prog='nlp_process', add_help=False) 19 | parser.add_argument('command', type=str, 20 | choices=SUBCOMMANDS, 21 | metavar='[subcommand]', 22 | help='The subcommand to use. ' 23 | 'Choices are {}.'.format(SUBCOMMANDS)) 24 | args, other_args = parser.parse_known_args() 25 | if args.command == 'clean_tok_para_corpus': 26 | parser = clean_tok_corpus.get_parser.para() 27 | sub_args = parser.parse_args(other_args) 28 | clean_tok_corpus.main_para(sub_args) 29 | elif args.command == 'clean_tok_mono_corpus': 30 | parser = clean_tok_corpus.get_parser.mono() 31 | sub_args = parser.parse_args(other_args) 32 | clean_tok_corpus.main_mono(sub_args) 33 | elif args.command == 'learn_subword': 34 | parser = learn_subword.get_parser() 35 | sub_args = parser.parse_args(other_args) 36 | learn_subword.main(sub_args) 37 | elif args.command == 'apply_subword': 38 | parser = apply_subword.get_parser() 39 | sub_args = parser.parse_args(other_args) 40 | apply_subword.main(sub_args) 41 | elif args.command == 'help': 42 | parser.print_help() 43 | else: 44 | parser.print_help() 45 | 46 | 47 | if __name__ == '__main__': 48 | cli_main() 49 | -------------------------------------------------------------------------------- /scripts/question_answering/albert_custom.yaml: -------------------------------------------------------------------------------- 1 | version: 1.0 2 | 3 | model: 4 | name: albert_base_v2 5 | framework: mxnet 6 | 7 | tuning: 8 | strategy: 9 | name: mycustom 10 | accuracy_criterion: 11 | relative: 0.02 12 | exit_policy: 13 | timeout: 0 14 | max_trials: 1000 15 | random_seed: 9527 16 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/README.md: -------------------------------------------------------------------------------- 1 | # Commands For Training on SQuAD 2 | 3 | All commands are generated by parsing the template in [run_squad.template](run_squad.template). 4 | To generate all commands, use the following code. 5 | 6 | ```bash 7 | python3 generate_commands.py 8 | ``` 9 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad.template: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-{{ dtype }}} # Default training data type 6 | MODEL_NAME={{ model_name }} 7 | BATCH_SIZE={{ batch_size }} 8 | NUM_ACCUMULATED={{ num_accumulated }} 9 | EPOCHS={{ epochs }} 10 | LR={{ lr }} 11 | WARMUP_RATIO={{ warmup_ratio }} 12 | WD={{ wd }} 13 | MAX_SEQ_LENGTH={{ max_seq_length }} 14 | MAX_GRAD_NORM={{ max_grad_norm }} 15 | LAYERWISE_DECAY={{ layerwise_decay }} 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_albert_base.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_albert_base_v2 7 | BATCH_SIZE=4 8 | NUM_ACCUMULATED=3 9 | EPOCHS=3 10 | LR=2e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_albert_large.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_albert_large_v2 7 | BATCH_SIZE=3 8 | NUM_ACCUMULATED=4 9 | EPOCHS=3 10 | LR=2e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_albert_xlarge.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_albert_xlarge_v2 7 | BATCH_SIZE=1 8 | NUM_ACCUMULATED=12 9 | EPOCHS=3 10 | LR=2e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=0.1 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_albert_xxlarge.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_albert_xxlarge_v2 7 | BATCH_SIZE=1 8 | NUM_ACCUMULATED=12 9 | EPOCHS=3 10 | LR=2e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=0.1 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_electra_base.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_electra_base 7 | BATCH_SIZE=8 8 | NUM_ACCUMULATED=1 9 | EPOCHS=2 10 | LR=0.0001 11 | WARMUP_RATIO=0.1 12 | WD=0 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=0.8 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_electra_large.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_electra_large 7 | BATCH_SIZE=2 8 | NUM_ACCUMULATED=4 9 | EPOCHS=2 10 | LR=5e-05 11 | WARMUP_RATIO=0.1 12 | WD=0 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1 15 | LAYERWISE_DECAY=0.9 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_electra_small.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_electra_small 7 | BATCH_SIZE=8 8 | NUM_ACCUMULATED=1 9 | EPOCHS=2 10 | LR=0.0003 11 | WARMUP_RATIO=0.1 12 | WD=0 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=0.8 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=gluon_en_cased_bert_base_v1 7 | BATCH_SIZE=6 8 | NUM_ACCUMULATED=2 9 | EPOCHS=3 10 | LR=3e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_mobilebert.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_uncased_mobilebert 7 | BATCH_SIZE=8 8 | NUM_ACCUMULATED=1 9 | EPOCHS=5 10 | LR=4e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=384 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_roberta_large.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=fairseq_roberta_large 7 | BATCH_SIZE=2 8 | NUM_ACCUMULATED=6 9 | EPOCHS=3 10 | LR=3e-05 11 | WARMUP_RATIO=0.2 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_uncased_bert_base.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_en_uncased_bert_base 7 | BATCH_SIZE=6 8 | NUM_ACCUMULATED=2 9 | EPOCHS=3 10 | LR=3e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_uncased_bert_large.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_en_uncased_bert_large 7 | BATCH_SIZE=2 8 | NUM_ACCUMULATED=6 9 | EPOCHS=3 10 | LR=3e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /scripts/question_answering/commands/run_squad2_uncased_bert_wwm_large.sh: -------------------------------------------------------------------------------- 1 | # Generated by "generate_commands.py" 2 | 3 | USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod 4 | VERSION=${2:-2.0} # SQuAD Version 5 | DTYPE=${3:-float32} # Default training data type 6 | MODEL_NAME=google_en_uncased_bert_wwm_large 7 | BATCH_SIZE=3 8 | NUM_ACCUMULATED=2 9 | EPOCHS=2 10 | LR=3e-05 11 | WARMUP_RATIO=0.1 12 | WD=0.01 13 | MAX_SEQ_LENGTH=512 14 | MAX_GRAD_NORM=1.0 15 | LAYERWISE_DECAY=-1 16 | 17 | # Prepare the Data 18 | nlp_data prepare_squad --version ${VERSION} 19 | 20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py 21 | 22 | # Run the script 23 | if [ ${USE_HOROVOD} -eq 0 ]; 24 | then 25 | RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" 26 | else 27 | RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" 28 | fi 29 | ${RUN_COMMAND} \ 30 | --model_name ${MODEL_NAME} \ 31 | --data_dir squad \ 32 | --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ 33 | --version ${VERSION} \ 34 | --do_eval \ 35 | --do_train \ 36 | --batch_size ${BATCH_SIZE} \ 37 | --num_accumulated ${NUM_ACCUMULATED} \ 38 | --layerwise_decay ${LAYERWISE_DECAY} \ 39 | --epochs ${EPOCHS} \ 40 | --lr ${LR} \ 41 | --warmup_ratio ${WARMUP_RATIO} \ 42 | --wd ${WD} \ 43 | --max_seq_length ${MAX_SEQ_LENGTH} \ 44 | --max_grad_norm ${MAX_GRAD_NORM} \ 45 | --dtype ${DTYPE} \ 46 | --overwrite_cache 47 | -------------------------------------------------------------------------------- /src/gluonnlp/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.0.dev' 2 | from . import base 3 | from . import data 4 | from . import models 5 | from . import utils 6 | from . import attention_cell 7 | from . import initializer as init 8 | from . import layers 9 | from . import loss 10 | from . import lr_scheduler 11 | from . import op 12 | from . import torch 13 | from . import sequence_sampler 14 | from . import embedding 15 | -------------------------------------------------------------------------------- /src/gluonnlp/base.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # pylint: disable=abstract-method 19 | """Helper functions.""" 20 | 21 | import os 22 | import numpy as np 23 | 24 | __all__ = ['get_home_dir', 'get_data_home_dir'] 25 | 26 | INT_TYPES = (int, np.int32, np.int64) 27 | FLOAT_TYPES = (float, np.float16, np.float32, np.float64) 28 | 29 | 30 | def get_home_dir(): 31 | """Get home directory for storing datasets/models/pre-trained word embeddings""" 32 | _home_dir = os.environ.get('GLUONNLP_HOME', os.path.join('~', '.gluonnlp')) 33 | # expand ~ to actual path 34 | _home_dir = os.path.expanduser(_home_dir) 35 | return _home_dir 36 | 37 | 38 | def get_data_home_dir(): 39 | """Get home directory for storing the datasets""" 40 | home_dir = get_home_dir() 41 | return os.path.join(home_dir, 'datasets') 42 | 43 | 44 | def get_model_zoo_home_dir(): 45 | """Get the local directory for storing pretrained models""" 46 | home_dir = get_home_dir() 47 | return os.path.join(home_dir, 'models') 48 | 49 | 50 | def get_model_zoo_checksum_dir(): 51 | """Get the directory that stores the checksums of the artifacts in the model zoo """ 52 | curr_dir = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) 53 | check_sum_dir = os.path.join(curr_dir, 'models', 'model_zoo_checksums') 54 | return check_sum_dir 55 | 56 | 57 | def get_repo_url(): 58 | """Return the base URL for Gluon dataset and model repository """ 59 | default_repo = 's3://gluonnlp-numpy-data' 60 | repo_url = os.environ.get('GLUONNLP_REPO_URL', default_repo) 61 | if repo_url[-1] != '/': 62 | repo_url = repo_url + '/' 63 | return repo_url 64 | 65 | 66 | def get_repo_model_zoo_url(): 67 | """Return the base URL for GluonNLP Model Zoo""" 68 | repo_url = get_repo_url() 69 | model_zoo_url = repo_url + 'models/' 70 | return model_zoo_url 71 | 72 | 73 | def use_einsum_optimization(): 74 | """Whether to use einsum for attention. This will potentially accelerate the 75 | attention cell 76 | 77 | Returns 78 | ------- 79 | flag 80 | The use einsum flag 81 | 82 | """ 83 | flag = os.environ.get('GLUONNLP_USE_EINSUM', False) 84 | return flag 85 | -------------------------------------------------------------------------------- /src/gluonnlp/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/src/gluonnlp/cli/__init__.py -------------------------------------------------------------------------------- /src/gluonnlp/cli/average_checkpoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import mxnet as mx 3 | import os 4 | 5 | mx.npx.set_np() 6 | 7 | 8 | def get_parser(): 9 | parser = argparse.ArgumentParser(description='Script to average the checkpoints') 10 | parser.add_argument('--checkpoints', type=str, required=True, nargs='+', 11 | help='checkpoint file paths, supports two format, ' 12 | '--checkpoints folder/epoch*.params or --checkpoints folder/update*.param') 13 | parser.add_argument('--ids', type=int, required=False, nargs='+', 14 | help='The IDs of the checkpoints.') 15 | parser.add_argument('--begin', type=int, required=False, 16 | default=None, 17 | help='begin number of checkpoints') 18 | parser.add_argument('--end', type=int, required=False, 19 | default=None, 20 | help='end number of checkpoints. ' 21 | 'We select the checkpoints with ID >= begin and <= end.') 22 | parser.add_argument('--save-path', type=str, required=True, help='Path of the output file') 23 | return parser 24 | 25 | 26 | def main(args): 27 | if args.begin is not None or args.end is not None or args.ids is not None: 28 | print(f'Before filtering, the checkpoints are {args.checkpoints}') 29 | prefix = os.path.commonprefix(args.checkpoints) 30 | postfix = os.path.commonprefix([ele[::-1] for ele in args.checkpoints])[::-1] 31 | checkpoint_id_l = [int(ele[len(prefix):-len(postfix)]) for ele in args.checkpoints] 32 | ckpt_paths = [] 33 | if args.ids is not None: 34 | for ele in args.ids: 35 | assert ele in checkpoint_id_l 36 | ckpt_paths.append(f'{prefix}{ele}{postfix}') 37 | else: 38 | assert args.begin is not None and args.end is not None, \ 39 | 'Must specify both begin and end if you want to select a range!' 40 | assert args.begin >= 0 41 | assert args.end >= args.begin 42 | for ele in checkpoint_id_l: 43 | if ele >= args.begin and ele <= args.end: 44 | ckpt_paths.append(f'{prefix}{ele}{postfix}') 45 | else: 46 | ckpt_paths = args.checkpoints 47 | print(f'Load models from {ckpt_paths}') 48 | print('Average the models and save it to {}'.format(args.save_path)) 49 | assert len(ckpt_paths) > 0, 'Cannot found checkpoints. You may need to check the inputs again.' 50 | res = mx.npx.load(ckpt_paths[0]) 51 | keys = res.keys() 52 | for ckpt_path in ckpt_paths[1:]: 53 | ckpt = mx.npx.load(ckpt_path) 54 | for key in keys: 55 | res[key] += ckpt[key] 56 | for key in keys: 57 | res[key] /= len(ckpt_paths) 58 | mx.npx.savez(args.save_path, **res) 59 | 60 | 61 | def cli_main(): 62 | parser = get_parser() 63 | args = parser.parse_args() 64 | main(args) 65 | 66 | 67 | if __name__ == '__main__': 68 | cli_main() 69 | -------------------------------------------------------------------------------- /src/gluonnlp/cli/data: -------------------------------------------------------------------------------- 1 | ../../../scripts/datasets -------------------------------------------------------------------------------- /src/gluonnlp/cli/process: -------------------------------------------------------------------------------- 1 | ../../../scripts/processing -------------------------------------------------------------------------------- /src/gluonnlp/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import vocab 2 | from . import tokenizers 3 | from . import batchify 4 | from .vocab import * 5 | 6 | __all__ = ['batchify', 'tokenizers'] + vocab.__all__ 7 | -------------------------------------------------------------------------------- /src/gluonnlp/data/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | """Tokenizers""" 2 | from .base import * 3 | from .huggingface import * 4 | from .jieba import * 5 | from .moses import * 6 | from .sentencepiece import * 7 | from .spacy import * 8 | from .subword_nmt import * 9 | from .whitespace import * 10 | from .yttm import * 11 | 12 | 13 | __all__ = base.__all__ +\ 14 | huggingface.__all__ + \ 15 | jieba.__all__ + \ 16 | moses.__all__ + \ 17 | sentencepiece.__all__ + \ 18 | spacy.__all__ + \ 19 | subword_nmt.__all__ + \ 20 | whitespace.__all__ + \ 21 | yttm.__all__ 22 | -------------------------------------------------------------------------------- /src/gluonnlp/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | # pylint: disable=wildcard-import 19 | """Word embeddings.""" 20 | 21 | from . import embed_loader 22 | from .embed_loader import * 23 | 24 | __all__ = (embed_loader.__all__ ) 25 | -------------------------------------------------------------------------------- /src/gluonnlp/loss.py: -------------------------------------------------------------------------------- 1 | from mxnet.gluon import HybridBlock 2 | from mxnet import npx 3 | 4 | 5 | class LabelSmoothCrossEntropyLoss(HybridBlock): 6 | r"""Computes the softmax cross entropy loss with label-smoothing 7 | 8 | .. math:: 9 | 10 | \DeclareMathOperator{softmax}{softmax} 11 | 12 | lp = \log \softmax({pred}) 13 | 14 | L = - [(1 - \alpha) \sum_{i=1}^N (lp_{i, {label}_i}) + \alpha \frac{1}{N} \sum_{j=1}^N (lp_{i, j})] 15 | 16 | To reduce complexity, we can implement it as 17 | 18 | .. math:: 19 | 20 | L = -\sum_i (\frac{N \alpha - 1}{N-1} lp_{i, {label}_i} + \frac{1 - \alpha}{N - 1} \sum_j lp_{i, j}) 21 | 22 | Parameters 23 | ---------- 24 | num_labels 25 | The number of possible labels. For example, in NLP, it can be the size of the vocabulary. 26 | alpha 27 | The uncertainty that will be injected to the labels. All the negative labels will be 28 | treated with probability equals to \frac{\alpha} / {N} 29 | from_logits 30 | Whether input is a log probability (usually from log_softmax) instead of unnormalized numbers. 31 | """ 32 | def __init__(self, num_labels: int, alpha: float = 0.1, from_logits: bool = False, **kwargs): 33 | super().__init__(**kwargs) 34 | self._num_labels = num_labels 35 | self._alpha = alpha 36 | self._from_logits = from_logits 37 | 38 | def forward(self, pred, label): 39 | """ 40 | 41 | Parameters 42 | ---------- 43 | pred : 44 | The predictions of the network. Shape (..., V) 45 | label : 46 | The labels. Shape (..., ) 47 | 48 | Returns 49 | ------- 50 | loss : 51 | Shape (..., ) 52 | """ 53 | if not self._from_logits: 54 | pred = npx.log_softmax(pred, axis=-1) 55 | log_likelihood = npx.pick(pred, label, axis=-1) 56 | all_scores = pred.sum(axis=-1) 57 | loss = - (1 - self._alpha) * log_likelihood\ 58 | - self._alpha / float(self._num_labels) * all_scores 59 | return loss 60 | -------------------------------------------------------------------------------- /src/gluonnlp/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | from mxnet import lr_scheduler 3 | 4 | 5 | class InverseSquareRootScheduler(lr_scheduler.LRScheduler): 6 | """ Reduce the learning rate according to a polynomial of given power. 7 | 8 | During warmup 9 | Increase the learning rate linearly from warmup_init_lr to base_lr, 10 | After warmup 11 | Decay the learning rate with 12 | lr = base_lr * sqrt(warmup_steps) / sqrt(num_update) 13 | 14 | Parameters 15 | ---------- 16 | warmup_steps 17 | maximum number of updates before the decay reaches final learning rate. 18 | base_lr 19 | The final learning rate in the warm-up stage. The learning rate starts to decay after 20 | the lr reaches warmup_end_lr 21 | warmup_init_lr 22 | The initial learning rate of the scheduler. The warm up starts at this point. 23 | """ 24 | 25 | def __init__(self, warmup_steps: int, base_lr: float = 1E-3, warmup_init_lr: float = 0.0): 26 | super().__init__( 27 | base_lr, warmup_steps, warmup_init_lr, 'linear') 28 | self.base_lr = base_lr 29 | self.warmup_steps = warmup_steps 30 | 31 | def __call__(self, num_update): 32 | if num_update < self.warmup_steps: 33 | return self.get_warmup_lr(num_update) 34 | else: 35 | return self.base_lr * math.sqrt(self.warmup_steps) / math.sqrt(num_update) 36 | -------------------------------------------------------------------------------- /src/gluonnlp/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .albert import * 3 | from .bert import * 4 | from .electra import * 5 | from .gpt2 import * 6 | from .mobilebert import * 7 | from .roberta import * 8 | from .transformer import * 9 | from .transformer_xl import * 10 | from .xlmr import * 11 | from .bart import * 12 | from .t5 import * 13 | from .mt5 import * 14 | 15 | __all__ = base.__all__ + \ 16 | albert.__all__ + \ 17 | bert.__all__ + \ 18 | electra.__all__ + \ 19 | gpt2.__all__ +\ 20 | mobilebert.__all__ + \ 21 | roberta.__all__ + \ 22 | transformer.__all__ + \ 23 | transformer_xl.__all__ + \ 24 | t5.__all__ + \ 25 | mt5.__all__ 26 | -------------------------------------------------------------------------------- /src/gluonnlp/models/base.py: -------------------------------------------------------------------------------- 1 | __all__ = ['list_backbone_names', 'get_backbone', 'BACKBONE_REGISTRY'] 2 | 3 | from typing import Tuple, List 4 | from ..base import get_model_zoo_home_dir 5 | from ..data.tokenizers import BaseTokenizer 6 | from ..utils.registry import Registry 7 | from mxnet.gluon import Block 8 | 9 | BACKBONE_REGISTRY = Registry('Backbone Models') 10 | 11 | 12 | def list_backbone_names(): 13 | all_keys = [] 14 | for backbone_type in BACKBONE_REGISTRY.list_keys(): 15 | all_keys.extend(BACKBONE_REGISTRY.get(backbone_type)[-1]()) 16 | return all_keys 17 | 18 | 19 | def get_backbone(model_name: str, 20 | root: str = get_model_zoo_home_dir(), 21 | **kwargs) -> Tuple['Block', str, BaseTokenizer, str, List]: 22 | """Get the backbone network 23 | 24 | Parameters 25 | ---------- 26 | model_name 27 | The name of the pretrained model 28 | root 29 | Downloaded directory of the model zoo 30 | 31 | Returns 32 | ------- 33 | model_cls 34 | The class to construct the backbone network 35 | cfg 36 | Path to the config file of the backbone 37 | tokenizer 38 | The tokenizer that is bound to the backbone model 39 | backbone_param_path 40 | The path to the pretrained backbone weights 41 | others 42 | The other items returned by the create function. 43 | Will be wrapped into a list 44 | 45 | Examples 46 | -------- 47 | 48 | >>> from gluonnlp.models import get_backbone 49 | >>> model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone('google_en_cased_bert_base') 50 | >>> model = model_cls.from_cfg(cfg) 51 | >>> model.load_parameters(backbone_param_path) 52 | """ 53 | model_cls, local_create_fn = None, None 54 | 55 | for backbone_type in BACKBONE_REGISTRY.list_keys(): 56 | ele_model_cls, ele_local_create_fn, list_key_fn = BACKBONE_REGISTRY.get(backbone_type) 57 | if model_name in list_key_fn(): 58 | model_cls = ele_model_cls 59 | local_create_fn = ele_local_create_fn 60 | if model_cls is None or local_create_fn is None: 61 | raise KeyError('The backbone model "{}" is not found! ' 62 | 'Here are all available backbone models = {}' 63 | .format(model_name, 64 | list_backbone_names())) 65 | cfg, tokenizer, local_params_path, *others = local_create_fn(model_name=model_name, root=root, 66 | **kwargs) 67 | return model_cls, cfg, tokenizer, local_params_path, others 68 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/albert.txt: -------------------------------------------------------------------------------- 1 | google_albert_base_v2/model-125be477.params 125be477d1cecc6843245eafe46ca1dc5961ffb5 46736016 2 | google_albert_base_v2/model-8767fdc9.yml 8767fdc9e1190606dc9aa17725438b4ae33704c4 436 3 | google_albert_base_v2/model_mlm-fe20650e.params fe20650e289fcd1a36c09d39e1d5cf5ffa64ba32 47251372 4 | google_albert_base_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289 5 | google_albert_base_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576 6 | google_albert_large_v2/model-ad60bcd5.params ad60bcd55cbba463c6e85062769fce846dd9fcf0 70737552 7 | google_albert_large_v2/model-e2e9b974.yml e2e9b9748ffe2b147cd92cbc8edba129ed9e98c1 388 8 | google_albert_large_v2/model_mlm-6a5015ee.params 6a5015ee845f874c1201b5a954275a489e0ed10c 71383980 9 | google_albert_large_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289 10 | google_albert_large_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576 11 | google_albert_xlarge_v2/model-4149c9e2.params 4149c9e2793dbd9352d27ab11d67f84b0763f4b2 234901136 12 | google_albert_xlarge_v2/model-8123bffd.yml 8123bffda684857ddac48ebeaaa18aba0e1503fb 437 13 | google_albert_xlarge_v2/model_mlm-ee184d38.params ee184d389424bab1adf17cc1feb86c69ba0791ff 236071852 14 | google_albert_xlarge_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289 15 | google_albert_xlarge_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576 16 | google_albert_xxlarge_v2/model-5601a0ed.params 5601a0edddb11d324aecccca7f496ef09013481e 890384016 17 | google_albert_xxlarge_v2/model-07fbeebc.yml 07fbeebcdee60e2362040807d56c572ae7dd7f03 438 18 | google_albert_xxlarge_v2/model_mlm-d2e2b06f.params d2e2b06f68668cab9c37dd60dca82f00e2e248ab 892603308 19 | google_albert_xxlarge_v2/spm-65999e5d.model 65999e5d811d9dc77a93bd712c8cb28e3addd852 760289 20 | google_albert_xxlarge_v2/vocab-2ee53ae7.json 2ee53ae76a9d8f478e67abc28a4cb9ec7444f090 372576 21 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/bart.txt: -------------------------------------------------------------------------------- 1 | fairseq_bart_base/model-8f4929b5.params 8f4929b54f2f77619885cea9f3bd7dba51a27f38 560560748 2 | fairseq_bart_base/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 3 | fairseq_bart_base/model-251bf089.yml 251bf08944d18cc29b59a4a854bdbccf601dabb5 754 4 | fairseq_bart_base/gpt2-f4dedacb.vocab f4dedacb076b1df441c9c7398ed9acd3c19865f3 575079 5 | fairseq_bart_large/model-862277b1.params 862277b1489ed95140cb63279fbd0098ef2dea90 1625180962 6 | fairseq_bart_large/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 7 | fairseq_bart_large/model-a2932dea.yml a2932deaf9737d95891755841fae3e388f3d698a 746 8 | fairseq_bart_large/gpt2-f1335494.vocab f1335494f47917829e3b1d08e579ff2c3fe4fd60 558231 9 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/electra.txt: -------------------------------------------------------------------------------- 1 | google_electra_small/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235 2 | google_electra_small/model-2654c8b4.params 2654c8b4e240a5713078d2bd79582285c3f1b333 53945262 3 | google_electra_small/gen_model-0c30d1c5.params 0c30d1c5678154937dee1d11bef8db6f43d4d767 54202512 4 | google_electra_small/model-9ffb21c8.yml 9ffb21c8885bdb3e5f62c3f7a670d406167ec10c 472 5 | google_electra_small/disc_model-137714b6.params 137714b6c7f327e642861a7380dd94c8b3dbf1ea 54211975 6 | google_electra_base/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235 7 | google_electra_base/model-31c235cc.params 31c235cc6da6f1872adffb31efe9318600b89ae5 435579680 8 | google_electra_base/gen_model-253a62c9.params 253a62c9aa9de24d85e09a9ae62ef88501e53dff 134978192 9 | google_electra_base/model-5b35ca0b.yml 5b35ca0b7f117978e372cfd8d98970d2d726e6c0 477 10 | google_electra_base/disc_model-514bd353.params 514bd353f9d42bc907bfa7e1175f4013b0147d7e 437947611 11 | google_electra_large/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235 12 | google_electra_large/model-9baf9ff5.params 9baf9ff55cee0195b7754aee7fcb3a1019c99f45 1336395080 13 | google_electra_large/gen_model-82c1b17b.params 82c1b17b4b5ac19700c272858b0b211437f72855 205211944 14 | google_electra_large/model-31b7dfdd.yml 31b7dfdd343bd2b2e43e200a735c83b0af1963f1 476 15 | google_electra_large/disc_model-5b820c02.params 5b820c026aa2ad779c1e9a41ff4ff1408fefacbf 1340602227 16 | gluon_electra_small_owt/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235 17 | gluon_electra_small_owt/model-e9636891.params e9636891daae9f2940b2b3210cca3c34c3d8f21e 53748654 18 | gluon_electra_small_owt/model-6e276d98.yml 6e276d98360fbb7c379d28bac34a3ca2918a90ab 473 19 | gluon_electra_small_owt/gen_model-45a6fb67.params 45a6fb67e1e6cb65d22b80498f2152ce9780d579 33926624 20 | gluon_electra_small_owt/disc_model-87836017.params 878360174ac71c3fdc7071be7835bea532c09b8d 54015367 21 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/gpt2.txt: -------------------------------------------------------------------------------- 1 | gpt2_124M/model_lm-99b90604.params 99b9060488b4542ccd045c28401da10a3158ca80 497771820 2 | gpt2_124M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 3 | gpt2_124M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 4 | gpt2_124M/model-bfed311d.params bfed311d5c980ba475f90ccf7f536d25c3b40386 497769466 5 | gpt2_355M/model_lm-eed0e964.params eed0e964f4222823a557acfee2c106f228ce0188 1419317644 6 | gpt2_355M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 7 | gpt2_355M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 8 | gpt2_355M/model-81dee612.params 81dee612413733899f6e5fbbeac91da781805e1b 1419312986 9 | gpt2_774M/model_lm-cfbfa641.params cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6 3096157676 10 | gpt2_774M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 11 | gpt2_774M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 12 | gpt2_774M/model-9917e24e.params 9917e24e89c651793adea69042d6cceddfc7973c 3096150714 13 | gpt2_1558M/model_lm-c8489dcb.params c8489dcbdb0d39bc3eac6d1d62e0e3dace9faa8f 6230494540 14 | gpt2_1558M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 15 | gpt2_1558M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 16 | gpt2_1558M/model-af3dd713.params af3dd71313b55b4be5f52bdd538c9db054c1e190 6230485274 17 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/mobilebert.txt: -------------------------------------------------------------------------------- 1 | google_uncased_mobilebert/model-1c33216b.yml 1c33216b256a76713e0906b7ceefb3b37d4d35a0 510 2 | google_uncased_mobilebert/vocab-e6d2b21d.json e6d2b21d910ccb356aa18f27a1c7d70660edc058 323235 3 | google_uncased_mobilebert/model-c8346cf2.params c8346cf2caf9cc422f081f03b50bc69945328894 98424130 4 | google_uncased_mobilebert/model_mlm-53948e82.params 53948e82d8ec091927af357387b36ade0e42b34c 146503986 5 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/mt5.txt: -------------------------------------------------------------------------------- 1 | google_mt5_small/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802 2 | google_mt5_small/model-23352279.yml 23352279d13971a536847aebe31b34c4a0b80dd8 242 3 | google_mt5_small/model-b20e24d7.params b20e24d75d097e9eea647f4b9a0dc53b956a9d1a 688633650 4 | google_mt5_base/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802 5 | google_mt5_base/model-da71d108.yml da71d1084d75af5648e1b9247fecfa74e0361da0 244 6 | google_mt5_base/model-91eaa894.params 91eaa89444e062e2fc3953b1184e15ccf5375385 1561555474 7 | google_mt5_large/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802 8 | google_mt5_large/model-1226608e.yml 1226608ec2c53cc6dcf2303a8f1b19c59f43cbfe 245 9 | google_mt5_large/model-6b46e841.params 6b46e841e9b1b4c8ad97b071b316f9c52c2731e6 3894572546 10 | google_mt5_xl/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802 11 | google_mt5_xl/model-089b83a2.yml 089b83a2c893bd901fe26180f2fbfd2f52804ae0 245 12 | google_mt5_xl/model-7655ea81.params 7655ea81d4b7c9787dd1bfa902e96cdf9e124e3d 12922784462 13 | google_mt5_xxl/mt5-2730df74.vocab 2730df74056f29388cc4c8c912af6e97ac54bab2 4309802 14 | google_mt5_xxl/model-65e24812.yml 65e248120fbdcbaced58fb6f6c21f8143f9e97be 246 15 | google_mt5_xxl/model-2e9e44b9.params 2e9e44b9fc10d8a4c7133fa5e67ecadedfbfb692 47588620878 16 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/roberta.txt: -------------------------------------------------------------------------------- 1 | fairseq_roberta_base/model-565d1db7.yml 565d1db71b0452fa2c28f155b8e9d90754f4f40a 401 2 | fairseq_roberta_base/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 3 | fairseq_roberta_base/gpt2-f1335494.vocab f1335494f47917829e3b1d08e579ff2c3fe4fd60 558231 4 | fairseq_roberta_base/model-09a1520a.params 09a1520adf652468c07e43a6ed28908418fa58a7 496222787 5 | fairseq_roberta_base/model_mlm-29889e2b.params 29889e2b4ef20676fda117bb7b754e1693d0df25 498794868 6 | fairseq_roberta_large/model-6b043b91.params 6b043b91a6a781a12ea643d0644d32300db38ec8 1417251819 7 | fairseq_roberta_large/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 8 | fairseq_roberta_large/model-6e66dc4a.yml 6e66dc4a450560a93aaf3d0ba9e0d447495d778a 402 9 | fairseq_roberta_large/gpt2-f1335494.vocab f1335494f47917829e3b1d08e579ff2c3fe4fd60 558231 10 | fairseq_roberta_large/model_mlm-119f38e1.params 119f38e1249bd28bea7dd2e90c09b8f4b879fa19 1421664140 11 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/t5.txt: -------------------------------------------------------------------------------- 1 | google_t5_small/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656 2 | google_t5_small/model-3cc6e5f7.yml 3cc6e5f7c6ccc3e2ac174d899b1aed74d7de65e0 235 3 | google_t5_small/model-e34f6fbd.params e34f6fbda666c02f0ffd5e15fec02056d3e3014d 242141346 4 | google_t5_base/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656 5 | google_t5_base/model-ca5cc26c.yml ca5cc26c9dfe31295c97ef536b3f6f954ef1a447 237 6 | google_t5_base/model-e1956ac9.params e1956ac9670263b6803672bd0d7579f71d7494c6 891901274 7 | google_t5_large/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656 8 | google_t5_large/model-01c5d9ae.yml 01c5d9ae5476b18c3516ebbe3a505b966982027d 238 9 | google_t5_large/model-bf5fc813.params bf5fc8138a04aa5f3bc495cacb010c873e59e909 2951363690 10 | google_t5_3B/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656 11 | google_t5_3B/model-791f2e90.yml 791f2e90057fcccfa83bf8130034196d3550fb77 240 12 | google_t5_3B/model-48ba7250.params 48ba72501239c8d2d355282eebdebd0935556780 11407098198 13 | google_t5_11B/t5-5f05e7c5.vocab 5f05e7c57adf916bdba74912b0b37dea5c585988 791656 14 | google_t5_11B/model-2e50d93e.yml 2e50d93effc258aa75af162e9598be60ae13a83e 241 15 | google_t5_11B/model-1936031c.params 1936031c6db581ae866f41ec6d3c1c6de2049823 45229995126 16 | -------------------------------------------------------------------------------- /src/gluonnlp/models/model_zoo_checksums/xlmr.txt: -------------------------------------------------------------------------------- 1 | fairseq_xlmr_base/model-3fa134e9.params 3fa134e9a13e2329ffa7b8d39612695ed8397c9d 1109814851 2 | fairseq_xlmr_base/model-b893d178.yml b893d178fa859fb6c708a08fc970b9980e047825 402 3 | fairseq_xlmr_base/model_mlm-86e37954.params 86e379542a6430cd988ff4b6a25966949afc241a 1113185880 4 | fairseq_xlmr_base/sentencepiece-18e17bae.model 18e17bae37be115135d4cf4ad9dfcc4f3b12cb80 5069075 5 | fairseq_xlmr_large/model-b62b074c.params b62b074cdd41e682075e2407f842be6578696b26 2235374571 6 | fairseq_xlmr_large/model-01fc59fb.yml 01fc59fb3a805f09d2aa11369d5b57e0be931fdd 403 7 | fairseq_xlmr_large/model_mlm-887506c2.params 887506c20bda452cf13ef04390eaa57a55602a92 2240585840 8 | fairseq_xlmr_large/sentencepiece-18e17bae.model 18e17bae37be115135d4cf4ad9dfcc4f3b12cb80 5069075 9 | -------------------------------------------------------------------------------- /src/gluonnlp/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/src/gluonnlp/third_party/__init__.py -------------------------------------------------------------------------------- /src/gluonnlp/torch/__init__.py: -------------------------------------------------------------------------------- 1 | from . import attention_cell 2 | from . import data 3 | from . import layers 4 | from . import optimizers 5 | from . import models 6 | from . import utils 7 | -------------------------------------------------------------------------------- /src/gluonnlp/torch/clib/amp_C_frontend.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void multi_tensor_lans_cuda( 4 | int chunk_size, 5 | at::Tensor noop_flag, 6 | std::vector> tensor_lists, 7 | const float lr, 8 | const float beta1, 9 | const float beta2, 10 | const float epsilon, 11 | const int step, 12 | const int bias_correction, 13 | const float weight_decay, 14 | const int grad_averaging, 15 | const int mode, 16 | const bool normalize_grad); 17 | 18 | 19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 20 | m.def("multi_tensor_lans", &multi_tensor_lans_cuda, 21 | "Computes and apply update for LANS optimizer"); 22 | } 23 | -------------------------------------------------------------------------------- /src/gluonnlp/torch/clib/compat.h: -------------------------------------------------------------------------------- 1 | #ifndef TORCH_CHECK 2 | #define TORCH_CHECK AT_CHECK 3 | #endif 4 | 5 | #define DATA_PTR data_ptr 6 | -------------------------------------------------------------------------------- /src/gluonnlp/torch/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import batchify 2 | -------------------------------------------------------------------------------- /src/gluonnlp/torch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import transformer 2 | from . import bert 3 | -------------------------------------------------------------------------------- /src/gluonnlp/torch/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from . import schedules 2 | from . import fused_lans 3 | 4 | from .fused_lans import FusedLANS 5 | -------------------------------------------------------------------------------- /src/gluonnlp/torch/optimizers/schedules.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020, Amazon. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Optimization for BERT model.""" 15 | 16 | from torch.optim.lr_scheduler import LambdaLR 17 | 18 | __all__ = ['get_warmup_linear_const_decay_poly_schedule'] 19 | 20 | 21 | def get_warmup_linear_const_decay_poly_schedule(optimizer, total_steps, warmup_ratio=0.002, 22 | const_ratio=0., degree=1.0, last_epoch=-1): 23 | """Create a schedule with a learning rate that decreases linearly from the 24 | initial lr set in the optimizer to 0, after a warmup period during which it 25 | increases linearly from 0 to the initial lr set in the optimizer and a 26 | constant period. 27 | 28 | Args: 29 | optimizer (:class:`~torch.optim.Optimizer`): 30 | The optimizer for which to schedule the learning rate. 31 | total_steps (:obj:`int`): 32 | The total number of training steps. 33 | warmup_ratio (:obj:`float`): 34 | The number of steps for the warmup phase. 35 | constant_ratio (:obj:`float`): 36 | The total number of training steps. 37 | last_epoch (:obj:`int`, `optional`, defaults to -1): 38 | The index of the last epoch when resuming training. 39 | 40 | Return: 41 | :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 42 | 43 | """ 44 | def lr_lambda(global_step: int): 45 | x = global_step / total_steps 46 | if warmup_ratio == 0.0: 47 | return 1.0 48 | elif x < warmup_ratio: 49 | return x / warmup_ratio 50 | elif x < warmup_ratio + const_ratio: 51 | return 1.0 52 | return ((1.0 - x) / (1.0 - warmup_ratio - const_ratio))**degree 53 | 54 | return LambdaLR(optimizer, lr_lambda, last_epoch) 55 | -------------------------------------------------------------------------------- /src/gluonnlp/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from . import shm 3 | from . import lazy_imports 4 | from . import preprocessing 5 | from . import registry 6 | from . import testing 7 | from .parameter import * 8 | from .misc import * 9 | -------------------------------------------------------------------------------- /src/gluonnlp/utils/config.py: -------------------------------------------------------------------------------- 1 | import yacs.config 2 | 3 | 4 | class CfgNode(yacs.config.CfgNode): 5 | def clone_merge(self, cfg_filename_or_other_cfg): 6 | """Create a new cfg by cloning and merging with the given cfg 7 | 8 | Parameters 9 | ---------- 10 | cfg_filename_or_other_cfg 11 | 12 | Returns 13 | ------- 14 | 15 | """ 16 | ret = self.clone() 17 | if isinstance(cfg_filename_or_other_cfg, str): 18 | ret.merge_from_file(cfg_filename_or_other_cfg) 19 | return ret 20 | elif isinstance(cfg_filename_or_other_cfg, CfgNode): 21 | ret.merge_from_other_cfg(cfg_filename_or_other_cfg) 22 | return ret 23 | elif cfg_filename_or_other_cfg is None: 24 | return ret 25 | else: 26 | raise TypeError('Type of config path is not supported!') 27 | -------------------------------------------------------------------------------- /src/gluonnlp/utils/shm.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import mmap 3 | 4 | if pickle.HIGHEST_PROTOCOL < 5: 5 | del pickle 6 | import pickle5 as pickle 7 | 8 | 9 | def serialize(path, tbl): 10 | """Serialize tbl with out-of-band data to path for zero-copy shared memory usage. 11 | 12 | If the object to be serialized itself, or the objects it uses for data 13 | storage (such as numpy arrays) implement the the pickle protocol version 5 14 | pickle.PickleBuffer type in __reduce_ex__, then this function can store 15 | these buffers out-of-band as files in `path` so that they subsequently be 16 | re-used for zero-copy sharing accross processes. 17 | 18 | Parameters 19 | ---------- 20 | path : pathlib.Path 21 | Empty folder used to save serialized data. Usually a folder /dev/shm 22 | tbl : object 23 | Object to serialize. For example a PyArrow Table, a Pandas Dataframe or 24 | any type that relies on NumPy to store the binary data. 25 | 26 | """ 27 | idx = 0 28 | 29 | def buffer_callback(buf): 30 | nonlocal idx 31 | with open(path / f'{idx}.bin', 'wb') as f: 32 | f.write(buf) 33 | idx += 1 34 | 35 | with open(path / 'meta.pkl', 'wb') as f: 36 | pickle.dump(tbl, f, protocol=5, buffer_callback=buffer_callback) 37 | 38 | 39 | def load(path): 40 | """Load serialized object with out-of-band data from path based on zero-copy shared memory. 41 | 42 | Parameters 43 | ---------- 44 | path : pathlib.Path 45 | Folder used to save serialized data with serialize(). Usually a folder /dev/shm 46 | 47 | """ 48 | num_buffers = len(list(path.iterdir())) - 1 # exclude meta.idx 49 | buffers = [] 50 | for idx in range(num_buffers): 51 | f = open(path / f'{idx}.bin', 'rb') 52 | buffers.append(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) 53 | with open(path / 'meta.pkl', 'rb') as f: 54 | return pickle.load(f, buffers=buffers) 55 | -------------------------------------------------------------------------------- /src/gluonnlp/utils/tvm_utils.py: -------------------------------------------------------------------------------- 1 | __all__ = ['get_ec2_tvm_flags', 'update_tvm_convert_map'] 2 | 3 | import tvm.relay.op as _op 4 | import tvm.relay.expr as _expr 5 | from typing import Dict 6 | from tvm.relay.frontend.mxnet import _convert_map 7 | from tvm.relay.frontend.common import infer_type as _infer_type 8 | 9 | def get_ec2_tvm_flags() -> Dict[str, Dict]: 10 | r"""Return the recommended flags for TVM compilation in AWS EC2 instances. 11 | 12 | Including C4, C5, G4, P3. 13 | 14 | For more details about AWS EC2 instances, refer to https://aws.amazon.com/ec2/instance-types/. 15 | 16 | Returns 17 | ------- 18 | info_dict 19 | A dictionary that contains the mapping between instance type and the 20 | corresponding compilation flags. 21 | Each element includes: 22 | 23 | - target 24 | The compilation target 25 | - use_gpu 26 | Whether it's a GPU instance 27 | - opt_level 28 | The optimization level in compilation 29 | - pass 30 | Additional graph passes for further improvement. 31 | """ 32 | instance_info = { 33 | 'g4': {'target': "cuda -model=t4 -libs=cublas,cudnn", 34 | 'use_gpu': True, 35 | 'opt_level': 3, 36 | 'required_pass': ["FastMath"]}, 37 | 'c4': {'target': 'llvm -mcpu=core-avx2 -libs=cblas', 38 | 'use_gpu': False, 39 | 'opt_level': 3, 40 | 'required_pass': ["FastMath"]}, 41 | 'c5': {'target': 'llvm -mcpu=skylake-avx512 -libs=cblas', 42 | 'use_gpu': False, 43 | 'opt_level': 3, 44 | 'required_pass': ["FastMath"]}, 45 | 'p3': {'target': 'cuda -model=v100 -libs=cublas,cudnn', 46 | 'use_gpu': True, 47 | 'opt_level': 3, 48 | 'required_pass': ["FastMath"]} 49 | } 50 | return instance_info 51 | 52 | 53 | def update_tvm_convert_map() -> None: 54 | """A Monkey Patch to update convert map in tvm/relay/frontend/mxnet.py""" 55 | op = (('masked_softmax', _mx_masked_softmax),) 56 | _convert_map.update({key: value for key, value in op}) 57 | 58 | 59 | def _mx_masked_softmax(inputs, attrs): 60 | assert len(inputs) == 1 or len(inputs) == 2 61 | axis = attrs.get_int("axis") 62 | temperature = attrs.get_float("temperature") 63 | if len(inputs) == 1: 64 | result = _op.nn.softmax(inputs[0] / _expr.const(temperature), axis=axis) 65 | else: 66 | neg = -1e18 67 | att_score, mask = inputs 68 | att_score_dtype = _infer_type(att_score).checked_type.dtype 69 | if att_score_dtype == "float16": 70 | neg = -1e4 71 | temp = _op.where(mask, 72 | att_score, 73 | _expr.const(neg)) 74 | result = _op.multiply(_op.nn.softmax(temp / _expr.const(temperature), axis=axis), mask.astype("float32")) 75 | return result 76 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Unit Tests 2 | 3 | To run the unittests, use the following command 4 | 5 | ```bash 6 | python3 -m pytest --forked --device="cpu" . 7 | ``` 8 | 9 | To test for certain file, e.g., the `test_models_transformer.py`, use the following command 10 | 11 | ```bash 12 | python3 -m pytest --forked --device="cpu" test_models_transformer.py 13 | ``` 14 | 15 | To test only for gpu device, use the following command 16 | 17 | ```bash 18 | python3 -m pytest --forked --device="gpu" test_models_transformer.py 19 | ``` 20 | 21 | To test both for cpu and gpu device, use the following command 22 | 23 | ```bash 24 | python3 -m pytest --forked --device="cpu" --device="gpu" test_models_transformer.py 25 | ``` 26 | 27 | In addition, to run all the tests, you should add the `--runslow` flag 28 | 29 | ```bash 30 | python3 -m pytest --forked --device="gpu" --runslow test_models.py 31 | ``` 32 | 33 | Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details. 34 | 35 | # Naming Convention 36 | 37 | The naming convention of the tests are `test_{module_name}.py`. 38 | For example, the test of [models/transformer.py](../src/gluonnlp/models/transformer.py) will be in 39 | `test_models_transformer.py`. The test of [models/__init__.py](../src/gluonnlp/models/__init__.py) 40 | is `test_models.py`. 41 | 42 | Also, we include the scheduled testing scripts for `nlp_process` in [process_cli](process_cli), 43 | and 'nlp_data' in [data_cli](data_cli). 44 | 45 | -------------------------------------------------------------------------------- /tests/data_cli/test_glue.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | import pandas as pd 4 | from gluonnlp.cli.data.general_nlp_benchmark import prepare_glue 5 | 6 | 7 | @pytest.mark.remote_required 8 | @pytest.mark.parametrize('task', ["cola", "sst", "mrpc", "qqp", "sts", "mnli", 9 | "snli", "qnli", "rte", "wnli", "diagnostic"]) 10 | def test_glue(task): 11 | parser = prepare_glue.get_parser() 12 | with tempfile.TemporaryDirectory() as root: 13 | args = parser.parse_args(['--benchmark', 'glue', 14 | '--tasks', task, 15 | '--data_dir', root]) 16 | prepare_glue.main(args) 17 | 18 | 19 | @pytest.mark.remote_required 20 | @pytest.mark.parametrize('task', ["cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record", 21 | 'broadcoverage-diagnostic', 'winogender-diagnostic']) 22 | def test_glue(task): 23 | parser = prepare_glue.get_parser() 24 | with tempfile.TemporaryDirectory() as root: 25 | args = parser.parse_args(['--benchmark', 'superglue', 26 | '--tasks', task, 27 | '--data_dir', root]) 28 | prepare_glue.main(args) 29 | -------------------------------------------------------------------------------- /tests/data_cli/test_wikipedia.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | from gluonnlp.cli.data.pretrain_corpus import prepare_wikipedia 4 | 5 | 6 | @pytest.mark.remote_required 7 | # Test for zh-classical (文言) + wuu (吴语), which are smaller compared with English 8 | @pytest.mark.parametrize('lang', ['zh-classical', 'wuu']) 9 | def test_download_format(lang): 10 | parser = prepare_wikipedia.get_parser() 11 | with tempfile.TemporaryDirectory() as root: 12 | download_args = parser.parse_args(['--mode', 'download+format', 13 | '--lang', lang, 14 | '--date', 'latest', '-o', root]) 15 | prepare_wikipedia.main(download_args) 16 | -------------------------------------------------------------------------------- /tests/process_cli/test_average_checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gluonnlp.cli import average_checkpoint 3 | from mxnet.gluon import nn 4 | from numpy.testing import assert_allclose 5 | 6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) 7 | 8 | def test_avg_ckpt(): 9 | try: 10 | average_checkpoint.cli_main() 11 | except: 12 | pass 13 | num_ckpts = 5 14 | model = nn.Dense(units=10, in_units=10) 15 | model.initialize() 16 | params = model.collect_params() 17 | gd_avg = {} 18 | for key in params.keys(): 19 | gd_avg[key] = params[key].data().asnumpy() 20 | model.save_parameters(os.path.join(_CURR_DIR, 'update0.params')) 21 | 22 | for i in range(1, num_ckpts): 23 | model.initialize(force_reinit=True) 24 | params = model.collect_params() 25 | for key in gd_avg.keys(): 26 | gd_avg[key] += params[key].data().asnumpy() 27 | model.save_parameters(os.path.join(_CURR_DIR, 'update{}.params'.format(i))) 28 | 29 | for key in gd_avg.keys(): 30 | gd_avg[key] /= num_ckpts 31 | 32 | parser = average_checkpoint.get_parser() 33 | args = parser.parse_args(['--checkpoints', None, 34 | '--begin', '0', 35 | '--end', str(num_ckpts-1), 36 | '--save-path', os.path.join(_CURR_DIR, 'avg.params')]) 37 | args.checkpoints = ['fake', 'ckpt'] 38 | try: 39 | average_checkpoint.main(args) 40 | except: 41 | pass 42 | args.checkpoints = [os.path.join(_CURR_DIR, 'update{}.params'.format(i)) \ 43 | for i in range(0, num_ckpts)] 44 | average_checkpoint.main(args) 45 | 46 | model.load_parameters(os.path.join(_CURR_DIR, 'avg.params')) 47 | params = model.collect_params() 48 | 49 | for key in gd_avg.keys(): 50 | assert_allclose(gd_avg[key], params[key].data().asnumpy(), 1E-7, 1E-7) 51 | -------------------------------------------------------------------------------- /tests/test_data_filtering.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from gluonnlp.data.filtering import ProfanityFilter, MosesNormalizer, LanguageIdentifier 3 | import multiprocessing 4 | 5 | 6 | def test_profanity_filter(): 7 | profanity_filter = ProfanityFilter('en') 8 | filter_word = 'anal' 9 | unfilter_word = 'analysis' 10 | for text in [' ' + filter_word, ' ' + filter_word + ' ', 11 | filter_word, filter_word + ' ' + unfilter_word]: 12 | assert profanity_filter.match(text) is True 13 | for text in [' ' + unfilter_word, unfilter_word, unfilter_word + ' ']: 14 | assert profanity_filter.match(text) is False 15 | 16 | 17 | def test_sentence_normalizer(): 18 | normalizer = MosesNormalizer('en') 19 | assert normalizer(' hello world!!".\t\t\r') == ' hello world!!." ' 20 | assert normalizer( 21 | b'We therefore defend, and will continue to defend wherever necessary, our position of \xe2\x80\x98no diversion\xe2\x80\x99.\n'.decode('utf-8')) == \ 22 | "We therefore defend, and will continue to defend wherever necessary, our position of 'no diversion'. " 23 | normalizer = MosesNormalizer('en', remove_non_printable_char=False) 24 | assert normalizer(' hello world!!".\t\t\r') == ' hello world!!."\t\t' 25 | normalizer = MosesNormalizer('en', remove_non_printable_char=False, unicode_norm_form='NFKC') 26 | assert normalizer(' hello world!!"⁵.\t\t\r') == ' hello world!!"5.\t\t' 27 | 28 | 29 | @pytest.mark.parametrize('algo', ['fasttext', 'fasttext_compressed', 'langid']) 30 | def test_language_identifier(algo): 31 | lang_id_model = LanguageIdentifier(algo=algo) 32 | lang_label, score = lang_id_model('你好,世界') 33 | assert lang_label == 'zh' 34 | with multiprocessing.Pool(2) as pool: 35 | out = pool.map(lang_id_model, ['你好,世界', 'Hello World']) 36 | assert out[0][0] == 'zh' 37 | assert out[1][0] == 'en' 38 | -------------------------------------------------------------------------------- /tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | import os 4 | import tempfile 5 | import pytest 6 | from gluonnlp.embedding import load_embeddings, get_fasttext_model 7 | from gluonnlp.data import Vocab 8 | 9 | def test_load_embeddings(): 10 | text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world', 'sadgood'] 11 | counter = collections.Counter(text_data) 12 | vocab1 = Vocab(counter) 13 | # load with vocab 14 | matrix1 = load_embeddings(vocab1) 15 | assert len(matrix1) == len(vocab1) 16 | # load without vocab 17 | matrix2, vocab2 = load_embeddings() 18 | assert len(matrix2) == len(vocab2) 19 | np.testing.assert_almost_equal(matrix1[vocab1["hello"]], matrix2[vocab2["hello"]]) 20 | 21 | # test_unk_method 22 | def simple(words): 23 | return np.ones((len(words), 50)) 24 | matrix3 = load_embeddings(vocab1, unk_method=simple) 25 | assert sum(matrix3[vocab1['sadgood']] == 1) == matrix3.shape[-1] 26 | np.testing.assert_almost_equal(matrix3[vocab1["hello"]], matrix2[vocab2["hello"]]) 27 | 28 | # load txt 29 | with tempfile.TemporaryDirectory() as root: 30 | path = os.path.join(root, "tmp.txt") 31 | with open(path, "w") as f: 32 | f.write("{} {}\n".format(matrix1.shape[0], matrix1.shape[1])) 33 | for word, vec in zip(vocab1.all_tokens, matrix1): 34 | f.write(word + " ") 35 | f.write(" ".join([str(num) for num in vec.tolist()])) 36 | f.write("\n") 37 | matrix4 = load_embeddings(vocab1, path) 38 | np.testing.assert_almost_equal(matrix4, matrix1) 39 | 40 | 41 | @pytest.mark.slow 42 | @pytest.mark.remote_required 43 | def test_get_fasttext_model(): 44 | text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world'] 45 | counter = collections.Counter(text_data) 46 | vocab1 = Vocab(counter) 47 | matrix1 = load_embeddings(vocab1, 'wiki.en') 48 | ft = get_fasttext_model('wiki.en') 49 | np.testing.assert_almost_equal(matrix1[vocab1["hello"]], ft['hello'], decimal=4) 50 | with pytest.raises(ValueError): 51 | get_fasttext_model('wiki.multi.ar') 52 | 53 | -------------------------------------------------------------------------------- /tests/test_gluon_block.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import mxnet as mx 3 | from mxnet import nd, np, npx 4 | from mxnet.test_utils import assert_allclose 5 | from mxnet.gluon import HybridBlock, Constant 6 | from mxnet.gluon.data import DataLoader 7 | import itertools 8 | mx.npx.set_np() 9 | 10 | 11 | def test_const(): 12 | class Foo(HybridBlock): 13 | def __init__(self): 14 | super().__init__() 15 | self.weight = Constant(np.ones((10, 10))) 16 | 17 | def forward(self, x, weight): 18 | return x, weight.astype(np.float32) 19 | 20 | foo = Foo() 21 | foo.hybridize() 22 | foo.initialize() 23 | 24 | 25 | def test_scalar(): 26 | class Foo(HybridBlock): 27 | def forward(self, x): 28 | return x * x * 2 29 | 30 | foo = Foo() 31 | foo.hybridize() 32 | foo.initialize() 33 | out = foo(mx.np.array(1.0)) 34 | assert_allclose(out.asnumpy(), np.array(2.0)) 35 | 36 | 37 | def test_gluon_nonzero_hybridize(): 38 | class Foo(HybridBlock): 39 | def __init__(self): 40 | super().__init__() 41 | 42 | def forward(self, x): 43 | dat = npx.nonzero(x) 44 | return dat.sum() + dat 45 | 46 | foo = Foo() 47 | foo.hybridize() 48 | out = foo(mx.np.array([1, 0, 2, 0, 3, 0])) 49 | out.wait_to_read() 50 | out = foo(mx.np.array([0, 0, 0, 0, 0, 0])) 51 | out.wait_to_read() 52 | 53 | 54 | @pytest.mark.xfail(reason='Expected to fail due to MXNet bug https://github.com/apache/' 55 | 'incubator-mxnet/issues/19659') 56 | def test_gluon_boolean_mask(): 57 | class Foo(HybridBlock): 58 | def forward(self, data, indices): 59 | mask = indices < 3 60 | data = npx.reshape(data, (-1, -2), reverse=True) 61 | mask = np.reshape(mask, (-1,)) 62 | sel = nd.np._internal.boolean_mask(data, mask) 63 | return sel 64 | data = mx.np.random.normal(0, 1, (5, 5, 5, 5, 16)) 65 | indices = mx.np.random.randint(0, 5, (5, 5, 5, 5)) 66 | data.attach_grad() 67 | indices.attach_grad() 68 | foo = Foo() 69 | foo.hybridize() 70 | with mx.autograd.record(): 71 | out = foo(data, indices) 72 | out.backward() 73 | out.wait_to_read() 74 | 75 | 76 | def test_basic_dataloader(): 77 | def grouper(iterable, n, fillvalue=None): 78 | """Collect data into fixed-length chunks or blocks""" 79 | # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx 80 | args = [iter(iterable)] * n 81 | return itertools.zip_longest(*args, fillvalue=fillvalue) 82 | ctx_l = [mx.cpu(i) for i in range(8)] 83 | dataset = [mx.np.ones((2,)) * i for i in range(1000)] 84 | dataloader = DataLoader(dataset, 2, num_workers=4, prefetch=10) 85 | 86 | for i, data_l in enumerate(grouper(dataloader, len(ctx_l))): 87 | for data, ctx in zip(data_l, ctx_l): 88 | if data is None: 89 | continue 90 | data = data.as_in_ctx(ctx) 91 | mx.npx.waitall() 92 | -------------------------------------------------------------------------------- /tests/test_initializer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from gluonnlp import initializer 3 | import mxnet as mx 4 | from mxnet.gluon import nn 5 | mx.npx.set_np() 6 | 7 | 8 | def test_truncnorm_string_alias_works(): 9 | try: 10 | layer = nn.Dense(in_units=1, units=1, weight_initializer='truncnorm') 11 | layer.initialize() 12 | except RuntimeError: 13 | pytest.fail('Layer couldn\'t be initialized') 14 | 15 | 16 | def test_truncnorm_all_values_inside_boundaries(): 17 | mean = 0 18 | std = 0.01 19 | layer = nn.Dense(in_units=1, units=1000) 20 | layer.initialize(init=initializer.TruncNorm(mean, std)) 21 | assert (layer.weight.data() <= 2 * std).asnumpy().all() 22 | assert (layer.weight.data() >= -2 * std).asnumpy().all() 23 | 24 | 25 | def test_truncnorm_generates_values_with_defined_mean_and_std(): 26 | from scipy import stats 27 | 28 | mean = 10 29 | std = 5 30 | layer = nn.Dense(in_units=1, units=100000) 31 | layer.initialize(init=initializer.TruncNorm(mean, std)) 32 | samples = layer.weight.data().reshape((-1, )).asnumpy() 33 | 34 | p_value = stats.kstest(samples, 'truncnorm', args=(-2, 2, mean, std)).pvalue 35 | assert p_value > 0.0001 36 | -------------------------------------------------------------------------------- /tests/test_loss.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | import pytest 4 | from numpy.testing import assert_allclose 5 | import scipy.special as sspecial 6 | from gluonnlp.loss import LabelSmoothCrossEntropyLoss 7 | mx.npx.set_np() 8 | 9 | 10 | @pytest.mark.parametrize('label_shape', [(5, 3), (3,), (2, 3, 2)]) 11 | @pytest.mark.parametrize('alpha', [0.0, 0.1]) 12 | @pytest.mark.parametrize('from_logits', [True, False]) 13 | @pytest.mark.parametrize('hybridize', [True, False]) 14 | def test_label_smoothing(label_shape, alpha, from_logits, hybridize): 15 | def _np_label_smoothing(pred, labels, alpha, from_logits): 16 | flatten_pred = pred.reshape((-1, pred.shape[-1])) 17 | flatten_labels = labels.reshape((-1,)) 18 | smoothed_labels = np.full_like(flatten_pred, 19 | fill_value=alpha / flatten_pred.shape[-1]) 20 | smoothed_labels[np.arange(flatten_pred.shape[0]), flatten_labels]\ 21 | = 1 - alpha + alpha / flatten_pred.shape[-1] 22 | if not from_logits: 23 | flatten_logits = np.log(sspecial.softmax(flatten_pred, axis=-1)) 24 | else: 25 | flatten_logits = flatten_pred 26 | # Calculate cross-entropy 27 | loss = - (smoothed_labels * flatten_logits).sum(axis=-1) 28 | return loss.reshape(labels.shape) 29 | label_num = 5 30 | loss = LabelSmoothCrossEntropyLoss(num_labels=label_num, alpha=alpha, from_logits=from_logits) 31 | if hybridize: 32 | loss.hybridize() 33 | if from_logits: 34 | pred = mx.np.random.uniform(-10, -1, label_shape + (label_num,)) 35 | else: 36 | pred = mx.np.random.normal(0, 1, label_shape + (label_num,)) 37 | labels = mx.np.random.randint(0, label_num, label_shape) 38 | out = loss(pred, labels) 39 | np_out = _np_label_smoothing(pred.asnumpy(), labels.asnumpy(), alpha, from_logits) 40 | assert_allclose(np_out, out.asnumpy(), 1E-4, 1E-4) 41 | 42 | -------------------------------------------------------------------------------- /tests/test_models_mt5.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tempfile 3 | 4 | from gluonnlp.models.mt5 import ( 5 | MT5Model, MT5Inference, mt5_cfg_reg, list_pretrained_mt5, get_pretrained_mt5 6 | ) 7 | 8 | def test_list_pretrained_mt5(): 9 | assert len(list_pretrained_mt5()) > 0 10 | 11 | 12 | @pytest.mark.parametrize('cfg_key', mt5_cfg_reg.list_keys()) 13 | def test_mt5_model_and_inference(cfg_key, ctx): 14 | # since MT5Model, MT5Inference simply inherits the T5Model, T5Inference, 15 | # we just want to make sure the model can be properly loaded, and leave 16 | # the correctness tests to test_model_t5.py 17 | with ctx: 18 | cfg = MT5Model.get_cfg(cfg_key) 19 | if cfg_key != 'google_mt5_small': 20 | cfg.defrost() 21 | cfg.MODEL.vocab_size = 256 22 | cfg.MODEL.d_model = 128 23 | cfg.MODEL.d_ff = 512 24 | cfg.MODEL.num_layers = 2 25 | cfg.MODEL.num_heads = 4 26 | cfg.freeze() 27 | mt5_model = MT5Model.from_cfg(cfg) 28 | mt5_model.initialize() 29 | mt5_model.hybridize() 30 | if cfg_key == 'google_mt5_small': 31 | inference_model = MT5Inference(mt5_model) 32 | inference_model.hybridize() 33 | 34 | 35 | def test_mt5_get_pretrained(ctx): 36 | with tempfile.TemporaryDirectory() as root, ctx: 37 | cfg, tokenizer, backbone_params_path, _ = get_pretrained_mt5('google_mt5_small') 38 | # we exclude s in the comparison below by avoiding len(tokenizer.vocab) 39 | assert cfg.MODEL.vocab_size >= len(tokenizer._sp_model) 40 | mt5_model = MT5Model.from_cfg(cfg) 41 | mt5_model.load_parameters(backbone_params_path) 42 | mt5_model.hybridize() 43 | mt5_inference_model = MT5Inference(mt5_model) 44 | mt5_inference_model.hybridize() 45 | -------------------------------------------------------------------------------- /tests/test_models_xlmr.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import mxnet as mx 4 | import tempfile 5 | from gluonnlp.models.xlmr import XLMRModel, \ 6 | list_pretrained_xlmr, get_pretrained_xlmr 7 | from gluonnlp.loss import LabelSmoothCrossEntropyLoss 8 | 9 | mx.npx.set_np() 10 | 11 | 12 | def test_list_pretrained_xlmr(): 13 | assert len(list_pretrained_xlmr()) > 0 14 | 15 | 16 | # We choose to not test amp for XLMR because it's the same as RoBERTa. 17 | @pytest.mark.slow 18 | @pytest.mark.remote_required 19 | @pytest.mark.parametrize('model_name', list_pretrained_xlmr()) 20 | def test_xlmr(model_name, ctx): 21 | # test from pretrained 22 | assert len(list_pretrained_xlmr()) > 0 23 | with ctx: 24 | with tempfile.TemporaryDirectory() as root: 25 | cfg, tokenizer, params_path, mlm_params_path =\ 26 | get_pretrained_xlmr(model_name, load_backbone=True, load_mlm=False, root=root) 27 | assert cfg.MODEL.vocab_size == len(tokenizer.vocab) 28 | # test backbone 29 | xlmr_model = XLMRModel.from_cfg(cfg) 30 | xlmr_model.load_parameters(params_path) 31 | # pass the mlm model 32 | 33 | # test forward 34 | batch_size = 1 35 | seq_length = 4 36 | vocab_size = len(tokenizer.vocab) 37 | input_ids = mx.np.array( 38 | np.random.randint( 39 | 2, 40 | vocab_size, 41 | (batch_size, seq_length) 42 | ), 43 | dtype=np.int32 44 | ) 45 | valid_length = mx.np.array( 46 | np.random.randint( 47 | seq_length // 2, 48 | seq_length, 49 | (batch_size,) 50 | ), 51 | dtype=np.int32 52 | ) 53 | contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length) 54 | mx.npx.waitall() 55 | # test backward 56 | label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size) 57 | with mx.autograd.record(): 58 | contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length) 59 | loss = label_smooth_loss(contextual_embeddings, input_ids) 60 | loss.backward() 61 | mx.npx.waitall() 62 | -------------------------------------------------------------------------------- /tests/test_pytest.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pytest 3 | import numpy as np 4 | import mxnet as mx 5 | 6 | 7 | @pytest.mark.seed(1) 8 | def test_test(): 9 | """Test that fixing a random seed works.""" 10 | py_rnd = random.randint(0, 100) 11 | np_rnd = np.random.randint(0, 100) 12 | mx_rnd = mx.nd.random_uniform(shape=(1, )).asscalar() 13 | 14 | random.seed(1) 15 | mx.random.seed(1) 16 | np.random.seed(1) 17 | 18 | assert py_rnd == random.randint(0, 100) 19 | assert np_rnd == np.random.randint(0, 100) 20 | assert mx_rnd == mx.nd.random_uniform(shape=(1, )).asscalar() 21 | -------------------------------------------------------------------------------- /tests/test_utils_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from numpy.testing import assert_allclose 4 | from gluonnlp.utils.preprocessing import get_trimmed_lengths, match_tokens_with_char_spans 5 | 6 | 7 | def test_get_trimmed_lengths(): 8 | for lengths, do_merge, max_length, gt_trimmed_lengths in\ 9 | [([10, 5, 4, 8], False, 6, [6, 5, 4, 6]), 10 | ([10, 5, 4, 8], True, 6, [2, 2, 1, 1]), 11 | ([20], False, 30, [20]), 12 | ([20], True, 30, [20]), 13 | ([15, 20], False, 30, [15, 20]), 14 | ([15, 20], True, 30, [15, 15])]: 15 | trimmed_lengths = get_trimmed_lengths(lengths, 16 | max_length=max_length, 17 | do_merge=do_merge) 18 | assert_allclose(trimmed_lengths, np.array(gt_trimmed_lengths)) 19 | 20 | 21 | def test_match_tokens_with_char_spans(): 22 | token_offsets = np.array([(0, 1), (1, 2), (3, 4), (5, 6)]) 23 | spans = np.array([(0, 3), (4, 6)]) 24 | out = match_tokens_with_char_spans(token_offsets, spans) 25 | assert_allclose(out, np.array([[0, 2], 26 | [2, 3]])) 27 | 28 | token_offsets = np.array([(5, 10), (10, 20), (20, 25), (26, 30)]) 29 | spans = np.array([(0, 3), (4, 6), (10, 30), 30 | (22, 23), (15, 25), 31 | (10, 35), (36, 38)]) 32 | out = match_tokens_with_char_spans(token_offsets, spans) 33 | assert_allclose(out, np.array([[0, 0], 34 | [0, 0], 35 | [1, 3], 36 | [2, 2], 37 | [1, 2], 38 | [1, 3], 39 | [3, 3]])) 40 | -------------------------------------------------------------------------------- /tests/test_utils_registry.py: -------------------------------------------------------------------------------- 1 | from gluonnlp.utils.registry import Registry 2 | 3 | 4 | def test_registry(): 5 | MODEL_REGISTRY = Registry('MODEL') 6 | @MODEL_REGISTRY.register() 7 | class MyModel: 8 | def __init__(self, a, b): 9 | self.a = a 10 | self.b = b 11 | 12 | @MODEL_REGISTRY.register() 13 | def my_model(): 14 | return 15 | 16 | @MODEL_REGISTRY.register('test_class') 17 | class MyModelWithNickName: 18 | def __init__(self, a, b, c): 19 | self.a = a 20 | self.b = b 21 | self.c = c 22 | 23 | @MODEL_REGISTRY.register('test_function') 24 | def my_model_with_nick_name(): 25 | return 26 | 27 | class MyModel2: 28 | pass 29 | 30 | MODEL_REGISTRY.register(MyModel2) 31 | MODEL_REGISTRY.register('my_model2', MyModel2) 32 | assert MODEL_REGISTRY.list_keys() ==\ 33 | ['MyModel', 'my_model', 'test_class', 'test_function', 'MyModel2', 'my_model2'] 34 | model = MODEL_REGISTRY.create('MyModel', 1, 2) 35 | assert model.a == 1 and model.b == 2 36 | model = MODEL_REGISTRY.create('MyModel', a=2, b=3) 37 | assert model.a == 2 and model.b == 3 38 | model = MODEL_REGISTRY.create_with_json('MyModel', '[4, 5]') 39 | assert model.a == 4 and model.b == 5 40 | model = MODEL_REGISTRY.create_with_json('test_class', 41 | '{"a": 100, "b": 200, "c": 300}') 42 | assert model.a == 100 and model.b == 200 and model.c == 300 43 | assert MODEL_REGISTRY.get('test_class') == MyModelWithNickName 44 | 45 | 46 | -------------------------------------------------------------------------------- /tests/torch/test_layers_torch.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import numpy as np 3 | from gluonnlp.torch.layers import SinusoidalPositionalEmbedding 4 | 5 | 6 | def test_sinusoidal_pos_embed(): 7 | embed1 = SinusoidalPositionalEmbedding(128, learnable=False) 8 | embed2 = SinusoidalPositionalEmbedding(128, learnable=True) 9 | assert len([(name, param) for name, param in embed1.named_parameters() 10 | if param.requires_grad]) == 0 11 | assert len([(name, param) for name, param in embed2.named_parameters() 12 | if param.requires_grad]) == 1 13 | inputs = th.randint(0, 128, (8, 4)) 14 | np.testing.assert_allclose(embed1(inputs).detach().cpu().numpy(), 15 | embed2(inputs).detach().cpu().numpy(), 1E-3, 1E-3) 16 | -------------------------------------------------------------------------------- /tools/batch/backbone_benchmark/run_batch_backbone_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | INSTANCE_TYPE=${1:-g4dn.2x} 5 | LOG_PATH=${2:-submit_backbone_benchmark.log} 6 | SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py 7 | 8 | python3 ${SUBMIT_SCRIPT_PATH} \ 9 | --region us-east-1 \ 10 | --source-ref fix_benchmark3 \ 11 | --job-type ${INSTANCE_TYPE} \ 12 | --save-path temp \ 13 | --name test_backbone_benchmark_${INSTANCE_TYPE} \ 14 | --work-dir scripts/benchmarks \ 15 | --remote https://github.com/sxjscience/gluon-nlp/ \ 16 | --command "bash run_backbone_benchmark.sh 2>&1 | tee stdout.log" \ 17 | | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \ 18 | | sed -e 's/ - / /g' >> ${LOG_PATH} 19 | -------------------------------------------------------------------------------- /tools/batch/batch_states/compile_notebooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Shell script for submitting AWS Batch jobs to compile notebooks 3 | 4 | event=$1 5 | ref=$2 6 | 7 | FAIL=0 8 | 9 | compile_notebook () { 10 | local MDFILE=$1 11 | DIR=$(dirname $MDFILE) 12 | BASENAME=$(basename $MDFILE) 13 | TARGETNAME=$(dirname $MDFILE)/${BASENAME%.md}.ipynb 14 | LOGNAME=$(dirname $MDFILE)/${BASENAME%.md}.stdout.log 15 | 16 | echo Compiling $BASENAME ... 17 | 18 | python3 docs/md2ipynb.py ${MDFILE} &> $LOGNAME 19 | 20 | EXIT_CODE=$? 21 | 22 | if [ $EXIT_CODE -ne 0 ]; then 23 | echo Compiling $BASENAME Failed, please download Notebook_Logs in build Artifacts for more details. 24 | else 25 | echo Compiling $BASENAME Succeeded 26 | fi 27 | exit $EXIT_CODE 28 | } 29 | 30 | pids=() 31 | 32 | for f in $(find docs/tutorials -type f -name '*.md' -print); do 33 | compile_notebook "$f" & 34 | pids+=($!) 35 | done; 36 | 37 | for pid in "${pids[@]}"; do 38 | wait "$pid" || let "FAIL+=1" 39 | done; 40 | 41 | if [ "$FAIL" == "0" ]; then 42 | echo Building Website 43 | make docs_local 44 | EXIT_CODE=$? 45 | if [ $EXIT_CODE -ne 0 ]; then 46 | echo Building Website Failed. 47 | exit $EXIT_CODE 48 | else 49 | echo Building Website Succeeded. 50 | if [ "$1" == "push" ]; then 51 | echo "Uploading docs to s3://gluon-nlp/$2/" 52 | aws s3 sync --delete ./docs/_build/html/ s3://gluon-nlp/$2/ --quiet --acl public-read 53 | else 54 | echo "Uploading docs to s3://gluon-nlp-staging/PR$1/$2/" 55 | aws s3 sync --delete ./docs/_build/html/ s3://gluon-nlp-staging/PR$1/$2/ --quiet --acl public-read 56 | fi 57 | fi 58 | else 59 | exit 1 60 | fi 61 | -------------------------------------------------------------------------------- /tools/batch/batch_states/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Shell script for installing dependencies and running test on AWS Batch 3 | set -ex 4 | 5 | echo $PWD 6 | SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" 7 | REPODIR="$( readlink -f ${SCRIPTPATH}/../../../../gluon-nlp)" 8 | 9 | python3 -m pip install --upgrade --user pytest pytest-cov contextvars 10 | python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/ 11 | -------------------------------------------------------------------------------- /tools/batch/batch_states/test_data_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Shell script for testing the data preprocessing on AWS Batch 3 | 4 | set -ex 5 | export PYTHONIOENCODING=utf8 6 | echo $PWD 7 | 8 | for MODEL in spm yttm 9 | do 10 | bash ../../../scripts/datasets/machine_translation/wmt2014_ende.sh ${MODEL} 11 | done 12 | for MODEL in spm yttm 13 | do 14 | bash ../../../scripts/datasets/machine_translation/wmt2017_zhen.sh ${MODEL} 15 | done 16 | -------------------------------------------------------------------------------- /tools/batch/hello_world.py: -------------------------------------------------------------------------------- 1 | from gluonnlp.data.vocab import Vocab 2 | import mxnet as mx 3 | 4 | 5 | if __name__ == '__main__': 6 | vocab = Vocab(['Hello', 'World!'], unk_token=None) 7 | print(vocab) 8 | num_gpus = mx.context.num_gpus() 9 | print('Number of GPUS:', num_gpus) 10 | 11 | -------------------------------------------------------------------------------- /tools/batch/question_answering/parse_squad_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | import glob 5 | import math 6 | import argparse 7 | from datetime import datetime 8 | 9 | parser = argparse.ArgumentParser(description='Parse SQuAD results generated by ' 10 | '"sync_batch_result.sh" to csv.') 11 | parser.add_argument('--dir', type=str, required=True, 12 | help='The basic directory to analyze the results.') 13 | parser.add_argument('--save_path', type=str, default=None, help='The path to save the results.') 14 | args = parser.parse_args() 15 | 16 | if args.save_path is None: 17 | args.save_path = os.path.basename(os.path.realpath(args.dir)) + '.csv' 18 | 19 | base_dir = args.dir 20 | prefix = 'test_squad2_' 21 | 22 | dat_l = [] 23 | datetime_parser = '%Y-%m-%d %H:%M:%S,%f' 24 | 25 | for folder in sorted(os.listdir(base_dir)): 26 | if folder.startswith(prefix): 27 | model_name = folder[len(prefix):] 28 | log_path_l = glob.glob(os.path.join(base_dir, folder, 'fintune*/finetune*.log')) 29 | param_path_l = sorted(glob.glob(os.path.join(base_dir, folder, 'fintune*/*.params'))) 30 | if len(param_path_l) == 0 or len(log_path_l) == 0: 31 | best_f1_threshold = math.nan 32 | best_exact_threshold = math.nan 33 | best_f1 = math.nan 34 | best_em = math.nan 35 | time_spent_in_hours = math.nan 36 | else: 37 | log_path = log_path_l[0] 38 | result_file = glob.glob(os.path.join(base_dir, folder, 'fintune*/best_results.json'))[0] 39 | with open(result_file, 'r') as in_f: 40 | result_dat = json.load(in_f) 41 | if 'best_f1_thresh' in result_dat: 42 | best_f1_threshold = result_dat['best_f1_thresh'] 43 | best_exact_threshold = result_dat['best_exact_thresh'] 44 | best_f1 = result_dat['best_f1'] 45 | best_em = result_dat['best_exact'] 46 | else: 47 | best_f1_threshold = math.nan 48 | best_exact_threshold = math.nan 49 | best_f1 = result_dat['f1'] 50 | best_em = result_dat['exact'] 51 | with open(log_path, 'r') as in_f: 52 | log_lines = in_f.readlines() 53 | start_time_str = ' '.join(log_lines[0].split()[0:2]) 54 | end_time_str = ' '.join(log_lines[-1].split()[0:2]) 55 | start_time = datetime.strptime(start_time_str, datetime_parser) 56 | end_time = datetime.strptime(end_time_str, datetime_parser) 57 | time_spent = end_time - start_time 58 | time_spent_in_hours = time_spent.total_seconds() / 3600 59 | dat_l.append({'name': model_name, 60 | 'best_f1': best_f1, 61 | 'best_em': best_em, 62 | 'best_f1_thresh': best_f1_threshold, 63 | 'best_em_thresh': best_exact_threshold, 64 | 'time_spent_in_hours': time_spent_in_hours}) 65 | df = pd.DataFrame(dat_l) 66 | print(df) 67 | print('Saving to {}'.format(args.save_path)) 68 | df.to_csv(args.save_path) 69 | -------------------------------------------------------------------------------- /tools/batch/question_answering/run_batch_squad.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | USE_HOROVOD=${1:-0} 6 | VERSION=${2:-2.0} 7 | LOG_PATH=${3:-submit_squad_v2.log} 8 | DTYPE=${4:-float32} 9 | SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py 10 | 11 | 12 | for MODEL_NAME in albert_base \ 13 | albert_large \ 14 | albert_xlarge \ 15 | albert_xxlarge \ 16 | electra_base \ 17 | electra_large \ 18 | electra_small \ 19 | roberta_large \ 20 | uncased_bert_base \ 21 | uncased_bert_large \ 22 | uncased_bert_wwm_large \ 23 | gluon_en_cased_bert_base_v1 \ 24 | mobilebert 25 | do 26 | python3 ${SUBMIT_SCRIPT_PATH} \ 27 | --region us-east-1 \ 28 | --source-ref master \ 29 | --job-type g4dn.12x \ 30 | --save-path temp \ 31 | --name test_squad2_${MODEL_NAME} \ 32 | --work-dir scripts/question_answering \ 33 | --remote https://github.com/dmlc/gluon-nlp/ \ 34 | --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} ${DTYPE} | tee stdout.log" \ 35 | | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \ 36 | | sed -e 's/ - / /g' >> ${LOG_PATH} 37 | done 38 | -------------------------------------------------------------------------------- /tools/batch/run_batch_conversion.sh: -------------------------------------------------------------------------------- 1 | for MODEL_NAME in bert albert electra mobilebert roberta xlmr bart 2 | do 3 | python3 submit-job.py \ 4 | --region us-east-1 \ 5 | --source-ref master \ 6 | --job-type c5n.4x \ 7 | --name convert_${MODEL_NAME} \ 8 | --work-dir scripts/conversion_toolkits \ 9 | --remote https://github.com/dmlc/gluon-nlp/ \ 10 | --command 'bash convert_'${MODEL_NAME}'.sh | tee stdout.log' >> log.info 11 | done 12 | -------------------------------------------------------------------------------- /tools/batch/sync_batch_result.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | LOG_PATH=$1 6 | SAVE_DIR_NAME=${2:-squad_2.0} 7 | 8 | while read -r job_name job_id; do 9 | aws s3 sync s3://gluon-nlp-dev/batch/${job_id}/temp ${SAVE_DIR_NAME}/${job_name} 10 | done < ${LOG_PATH} 11 | -------------------------------------------------------------------------------- /tools/docker/devel_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /start_jupyter.sh 4 | 5 | exec "$@" 6 | -------------------------------------------------------------------------------- /tools/docker/gluon_nlp_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | date 4 | echo "Args: $@" 5 | env 6 | echo "jobId: $AWS_BATCH_JOB_ID" 7 | echo "jobQueue: $AWS_BATCH_JQ_NAME" 8 | echo "computeEnvironment: $AWS_BATCH_CE_NAME" 9 | 10 | SOURCE_REF=$1 11 | WORK_DIR=$2 12 | COMMAND=$3 13 | SAVED_OUTPUT=$4 14 | SAVE_PATH=$5 15 | REMOTE=$6 16 | DEVICE=${7:-gpu} 17 | 18 | if [ ! -z $REMOTE ]; then 19 | git remote set-url origin $REMOTE 20 | fi; 21 | 22 | git fetch origin $SOURCE_REF:working 23 | git checkout working 24 | 25 | if [ $DEVICE == "cpu" ]; then 26 | python3 -m pip uninstall --quiet mxnet -y 27 | python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20210121" -f https://dist.mxnet.io/python 28 | else 29 | python3 -m pip uninstall --quiet mxnet-cu102 -y 30 | python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0a" --user 31 | fi 32 | 33 | python3 -m pip install --quiet -e .[extras,dev] 34 | 35 | cd $WORK_DIR 36 | /bin/bash -o pipefail -c "$COMMAND" 37 | COMMAND_EXIT_CODE=$? 38 | if [[ -f $SAVED_OUTPUT ]]; then 39 | aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH --quiet; 40 | elif [[ -d $SAVED_OUTPUT ]]; then 41 | aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH --quiet; 42 | fi; 43 | exit $COMMAND_EXIT_CODE 44 | -------------------------------------------------------------------------------- /tools/docker/install/install_horovod.sh: -------------------------------------------------------------------------------- 1 | set -euo pipefail 2 | 3 | # Install Horovod 4 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \ 5 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \ 6 | HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user 7 | # Debug horovod by default 8 | echo NCCL_DEBUG=INFO >> /etc/nccl.conf 9 | -------------------------------------------------------------------------------- /tools/docker/install/install_jupyter_lab.sh: -------------------------------------------------------------------------------- 1 | set -euo pipefail 2 | 3 | # Install NodeJS + Tensorboard + TensorboardX 4 | 5 | curl -sL https://deb.nodesource.com/setup_14.x | bash - \ 6 | && apt-get install -y nodejs 7 | 8 | apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev 9 | 10 | python3 -m pip install --no-cache --upgrade \ 11 | soundfile==0.10.2 \ 12 | ipywidgets==7.5.1 \ 13 | jupyter_tensorboard==0.2.0 \ 14 | widgetsnbextension==3.5.1 \ 15 | tensorboard==2.1.1 \ 16 | tensorboardX==2.1 --user 17 | jupyter labextension install jupyterlab_tensorboard \ 18 | && jupyter nbextension enable --py widgetsnbextension \ 19 | && jupyter labextension install @jupyter-widgets/jupyterlab-manager 20 | 21 | # Revise default shell to /bin/bash 22 | jupyter notebook --generate-config \ 23 | && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py 24 | -------------------------------------------------------------------------------- /tools/docker/install/install_llvm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | set -e 20 | set -u 21 | set -o pipefail 22 | 23 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\ 24 | >> /etc/apt/sources.list.d/llvm.list 25 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\ 26 | >> /etc/apt/sources.list.d/llvm.list 27 | 28 | 29 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\ 30 | >> /etc/apt/sources.list.d/llvm.list 31 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\ 32 | >> /etc/apt/sources.list.d/llvm.list 33 | 34 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\ 35 | >> /etc/apt/sources.list.d/llvm.list 36 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\ 37 | >> /etc/apt/sources.list.d/llvm.list 38 | 39 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\ 40 | >> /etc/apt/sources.list.d/llvm.list 41 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\ 42 | >> /etc/apt/sources.list.d/llvm.list 43 | 44 | wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add - 45 | apt-get update && apt-get install -y llvm-9 llvm-10 llvm-11 clang-9 clang-10 clang-11 46 | -------------------------------------------------------------------------------- /tools/docker/install/install_openmpi.sh: -------------------------------------------------------------------------------- 1 | set -euo pipefail 2 | 3 | mkdir /tmp/openmpi \ 4 | && cd /tmp/openmpi \ 5 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 6 | && tar zxf openmpi-4.0.1.tar.gz \ 7 | && cd openmpi-4.0.1 \ 8 | && ./configure --enable-orterun-prefix-by-default \ 9 | && make -j $(nproc) all \ 10 | && make install \ 11 | && ldconfig \ 12 | && rm -rf /tmp/openmpi 13 | 14 | # Create a wrapper for OpenMPI to allow running as root by default 15 | mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 16 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 17 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 18 | && chmod a+x /usr/local/bin/mpirun 19 | 20 | echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 21 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 22 | -------------------------------------------------------------------------------- /tools/docker/install/install_python_packages.sh: -------------------------------------------------------------------------------- 1 | set -euo pipefail 2 | 3 | 4 | python3 -m pip --no-cache-dir install --upgrade \ 5 | pip \ 6 | setuptools \ 7 | wheel 8 | 9 | # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 10 | python3 -m pip install --no-cache --upgrade \ 11 | numpy==1.19.1 \ 12 | pandas==0.25.1 \ 13 | cython \ 14 | pytest \ 15 | pytest-cov \ 16 | Pillow \ 17 | requests==2.22.0 \ 18 | scikit-learn==0.20.4 \ 19 | scipy==1.2.2 \ 20 | urllib3==1.25.8 \ 21 | python-dateutil==2.8.0 \ 22 | sagemaker-experiments==0.* \ 23 | PyYAML==5.3.1 \ 24 | mpi4py==3.0.2 \ 25 | jupyterlab==2.2.4 \ 26 | contextvars \ 27 | cmake \ 28 | awscli --user 29 | -------------------------------------------------------------------------------- /tools/docker/install/install_tvm_cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | set -e 20 | set -u 21 | set -o pipefail 22 | 23 | cd ${WORKDIR} 24 | git clone https://github.com/apache/incubator-tvm tvm --recursive 25 | cd ${WORKDIR}/tvm 26 | # checkout a hash-tag 27 | git checkout bf862d4c4355eae4f18d89b3b6b98ed0a2c18e9c 28 | 29 | mkdir -p build 30 | cp cmake/config.cmake build 31 | echo set\(USE_LLVM llvm-config-10\) >> build/config.cmake 32 | echo set\(USE_GRAPH_EXECUTOR ON\) >> build/config.cmake 33 | echo set\(USE_BLAS openblas\) >> build/config.cmake 34 | 35 | cd build 36 | cmake .. -GNinja 37 | ninja 38 | 39 | # install python binding 40 | cd .. 41 | cd python 42 | python3 -m pip install -U -e . --user 43 | -------------------------------------------------------------------------------- /tools/docker/install/install_tvm_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | 19 | set -e 20 | set -u 21 | set -o pipefail 22 | 23 | cd ${WORKDIR} 24 | git clone https://github.com/apache/incubator-tvm tvm --recursive 25 | cd ${WORKDIR}/tvm 26 | # checkout a hash-tag 27 | git checkout bf862d4c4355eae4f18d89b3b6b98ed0a2c18e9c 28 | 29 | 30 | mkdir -p build 31 | cp cmake/config.cmake build 32 | echo set\(USE_LLVM llvm-config-10\) >> build/config.cmake 33 | echo set\(USE_CUDA ON\) >> build/config.cmake 34 | echo set\(USE_CUDNN ON\) >> build/config.cmake 35 | echo set\(USE_CUBLAS ON\) >> build/config.cmake 36 | echo set\(USE_GRAPH_EXECUTOR ON\) >> build/config.cmake 37 | echo set\(USE_BLAS openblas\) >> build/config.cmake 38 | 39 | cd build 40 | cmake -GNinja -DCUDA_CUBLAS_LIBRARY=/usr/lib/x86_64-linux-gnu/libcublas.so .. 41 | ninja 42 | 43 | # install python binding 44 | cd .. 45 | cd python 46 | python3 -m pip install -U -e . --user 47 | -------------------------------------------------------------------------------- /tools/docker/install/install_ubuntu18.04_core.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | set -u 3 | set -o pipefail 4 | 5 | export DEBIAN_FRONTEND=noninteractive 6 | 7 | rm -rf /var/lib/apt/lists/* \ 8 | && apt-get clean \ 9 | && apt-get update \ 10 | && apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | build-essential \ 13 | ca-certificates \ 14 | curl \ 15 | emacs \ 16 | subversion \ 17 | locales \ 18 | cmake \ 19 | git \ 20 | libopencv-dev \ 21 | htop \ 22 | vim \ 23 | wget \ 24 | unzip \ 25 | less \ 26 | libopenblas-dev \ 27 | gpg-agent \ 28 | ninja-build \ 29 | openssh-client \ 30 | openssh-server \ 31 | python3-dev \ 32 | python3-pip \ 33 | python3-setuptools \ 34 | libxft-dev \ 35 | zlib1g-dev \ 36 | && apt-get clean \ 37 | && rm -rf /var/lib/apt/lists/* 38 | 39 | ln -s $(which python3) /usr/local/bin/python 40 | -------------------------------------------------------------------------------- /tools/docker/start_jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run Jupyter in foreground if $JUPYTER_FG is set 4 | if [[ "${JUPYTER_FG}" == "true" ]]; then 5 | jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' 6 | exit 0 7 | else 8 | nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 & 9 | 10 | echo "Notebook server successfully started, a JupyterLab instance has been executed!" 11 | echo "Make local folders visible by volume mounting to /workspace/notebook" 12 | echo "To access visit http://localhost:8888 on your host machine." 13 | echo 'Ensure the following arguments to "docker run" are added to expose the server ports to your host machine: 14 | -p 8888:8888 -p 8787:8787 -p 8786:8786' 15 | fi 16 | -------------------------------------------------------------------------------- /tools/docker/ubuntu18.04-cpu.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 as base 2 | 3 | LABEL maintainer="GluonNLP Team" 4 | COPY install /install 5 | 6 | ENV PYTHONDONTWRITEBYTECODE=1 \ 7 | PYTHONUNBUFFERED=1 \ 8 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ 9 | PYTHONIOENCODING=UTF-8 \ 10 | LANG=C.UTF-8 \ 11 | LC_ALL=C.UTF-8 12 | 13 | ENV WORKDIR=/workspace 14 | ENV SHELL=/bin/bash 15 | 16 | RUN mkdir -p ${WORKDIR} 17 | 18 | 19 | RUN bash /install/install_ubuntu18.04_core.sh 20 | 21 | # Install Open MPI 22 | RUN bash /install/install_openmpi.sh 23 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 24 | ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH 25 | 26 | # Install LLVM 27 | RUN bash /install/install_llvm.sh 28 | 29 | # Install Python Packages 30 | RUN bash /install/install_python_packages.sh 31 | 32 | # Install TVM 33 | RUN bash /install/install_tvm_cpu.sh 34 | 35 | # Install MXNet 36 | RUN python3 -m pip install -U --pre "mxnet>=2.0.0a" --user 37 | 38 | # Install PyTorch 39 | RUN python3 -m pip install "torch==1.7.1+cpu" torchvision -f https://download.pytorch.org/whl/torch_stable.html 40 | 41 | # Install Jupyter Lab 42 | RUN bash /install/install_jupyter_lab.sh 43 | 44 | RUN mkdir -p ${WORKDIR}/data 45 | RUN mkdir -p /.init 46 | RUN cd ${WORKDIR} \ 47 | && git clone https://github.com/dmlc/gluon-nlp \ 48 | && cd gluon-nlp \ 49 | && git checkout master \ 50 | && python3 -m pip install -U -e ."[extras,dev]" 51 | 52 | 53 | # Stage-CI 54 | FROM base as ci 55 | WORKDIR ${WORKDIR}/gluon-nlp 56 | ADD gluon_nlp_job.sh . 57 | RUN chmod +x gluon_nlp_job.sh 58 | 59 | 60 | # Stage-Devel 61 | FROM base as devel 62 | COPY start_jupyter.sh /start_jupyter.sh 63 | COPY devel_entrypoint.sh /devel_entrypoint.sh 64 | RUN chmod +x /devel_entrypoint.sh 65 | 66 | EXPOSE 8888 67 | EXPOSE 8787 68 | EXPOSE 8786 69 | 70 | WORKDIR ${WORKDIR} 71 | 72 | # Add Tini 73 | ARG TINI_VERSION=v0.19.0 74 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini 75 | RUN chmod +x /tini 76 | ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] 77 | CMD ["/bin/bash"] 78 | -------------------------------------------------------------------------------- /tools/docker/ubuntu18.04-gpu.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as base 2 | 3 | LABEL maintainer="GluonNLP Team" 4 | COPY install /install 5 | 6 | ENV PYTHONDONTWRITEBYTECODE=1 \ 7 | PYTHONUNBUFFERED=1 \ 8 | LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \ 9 | PYTHONIOENCODING=UTF-8 \ 10 | LANG=C.UTF-8 \ 11 | LC_ALL=C.UTF-8 12 | 13 | ENV WORKDIR=/workspace 14 | ENV SHELL=/bin/bash 15 | 16 | RUN mkdir -p ${WORKDIR} 17 | 18 | RUN bash /install/install_ubuntu18.04_core.sh 19 | 20 | # Install Open MPI 21 | RUN bash /install/install_openmpi.sh 22 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 23 | ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:/usr/bin:$PATH 24 | 25 | # Install LLVM 26 | RUN bash /install/install_llvm.sh 27 | 28 | # Install Python Packages 29 | RUN bash /install/install_python_packages.sh 30 | 31 | # Install TVM 32 | RUN bash /install/install_tvm_gpu.sh 33 | 34 | # Install MXNet 35 | RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0a" --user 36 | 37 | # Install PyTorch 38 | RUN python3 -m pip install "torch==1.8.1+cu102" torchvision -f https://download.pytorch.org/whl/torch_stable.html 39 | 40 | # Install Horovod 41 | RUN bash /install/install_horovod.sh 42 | 43 | # Install Jupyter Lab 44 | RUN bash /install/install_jupyter_lab.sh 45 | 46 | RUN mkdir -p ${WORKDIR}/data 47 | RUN mkdir -p /.init 48 | RUN cd ${WORKDIR} \ 49 | && git clone https://github.com/dmlc/gluon-nlp \ 50 | && cd gluon-nlp \ 51 | && git checkout master \ 52 | && python3 -m pip install -U -e ."[extras,dev]" 53 | 54 | # Stage-CI 55 | FROM base as ci 56 | WORKDIR ${WORKDIR}/gluon-nlp 57 | ADD gluon_nlp_job.sh . 58 | RUN chmod +x gluon_nlp_job.sh 59 | 60 | # Stage-Devel 61 | FROM base as devel 62 | COPY start_jupyter.sh /start_jupyter.sh 63 | COPY devel_entrypoint.sh /devel_entrypoint.sh 64 | RUN chmod +x /devel_entrypoint.sh 65 | 66 | EXPOSE 8888 67 | EXPOSE 8787 68 | EXPOSE 8786 69 | 70 | WORKDIR ${WORKDIR} 71 | 72 | # Add Tini 73 | ARG TINI_VERSION=v0.19.0 74 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini 75 | RUN chmod +x /tini 76 | ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ] 77 | CMD ["/bin/bash"] 78 | --------------------------------------------------------------------------------