├── .coveragerc
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── buildwebsite.yml
    │   ├── data-pipeline.yml
    │   ├── nightly-test.yml
    │   ├── unittests-gpu.yml
    │   └── unittests.yml
├── .gitignore
├── .pylintrc
├── .pytype.cfg
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE
├── Makefile
├── README.md
├── conftest.py
├── docs
    ├── .gitignore
    ├── .nojekyll
    ├── 404.rst
    ├── Doxyfile
    ├── Makefile
    ├── README.txt
    ├── _static
    │   ├── 404.jpg
    │   ├── custom.css
    │   ├── gluon-logo.svg
    │   ├── gluon.ico
    │   ├── google_analytics.js
    │   ├── hidebib.js
    │   └── install-options.js
    ├── api
    │   ├── attention.rst
    │   ├── data.rst
    │   ├── embedding.rst
    │   ├── index.rst
    │   ├── layers.rst
    │   ├── models.rst
    │   ├── operators.rst
    │   ├── sequence_sampler.rst
    │   └── utils.rst
    ├── conf.py
    ├── examples.rst
    ├── genindex.rst
    ├── index.rst
    ├── install.rst
    ├── install
    │   ├── install-include.rst
    │   └── install-more.rst
    ├── md2ipynb.py
    ├── model_zoo
    ├── tutorials
    │   ├── deep_learning_compiler
    │   │   ├── index.rst
    │   │   └── tvm_basic.md
    │   ├── index.rst
    │   ├── pretrained_models
    │   │   ├── index.rst
    │   │   ├── pretrained_t5_mlm.md
    │   │   └── pretraining_objectives.png
    │   ├── question_answering
    │   │   ├── index.rst
    │   │   ├── offsets_match.png
    │   │   ├── qa1.png
    │   │   ├── qa2.png
    │   │   ├── question_answering.md
    │   │   └── squad_utils.py
    │   ├── text_prediction
    │   │   ├── bert_illustration.png
    │   │   ├── index.rst
    │   │   ├── merge_input.png
    │   │   ├── text_prediction_part1.md
    │   │   └── text_prediction_part2.md
    │   ├── tokenization
    │   │   ├── index.rst
    │   │   ├── tokenization_part1.md
    │   │   ├── tokenization_part2.md
    │   │   └── tokenization_part3.md
    │   └── word_embedding
    │   │   ├── index.rst
    │   │   └── word_embedding.md
    └── website
    │   ├── configuration.rst
    │   ├── contribute.rst
    │   ├── git.rst
    │   ├── index.rst
    │   └── release.rst
├── pytest.ini
├── scripts
    ├── __init__.py
    ├── benchmarks
    │   ├── README.md
    │   ├── benchmark_gluonnlp.py
    │   ├── benchmark_gluonnlp.sh
    │   ├── benchmark_gluonnlp_fp16.sh
    │   ├── benchmark_gluonnlp_tvm.sh
    │   ├── benchmark_hf.py
    │   ├── benchmark_utils.py
    │   ├── requirements.txt
    │   └── run_backbone_benchmark.sh
    ├── classification
    │   ├── README.md
    │   ├── classification.py
    │   ├── classification_utils.py
    │   └── train_classification.py
    ├── conversion_toolkits
    │   ├── README.md
    │   ├── bert_base_config.json
    │   ├── bert_large_config.json
    │   ├── convert_albert.sh
    │   ├── convert_bart.sh
    │   ├── convert_bert.sh
    │   ├── convert_bert_torch.sh
    │   ├── convert_electra.py
    │   ├── convert_electra.sh
    │   ├── convert_fairseq_bart.py
    │   ├── convert_fairseq_roberta.py
    │   ├── convert_fairseq_xlmr.py
    │   ├── convert_gpt2.py
    │   ├── convert_gpt2.sh
    │   ├── convert_mobilebert.py
    │   ├── convert_mobilebert.sh
    │   ├── convert_mt5.py
    │   ├── convert_mt5.sh
    │   ├── convert_roberta.sh
    │   ├── convert_t5.py
    │   ├── convert_t5.sh
    │   ├── convert_tf_hub_model.py
    │   └── convert_xlmr.sh
    ├── datasets
    │   ├── README.md
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── general_nlp_benchmark
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── prepare_glue.py
    │   │   └── prepare_text_classification.py
    │   ├── language_modeling
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   └── prepare_lm.py
    │   ├── machine_translation
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── prepare_wmt.py
    │   │   ├── wmt2014_ende.sh
    │   │   └── wmt2017_zhen.sh
    │   ├── music_generation
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   └── prepare_music_midi.py
    │   ├── pretrain_corpus
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── prepare_bookcorpus.py
    │   │   ├── prepare_gutenberg.py
    │   │   ├── prepare_openwebtext.py
    │   │   └── prepare_wikipedia.py
    │   ├── question_answering
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── prepare_hotpotqa.py
    │   │   ├── prepare_naturalquestions.py
    │   │   ├── prepare_searchqa.py
    │   │   ├── prepare_squad.py
    │   │   └── prepare_triviaqa.py
    │   ├── update_download_stats.py
    │   └── url_checksums
    │   │   ├── bookcorpus.txt
    │   │   ├── glue.txt
    │   │   ├── gutenberg.txt
    │   │   ├── hotpotqa.txt
    │   │   ├── language_model.txt
    │   │   ├── mirror
    │   │       └── wmt.json
    │   │   ├── music_midi.txt
    │   │   ├── naturalquestions.txt
    │   │   ├── searchqa.txt
    │   │   ├── squad.txt
    │   │   ├── superglue.txt
    │   │   ├── text_classification.txt
    │   │   ├── triviaqa.txt
    │   │   ├── wikipedia.txt
    │   │   └── wmt.txt
    ├── generation
    │   ├── README.md
    │   ├── calculate_metrics.py
    │   ├── generate_unconditional_gpt2_samples.py
    │   └── interactive_conditional_gpt2_samples.py
    ├── index.rst
    ├── machine_translation
    │   ├── README.md
    │   ├── __init__.py
    │   ├── evaluate_epochs_wmt2014_ende.sh
    │   ├── evaluate_transformer.py
    │   ├── train_transformer.py
    │   ├── transformer_enc12_dec1.yml
    │   └── wmt2014_back_translation.sh
    ├── pretraining
    │   ├── README.md
    │   ├── bert
    │   │   ├── README.md
    │   │   ├── covert_bookcorpus_format.py
    │   │   ├── create_pretraining_data.py
    │   │   ├── pretraining_utils.py
    │   │   └── run_pretraining.py
    │   ├── convert_electra_pretrain_backbone.py
    │   ├── data_preprocessing.py
    │   ├── pretraining_utils.py
    │   ├── run_electra.py
    │   └── torch
    │   │   └── bert
    │   │       ├── README.md
    │   │       ├── prepare_quickthought.py
    │   │       └── run_pretraining.py
    ├── processing
    │   ├── README.md
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── apply_subword.py
    │   ├── clean_tok_corpus.py
    │   ├── learn_subword.py
    │   └── segment_sentences.py
    └── question_answering
    │   ├── README.md
    │   ├── albert_custom.yaml
    │   ├── commands
    │       ├── README.md
    │       ├── generate_commands.py
    │       ├── run_squad.template
    │       ├── run_squad2_albert_base.sh
    │       ├── run_squad2_albert_large.sh
    │       ├── run_squad2_albert_xlarge.sh
    │       ├── run_squad2_albert_xxlarge.sh
    │       ├── run_squad2_electra_base.sh
    │       ├── run_squad2_electra_large.sh
    │       ├── run_squad2_electra_small.sh
    │       ├── run_squad2_gluon_en_cased_bert_base_v1.sh
    │       ├── run_squad2_mobilebert.sh
    │       ├── run_squad2_roberta_large.sh
    │       ├── run_squad2_uncased_bert_base.sh
    │       ├── run_squad2_uncased_bert_large.sh
    │       └── run_squad2_uncased_bert_wwm_large.sh
    │   ├── custom_strategy.py
    │   ├── eval_utils.py
    │   ├── models.py
    │   ├── run_squad.py
    │   ├── run_squad_albert.py
    │   └── squad_utils.py
├── setup.py
├── src
    └── gluonnlp
    │   ├── __init__.py
    │   ├── attention_cell.py
    │   ├── base.py
    │   ├── cli
    │       ├── __init__.py
    │       ├── average_checkpoint.py
    │       ├── data
    │       └── process
    │   ├── data
    │       ├── __init__.py
    │       ├── batchify.py
    │       ├── filtering.py
    │       ├── loading.py
    │       ├── sampler.py
    │       ├── tokenizers
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   ├── huggingface.py
    │       │   ├── jieba.py
    │       │   ├── moses.py
    │       │   ├── sentencepiece.py
    │       │   ├── spacy.py
    │       │   ├── subword_nmt.py
    │       │   ├── whitespace.py
    │       │   └── yttm.py
    │       └── vocab.py
    │   ├── embedding
    │       ├── __init__.py
    │       ├── _constants.py
    │       └── embed_loader.py
    │   ├── initializer.py
    │   ├── layers.py
    │   ├── loss.py
    │   ├── lr_scheduler.py
    │   ├── models
    │       ├── __init__.py
    │       ├── albert.py
    │       ├── bart.py
    │       ├── base.py
    │       ├── bert.py
    │       ├── electra.py
    │       ├── gpt2.py
    │       ├── mobilebert.py
    │       ├── model_zoo_checksums
    │       │   ├── albert.txt
    │       │   ├── bart.txt
    │       │   ├── bert.txt
    │       │   ├── electra.txt
    │       │   ├── gpt2.txt
    │       │   ├── mobilebert.txt
    │       │   ├── mt5.txt
    │       │   ├── roberta.txt
    │       │   ├── t5.txt
    │       │   └── xlmr.txt
    │       ├── mt5.py
    │       ├── roberta.py
    │       ├── t5.py
    │       ├── transformer.py
    │       ├── transformer_xl.py
    │       └── xlmr.py
    │   ├── op.py
    │   ├── sequence_sampler.py
    │   ├── third_party
    │       ├── __init__.py
    │       ├── sentencepiece_model_pb2.py
    │       └── sentencepiece_pb2.py
    │   ├── torch
    │       ├── __init__.py
    │       ├── attention_cell.py
    │       ├── clib
    │       │   ├── amp_C_frontend.cpp
    │       │   ├── compat.h
    │       │   ├── multi_tensor_apply.cuh
    │       │   ├── multi_tensor_l2norm_kernel.cu
    │       │   ├── multi_tensor_lans.cu
    │       │   └── type_shim.h
    │       ├── data
    │       │   ├── __init__.py
    │       │   └── batchify.py
    │       ├── layers.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── bert.py
    │       │   └── transformer.py
    │       ├── optimizers
    │       │   ├── __init__.py
    │       │   ├── fused_lans.py
    │       │   └── schedules.py
    │       └── utils.py
    │   └── utils
    │       ├── __init__.py
    │       ├── config.py
    │       ├── lazy_imports.py
    │       ├── misc.py
    │       ├── parameter.py
    │       ├── preprocessing.py
    │       ├── registry.py
    │       ├── shm.py
    │       ├── testing.py
    │       └── tvm_utils.py
├── tests
    ├── README.md
    ├── data_cli
    │   ├── test_glue.py
    │   └── test_wikipedia.py
    ├── process_cli
    │   ├── data
    │   │   ├── wmt19-test-de-en.de
    │   │   ├── wmt19-test-de-en.en
    │   │   └── wmt19-test-zh-en.zh.jieba
    │   ├── test_average_checkpoint.py
    │   └── test_learn_apply_subword.py
    ├── test_attention_cell.py
    ├── test_data_batchify.py
    ├── test_data_filtering.py
    ├── test_data_loading.py
    ├── test_data_sampler.py
    ├── test_data_tokenizers.py
    ├── test_data_vocab.py
    ├── test_embedding.py
    ├── test_gluon_block.py
    ├── test_initializer.py
    ├── test_layers.py
    ├── test_loss.py
    ├── test_models.py
    ├── test_models_albert.py
    ├── test_models_bart.py
    ├── test_models_bert.py
    ├── test_models_electra.py
    ├── test_models_gpt2.py
    ├── test_models_mobilebert.py
    ├── test_models_mt5.py
    ├── test_models_roberta.py
    ├── test_models_t5.py
    ├── test_models_transformer.py
    ├── test_models_transformer_xl.py
    ├── test_models_xlmr.py
    ├── test_op.py
    ├── test_pytest.py
    ├── test_sequence_sampler.py
    ├── test_utils_misc.py
    ├── test_utils_parameter.py
    ├── test_utils_preprocessing.py
    ├── test_utils_registry.py
    └── torch
    │   ├── test_attention_cell_torch.py
    │   ├── test_bert_torch.py
    │   └── test_layers_torch.py
└── tools
    ├── batch
        ├── README.md
        ├── backbone_benchmark
        │   └── run_batch_backbone_benchmark.sh
        ├── batch_states
        │   ├── compile_notebooks.sh
        │   ├── test.sh
        │   └── test_data_pipeline.sh
        ├── hello_world.py
        ├── question_answering
        │   ├── parse_squad_results.py
        │   └── run_batch_squad.sh
        ├── run_batch_conversion.sh
        ├── submit-job.py
        ├── sync_batch_result.sh
        └── wait-job.py
    ├── diagnose.py
    └── docker
        ├── README.md
        ├── devel_entrypoint.sh
        ├── gluon_nlp_job.sh
        ├── install
            ├── install_horovod.sh
            ├── install_jupyter_lab.sh
            ├── install_llvm.sh
            ├── install_openmpi.sh
            ├── install_python_packages.sh
            ├── install_tvm_cpu.sh
            ├── install_tvm_gpu.sh
            └── install_ubuntu18.04_core.sh
        ├── start_jupyter.sh
        ├── ubuntu18.04-cpu.Dockerfile
        └── ubuntu18.04-gpu.Dockerfile


/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | omit =
 4 |     tests/*
 5 |     scripts/*
 6 | concurrency =
 7 |     multiprocessing
 8 |     thread
 9 | 
10 | [report]
11 | ignore_errors = True
12 | 
13 | [html]
14 | directory = coverage_html_report
15 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | max-complexity = 18
4 | exclude = tests,__init__.py
5 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: 'bug'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | ## Description
10 | (A clear and concise description of what the bug is.)
11 | 
12 | ### Error Message
13 | (Paste the complete error message, including stack trace.)
14 | 
15 | ## To Reproduce
16 | (If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)
17 | 
18 | ### Steps to reproduce
19 | (Paste the commands you ran that produced the error.)
20 | 
21 | 1.
22 | 2.
23 | 
24 | ## What have you tried to solve it?
25 | 
26 | 1.
27 | 2.
28 | 
29 | ## Environment
30 | 
31 | We recommend using our script for collecting the diagnositc information. Run the following command and paste the outputs below:
32 | ```
33 | curl --retry 10 -s https://raw.githubusercontent.com/dmlc/gluon-nlp/master/tools/diagnose.py | python
34 | 
35 | # paste outputs here
36 | ```
37 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Description
11 | (A clear and concise description of what the feature is.)
12 | - If the proposal is about a new model, provide description of what the model is.
13 | - If the proposal is about an API, provide mock examples if possible.
14 | 
15 | ## References
16 | - list reference and related literature
17 | - list known implementations
18 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description ##
 2 | (Brief description on what this PR is about)
 3 | 
 4 | ## Checklist ##
 5 | ### Essentials ###
 6 | - [ ] PR's title starts with a category (e.g. [BUGFIX], [MODEL], [TUTORIAL], [FEATURE], [DOC], etc)
 7 | - [ ] Changes are complete (i.e. I finished coding on this PR)
 8 | - [ ] All changes have test coverage
 9 | - [ ] Code is well-documented
10 | 
11 | ### Changes ###
12 | - [ ] Feature1, tests, (and when applicable, API doc)
13 | - [ ] Feature2, tests, (and when applicable, API doc)
14 | 
15 | ## Comments ##
16 | - If this change is a backward incompatible change, why must this change be made.
17 | - Interesting edge cases to note here
18 | 
19 | cc @dmlc/gluon-nlp-team
20 | 


--------------------------------------------------------------------------------
/.github/workflows/data-pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: data pipeline end-to-end
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '00 18 * * *'  # At UTC 18:00 AM, everyday, use https://crontab.guru/
 6 | 
 7 | defaults:
 8 |   run:
 9 |     shell: bash
10 | 
11 | jobs:
12 |   unittest:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         python-version: [ '3.7' ]
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v2
21 | 
22 |       - name: Install Other Dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           python -m pip install boto3
26 | 
27 |       - name: Configure AWS Credentials
28 |         uses: aws-actions/configure-aws-credentials@v1
29 |         with:
30 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
31 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
32 |           aws-region: us-east-1
33 | 
34 |       - name: Test Data Pipeline on AWS Batch
35 |         run: |
36 |           python ./tools/batch/submit-job.py --region us-east-1 \
37 |                                              --job-type c5n.4x \
38 |                                              --source-ref ${{ github.ref }} \
39 |                                              --work-dir tools/batch/batch_states \
40 |                                              --remote https://github.com/${{ github.repository }} \
41 |                                              --command "bash test_data_pipeline.sh" --wait
42 | 


--------------------------------------------------------------------------------
/.github/workflows/nightly-test.yml:
--------------------------------------------------------------------------------
 1 | name: nightly test
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '30 23 * * *'  # At UTC 23:30, everyday, use https://crontab.guru/
 6 | 
 7 | defaults:
 8 |   run:
 9 |     shell: bash
10 | 
11 | jobs:
12 |   unittest:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         python-version: [ '3.7' ]
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v2
21 | 
22 |       - name: Install Other Dependencies
23 |         run: |
24 |           python -m pip install --upgrade pip
25 |           python -m pip install boto3
26 | 
27 |       - name: Configure AWS Credentials
28 |         uses: aws-actions/configure-aws-credentials@v1
29 |         with:
30 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
31 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
32 |           aws-region: us-east-1
33 | 
34 |       - name: Test GluonNLP on MXNet nightly realse
35 |         run: |
36 |           echo "Start submitting job"
37 |           python ./tools/batch/submit-job.py --region us-east-1 \
38 |                                              --job-type g4dn.4x \
39 |                                              --name GluonNLP-Nightly-Test \
40 |                                              --source-ref ${{ github.ref }} \
41 |                                              --work-dir . \
42 |                                              --remote https://github.com/${{ github.repository }} \
43 |                                              --command "python3 -m pip install pytest-forked \
44 |                                                         && python3 -m pip install -U --pre 'mxnet-cu102>=2.0.0b20210418' -f https://dist.mxnet.io/python/cu102 \
45 |                                                         && python3 -m pytest --forked --durations=50 --device="cpu" --device="gpu" --runslow ./tests/" \
46 |                                              --wait | tee batch_job.log
47 | 
48 |       - name: Upload Cloud Watch Log
49 |         if: ${{ failure() || success() }}
50 |         uses: actions/upload-artifact@v2
51 |         with:
52 |           name: Test_Log
53 |           path: ./batch_job.log
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # pycharm
 77 | .idea
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # test data
107 | tests/data/
108 | tests/data/embedding/
109 | tests/data/my_embed/
110 | tests/externaldata/
111 | .pytest_cache
112 | 
113 | # docs
114 | docs/html
115 | 
116 | # release
117 | scripts/*.zip
118 | docs/tutorials/*.zip
119 | docs/tutorials/*/*.ipynb
120 | 
121 | conda
122 | 
123 | # temp files
124 | *.swp
125 | 
126 | # vscode
127 | .vscode
128 | 
129 | # Mac
130 | .DS_Store
131 | 
132 | # license checker
133 | ci/rat/apache-rat/
134 | ci/rat/apache-rat.jar
135 | 


--------------------------------------------------------------------------------
/.pytype.cfg:
--------------------------------------------------------------------------------
1 | # NOTE: All relative paths are relative to the location of this file.
2 | [pytype]
3 | # Space-separated list of files or directories to process.
4 | inputs =
5 |     src/gluonnlp
6 | 
7 | # Python version (major.minor) of the target code.
8 | python_version = 3.6
9 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # Watchers and contributors to DMLC GluonNLP repo directories/packages/files
 2 | # Please see documentation of use of CODEOWNERS file at
 3 | # https://help.github.com/articles/about-codeowners/ and
 4 | # https://github.com/blog/2392-introducing-code-owners
 5 | #
 6 | # Anybody can add themselves or a team as additional watcher or contributor
 7 | # to get notified about changes in a specific package.
 8 | # See https://help.github.com/articles/about-teams how to setup teams.
 9 | 
10 | 
11 | # Global owners
12 | *			@dmlc/gluon-nlp-committers @dmlc/gluon-nlp-reviewers
13 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | doxygen
2 | _build
3 | gen_modules
4 | tutorials
5 | doctrees
6 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/404.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | Page Not Found
 4 | --------------
 5 | 
 6 | You stumbled upon a page that's making us scratch our brains right now. Before any of us panics,
 7 | we'd like you to know that you are being redirected to a better known and cozy place, in just a few seconds.
 8 | 
 9 | .. image:: _static/404.jpg
10 |    :alt: Page Not Found
11 |    :width: 60%
12 |    :align: center
13 |    :target: ./index.html
14 | 
15 | .. raw:: html
16 | 
17 |    <script type="text/javascript">
18 |    window.onload = function() {
19 |       var seconds = 5;
20 |       var url = "http://gluon-nlp.mxnet.io";
21 |       setTimeout(function() {
22 |          window.location.href = url;
23 |       }, seconds*1000);
24 |    };
25 |    </script>
26 | 


--------------------------------------------------------------------------------
/docs/README.txt:
--------------------------------------------------------------------------------
1 | The documentation of GluonNLP is generated with recommonmark and sphinx.
2 | 
3 | - pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
4 | 
5 | For more details, refer to [website/configuration.rst](website/configuration.rst)
6 | 


--------------------------------------------------------------------------------
/docs/_static/404.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/_static/404.jpg


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
 1 | .Logos {
 2 |     display: inline;
 3 |     margin: 1em;
 4 |     max-width: 120px;
 5 | }
 6 | 
 7 | .install {
 8 |     max-width: 800px;
 9 | }
10 | .install .title {
11 |     display: inline-block;
12 |     min-width: 100px;
13 |     text-transform: uppercase;
14 |     font-size: 90%;
15 |     color: #555;
16 | }
17 | 
18 | .install .option {
19 |     margin: 5px;
20 | }
21 | 
22 | @media (max-width: 650px) {
23 |     .install .option, .install .title {
24 |         width: 90%;
25 |     }
26 | 
27 |     .install .title {
28 |         margin-top: 1em;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/docs/_static/gluon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/_static/gluon.ico


--------------------------------------------------------------------------------
/docs/_static/google_analytics.js:
--------------------------------------------------------------------------------
1 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
2 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
3 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
4 |   })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
5 | 
6 |   ga('create', 'UA-96378503-8', 'auto');
7 |   ga('send', 'pageview');
8 | 


--------------------------------------------------------------------------------
/docs/_static/hidebib.js:
--------------------------------------------------------------------------------
 1 | // adapted from: http://www.robots.ox.ac.uk/~vedaldi/assets/hidebib.js
 2 | function hideallbibs()
 3 | {
 4 |     var el = document.getElementsByTagName("div") ;
 5 |     for (var i = 0 ; i < el.length ; ++i) {
 6 |         if (el[i].className == "paper") {
 7 |             var bib = el[i].getElementsByTagName("pre") ;
 8 |             if (bib.length > 0) {
 9 |                 bib [0] .style.display = 'none' ;
10 |             }
11 |         }
12 |     }
13 | }
14 | 
15 | function togglebib(paperid)
16 | {
17 |     var paper = document.getElementById(paperid) ;
18 |     var bib = paper.getElementsByTagName('pre') ;
19 |     if (bib.length > 0) {
20 |         if (bib [0] .style.display == 'none') {
21 |             bib [0] .style.display = 'block' ;
22 |         } else {
23 |             bib [0] .style.display = 'none' ;
24 |         }
25 |     }
26 | }
27 | 
28 | function toggleblock(blockId)
29 | {
30 |    var block = document.getElementById(blockId);
31 |    if (block.style.display == 'none') {
32 |     block.style.display = 'block' ;
33 |    } else {
34 |     block.style.display = 'none' ;
35 |    }
36 | }
37 | 
38 | function hideblock(blockId)
39 | {
40 |    var block = document.getElementById(blockId);
41 |    block.style.display = 'none' ;
42 | }
43 | 


--------------------------------------------------------------------------------
/docs/_static/install-options.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function () {
 2 | 
 3 |     function label(lbl) {
 4 |         return $.trim(lbl.replace(/[ .]/g, '-').replace('+-', '').toLowerCase());
 5 |     }
 6 | 
 7 |     // a hack: macos doesn't support cuda, so disable all cuda options when it
 8 |     // is selected.
 9 |     function disableCuda() {
10 |         $('.install .option').each(function(){
11 |             if (label($(this).text()).indexOf("cuda") != -1) {
12 |                 $(this).addClass('disabled');
13 |             }
14 |         });
15 |     }
16 |     function enableCuda() {
17 |         $('.install .option').each(function(){
18 |             if (label($(this).text()).indexOf("cuda") != -1) {
19 |                 $(this).removeClass('disabled');
20 |             }
21 |         });
22 |     }
23 | 
24 |     // find the user os, and set the according option to active
25 |     function setActiveOSButton() {
26 |         var os = "linux"
27 |         var agent = window.navigator.userAgent.toLowerCase();
28 |         if (agent.indexOf("win") != -1) {
29 |             os = "windows"
30 |         } else if (agent.indexOf("mac") != -1) {
31 |             os = "macos"
32 |         }
33 |         if (os == "macos") {
34 |             disableCuda();
35 |         }
36 |         $('.install .option').each(function(){
37 |             if (label($(this).text()).indexOf(os) != -1) {
38 |                 $(this).addClass('active');
39 |             }
40 |         });
41 |     }
42 | 
43 |     setActiveOSButton();
44 | 
45 |     // apply theme
46 |     function setTheme() {
47 |         $('.opt-group .option').each(function(){
48 |             $(this).addClass('mdl-button mdl-js-button mdl-js-ripple-effect mdl-button--raised ');
49 |             $(this).attr('id', label($(this).text()));
50 |         });
51 |         $('.opt-group .active').each(function(){
52 |             $(this).addClass('mdl-button--colored');
53 |         });
54 |     }
55 |     setTheme();
56 | 
57 | 
58 |     // show the command according to the active options
59 |     function showCommand() {
60 |         $('.opt-group .option').each(function(){
61 |             $('.'+label($(this).text())).hide();
62 |             // console.log('disable '+label($(this).text()));
63 |         });
64 |         $('.opt-group .active').each(function(){
65 |             $('.'+label($(this).text())).show();
66 |             // console.log('enable '+label($(this).text()));
67 |         });
68 |     }
69 |     showCommand();
70 | 
71 |     function setOptions() {
72 |         var el = $(this);
73 |         el.siblings().removeClass('active');
74 |         el.siblings().removeClass('mdl-button--colored');
75 |         el.addClass('active');
76 |         el.addClass('mdl-button--colored');
77 |         // console.log('enable'+el.text())
78 |         // console.log('disable'+el.siblings().text())
79 |         console.log($('.install #macos').hasClass('active') )
80 |         if ($('.install #macos').hasClass('active') == true) {
81 |             disableCuda();
82 |         } else {
83 |             enableCuda();
84 |         }
85 |         showCommand();
86 |     }
87 | 
88 |     $('.opt-group').on('click', '.option', setOptions);
89 | 
90 | });
91 | 


--------------------------------------------------------------------------------
/docs/api/attention.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.attention_cell
 2 | =======================
 3 | 
 4 | GluonNLP Toolkit provides ways to implement the attention mechanism that is prevailing in NLP models.
 5 | 
 6 | .. currentmodule:: gluonnlp.attention_cell
 7 | 
 8 | Attention Mechanism
 9 | -------------------
10 | 
11 | .. automodule:: gluonnlp.attention_cell
12 |     :members:
13 |     :imported-members:
14 |     :special-members: __contains__, __getitem__, __setitem__
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/api/data.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.data
 2 | =============
 3 | 
 4 | GluonNLP Toolkit provides tools for building efficient data pipelines for NLP tasks.
 5 | 
 6 | .. currentmodule:: gluonnlp.data
 7 | 
 8 | Tokenizers
 9 | ----------
10 | .. automodule:: gluonnlp.data.tokenizers
11 |     :members:
12 |     :imported-members:
13 |     :special-members: __contains__, __getitem__, __setitem__
14 | 
15 | Vocabulary
16 | ----------
17 | .. automodule:: gluonnlp.data.vocab
18 |     :members:
19 |     :imported-members:
20 |     :special-members: __contains__, __getitem__, __setitem__
21 | 
22 | Batchify Function
23 | -----------------
24 | .. automodule:: gluonnlp.data.batchify
25 |     :members:
26 | 
27 | Data Sampler
28 | ------------
29 | .. automodule:: gluonnlp.data.sampler
30 |     :members:
31 |     :imported-members:
32 | 
33 | Text Filtering
34 | --------------
35 | .. automodule:: gluonnlp.data.filtering
36 |     :members:
37 |     :imported-members:
38 | 
39 | Data Loading
40 | ------------
41 | .. automodule:: gluonnlp.data.loading
42 |     :members:
43 |     :imported-members:
44 | 


--------------------------------------------------------------------------------
/docs/api/embedding.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.embedding
 2 | ==================
 3 | 
 4 | GluonNLP Toolkit provides tools for working with embeddings.
 5 | 
 6 | .. currentmodule:: gluonnlp.embedding
 7 | 
 8 | This page describes the ``gluonnlp`` APIs for text embedding, such as loading
 9 | pre-trained embedding vectors for text tokens and storing them in the
10 | ``numpy.ndarray`` format.
11 | 
12 | 
13 | Pre-trained Embeddings
14 | ----------------------
15 | 
16 | .. currentmodule:: gluonnlp.embedding
17 | .. autosummary::
18 |     :nosignatures:
19 | 
20 |     list_sources
21 |     load_embeddings
22 |     get_fasttext_model
23 | 
24 | 
25 | API Reference
26 | -------------
27 | 
28 | .. automodule:: gluonnlp.embedding
29 |     :members:
30 |     :imported-members:
31 |     :special-members: __contains__, __getitem__, __setitem__
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
 1 | API Documentation
 2 | =================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 |    data
 8 |    embedding
 9 |    models
10 |    attention
11 |    layers
12 |    operators
13 |    sequence_sampler
14 |    utils
15 | 


--------------------------------------------------------------------------------
/docs/api/layers.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.layers
 2 | ===============
 3 | 
 4 | GluonNLP Toolkit provides some common layers that can help you build NLP models.
 5 | 
 6 | .. currentmodule:: gluonnlp.layers
 7 | 
 8 | Layers
 9 | ------
10 | 
11 | .. automodule:: gluonnlp.layers
12 |     :members:
13 |     :imported-members:
14 |     :special-members: __contains__, __getitem__, __setitem__
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/api/models.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.models
 2 | ===============
 3 | 
 4 | GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default,
 5 | all requested pre-trained weights are downloaded from public repo and stored in `~/.gluonnlp/models/`.
 6 | 
 7 | .. currentmodule:: gluonnlp.models
 8 | .. autosummary::
 9 | 
10 | Models
11 | ------
12 | .. automodule:: gluonnlp.models
13 |     :members:
14 |     :no-inherited-members:
15 |     :imported-members:
16 |     :special-members: __contains__, __getitem__, __setitem__
17 | 


--------------------------------------------------------------------------------
/docs/api/operators.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.op
 2 | ===============
 3 | 
 4 | GluonNLP Toolkit provides some functions that can help you build NLP architectures and training pipelines.
 5 | 
 6 | .. currentmodule:: gluonnlp.op
 7 | 
 8 | Layers
 9 | ------
10 | 
11 | .. automodule:: gluonnlp.op
12 |     :members:
13 |     :imported-members:
14 |     :special-members: __contains__, __getitem__, __setitem__
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/api/sequence_sampler.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.sequence_sampler
 2 | =========================
 3 | 
 4 | GluonNLP Toolkit provides ways to sample from a sequence generator.
 5 | 
 6 | .. currentmodule:: gluonnlp.sequence_sampler
 7 | 
 8 | Sequence Sampler
 9 | ----------------
10 | 
11 | .. automodule:: gluonnlp.sequence_sampler
12 |     :members:
13 |     :imported-members:
14 |     :special-members: __contains__, __getitem__, __setitem__
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/api/utils.rst:
--------------------------------------------------------------------------------
 1 | gluonnlp.utils
 2 | ==============
 3 | 
 4 | GluonNLP Toolkit provides tools for easily setting up task specific loss.
 5 | 
 6 | .. currentmodule:: gluonnlp.utils
 7 | 
 8 | API Reference
 9 | -------------
10 | 
11 | .. automodule:: gluonnlp.utils
12 |     :members:
13 |     :imported-members:
14 |     :special-members: __contains__, __getitem__, __setitem__
15 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | --------
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Benchmarking the Performance of NLP Backbones
 8 |       :link: model_zoo/benchmarks/index.html
 9 | 
10 |       Benchmarking the performance of NLP models.
11 |       We released the benchmarking script that compares different NLP packages.
12 | 
13 |    .. card::
14 |       :title: Conversion Scripts
15 |       :link: model_zoo/conversion_toolkits/index.html
16 | 
17 |       Converting NLP models from other frameworks to GluonNLP.
18 | 
19 |    .. card::
20 |       :title: Datasets
21 |       :link: model_zoo/datasets/index.html
22 | 
23 |       Example about the datasets supported by `nlp_data`
24 | 
25 |    .. card::
26 |       :title: Generation
27 |       :link: model_zoo/generation/index.html
28 | 
29 |       Example about how to generate from a pretrained GPT-2 model with GluonNLP.
30 |       We provided the generation script and tried to compare different sampling methods.
31 | 
32 |    .. card::
33 |       :title: Machine Translation
34 |       :link: model_zoo/machine_translation/index.html
35 | 
36 |       Train machine translation model with GluonNLP.
37 | 
38 |    .. card::
39 |       :title: Data Preprocessing Toolkit in GluonNLP
40 |       :link: model_zoo/processing/index.html
41 | 
42 |       Example about the data processing toolkit (`nlp_process`) offered in GluonNLP.
43 | 
44 |    .. card::
45 |       :title: Pretraining Model
46 |       :link: model_zoo/pretraining/index.html
47 | 
48 |       Examples about pretraining your own backbones.
49 | 
50 |    .. card::
51 |       :title: Question Answering Examples
52 |       :link: model_zoo/question_answering/index.html
53 | 
54 |       Run SQuAD 1.1 and 2.0 finetuning with GluonNLP. You will know how to run the models with
55 |       mixed-precision training (AMP) and Horovod.
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/genindex.rst:
--------------------------------------------------------------------------------
1 | Index
2 | =====
3 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | GluonNLP: NLP made easy
 2 | =======================
 3 | 
 4 | Get Started: A Quick Example
 5 | ----------------------------
 6 | 
 7 | Here is a quick example that downloads and creates a word embedding model and then
 8 | computes the cosine similarity between two words.
 9 | 
10 | (You can click the play button below to run this example.)
11 | 
12 | .. container:: demo
13 |    :name: frontpage-demo
14 | 
15 |    `Word Embedding <https://repl.it/@szha/gluon-nlp>`_
16 | 
17 | .. raw:: html
18 | 
19 |    <script type="text/javascript">
20 |    window.onload = function() {
21 |      var demo = document.createElement("IFRAME");
22 |      demo.src = "https://repl.it/@szha/gluon-nlp?lite=true";
23 |      demo.height = "400px";
24 |      demo.width = "100%";
25 |      demo.scrolling = "no";
26 |      demo.frameborder = "no";
27 |      demo.allowtransparency = true;
28 |      demo.allowfullscreen = true;
29 |      demo.seamless = true;
30 |      demo.sandbox = "allow-forms allow-pointer-lock allow-same-origin allow-scripts allow-modals";
31 |      demo_div = document.getElementById("frontpage-demo");
32 |      while (demo_div.firstChild) {
33 |        demo_div.removeChild(demo_div.firstChild);
34 |      }
35 |      demo_div.appendChild(demo);
36 |    }; // load demo last
37 |    </script>
38 | 
39 | 
40 | .. include:: examples.rst
41 | 
42 | And more in :doc:`tutorials <tutorials/index>`.
43 | 
44 | 
45 | .. include:: install.rst
46 | 
47 | 
48 | About GluonNLP
49 | --------------
50 | 
51 | .. hint::
52 | 
53 |    You can find out the doc for our master development branch `here <http://nlp.gluon.ai/master/index.html>`_.
54 | 
55 | GluonNLP provides implementations of the state-of-the-art (SOTA) deep learning
56 | models in NLP, and build blocks for text data pipelines and models.
57 | It is designed for engineers, researchers, and students to fast prototype
58 | research ideas and products based on these models. This toolkit offers five main features:
59 | 
60 | 1. Carefully designed APIs that greatly reduce the implementation complexity.
61 | 2. Pre-trained models for common NLP tasks.
62 | 3. Tutorials to help get started on new NLP tasks.
63 | 4. Community support.
64 | 
65 | This toolkit assumes that users have basic knowledge about deep learning and
66 | NLP. Otherwise, please refer to an introductory course such as
67 | `Dive into Deep Learning <https://www.d2l.ai/>`_ or
68 | `Stanford CS224n <http://web.stanford.edu/class/cs224n/>`_.
69 | If you are not familiar with Gluon, check out the `Gluon documentation
70 | <http://mxnet.apache.org/versions/master/tutorials/index.html#python-tutorials>`__.
71 | You may find the 60-min Gluon crash course linked from there especially helpful.
72 | 
73 | 
74 | .. toctree::
75 |    :hidden:
76 |    :maxdepth: 2
77 | 
78 |    tutorials/index
79 |    model_zoo/index
80 |    api/index
81 |    website/index
82 |    genindex
83 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ------------
 3 | 
 4 | .. Ignore prerequisites to make the index page concise, which will be shown at
 5 |    the install page
 6 | 
 7 | .. raw:: html
 8 | 
 9 |    <style>.admonition-prerequisite {display: none;}</style>
10 | 
11 | .. include:: install/install-include.rst
12 | 
13 | .. raw:: html
14 | 
15 |    <style>.disabled { display: none; }</style>
16 |    <script type="text/javascript" src='_static/install-options.js'></script>
17 | 
18 | Check :doc:`install/install-more` for more installation instructions and options.
19 | 


--------------------------------------------------------------------------------
/docs/install/install-more.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | Installation
 4 | ------------
 5 | 
 6 | .. include:: install-include.rst
 7 | 
 8 | .. raw:: html
 9 | 
10 |    <style>.disabled { display: none; }</style>
11 |    <script type="text/javascript" src='../_static/install-options.js'></script>
12 | 
13 | 
14 | Next steps
15 | ----------
16 | 
17 | - Checkout Apache MXNet `Get Started <https://mxnet.apache.org/get_started>`_ for more options such as ARM devices and docker images.
18 | - `Verify your MXNet installation <https://mxnet.apache.org/get_started/validate_mxnet>`_
19 | - `Configure MXNet environment variables <https://mxnet.apache.org/api/faq/env_var>`_
20 | - For new users: MXNet `Crash Course <https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/index.html>`_ and `other tutorials <https://mxnet.apache.org/api/python/docs/tutorials/index.html>`_.
21 | - For experienced users: `Packages & Modules <https://mxnet.apache.org/api/python/docs/tutorials/index.html#packages-modules>`_ and `Performance tips <https://mxnet.apache.org/api/python/docs/tutorials/index.html#performance>`_.
22 | - For advanced users: Apache MXNet `API <https://mxnet.apache.org/api>`_ and `GluonNLP API <../api/index.html>`_.
23 | 
24 | ..
25 |    TOOD: write a new directive `no-local-toc` for it
26 | 
27 | .. raw:: html
28 | 
29 |    <style>.localtoc { display: none; }</style>
30 | 


--------------------------------------------------------------------------------
/docs/md2ipynb.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import time
 4 | 
 5 | import nbformat
 6 | import notedown
 7 | 
 8 | parser = argparse.ArgumentParser(description='Convert md file to ipynb files.')
 9 | parser.add_argument('input', help='input.md', type=str)
10 | parser.add_argument('-d', '--disable_compute',
11 |                     help='Disable computing python scripts', action="store_true")
12 | args = parser.parse_args()
13 | 
14 | # timeout for each notebook, in sec
15 | timeout = 90 * 60
16 | 
17 | # the files will be ignored for execution
18 | ignore_execution = []
19 | 
20 | # Change working directory to directory of input file
21 | input_dir, input_fn = os.path.split(args.input)
22 | if input_dir:
23 |     os.chdir(input_dir)
24 | 
25 | output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb'])
26 | 
27 | reader = notedown.MarkdownReader()
28 | 
29 | # read
30 | with open(input_fn, encoding='utf-8', mode='r') as f:
31 |     notebook = reader.read(f)
32 | 
33 | if not any([i in input_fn for i in ignore_execution]):
34 |     tic = time.time()
35 |     if not args.disable_compute:
36 |         notedown.run(notebook, timeout)
37 |     print('=== Finished evaluation in %f sec' % (time.time() - tic))
38 | 
39 | # write
40 | # need to add language info to for syntax highlight
41 | notebook['metadata'].update({'language_info': {'name': 'ipython', 'version': 3}})
42 | 
43 | notebook_json = nbformat.writes(notebook)
44 | 
45 | with open(output_fn, encoding='utf-8', mode='w') as f:
46 |     f.write(notebook_json)
47 | 


--------------------------------------------------------------------------------
/docs/model_zoo:
--------------------------------------------------------------------------------
1 | ../scripts


--------------------------------------------------------------------------------
/docs/tutorials/deep_learning_compiler/index.rst:
--------------------------------------------------------------------------------
 1 | Compile NLP Models
 2 | ==================
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Compile and accelerate NLP models via TVM
 8 |       :link: tvm_basic.html
 9 | 
10 |       Basic example of how to use TVM to compile backbone models in GluonNLP.
11 | 
12 | .. toctree::
13 |     :hidden:
14 |     :maxdepth: 2
15 | 
16 |     tvm_basic.ipynb
17 | 


--------------------------------------------------------------------------------
/docs/tutorials/index.rst:
--------------------------------------------------------------------------------
  1 | Tutorials
  2 | =========
  3 | 
  4 | Interested in getting started in a new NLP area? Here are some tutorials to help get started.
  5 | 
  6 | 
  7 | Embedding
  8 | -----------------------
  9 | 
 10 | .. container:: cards
 11 | 
 12 |    .. card::
 13 |       :title: Using Pre-trained Word Embeddings
 14 |       :link: word_embedding/word_embedding.html
 15 | 
 16 |       Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
 17 |       analogy problems.
 18 | 
 19 | 
 20 | Text Prediction
 21 | -----------------------
 22 | 
 23 | .. container:: cards
 24 | 
 25 |    .. card::
 26 |       :title: Text Prediction Part1
 27 |       :link: text_prediction/text_prediction_part1.html
 28 | 
 29 |       Load pretrained NLP backbones.
 30 | 
 31 |    .. card::
 32 |       :title: Text Prediction Part2
 33 |       :link: text_prediction/text_prediction_part2.html
 34 | 
 35 |       An example that finetunes MobileBERT for sentiment analysis and sentence similarity.
 36 | 
 37 | 
 38 | Question Answering
 39 | -----------------------
 40 | 
 41 | .. container:: cards
 42 | 
 43 |    .. card::
 44 |       :title: Question Answering with GluonNLP
 45 |       :link: question_answering/question_answering.html
 46 | 
 47 |       Learn how to build a model for Question Answering (QA) based on the backbone provided in GluonNLP.
 48 | 
 49 | 
 50 | Tokenization
 51 | -----------------------
 52 | 
 53 | .. container:: cards
 54 | 
 55 |    .. card::
 56 |       :title: Tokenization Part1
 57 |       :link: tokenization/tokenization_part1.html
 58 | 
 59 |       The basic usage tokenizers in GluonNLP.
 60 | 
 61 |    .. card::
 62 |       :title: Tokenization Part2
 63 |       :link: tokenization/tokenization_part2.html
 64 | 
 65 |       Try out different subword learning algorithms.
 66 | 
 67 | 
 68 | Using Pretrained Models
 69 | -----------------------
 70 | 
 71 | .. container:: cards
 72 | 
 73 |    .. card::
 74 |       :title: T5 for Masked Language Modeling
 75 |       :link: pretrained_models/pretrained_t5_mlm.html
 76 | 
 77 |       An example of using pretrained models in GluonNLP. 
 78 | 
 79 | 
 80 | Compiling NLP Models
 81 | --------------------
 82 | 
 83 | .. container:: cards
 84 | 
 85 |    .. card::
 86 |       :title: Compile and accelerate NLP models via TVM
 87 |       :link: deep_learning_compiler/tvm_basic.html
 88 | 
 89 |       Basic example of how to use TVM to compile backbone models in GluonNLP.
 90 | 
 91 | 
 92 | .. toctree::
 93 |    :hidden:
 94 |    :maxdepth: 2
 95 | 
 96 |    word_embedding/index
 97 |    text_prediction/index
 98 |    question_answering/index
 99 |    tokenization/index
100 |    pretrained_models/index
101 |    deep_learning_compiler/index
102 | 


--------------------------------------------------------------------------------
/docs/tutorials/pretrained_models/index.rst:
--------------------------------------------------------------------------------
 1 | Using Pretrained Models
 2 | =======================
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: T5 for Masked Language Modeling
 8 |       :link: pretrained_t5_mlm.html
 9 | 
10 |       Use a pretrained T5 for MLM with noise spans. 
11 | 
12 | .. toctree::
13 |     :hidden:
14 |     :maxdepth: 2
15 | 
16 |     pretrained_t5_mlm.ipynb
17 | 


--------------------------------------------------------------------------------
/docs/tutorials/pretrained_models/pretraining_objectives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/pretrained_models/pretraining_objectives.png


--------------------------------------------------------------------------------
/docs/tutorials/question_answering/index.rst:
--------------------------------------------------------------------------------
 1 | Question Answering
 2 | =======================
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Question Answering with GluonNLP
 8 |       :link: question_answering.html
 9 | 
10 |       Learn how to build a model for Question Answering (QA) based on the backbone provided in GluonNLP.
11 | 
12 | .. toctree::
13 |    :hidden:
14 |    :maxdepth: 2
15 | 
16 |    question_answering.ipynb
17 | 


--------------------------------------------------------------------------------
/docs/tutorials/question_answering/offsets_match.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/offsets_match.png


--------------------------------------------------------------------------------
/docs/tutorials/question_answering/qa1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/qa1.png


--------------------------------------------------------------------------------
/docs/tutorials/question_answering/qa2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/question_answering/qa2.png


--------------------------------------------------------------------------------
/docs/tutorials/text_prediction/bert_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/text_prediction/bert_illustration.png


--------------------------------------------------------------------------------
/docs/tutorials/text_prediction/index.rst:
--------------------------------------------------------------------------------
 1 | Text Prediction
 2 | =======================
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Text Prediction Part1
 8 |       :link: text_prediction_part1.html
 9 | 
10 |       Load pretrained NLP backbones.
11 | 
12 |    .. card::
13 |       :title: Text Prediction Part2
14 |       :link: text_prediction_part2.html
15 | 
16 |       An example that finetunes MobileBERT for sentiment analysis and sentence similarity.
17 | 
18 | 
19 | .. toctree::
20 |    :hidden:
21 |    :maxdepth: 2
22 | 
23 |    text_prediction_part1.ipynb
24 |    text_prediction_part2.ipynb
25 | 


--------------------------------------------------------------------------------
/docs/tutorials/text_prediction/merge_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/docs/tutorials/text_prediction/merge_input.png


--------------------------------------------------------------------------------
/docs/tutorials/tokenization/index.rst:
--------------------------------------------------------------------------------
 1 | Tokenization
 2 | =======================
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Tokenization Part1
 8 |       :link: tokenization_part1.html
 9 | 
10 |       The basic usage tokenizers in GluonNLP.
11 | 
12 | 
13 |    .. card::
14 |       :title: Tokenization Part2
15 |       :link: tokenization_part2.html
16 | 
17 |       Try out different subword learning algorithms.
18 | 
19 | 
20 |    .. card::
21 |       :title: Tokenization Part3
22 |       :link: tokenization_part3.html
23 | 
24 |       Tutorial that downloads wikipedia data and learn subword.
25 | 
26 | 
27 | .. toctree::
28 |    :hidden:
29 |    :maxdepth: 2
30 | 
31 |    tokenization_part1.ipynb
32 |    tokenization_part2.ipynb
33 |    tokenization_part3.ipynb
34 | 


--------------------------------------------------------------------------------
/docs/tutorials/tokenization/tokenization_part3.md:
--------------------------------------------------------------------------------
 1 | # Part3: Download Data from Wikipedia and Learn Subword
 2 | 
 3 | In this tutorial, we will download the Wikipedia classical Chinese dataset with `nlp_data` and learn a customized sentencepiece vocabulary.
 4 | 
 5 | ## Download Data
 6 | 
 7 | ```{.shell .input}
 8 | !nlp_data prepare_wikipedia --mode download+format --lang zh-classical --date latest --quiet -o wiki_zh_classical
 9 | ```
10 | 
11 | To save time, we will use the first 10000 sentences for training the subword model.
12 | 
13 | 
14 | ```{.shell .input}
15 | !head -10000 wiki_zh_classical/prepared_wikipedia/wikipedia-prepared-0000.txt > train_corpus.txt
16 | ```
17 | 
18 | ```{.shell .input}
19 | !nlp_process learn_subword --model spm --corpus train_corpus.txt --vocab-size 10000 \
20 |                            --disable-bos --disable-eos \
21 |                            --custom-special-tokens "cls_token=[CLS]" "sep_token=[SEP]" "mask_token=[MASK]"
22 | ```
23 | 
24 | The model are saved in "spm" folder.
25 | 
26 | ```{.shell .input}
27 | !ls spm
28 | ```
29 | 
30 | ## Build the Tokenizer with the Saved Model
31 | 
32 | 
33 | ```{.python .input}
34 | import gluonnlp
35 | import json
36 | from gluonnlp.data.tokenizers import SentencepieceTokenizer
37 | tokenizer = SentencepieceTokenizer(model_path='spm/spm.model', vocab="spm/spm.vocab")
38 | print(tokenizer)
39 | print()
40 | print('The first 10 tokens in the vocabulary:')
41 | print('--------------------------------------')
42 | print(tokenizer.vocab.all_tokens[:10])
43 | ```
44 | 
45 | You can use the tokenizer direclty.
46 | 
47 | 
48 | ```{.python .input}
49 | tokenizer.encode('賈夫人仙逝揚州城 ·')
50 | ```
51 | 
52 | 
53 | ```{.python .input}
54 | tokenizer.encode_with_offsets('賈夫人仙逝揚州城 ·')
55 | ```
56 | 
57 | ## Explore More Options
58 | 
59 | To explore more options, you may check the README.
60 | 
61 | 
62 | ```{.shell .input}
63 | !nlp_process learn_subword --help
64 | ```
65 | 


--------------------------------------------------------------------------------
/docs/tutorials/word_embedding/index.rst:
--------------------------------------------------------------------------------
 1 | Representation Learning
 2 | =======================
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Using Pre-trained Word Embeddings
 8 |       :link: word_embedding.html
 9 | 
10 |       Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
11 |       analogy problems.
12 | 
13 | 
14 | .. toctree::
15 |    :hidden:
16 |    :maxdepth: 2
17 | 
18 |    word_embedding.ipynb
19 | 


--------------------------------------------------------------------------------
/docs/website/index.rst:
--------------------------------------------------------------------------------
 1 | Community
 2 | =========
 3 | 
 4 | .. card::
 5 |    :title: Community
 6 |    :is_head: true
 7 |    :link: https://www.apache.org/foundation/policies/conduct
 8 | 
 9 |    Welcome to GluonNLP community. We strive to foster a collaborative and welcoming community. We
10 |    expect all members to follow the `code of conduct <https://github.com/dmlc/gluon-nlp/tree/master/CODE_OF_CONDUCT.md>`__.
11 | 
12 | 
13 | .. container:: cards
14 | 
15 |    .. card::
16 |       :title: Github Issues
17 |       :link: https://github.com/dmlc/gluon-nlp/issues
18 | 
19 |       Feature requests, bug reports, design and roadmap discussion.
20 | 
21 |    .. card::
22 |       :title: GluonNLP Slack Channel
23 |       :link: https://apache-mxnet.slack.com/messages/CCCDM10V9
24 | 
25 |       #gluon-nlp Slack channel. Click the `sign-up link <https://join.slack.com/t/apache-mxnet/shared_invite/zt-5n577awn-iEQhjazdppqbAV~0K7_Vvg>`_ to register.
26 | 
27 | 
28 | Interested in contributing to GluonNLP? Check our contribution guide:
29 | 
30 | .. toctree::
31 |    :maxdepth: 3
32 | 
33 |    contribute
34 |    git
35 |    release
36 |    configuration


--------------------------------------------------------------------------------
/docs/website/release.rst:
--------------------------------------------------------------------------------
 1 | Release Checklist
 2 | =================
 3 | 
 4 | Below is the checklist for releasing a new minor version of GluonNLP:
 5 | 
 6 | - Creat a new release branch $major.$minor.x with commits from the master branch
 7 | - Bump the version in the master branch to $major.$minor+1.$patch.dev
 8 | - Bump the version in the release branch to $major.$minor.$patch
 9 | - Update the installation from source instruction in the release branch
10 | - Draft the release note, highlight important events/models/features, as well as breaking changes
11 | - Publish the release on Github, creating a tag $major.$minor.$patch
12 | - Check the content at http://gluon-nlp.mxnet.io/$major.$minor.x/index.html
13 | - Upload and refresh the default version website
14 | - Prepare pip package
15 | - Make annoucement (Twitter, etc)
16 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     seed: set the python, numpy and mxnet random seeds to a specified value for test reproducibility
4 |     serial: mark a test that requires more resources to run that are thus only suitable for serial run.
5 |     remote_required: mark a test that requires internet access.
6 |     gpu: mark a test that requires GPU.
7 |     integration: mark an integration test
8 |     skip_master: mark a test that is temporarily skipped for mxnet master validation.
9 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking the Performance of NLP Backbones
 2 | 
 3 | We benchmark the latency and peak memory usage of a single training (forward + backward) and inference (forward-only) step 
 4 | of the NLP backbones.
 5 | For comparison, we also provide the numbers of the models in huggingface.
 6 | 
 7 | ## Backbones in HuggingFace
 8 | 
 9 | We use the [huggingface benchmark](https://github.com/huggingface/transformers/tree/master/examples/benchmarking) 
10 | to benchmark the training + inference speed of common workloads in NLP. 
11 | 
12 | ```bash
13 | python3 -m pip install -U -r requirements.txt
14 | python3 benchmark_hf.py
15 | ```
16 | 
17 | It will generate a list of csv files:
18 | 
19 | ```
20 | ├── pytorch_train_fp32.csv
21 | ├── pytorch_train_fp16.csv
22 | ├── pytorch_infer_fp32.csv
23 | ├── pytorch_infer_fp16.csv
24 | ├── pytorch_infer_fp32_ts.csv
25 | ```
26 | 
27 | ## GluonNLP Backbones based on MXNet-2.0
28 | 
29 | We profile three options: `NT` layout, `NT` layout with `TN` layout as the compute layout,
30 | and `TN` layout.
31 | 
32 | ```bash
33 | python3 -m pip install -U -r requirements.txt
34 | bash benchmark_gluonnlp.sh
35 | ```
36 | 
37 | It will generate csv files with `gluonnlp_` as the prefix
38 | ```
39 | ├── gluonnlp_train_fp32_NT_NT.csv
40 | ├── gluonnlp_train_fp32_NT_TN.csv
41 | ├── gluonnlp_train_fp32_TN_TN.csv
42 | ├── gluonnlp_infer_fp32_NT_NT_tvm0.csv
43 | ├── gluonnlp_infer_fp32_NT_TN_tvm0.csv
44 | ├── gluonnlp_infer_fp32_TN_TN_tvm0.csv
45 | ```
46 | 
47 | ## GluonNLP + TVM for Inference
48 | 
49 | Install TVM as described in https://tvm.apache.org/docs/install/index.html
50 | 
51 | ```bash
52 | bash benchmark_gluonnlp_tvm.sh
53 | ```
54 | 
55 | ```
56 | ├── gluonnlp_infer_fp32_NT_NT_tvm1.csv
57 | ├── gluonnlp_infer_fp32_NT_TN_tvm1.csv
58 | ├── gluonnlp_infer_fp32_TN_TN_tvm1.csv
59 | ```
60 | 
61 | ## Generate the Benchmark Report
62 | 


--------------------------------------------------------------------------------
/scripts/benchmarks/benchmark_gluonnlp.sh:
--------------------------------------------------------------------------------
 1 | for mode in train inference
 2 | do
 3 |   python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode
 4 | done
 5 | 
 6 | for mode in train inference
 7 | do
 8 |   python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode
 9 | done
10 | 
11 | for mode in train inference
12 | do
13 |   python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode
14 | done
15 | 


--------------------------------------------------------------------------------
/scripts/benchmarks/benchmark_gluonnlp_fp16.sh:
--------------------------------------------------------------------------------
 1 | for mode in train inference
 2 | do
 3 |   python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16
 4 | done
 5 | 
 6 | for mode in train inference
 7 | do
 8 |   python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16
 9 | done
10 | 
11 | for mode in train inference
12 | do
13 |   python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16
14 | done
15 | 


--------------------------------------------------------------------------------
/scripts/benchmarks/benchmark_gluonnlp_tvm.sh:
--------------------------------------------------------------------------------
1 | python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode inference --use_tvm --instance_type g4
2 | python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode inference --use_tvm --instance_type g4
3 | python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode inference --use_tvm --instance_type g4
4 | 


--------------------------------------------------------------------------------
/scripts/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | py3nvml
3 | torch
4 | torchvision
5 | 


--------------------------------------------------------------------------------
/scripts/benchmarks/run_backbone_benchmark.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install -U -r requirements.txt
2 | python3 benchmark_hf.py
3 | bash benchmark_gluonnlp.sh
4 | bash benchmark_gluonnlp_fp16.sh
5 | 


--------------------------------------------------------------------------------
/scripts/classification/README.md:
--------------------------------------------------------------------------------
 1 | # finetune classification
 2 | ## prepare datasets
 3 | use nlp_data to prepare data at first.
 4 | ```bash
 5 | nlp_data prepare_glue --benchmark glue -t sst
 6 | ```
 7 | ##finetine scripts
 8 | Then run the scripts to finetune:
 9 | ```bash
10 | python train_classification.py \
11 |   --model_name google_en_uncased_bert_base \
12 |   --task_name cola \
13 |   --lr 2e-5\
14 |   --model_name google_en_cased_bert_base \
15 |   --batch_size 32 \
16 |   --do_train \
17 |   --do_eval \
18 |   --seed 7800 \
19 |   --epochs 10 \
20 |   --optimizer adamw \
21 |   --train_dir glue/cola/train.parquet \
22 |   --eval_dir glue/cola/dev.parquet \
23 |   --gpus 0
24 | ```
25 | alternatively, because some task are slow(like MNLI), you can use horovod to accelerate,
26 | ```bash
27 | horovodrun -np 4 -H localhost:4  python train_classification.py \
28 |   --comm_backend horovod \
29 |   --model_name google_en_uncased_bert_base \
30 |   --task_name mnli \
31 |   --lr 2e-4\
32 |   --batch_size 32 \
33 |   --do_train \
34 |   --do_eval \
35 |   --epochs 5 \
36 |   --log_interval 500 \
37 |   --warmup_ratio 0.1 \
38 |   --optimizer adamw \
39 |   --train_dir glue/mnli/train.parquet \
40 |   --eval_dir glue/mnli/dev_matched.parquet \
41 |   --gpus 0,1,2,3
42 | ```
43 | 
44 | ## some result
45 | here are some results with their hyperparameters
46 | 
47 | | task Name    | metirc | learning rate  | batch size | seed | epoch | result | tensorboard dev |
48 | |-----------|-------------|---------------|--------------|---------|-------|------|-----|
49 | |    SST    | Accuracy |  2e-5       | 32    | 7800 |  5 |  93.23 | https://tensorboard.dev/experiment/eKVI0DC6SEWBbHzS8ZphNg/|
50 | |    STS    |  Pearson Corr. | 2e-5       | 32    | 24 |  10 |  89.26 |  https://tensorboard.dev/experiment/kPOnlNeiQ4W5EmFlkqjC6A/|
51 | |    CoLA    | Matthew Corr.  | 2e-5       | 32    | 7800 |  10 |  59.23 |  https://tensorboard.dev/experiment/33euRGh9SrW3p15JWgILnw/ |
52 | |    RTE    |  Accuracy | 2e-5       | 32    | 1800 |  10 |  69.67 |  https://tensorboard.dev/experiment/XjTxr5anRrC1LMukLJJQ3g/|
53 | |    MRPC    | Accuracy/F1  | 3e-5       | 32    | 7800 |  5 |  85.38/87.31 |  https://tensorboard.dev/experiment/jEJFq2XXQ8SvCxt6eKIjwg/ |
54 | |    MNLI    |  Accuracy(m/mm) | 2e-5       | 48    | 7800 |  5 |  84.90/85.10 |  https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ |


--------------------------------------------------------------------------------
/scripts/classification/classification.py:
--------------------------------------------------------------------------------
 1 | import gluonnlp
 2 | import numpy as np
 3 | import mxnet as mx
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | from gluonnlp.data.sampler import SplitSampler
 7 | from tqdm import tqdm
 8 | from mxnet.gluon import nn
 9 | from gluonnlp.models import get_backbone
10 | from gluonnlp.utils.parameter import clip_grad_global_norm
11 | from gluonnlp.utils.preprocessing import get_trimmed_lengths
12 | from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat
13 | from mxnet.gluon.data import batchify as bf
14 | from mxnet.gluon.data import DataLoader
15 | from mxnet.lr_scheduler import PolyScheduler
16 | from gluonnlp.utils import set_seed
17 | 
18 | class TextPredictionNet(nn.HybridBlock):
19 |     def __init__(self, backbone, output_size = 2):
20 |         super().__init__()
21 |         self.backbone = backbone
22 |         self.output_size = output_size
23 |         self.out_proj = nn.Dense(in_units=backbone.units,
24 |                                  units=self.output_size,
25 |                                  flatten=False)
26 | 
27 | 
28 |     def forward(self, data, token_types, valid_length):
29 |         _, pooled_out = self.backbone(data, token_types, valid_length)
30 |         out = self.out_proj(pooled_out)
31 |         return out
32 | 
33 |     def initialize_with_pretrained_backbone(self, backbone_params_path, ctx=None):
34 |         self.backbone.load_parameters(backbone_params_path, ctx=ctx)
35 |         self.out_proj.initialize(ctx=ctx)
36 | 
37 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/README.md:
--------------------------------------------------------------------------------
 1 | # Conversion Scripts
 2 | 
 3 | In GluonNLP, we provide shared scripts to convert the model checkpoints in other repositories to GluonNLP.  
 4 | 
 5 | At this stage, the model needs to be downloaded locally, and the converting scripts accepts only the file directory as the argument,
 6 | without the support of accepting the url. In addition, both the tensorflow fine-tuned models that
 7 | can be loaded in TF1 Hub modules and TF2 SavedModels are accepted, although the parameters of mask
 8 | language model are not provided in TF2 SavedModels in most cases, and
 9 | the differences of these parameters are not required to be tested after converting.
10 | 
11 | The testing step mentioned above are controlled by the flag `--test`, in which the maximum
12 | tolerance of 1e-3 between gluon model with converted weights and original tensorflow model.
13 | In addition, we can use GPU in all converting scripts by adding `--gpu 0`.
14 | 
15 | For RoBERTa XLM-R and BART model, we rely on the master version of [fairseq](https://github.com/pytorch/fairseq#requirements-and-installation) package locally as `pip install git+https://github.com/pytorch/fairseq.git@master`.
16 | 
17 | ## BERT
18 | Convert model from [BERT LIST](https://tfhub.dev/google/collections/bert/1).
19 | 
20 | You can use the script provided in [convert_bert.sh](convert_bert.sh).
21 | The following command give you a rough idea about the code.
22 | 
23 | ```bash
24 | bash convert_bert.sh
25 | ```
26 | 
27 | In the process, we downloaded the config file from the [official repo](https://github.com/google-research/bert#pre-trained-models), download the configuration file `bert_config.json`,
28 | and move it into `${case}_bert_${model}/assets/`.
29 | 
30 | ## ALBERT
31 | You can use the command described in
32 | ```bash
33 | bash convert_albert.sh
34 | ```
35 | 
36 | ## ELECTRA
37 | The TF Hub is not available for ELECTRA model currently.
38 | Thus, you will need to clone the [electra repository](https://github.com/ZheyuYe/electra)
39 | and download the checkpoint. The parameters are converted from local checkpoints.
40 | By running the following command, you can convert + verify the ELECTRA model with both the discriminator and the generator.
41 | 
42 | Notice: please set up the `--electra_path` with the cloned path if you'd like to directly use `convert_electra.py`.
43 | 
44 | ```bash
45 | bash convert_electra.sh
46 | ```
47 | 
48 | ## MobileBert
49 | ```bash
50 | bash convert_mobilebert.sh
51 | ```
52 | 
53 | ## RoBERTa
54 | ```bash
55 | bash convert_roberta.sh
56 | ```
57 | 
58 | ## XLM-R
59 | ```bash
60 | bash convert_xlmr.sh
61 | ```
62 | 
63 | ## BART
64 | ```bash
65 | bash convert_bart.sh
66 | ```
67 | 
68 | ## GPT-2
69 | ```bash
70 | bash convert_gpt2.sh
71 | ```
72 | 
73 | ## T5
74 | ```bash
75 | bash convert_t5.sh
76 | ```
77 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/bert_base_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/bert_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 1024,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 4096,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 16,
10 |   "num_hidden_layers": 24,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_albert.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | python3 -m pip install tensorflow==1.15 --upgrade --user
 4 | python3 -m pip install tensorflow_hub --upgrade --user
 5 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
 6 | for model in base large xlarge xxlarge
 7 | do
 8 |     hub_directory="google_albert_${model}_v2"
 9 |     mkdir -p ${hub_directory}
10 |     wget "https://tfhub.dev/google/albert_${model}/3?tf-hub-format=compressed" -O "${hub_directory}.tar.gz"
11 |     tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
12 |     python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type albert --test
13 | done
14 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_bart.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
2 | for model in base large
3 | do
4 |     mkdir bart_${model}
5 |     wget  "https://dl.fbaipublicfiles.com/fairseq/models/bart.${model}.tar.gz"
6 |     tar zxf bart.${model}.tar.gz --directory bart_${model}
7 |     python3 convert_fairseq_bart.py --fairseq_model_path bart_${model}/bart.${model} --test
8 | done
9 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_bert.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | python3 -m pip install 'tensorflow<3' --upgrade --user
 4 | python3 -m pip install tensorflow_hub --upgrade --user
 5 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
 6 | 
 7 | # Conversion for English Models
 8 | for model in base large
 9 | do
10 |     for case in cased uncased
11 |     do
12 |         hub_directory="google_en_${case}_bert_${model}"
13 |         mkdir -p ${hub_directory}
14 |         if [ ${model} == base ];then
15 |             url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed"
16 |         else
17 |             url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed"
18 |         fi
19 |         wget ${url} -O "${hub_directory}.tar.gz"
20 |         tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
21 |         cp bert_${model}_config.json ${hub_directory}/assets/
22 |         python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
23 |     done
24 | done
25 | 
26 | # Conversion for Chinese Models
27 | url="https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/2?tf-hub-format=compressed"
28 | hub_directory="google_zh_bert_base"
29 | mkdir -p ${hub_directory}
30 | wget ${url} -O "${hub_directory}.tar.gz"
31 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
32 | cp bert_base_config.json ${hub_directory}/assets/
33 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
34 | 
35 | # Conversion for Multi-lingual Models
36 | url="https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2?tf-hub-format=compressed"
37 | hub_directory="google_multi_cased_bert_base"
38 | mkdir -p ${hub_directory}
39 | wget ${url} -O "${hub_directory}.tar.gz"
40 | tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
41 | cp bert_base_config.json ${hub_directory}/assets/
42 | python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
43 | 
44 | # Conversion for Whole-word-masking Models
45 | for case in cased uncased
46 | do
47 |     hub_directory="google_en_${case}_bert_wwm_large"
48 |     mkdir -p ${hub_directory}
49 |     url="https://tfhub.dev/tensorflow/bert_en_wwm_${case}_L-24_H-1024_A-16/2?tf-hub-format=compressed"
50 |     wget ${url} -O "${hub_directory}.tar.gz"
51 |     tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
52 |     cp bert_large_config.json ${hub_directory}/assets/
53 |     python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
54 | done
55 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_bert_torch.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | python3 -m pip install 'tensorflow<3' --upgrade --user
 4 | python3 -m pip install tensorflow_hub --upgrade --user
 5 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
 6 | 
 7 | # Conversion for English Models
 8 | for model in base large
 9 | do
10 |     for case in cased uncased
11 |         do
12 |             hub_directory="google_en_${case}_bert_${model}"
13 |             mkdir -p ${hub_directory}
14 |             if [ ${model} == base ];then
15 |                 url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed"
16 |             else
17 |                 url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed"
18 |             fi
19 |             wget ${url} -O "${hub_directory}.tar.gz"
20 |             tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
21 |             cp bert_${model}_config.json ${hub_directory}/assets/
22 |             python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test --torch
23 |         done
24 |     done
25 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_electra.sh:
--------------------------------------------------------------------------------
 1 | python3 -m pip install tensorflow==1.15 --upgrade --user
 2 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
 3 | git clone https://github.com/ZheyuYe/electra.git
 4 | cd electra
 5 | git checkout 923179410471f9e1820b3f0771c239e1752e4e18
 6 | cd ..
 7 | for model in small base large
 8 | do
 9 |     wget https://storage.googleapis.com/electra-data/electra_${model}.zip
10 |     unzip electra_${model}.zip
11 |     python3 convert_electra.py --tf_model_path electra_${model} --electra_path electra --model_size ${model} --test
12 | done
13 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_gpt2.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install tensorflow==1.15 --upgrade --user
2 | git clone https://github.com/openai/gpt-2.git gpt_2
3 | for model in 124M 355M 774M 1558M
4 | do
5 |     python3 gpt_2/download_model.py ${model}
6 |     mkdir gpt2_${model}
7 |     CUDA_VISIBLE_DEVICES="" python3 convert_gpt2.py --tf_model_path models/${model} --save_dir gpt2_${model} --test
8 | done
9 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_mobilebert.sh:
--------------------------------------------------------------------------------
 1 | python3 -m pip install tensorflow==1.15 --upgrade --user
 2 | export TF_FORCE_GPU_ALLOW_GROWTH="true"
 3 | svn checkout https://github.com/google-research/google-research/trunk/mobilebert
 4 | 
 5 | mkdir mobilebert_model
 6 | url='https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/uncased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz'
 7 | wget ${url} -O "mobilebert.tar.gz"
 8 | tar -xvf mobilebert.tar.gz --directory mobilebert_model
 9 | python3 convert_mobilebert.py --tf_model_path mobilebert_model/mobilebert --mobilebert_dir mobilebert --test
10 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_mt5.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/huggingface/transformers.git --upgrade
2 | for model in small base large xl xxl
3 | do
4 |     dest_dir="google_mt5_${model}"
5 |     mkdir ${dest_dir}
6 |     python3 convert_mt5.py "google/mt5-${model}" ${dest_dir} --test
7 | done
8 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_roberta.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
2 | for model in base large
3 | do
4 |     mkdir roberta_${model}
5 |     wget "https://dl.fbaipublicfiles.com/fairseq/models/roberta.${model}.tar.gz"
6 |     tar zxf roberta.${model}.tar.gz --directory roberta_${model}
7 |     python3 convert_fairseq_roberta.py --fairseq_model_path roberta_${model}/roberta.${model} --test
8 | done
9 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_t5.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install git+https://github.com/huggingface/transformers.git --upgrade
2 | for model in small base large 3B 11B
3 | do
4 |     dest_dir="google_t5_${model}"
5 |     mkdir ${dest_dir}
6 |     python3 convert_t5.py "t5-${model,,}" ${dest_dir} --test
7 | done
8 | 


--------------------------------------------------------------------------------
/scripts/conversion_toolkits/convert_xlmr.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install fairseq==0.10.1 --upgrade --user
2 | for model in base large
3 | do
4 |     mkdir xlmr_${model}
5 |     wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.${model}.tar.gz"
6 |     tar zxf xlmr.${model}.tar.gz --directory xlmr_${model}
7 |     python3 convert_fairseq_xlmr.py --fairseq_model_path xlmr_${model}/xlmr.${model} --model_size ${model} --test
8 | done
9 | 


--------------------------------------------------------------------------------
/scripts/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import importlib
 3 | import os
 4 | 
 5 | SUBCOMMAND_DICT = dict()
 6 | 
 7 | # Find all modules starting with `prepare_`
 8 | CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 9 | for root, dirs, files in os.walk(CURR_DIR, topdown=False):
10 |     for name in files:
11 |         if name.startswith('prepare_') and name.endswith('.py'):
12 |             command = name[:-3]
13 |             path = os.path.join(root, name)
14 |             relpath = os.path.relpath(path, CURR_DIR)[:-3]
15 |             if relpath.startswith(os.sep):
16 |                 relpath = path[len(os.sep):]
17 |             subpackage = relpath.replace(os.sep, '.')
18 |             SUBCOMMAND_DICT[command] = 'gluonnlp.cli.data.' + subpackage
19 | 
20 | 
21 | def cli_main():
22 |     parser = argparse.ArgumentParser(
23 |         description='Build-in scripts for downloading and preparing the data in GluonNLP.',
24 |         prog='nlp_data', add_help=False)
25 |     parser.add_argument('command', type=str,
26 |                         choices=sorted(SUBCOMMAND_DICT.keys()) + ['help'],
27 |                         metavar='[subcommand]',
28 |                         help='The subcommand to use. '
29 |                              'Choices are {}.'.format(sorted(SUBCOMMAND_DICT.keys()) + ['help']))
30 |     args, other_args = parser.parse_known_args()
31 |     if args.command == 'help':
32 |         parser.print_help()
33 |     else:
34 |         mod = importlib.import_module(SUBCOMMAND_DICT[args.command])
35 |         parser = mod.get_parser()
36 |         sub_args = parser.parse_args(other_args)
37 |         mod.main(sub_args)
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     cli_main()
42 | 


--------------------------------------------------------------------------------
/scripts/datasets/general_nlp_benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/general_nlp_benchmark/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/language_modeling/README.md:
--------------------------------------------------------------------------------
 1 | # Language Modeling Benchmark
 2 | 
 3 | Prepare the language modeling benchmarking datasets. 
 4 | In order to help reproduce the papers, we use 
 5 | the tokenized corpus as the training/validation/testing dataset.
 6 | 
 7 | ```bash
 8 | # WikiText-2
 9 | nlp_data prepare_lm --dataset wikitext2
10 | 
11 | # WikiText-103
12 | nlp_data prepare_lm --dataset wikitext103
13 | 
14 | # enwik8
15 | nlp_data prepare_lm --dataset enwik8
16 | 
17 | # Text-8
18 | nlp_data prepare_lm --dataset text8
19 | 
20 | # Google One-Billion-Word
21 | nlp_data prepare_lm --dataset gbw
22 | ```
23 | 
24 | Happy language modeling :)
25 | 


--------------------------------------------------------------------------------
/scripts/datasets/language_modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/language_modeling/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/machine_translation/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Translation
 2 | 
 3 | In machine translation, we train a model to map a sentence from the source language, e.g., English, 
 4 | to the target language, e.g., Chinese. Here, we provide scripts to download the common benchmark 
 5 | datasets for machine translation. The downloaded datasets are stored as a pair of corpus files, 
 6 | one for the source and the other for the target.  
 7 | 
 8 | ## WMT
 9 | You can use [prepare_wmt.py](prepare_wmt.py) to download and prepare the raw training corpus and 
10 | then use [clean_parallel_corpus.py](../../preprocess/clean_parallel_corpus.py) to clean and 
11 | filter the corpus. 
12 | 
13 | You may download the raw WMT2014 en-de  
14 | ```bash
15 | nlp_data prepare_wmt \
16 |         --dataset wmt2014 \
17 |         --lang-pair en-de \
18 |         --save-path wmt2014_en_de
19 | ```
20 | 
21 | By combining `nlp_data` and `nlp_process`, we provide the example for preparing the 
22 | WMT2014 en-de training dataset: [wmt2014_ende.sh](wmt2014_ende.sh). This involves three steps:
23 | - Downloading the raw text data
24 | - Clean and tokenize the data
25 | - Learn subword model and apply the learned subword model.
26 | 
27 | ```bash
28 | bash wmt2014_ende.sh yttm
29 | ```
30 | 
31 | We support the following subword learning algorithms:
32 | 
33 | ```bash
34 | # BPE from YouTokenToMe
35 | bash wmt2014_ende.sh yttm
36 | 
37 | # BPE from Huggingface
38 | bash wmt2014_ende.sh hf_bpe
39 | 
40 | # BPE from subword-nmt
41 | bash wmt2014_ende.sh subword_nmt
42 | 
43 | # Byte-level BPE
44 | bash wmt2014_ende.sh hf_bytebpe
45 | 
46 | # Sentencepiece
47 | bash wmt2014_ende.sh spm
48 | 
49 | # WordPiece
50 | bash wmt2014_ende.sh hf_wordpiece
51 | ```
52 | 
53 | 
54 | Apart from WMT2014 EN-DE, we also provided the script for preparing the training data for 
55 | WMT2017 ZH-EN task: 
56 | [wmt2017_zhen.sh](wmt2017_zhen.sh).
57 | 
58 | ### Monolingual Corpus
59 | In the WMT competition, there are additional monolingual corpus that helps you train NMT models. 
60 | You may download the raw monolingual corpus by adding `--mono` flag.
61 | 
62 | One example is to download the newscrawl monolingual corpus in German:
63 | 
64 | ```bash
65 | nlp_data prepare_wmt \
66 |         --mono \
67 |         --mono_lang de \
68 |         --dataset newscrawl \
69 |         --save-path wmt2014_mono
70 | ```   
71 | 
72 | 
73 | ### Directory Structure of Translation Dataset
74 | 
75 | The basic structure of a translation dataset is like the following:
76 | ```
77 | folder_name
78 | ├── train.raw.{src}
79 | ├── train.raw.{tgt}
80 | ├── train.tok.{src}
81 | ├── train.tok.{tgt}
82 | ├── train.tok.{subword_model}.{src}
83 | ├── train.tok.{subword_model}.{tgt}
84 | ├── ... 
85 | ├── ... Repeat for valid and test
86 | ├── ...
87 | ├── {subword_model}.model
88 | ├── {subword_model}.path
89 | ```
90 | 


--------------------------------------------------------------------------------
/scripts/datasets/machine_translation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/machine_translation/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/music_generation/README.md:
--------------------------------------------------------------------------------
 1 | # Music Generation
 2 | 
 3 | We provide datasets for training a music generation model. 
 4 | 
 5 | ## Maestro
 6 | 
 7 | See https://magenta.tensorflow.org/datasets/maestro for detailed introduction.
 8 | 
 9 | ```
10 | # Get V1 Dataset
11 | nlp_data prepare_music_midi --dataset maestro_v1
12 | 
13 | # Get V2 Dataset
14 | nlp_data prepare_music_midi --dataset maestro_v2
15 | ```
16 | 
17 | ## LakhMIDI
18 | 
19 | See https://colinraffel.com/projects/lmd/ for more details
20 | 
21 | ```
22 | # Get Lakh MIDI Full Dataset
23 | nlp_data prepare_music_midi --dataset lmd_full
24 | 
25 | # Get the subset of 45,129 files from LMD-full 
26 | # which have been matched to entries in the Million Song Datase
27 | nlp_data prepare_music_midi --dataset lmd_matched
28 | 
29 | # Get the aligned version of lmd_matched
30 | nlp_data prepare_music_midi --dataset lmd_aligned
31 | 
32 | # Get the clean midi data
33 | nlp_data prepare_music_midi --dataset clean_midi
34 | ```
35 | 
36 | ## Geocities
37 | 
38 | The Geocities collection of MIDI files. 
39 | See https://archive.org/details/archiveteam-geocities-midi-collection-2009 for more details.
40 | ```
41 | nlp_data prepare_music_midi --dataset geocities
42 | ```
43 | 


--------------------------------------------------------------------------------
/scripts/datasets/music_generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/music_generation/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/pretrain_corpus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/pretrain_corpus/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/question_answering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/datasets/question_answering/__init__.py


--------------------------------------------------------------------------------
/scripts/datasets/question_answering/prepare_hotpotqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from gluonnlp.utils.misc import download, load_checksum_stats
 4 | from gluonnlp.base import get_data_home_dir
 5 | 
 6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 7 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'hotpotqa')
 8 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'hotpotqa.txt')
 9 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
10 | 
11 | 
12 | _CITATIONS = """
13 | @inproceedings{yang2018hotpotqa,
14 |   title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
15 |   author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},
16 |   booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
17 |   year={2018}
18 | }
19 | 
20 | """
21 | 
22 | _URLS = {
23 |     'train': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json',
24 |     'dev_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json',
25 |     'dev_distractor': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json',
26 |     'test_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json',
27 | }
28 | 
29 | 
30 | def get_parser():
31 |     parser = argparse.ArgumentParser(description='Downloading the HotpotQA Dataset.')
32 |     parser.add_argument('--save-path', type=str, default='hotpotqa')
33 |     parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
34 |                         help='The path to download the dataset.')
35 |     parser.add_argument('--overwrite', action='store_true')
36 |     return parser
37 | 
38 | 
39 | def main(args):
40 |     if not os.path.exists(args.save_path):
41 |         os.makedirs(args.save_path)
42 |     for url in _URLS.values():
43 |         file_name = url[url.rfind('/') + 1:]
44 |         file_hash = _URL_FILE_STATS[url]
45 |         download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
46 |         if not os.path.exists(os.path.join(args.save_path, file_name))\
47 |                 or (args.overwrite and args.save_path != args.cache_path):
48 |             os.symlink(os.path.join(args.cache_path, file_name),
49 |                        os.path.join(args.save_path, file_name))
50 | 
51 | 
52 | def cli_main():
53 |     parser = get_parser()
54 |     args = parser.parse_args()
55 |     main(args)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     cli_main()
60 | 


--------------------------------------------------------------------------------
/scripts/datasets/question_answering/prepare_searchqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from gluonnlp.utils.misc import download, load_checksum_stats
 4 | from gluonnlp.base import get_data_home_dir, get_repo_url
 5 | 
 6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 7 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
 8 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'searchqa.txt')
 9 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
10 | 
11 | 
12 | _CITATIONS = """
13 | @article{dunn2017searchqa,
14 |   title={Searchqa: A new q\&a dataset augmented with context from a search engine},
15 |   author={Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V Ugur and Cirik, Volkan and Cho, Kyunghyun},
16 |   journal={arXiv preprint arXiv:1704.05179},
17 |   year={2017}
18 | }
19 | 
20 | """
21 | 
22 | _URLS = {
23 |     'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt',
24 |     'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt',
25 |     'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt'
26 | }
27 | 
28 | 
29 | def get_parser():
30 |     parser = argparse.ArgumentParser(description='Downloading the SearchQA Dataset.')
31 |     parser.add_argument('--save-path', type=str, default='searchqa')
32 |     parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
33 |                         help='The path to download the dataset.')
34 |     parser.add_argument('--overwrite', action='store_true')
35 |     return parser
36 | 
37 | 
38 | def main(args):
39 |     if not os.path.exists(args.save_path):
40 |         os.makedirs(args.save_path)
41 |     for url in _URLS.values():
42 |         file_name = url[url.rfind('/') + 1:]
43 |         file_hash = _URL_FILE_STATS[url]
44 |         download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
45 |         if not os.path.exists(os.path.join(args.save_path, file_name))\
46 |                 or (args.overwrite and args.save_path != args.cache_path):
47 |             os.symlink(os.path.join(args.cache_path, file_name),
48 |                        os.path.join(args.save_path, file_name))
49 | 
50 | 
51 | def cli_main():
52 |     parser = get_parser()
53 |     args = parser.parse_args()
54 |     main(args)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     cli_main()
59 | 


--------------------------------------------------------------------------------
/scripts/datasets/question_answering/prepare_triviaqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tarfile
 3 | import argparse
 4 | from gluonnlp.utils.misc import download, load_checksum_stats
 5 | from gluonnlp.base import get_data_home_dir
 6 | 
 7 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 8 | _BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'triviaqa')
 9 | _URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'triviaqa.txt')
10 | _URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
11 | 
12 | 
13 | _CITATIONS = """
14 | @InProceedings{JoshiTriviaQA2017,
15 |      author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
16 |      title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
17 |      booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
18 |      month = {July},
19 |      year = {2017},
20 |      address = {Vancouver, Canada},
21 |      publisher = {Association for Computational Linguistics},
22 | }
23 | 
24 | """
25 | 
26 | _URLS = {
27 |     'rc': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz',
28 |     'unfiltered': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz'
29 | }
30 | 
31 | 
32 | def get_parser():
33 |     parser = argparse.ArgumentParser(description='Downloading the TriviaQA Dataset.')
34 |     parser.add_argument('--type', type=str, choices=['rc', 'unfiltered'], default='rc',
35 |                         help='type of the triviaqa dataset.')
36 |     parser.add_argument('--save-path', type=str, default='triviaqa')
37 |     parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
38 |                         help='The path to download the dataset.')
39 |     parser.add_argument('--overwrite', action='store_true')
40 |     return parser
41 | 
42 | 
43 | def main(args):
44 | 
45 |     def extract(tar_path, target_path):
46 |         try:
47 |             tar = tarfile.open(tar_path, "r:gz")
48 |             file_names = tar.getnames()
49 |             for file_name in file_names:
50 |                 tar.extract(file_name, target_path)
51 |             tar.close()
52 |         except Exception  as e:
53 |             print(e)
54 | 
55 |     tar_url = _URLS[args.type]
56 |     file_name = tar_url[tar_url.rfind('/') + 1:]
57 |     file_hash = _URL_FILE_STATS[tar_url]
58 |     download(tar_url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
59 |     if not os.path.exists(args.save_path):
60 |         os.makedirs(args.save_path)
61 |     if not os.path.exists(os.path.join(args.save_path, file_name))\
62 |             or (args.overwrite and args.save_path != args.cache_path):
63 |         os.symlink(os.path.join(args.cache_path, file_name),
64 |                    os.path.join(args.save_path, file_name))
65 |     extract(os.path.join(args.save_path, file_name), args.save_path)
66 | 
67 | 
68 | def cli_main():
69 |     parser = get_parser()
70 |     args = parser.parse_args()
71 |     main(args)
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     cli_main()
76 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/bookcorpus.txt:
--------------------------------------------------------------------------------
1 | https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz 87ca37e83fd7ea58573a1630ebf9d1da9ee34a41 2404269430


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/glue.txt:
--------------------------------------------------------------------------------
 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/cola.zip 19096246cd2a06d8fe2d13880d6cec61149f77c7 376971
 2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/sst.zip 44f5954391612a8b3d9d65f6d4a824e9ae8d19ce 7439277
 3 | https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt 716e0f67af962f08220b7e97d229b293077ef41f 1047044
 4 | https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc 506c7a1a5e0dd551ceec2f84070fa1a8c2bc4b41 6222
 5 | https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt 4265196c15cf75620b0b592b8b921f543bda7e6c 441275
 6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/qqp.zip d775bd543ee78e3f64892a43ada949daf93e003d 41696084
 7 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/sts.zip cc66d8533052de6d7475ac56dfce300751e070a4 802872
 8 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/mnli.zip c22c684daa5cc9fad949d09d10ecedf94a2ce053 312783507
 9 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/snli.zip c60db4cc8820749e6af9f713f4d55109dd46e8c1 129820157
10 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/qnli.zip 6700cb1d2536bf512314b01350f9ac382439218e 10627589
11 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/rte.zip 2eb8630df898b7d8df14ca9130c1ac1cf79eb376 697150
12 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/glue/wnli.zip fc9834b5a8af4e1d8412e48bc38b477510a8c2d0 28999
13 | https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D c137a2020ab489011dc38fde9ee429f4e2c71257 222257
14 | https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1 2f46c4b80fea8d3ea52a28e05467af3332fa65d9 265530
15 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/gutenberg.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
2 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/hotpotqa.txt:
--------------------------------------------------------------------------------
1 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json 08c42431c22984f362e94de0e635c7b858c3cff0 566426227
2 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json 825b6cfc34a61db41e82bbb14d978d5a834925f8 46320117
3 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json 96a41025612e8cb15989251102dc05efe9647eda 47454698
4 | http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json b30e4ff0d8b7bd808240e5609410f9c36279ef36 46213747
5 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/language_model.txt:
--------------------------------------------------------------------------------
1 | https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d17d80b1459be871a5039ac23e752a53cbe 4475746
2 | https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076
3 | http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475
4 | http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016
5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
7 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/music_midi.txt:
--------------------------------------------------------------------------------
1 | http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz 330b3c67f24f9280f81e1f7ab12749087dd83f08 1768163879
2 | http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz 218b7c82ecb230a6679053e48e87714f0bd4836f 1407072670
3 | http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz 9873e84dd5a531ba3623e0a24ce33a81681cba80 272169548
4 | http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz ae47e29dfc18d7779d95697a6461d759504c7a1c 234283029
5 | https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip e189d8a0b6769f3be576a036da840adafe489327 46579421
6 | https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip 13808bf9503c72371d38e9705e93ce8623b21c01 59243107
7 | https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip 493880759c648dd96167a2f4d394421e6fa33874 437506993
8 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/naturalquestions.txt:
--------------------------------------------------------------------------------
1 | s3://gluonnlp-numpy-data/NaturalQuestions/v1.0-simplified_simplified-nq-train.jsonl.gz 9ae896ea4b29370fe157aea61a088ffdc0fbda8f 4715820286
2 | s3://gluonnlp-numpy-data/NaturalQuestions/nq-dev-all.jsonl.gz b4cc081a2d065f84d630a1338dead7faad77eeff 1068038975
3 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/searchqa.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
4 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/squad.txt:
--------------------------------------------------------------------------------
1 | https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 1faea1252438a64f9718412a55036b786cfcc636 30288272
2 | https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json e1621aae0683b346ee9743bd5609266ba0cc34fc 4854279
3 | https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json ceb2acdea93b9d82ab1829c7b1e03bee9e302c99 42123633
4 | https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json 53ebaeb15bc5cab36645150f6f65d074348e2f3d 4370528
5 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/superglue.txt:
--------------------------------------------------------------------------------
 1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/cb.zip c16fa0a46f0f888d59767851c44d8db397896fe5 75482
 2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/copa.zip ef110b215d7ff95a2fd2d0133f0959d324e9eec3 43986
 3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/multirc.zip 05bfcb1da7ea06742266f24503342fc20b2ab88a 1116225
 4 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/rte.zip 66105efeccc3fc54f9c5539de4c2d393d5bb4d36 750920
 5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/wic.zip 5b95487a3690abc718bc173ccd35bf084c43b10a 396213
 6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/wsc.zip 829ec3dd532284281cc19bacf9cded6c11d3452d 32751
 7 | https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip 8c8874dcace4942dd00cf9f76c2537ea0e2026eb 33950
 8 | https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip 949909079262bc4f6fb66bd889707aa71218975f 10413
 9 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/boolq.zip 90bf152c8012869d326260709404ce5111a76b46 4118001
10 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/glue_superglue/superglue/record.zip af2825be511efa8fbc7813756e768efffb8fcc11 51757880
11 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/text_classification.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/ag_news_csv.tar.gz 00b73919ec0527118ca35d819029985c33ca4005 11784327
2 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/imdb.tar.gz af11c368141a0cec4d49563000a2a54f9afdc38d 35673480
3 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/dbpedia_csv.tar.gz f39ead1841501739a34a5bbb22d405677e3165f7 68341698
4 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/yelp_review_polarity_csv.tar.gz dd08ed616d28c633b1ff7a5e12d900426e5db779 166373322
5 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/yelp_review_full_csv.tar.gz d0a1011a88be15254054e94144c83e92a048e318 196146693
6 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/amazon_review_polarity_csv.tar.gz 9689538a9ee0630340da8aa456a0888cc6733919 688340758
7 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/text_classification/amazon_review_full_csv.tar.gz e85b2d264aa8d8d3cc4dbe08adba88c0db92ff5b 643695117
8 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/triviaqa.txt:
--------------------------------------------------------------------------------
1 | https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz aa7d8c01d4a5e563caaeb648e8c1f506e353ebd6 2665779500
2 | https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz 670ba904b286865e25bb67ebd31c25e7c74c18ae 632549060
3 | 


--------------------------------------------------------------------------------
/scripts/datasets/url_checksums/wikipedia.txt:
--------------------------------------------------------------------------------
1 | https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz    1e1d77c31622744aaa45ff5bfbfca397154d9186    5068070627
2 | 


--------------------------------------------------------------------------------
/scripts/index.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | .. container:: cards
 5 | 
 6 |    .. card::
 7 |       :title: Benchmarking the Performance of NLP Backbones
 8 |       :link: benchmarks/index.html
 9 | 
10 |       NLP Benchmark.
11 |    
12 |    .. card::
13 |       :title: Classification Scripts
14 |       :link: classification/index.html
15 | 
16 |       NLP Classification example.
17 |    
18 |    .. card::
19 |       :title: Conversion Scripts
20 |       :link: conversion_toolkits/index.html
21 | 
22 |       Converting NLP models from other frameworks to GluonNLP.
23 | 
24 |    .. card::
25 |       :title: Datasets
26 |       :link: datasets/index.html
27 | 
28 |       Datasets in GluonNLP.
29 | 
30 |    .. card::
31 |       :title: Generation
32 |       :link: generation/index.html
33 | 
34 |       Sequence generation with GPT-2
35 | 
36 |    .. card::
37 |       :title: Machine Translation
38 |       :link: machine_translation/index.html
39 | 
40 |       Machine Translation examples.
41 | 
42 |    .. card::
43 |       :title: Data Preprocessing Toolkit in GluonNLP
44 |       :link: processing/index.html
45 | 
46 |       Data preprocessing examples.
47 | 
48 |    .. card::
49 |       :title: Pretraining Model
50 |       :link: pretraining/index.html
51 | 
52 |       Pretraining examples.
53 | 
54 |    .. card::
55 |       :title: Question Answering Examples
56 |       :link: question_answering/index.html
57 | 
58 |       Question Answering Example.
59 | 
60 | .. toctree::
61 |    :hidden:
62 |    :maxdepth: 1
63 | 
64 | 
65 |    benchmarks/index
66 |    conversion_toolkits/index
67 |    datasets/index
68 |    classification/index
69 |    generation/index
70 |    machine_translation/index
71 |    pretraining/index
72 |    processing/index
73 |    question_answering/index
74 |    
75 | 


--------------------------------------------------------------------------------
/scripts/machine_translation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/machine_translation/__init__.py


--------------------------------------------------------------------------------
/scripts/machine_translation/evaluate_epochs_wmt2014_ende.sh:
--------------------------------------------------------------------------------
 1 | SAVE_DIR=$1
 2 | SUBWORD_ALGO=${2:-yttm}
 3 | EPOCH_BEGIN=${3:-30}
 4 | EPOCH_END=${4:-60}
 5 | STOCHASTIC=${5:-0}
 6 | LP_ALPHA=${6:-0.6}
 7 | LP_K=${7:-5}
 8 | BEAM_SIZE=${8:-4}
 9 | 
10 | 
11 | for epoch in $( seq ${EPOCH_BEGIN} ${EPOCH_END})
12 | do
13 |     for fold in dev test
14 |     do
15 |         python3 evaluate_transformer.py \
16 |           --param_path ${SAVE_DIR}/epoch${epoch}.params \
17 |           --src_lang en \
18 |           --tgt_lang de \
19 |           --cfg ${SAVE_DIR}/config.yml \
20 |           --src_tokenizer ${SUBWORD_ALGO} \
21 |           --tgt_tokenizer ${SUBWORD_ALGO} \
22 |           --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \
23 |           --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \
24 |           --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \
25 |           --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \
26 |           --src_corpus wmt2014_ende/${fold}.raw.en \
27 |           --tgt_corpus wmt2014_ende/${fold}.raw.de \
28 |           --lp_alpha ${LP_ALPHA} \
29 |           --lp_k ${LP_K} \
30 |           --beam-size ${BEAM_SIZE} \
31 |           --save_dir ${SAVE_DIR}/epoch${epoch}_evaluation_${fold}_alpha${LP_ALPHA}_K${LP_K}_beam${BEAM_SIZE} \
32 |           --fp16
33 |     done
34 | done
35 | 


--------------------------------------------------------------------------------
/scripts/machine_translation/transformer_enc12_dec1.yml:
--------------------------------------------------------------------------------
1 | MODEL:
2 |   dropout: 0.2
3 |   DECODER:
4 |     pre_norm: false
5 |     num_layers: 1
6 |   ENCODER:
7 |     pre_norm: false
8 |     num_layers: 12
9 | 


--------------------------------------------------------------------------------
/scripts/pretraining/bert/covert_bookcorpus_format.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | class BookscorpusTextFormatting:
 5 |     def __init__(self, books_path, output_filename, recursive = False, interval = 500):
 6 |         self.books_path = books_path
 7 |         self.recursive = recursive
 8 |         self.output_filename = output_filename.split('.')
 9 |         self.interval = interval
10 | 
11 |     # This puts one book per line
12 | 
13 |     def merge(self):
14 |         count = 0
15 |         for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
16 |             if count == 0:
17 |                 ofile_name = '.'.join([self.output_filename[0]+'-'+str(count//500), self.output_filename[1]])
18 |                 ofile = open(ofile_name, mode='w', encoding='utf-8-sig', newline='\n')
19 |             elif count%self.interval == 0:
20 |                 print(count)
21 |                 ofile.close()
22 |                 ofile_name = '.'.join([self.output_filename[0]+'-'+str(count//500), self.output_filename[1]])
23 |                 ofile = open(ofile_name, mode='w', encoding='utf-8-sig', newline='\n')
24 |             file = open(filename, mode='r', encoding='utf-8-sig', newline='\n')
25 |             for line in file:
26 |                 if line.strip() != '':
27 |                     ofile.write(line.strip() + ' ')
28 |             ofile.write("\n\n")
29 |             count += 1
30 |         ofile.close()
31 | 
32 | data_dir = 'BookCorpus/books1/epubtxt/'
33 | output_name_format = 'BookCorpus/after_prepare/bookcorpus.txt'
34 | 
35 | FormatTool = BookscorpusTextFormatting(data_dir, output_name_format)
36 | FormatTool.merge()
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/pretraining/convert_electra_pretrain_backbone.py:
--------------------------------------------------------------------------------
 1 | """Convert pre-trained model parameters from ElectraForPretrain to ElectraModel"""
 2 | 
 3 | import os
 4 | import argparse
 5 | import mxnet as mx
 6 | 
 7 | from pretraining_utils import get_electra_pretraining_model
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser(description=__doc__)
12 |     group = parser.add_mutually_exclusive_group(required=True)
13 |     group.add_argument('--model-name', type=str, default='google_electra_small',
14 |                        help='Name of the pretrained model.')
15 |     parser.add_argument('--params-file', type=str, required=True,
16 |                         help='Path to the pretrained parameter file.')
17 |     parser.add_argument('--out-file', type=str, default=None,
18 |                         help='Output file path.')
19 |     parser.add_argument('--generator_units_scale', type=float, default=None,
20 |                         help='The scale size of the generator units, same as used in pretraining.')
21 |     parser.add_argument('--generator_layers_scale', type=float, default=None,
22 |                         help='The scale size of the generator layer, same as used in pretraining.')
23 | 
24 |     args = parser.parse_args()
25 |     return args
26 | 
27 | 
28 | def convert_params(model_name, generator_units_scale, generator_layers_scale,
29 |                    params_path, out_path):
30 |     _, _, pretrain_model = get_electra_pretraining_model(model_name, [mx.cpu()],
31 |                                                          generator_units_scale=generator_units_scale,
32 |                                                          generator_layers_scale=generator_layers_scale,
33 |                                                          params_path=params_path)
34 |     backbone_model = pretrain_model.disc_backbone
35 |     backbone_model.save_parameters(out_path)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     args = parse_args()
40 |     out_path = args.out_file
41 |     if not out_path:
42 |         params_file = args.params_file
43 |         file_name_sep = os.path.basename(params_file).split(os.path.extsep)
44 |         file_name_sep.insert(-1, 'backbone')
45 |         out_path = os.path.join(
46 |             os.path.dirname(params_file),
47 |             os.path.extsep.join(file_name_sep))
48 |     convert_params(args.model_name, args.generator_units_scale, args.generator_layers_scale,
49 |                    args.params_file, out_path)
50 | 


--------------------------------------------------------------------------------
/scripts/pretraining/torch/bert/README.md:
--------------------------------------------------------------------------------
 1 | NOTE: GluonNLP uses `/dev/shm/gluonnlp` shared memory filesystem to share
 2 | datasets among multi-process workloads. At this time, `/dev/shm/gluonnlp` is not
 3 | cleaned up automatically after the workload completes and manual deletion is
 4 | needed to free up memory. Sometimes you may not want to delete
 5 | `/dev/shm/gluonnlp` after running a workload, as you intend to run a workload
 6 | based on same dataset later and it's useful to keep the dataset in shared
 7 | memory.
 8 | 
 9 | # BERT
10 | 
11 | -1. p4 instance preparation
12 | 
13 | ```bash
14 | sudo mkfs.btrfs /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1 /dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1
15 | sudo mount /dev/nvme1n1 /mnt
16 | sudo chown ubuntu:ubuntu /mnt/
17 | ```
18 | 
19 | 1. Get the dataset
20 | 
21 | ```bash
22 | nlp_data prepare_bookcorpus --segment_sentences --segment_num_worker 16
23 | nlp_data prepare_wikipedia --mode download_prepared --segment_sentences --segment_num_worker 16
24 | find wikicorpus/one_sentence_per_line BookCorpus/one_sentence_per_line -type f > input_reference
25 | ```
26 | 
27 | 2. Prepare batches
28 | 
29 | ```bash
30 | python3 prepare_quickthought.py \
31 |     --input-reference input_reference
32 |     --output /mnt/out_quickthought_128 \
33 |     --model-name google_en_cased_bert_base \
34 |     --max-seq-length 128
35 | ```
36 | 
37 | 
38 | 1. Phase 1 training with sequence length 128
39 | 
40 | ```bash
41 | python3 -m torch.distributed.launch --nproc_per_node=8 run_pretraining.py \
42 |   --model_name google_en_cased_bert_base \
43 |   --lr 0.005 \
44 |   --batch_size 128 \
45 |   --num_accumulated 96 \
46 |   --num_dataloader_workers 4 \
47 |   --num_steps 3870 \
48 |   --input-files /mnt/out_quickthought_128/*feather \
49 |   --mmap-folder /mnt/gluonnlp_mmap \
50 |   --ckpt_dir /mnt/ckpt_dir \
51 |   --ckpt_interval 1000 2>&1| tee train.log;
52 | ```
53 | 
54 | 3. Phase 2 training with sequence length 512
55 | 
56 | TBD
57 | 
58 | Finally we obtain a folder of structure as followed,
59 | 
60 | ```
61 | coder_base
62 | ├── vocab-{short_hash}.json
63 | ├── model-{short_hash}.params
64 | ├── model-{short_hash}.yml
65 | ```
66 | 


--------------------------------------------------------------------------------
/scripts/processing/README.md:
--------------------------------------------------------------------------------
 1 | # Data Processing Toolkit in GluonNLP
 2 | We provide a bunch of data 
 3 | 
 4 | ## Clean and Tokenize a Parallel Corpus
 5 | 
 6 | To clean and tokenize a parallel corpus, use
 7 | ```
 8 | nlp_process clean_tok_para_corpus --help
 9 | ```
10 | 
11 | ## Learn a subword model
12 | 
13 | To learn a subword tokenizer, use
14 | ```
15 | nlp_process learn_subword --help
16 | ```
17 | 
18 | 
19 | ## Apply the learned subword model
20 | To apply the learned subword tokenizer, user
21 | ```
22 | nlp_process apply_subword --help
23 | ```
24 | 


--------------------------------------------------------------------------------
/scripts/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/scripts/processing/__init__.py


--------------------------------------------------------------------------------
/scripts/processing/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import textwrap
 3 | 
 4 | from . import (
 5 |     clean_tok_corpus,
 6 |     learn_subword,
 7 |     apply_subword
 8 | )
 9 | 
10 | 
11 | SUBCOMMANDS = ['clean_tok_para_corpus', 'clean_tok_mono_corpus',
12 |                'learn_subword', 'apply_subword', 'help']
13 | 
14 | 
15 | def cli_main():
16 |     parser = argparse.ArgumentParser(
17 |         description='Sharable data preprocessing utilities in GluonNLP.',
18 |         prog='nlp_process', add_help=False)
19 |     parser.add_argument('command', type=str,
20 |                         choices=SUBCOMMANDS,
21 |                         metavar='[subcommand]',
22 |                         help='The subcommand to use. '
23 |                              'Choices are {}.'.format(SUBCOMMANDS))
24 |     args, other_args = parser.parse_known_args()
25 |     if args.command == 'clean_tok_para_corpus':
26 |         parser = clean_tok_corpus.get_parser.para()
27 |         sub_args = parser.parse_args(other_args)
28 |         clean_tok_corpus.main_para(sub_args)
29 |     elif args.command == 'clean_tok_mono_corpus':
30 |         parser = clean_tok_corpus.get_parser.mono()
31 |         sub_args = parser.parse_args(other_args)
32 |         clean_tok_corpus.main_mono(sub_args)
33 |     elif args.command == 'learn_subword':
34 |         parser = learn_subword.get_parser()
35 |         sub_args = parser.parse_args(other_args)
36 |         learn_subword.main(sub_args)
37 |     elif args.command == 'apply_subword':
38 |         parser = apply_subword.get_parser()
39 |         sub_args = parser.parse_args(other_args)
40 |         apply_subword.main(sub_args)
41 |     elif args.command == 'help':
42 |         parser.print_help()
43 |     else:
44 |         parser.print_help()
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     cli_main()
49 | 


--------------------------------------------------------------------------------
/scripts/question_answering/albert_custom.yaml:
--------------------------------------------------------------------------------
 1 | version: 1.0
 2 | 
 3 | model:
 4 |   name: albert_base_v2
 5 |   framework: mxnet
 6 | 
 7 | tuning:
 8 |   strategy:
 9 |     name: mycustom
10 |   accuracy_criterion:
11 |     relative:  0.02
12 |   exit_policy:
13 |     timeout: 0
14 |     max_trials: 1000
15 |   random_seed: 9527
16 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/README.md:
--------------------------------------------------------------------------------
1 | # Commands For Training on SQuAD
2 | 
3 | All commands are generated by parsing the template in [run_squad.template](run_squad.template). 
4 | To generate all commands, use the following code.
5 | 
6 | ```bash
7 | python3 generate_commands.py
8 | ```
9 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad.template:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-{{ dtype }}}   # Default training data type
 6 | MODEL_NAME={{ model_name }}
 7 | BATCH_SIZE={{ batch_size }}
 8 | NUM_ACCUMULATED={{ num_accumulated }}
 9 | EPOCHS={{ epochs }}
10 | LR={{ lr }}
11 | WARMUP_RATIO={{ warmup_ratio }}
12 | WD={{ wd }}
13 | MAX_SEQ_LENGTH={{ max_seq_length }}
14 | MAX_GRAD_NORM={{ max_grad_norm }}
15 | LAYERWISE_DECAY={{ layerwise_decay }}
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_base.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_albert_base_v2
 7 | BATCH_SIZE=4
 8 | NUM_ACCUMULATED=3
 9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_large.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_albert_large_v2
 7 | BATCH_SIZE=3
 8 | NUM_ACCUMULATED=4
 9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_xlarge.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_albert_xlarge_v2
 7 | BATCH_SIZE=1
 8 | NUM_ACCUMULATED=12
 9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=0.1
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_albert_xxlarge_v2
 7 | BATCH_SIZE=1
 8 | NUM_ACCUMULATED=12
 9 | EPOCHS=3
10 | LR=2e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=0.1
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_electra_base.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_electra_base
 7 | BATCH_SIZE=8
 8 | NUM_ACCUMULATED=1
 9 | EPOCHS=2
10 | LR=0.0001
11 | WARMUP_RATIO=0.1
12 | WD=0
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=0.8
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_electra_large.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_electra_large
 7 | BATCH_SIZE=2
 8 | NUM_ACCUMULATED=4
 9 | EPOCHS=2
10 | LR=5e-05
11 | WARMUP_RATIO=0.1
12 | WD=0
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1
15 | LAYERWISE_DECAY=0.9
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_electra_small.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_electra_small
 7 | BATCH_SIZE=8
 8 | NUM_ACCUMULATED=1
 9 | EPOCHS=2
10 | LR=0.0003
11 | WARMUP_RATIO=0.1
12 | WD=0
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=0.8
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=gluon_en_cased_bert_base_v1
 7 | BATCH_SIZE=6
 8 | NUM_ACCUMULATED=2
 9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_mobilebert.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_uncased_mobilebert
 7 | BATCH_SIZE=8
 8 | NUM_ACCUMULATED=1
 9 | EPOCHS=5
10 | LR=4e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=384
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_roberta_large.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=fairseq_roberta_large
 7 | BATCH_SIZE=2
 8 | NUM_ACCUMULATED=6
 9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.2
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_en_uncased_bert_base
 7 | BATCH_SIZE=6
 8 | NUM_ACCUMULATED=2
 9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_en_uncased_bert_large
 7 | BATCH_SIZE=2
 8 | NUM_ACCUMULATED=6
 9 | EPOCHS=3
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/scripts/question_answering/commands/run_squad2_uncased_bert_wwm_large.sh:
--------------------------------------------------------------------------------
 1 | # Generated by "generate_commands.py"
 2 | 
 3 | USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 4 | VERSION=${2:-2.0}   # SQuAD Version
 5 | DTYPE=${3:-float32}   # Default training data type
 6 | MODEL_NAME=google_en_uncased_bert_wwm_large
 7 | BATCH_SIZE=3
 8 | NUM_ACCUMULATED=2
 9 | EPOCHS=2
10 | LR=3e-05
11 | WARMUP_RATIO=0.1
12 | WD=0.01
13 | MAX_SEQ_LENGTH=512
14 | MAX_GRAD_NORM=1.0
15 | LAYERWISE_DECAY=-1
16 | 
17 | # Prepare the Data
18 | nlp_data prepare_squad --version ${VERSION}
19 | 
20 | RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
21 | 
22 | # Run the script
23 | if [ ${USE_HOROVOD} -eq 0 ];
24 | then
25 |   RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
26 | else
27 |   RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
28 | fi
29 | ${RUN_COMMAND} \
30 |     --model_name ${MODEL_NAME} \
31 |     --data_dir squad \
32 |     --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
33 |     --version ${VERSION} \
34 |     --do_eval \
35 |     --do_train \
36 |     --batch_size ${BATCH_SIZE} \
37 |     --num_accumulated ${NUM_ACCUMULATED} \
38 |     --layerwise_decay ${LAYERWISE_DECAY} \
39 |     --epochs ${EPOCHS} \
40 |     --lr ${LR} \
41 |     --warmup_ratio ${WARMUP_RATIO} \
42 |     --wd ${WD} \
43 |     --max_seq_length ${MAX_SEQ_LENGTH} \
44 |     --max_grad_norm ${MAX_GRAD_NORM} \
45 |     --dtype ${DTYPE} \
46 |     --overwrite_cache
47 | 


--------------------------------------------------------------------------------
/src/gluonnlp/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = '1.0.0.dev'
 2 | from . import base
 3 | from . import data
 4 | from . import models
 5 | from . import utils
 6 | from . import attention_cell
 7 | from . import initializer as init
 8 | from . import layers
 9 | from . import loss
10 | from . import lr_scheduler
11 | from . import op
12 | from . import torch
13 | from . import sequence_sampler
14 | from . import embedding
15 | 


--------------------------------------------------------------------------------
/src/gluonnlp/base.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | # pylint: disable=abstract-method
19 | """Helper functions."""
20 | 
21 | import os
22 | import numpy as np
23 | 
24 | __all__ = ['get_home_dir', 'get_data_home_dir']
25 | 
26 | INT_TYPES = (int, np.int32, np.int64)
27 | FLOAT_TYPES = (float, np.float16, np.float32, np.float64)
28 | 
29 | 
30 | def get_home_dir():
31 |     """Get home directory for storing datasets/models/pre-trained word embeddings"""
32 |     _home_dir = os.environ.get('GLUONNLP_HOME', os.path.join('~', '.gluonnlp'))
33 |     # expand ~ to actual path
34 |     _home_dir = os.path.expanduser(_home_dir)
35 |     return _home_dir
36 | 
37 | 
38 | def get_data_home_dir():
39 |     """Get home directory for storing the datasets"""
40 |     home_dir = get_home_dir()
41 |     return os.path.join(home_dir, 'datasets')
42 | 
43 | 
44 | def get_model_zoo_home_dir():
45 |     """Get the local directory for storing pretrained models"""
46 |     home_dir = get_home_dir()
47 |     return os.path.join(home_dir, 'models')
48 | 
49 | 
50 | def get_model_zoo_checksum_dir():
51 |     """Get the directory that stores the checksums of the artifacts in the model zoo """
52 |     curr_dir = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
53 |     check_sum_dir = os.path.join(curr_dir, 'models', 'model_zoo_checksums')
54 |     return check_sum_dir
55 | 
56 | 
57 | def get_repo_url():
58 |     """Return the base URL for Gluon dataset and model repository """
59 |     default_repo = 's3://gluonnlp-numpy-data'
60 |     repo_url = os.environ.get('GLUONNLP_REPO_URL', default_repo)
61 |     if repo_url[-1] != '/':
62 |         repo_url = repo_url + '/'
63 |     return repo_url
64 | 
65 | 
66 | def get_repo_model_zoo_url():
67 |     """Return the base URL for GluonNLP Model Zoo"""
68 |     repo_url = get_repo_url()
69 |     model_zoo_url = repo_url + 'models/'
70 |     return model_zoo_url
71 | 
72 | 
73 | def use_einsum_optimization():
74 |     """Whether to use einsum for attention. This will potentially accelerate the
75 |     attention cell
76 | 
77 |     Returns
78 |     -------
79 |     flag
80 |         The use einsum flag
81 | 
82 |     """
83 |     flag = os.environ.get('GLUONNLP_USE_EINSUM', False)
84 |     return flag
85 | 


--------------------------------------------------------------------------------
/src/gluonnlp/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/src/gluonnlp/cli/__init__.py


--------------------------------------------------------------------------------
/src/gluonnlp/cli/average_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import mxnet as mx
 3 | import os
 4 | 
 5 | mx.npx.set_np()
 6 | 
 7 | 
 8 | def get_parser():
 9 |     parser = argparse.ArgumentParser(description='Script to average the checkpoints')
10 |     parser.add_argument('--checkpoints', type=str, required=True, nargs='+',
11 |                         help='checkpoint file paths, supports two format, '
12 |                         '--checkpoints folder/epoch*.params or --checkpoints folder/update*.param')
13 |     parser.add_argument('--ids', type=int, required=False, nargs='+',
14 |                         help='The IDs of the checkpoints.')
15 |     parser.add_argument('--begin', type=int, required=False,
16 |                         default=None,
17 |                         help='begin number of checkpoints')
18 |     parser.add_argument('--end', type=int, required=False,
19 |                         default=None,
20 |                         help='end number of checkpoints. '
21 |                              'We select the checkpoints with ID >= begin and <= end.')
22 |     parser.add_argument('--save-path', type=str, required=True, help='Path of the output file')
23 |     return parser
24 | 
25 | 
26 | def main(args):
27 |     if args.begin is not None or args.end is not None or args.ids is not None:
28 |         print(f'Before filtering, the checkpoints are {args.checkpoints}')
29 |         prefix = os.path.commonprefix(args.checkpoints)
30 |         postfix = os.path.commonprefix([ele[::-1] for ele in args.checkpoints])[::-1]
31 |         checkpoint_id_l = [int(ele[len(prefix):-len(postfix)]) for ele in args.checkpoints]
32 |         ckpt_paths = []
33 |         if args.ids is not None:
34 |             for ele in args.ids:
35 |                 assert ele in checkpoint_id_l
36 |                 ckpt_paths.append(f'{prefix}{ele}{postfix}')
37 |         else:
38 |             assert args.begin is not None and args.end is not None, \
39 |                 'Must specify both begin and end if you want to select a range!'
40 |             assert args.begin >= 0
41 |             assert args.end >= args.begin
42 |             for ele in checkpoint_id_l:
43 |                 if ele >= args.begin and ele <= args.end:
44 |                     ckpt_paths.append(f'{prefix}{ele}{postfix}')
45 |     else:
46 |         ckpt_paths = args.checkpoints
47 |     print(f'Load models from {ckpt_paths}')
48 |     print('Average the models and save it to {}'.format(args.save_path))
49 |     assert len(ckpt_paths) > 0, 'Cannot found checkpoints. You may need to check the inputs again.'
50 |     res = mx.npx.load(ckpt_paths[0])
51 |     keys = res.keys()
52 |     for ckpt_path in ckpt_paths[1:]:
53 |         ckpt = mx.npx.load(ckpt_path)
54 |         for key in keys:
55 |             res[key] += ckpt[key]
56 |     for key in keys:
57 |         res[key] /= len(ckpt_paths)
58 |     mx.npx.savez(args.save_path, **res)
59 | 
60 | 
61 | def cli_main():
62 |     parser = get_parser()
63 |     args = parser.parse_args()
64 |     main(args)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     cli_main()
69 | 


--------------------------------------------------------------------------------
/src/gluonnlp/cli/data:
--------------------------------------------------------------------------------
1 | ../../../scripts/datasets


--------------------------------------------------------------------------------
/src/gluonnlp/cli/process:
--------------------------------------------------------------------------------
1 | ../../../scripts/processing


--------------------------------------------------------------------------------
/src/gluonnlp/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import vocab
2 | from . import tokenizers
3 | from . import batchify
4 | from .vocab import *
5 | 
6 | __all__ = ['batchify', 'tokenizers'] + vocab.__all__
7 | 


--------------------------------------------------------------------------------
/src/gluonnlp/data/tokenizers/__init__.py:
--------------------------------------------------------------------------------
 1 | """Tokenizers"""
 2 | from .base import *
 3 | from .huggingface import *
 4 | from .jieba import *
 5 | from .moses import *
 6 | from .sentencepiece import *
 7 | from .spacy import *
 8 | from .subword_nmt import *
 9 | from .whitespace import *
10 | from .yttm import *
11 | 
12 | 
13 | __all__ = base.__all__ +\
14 |           huggingface.__all__ + \
15 |           jieba.__all__ + \
16 |           moses.__all__ + \
17 |           sentencepiece.__all__ + \
18 |           spacy.__all__ + \
19 |           subword_nmt.__all__ + \
20 |           whitespace.__all__ + \
21 |           yttm.__all__
22 | 


--------------------------------------------------------------------------------
/src/gluonnlp/embedding/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | # pylint: disable=wildcard-import
19 | """Word embeddings."""
20 | 
21 | from . import embed_loader
22 | from .embed_loader import *
23 | 
24 | __all__ = (embed_loader.__all__ )
25 | 


--------------------------------------------------------------------------------
/src/gluonnlp/loss.py:
--------------------------------------------------------------------------------
 1 | from mxnet.gluon import HybridBlock
 2 | from mxnet import npx
 3 | 
 4 | 
 5 | class LabelSmoothCrossEntropyLoss(HybridBlock):
 6 |     r"""Computes the softmax cross entropy loss with label-smoothing
 7 | 
 8 |     .. math::
 9 | 
10 |         \DeclareMathOperator{softmax}{softmax}
11 | 
12 |         lp = \log \softmax({pred})
13 | 
14 |         L = - [(1 - \alpha) \sum_{i=1}^N (lp_{i, {label}_i}) + \alpha \frac{1}{N} \sum_{j=1}^N (lp_{i, j})]
15 | 
16 |     To reduce complexity, we can implement it as
17 | 
18 |     .. math::
19 | 
20 |         L = -\sum_i (\frac{N \alpha - 1}{N-1} lp_{i, {label}_i} + \frac{1 - \alpha}{N - 1} \sum_j lp_{i, j})
21 | 
22 |     Parameters
23 |     ----------
24 |     num_labels
25 |         The number of possible labels. For example, in NLP, it can be the size of the vocabulary.
26 |     alpha
27 |         The uncertainty that will be injected to the labels. All the negative labels will be
28 |         treated with probability equals to \frac{\alpha} / {N}
29 |     from_logits
30 |         Whether input is a log probability (usually from log_softmax) instead of unnormalized numbers.
31 |     """
32 |     def __init__(self, num_labels: int, alpha: float = 0.1, from_logits: bool = False, **kwargs):
33 |         super().__init__(**kwargs)
34 |         self._num_labels = num_labels
35 |         self._alpha = alpha
36 |         self._from_logits = from_logits
37 | 
38 |     def forward(self, pred, label):
39 |         """
40 | 
41 |         Parameters
42 |         ----------
43 |         pred :
44 |             The predictions of the network. Shape (..., V)
45 |         label :
46 |             The labels. Shape (..., )
47 | 
48 |         Returns
49 |         -------
50 |         loss :
51 |             Shape (..., )
52 |         """
53 |         if not self._from_logits:
54 |             pred = npx.log_softmax(pred, axis=-1)
55 |         log_likelihood = npx.pick(pred, label, axis=-1)
56 |         all_scores = pred.sum(axis=-1)
57 |         loss = - (1 - self._alpha) * log_likelihood\
58 |                - self._alpha / float(self._num_labels) * all_scores
59 |         return loss
60 | 


--------------------------------------------------------------------------------
/src/gluonnlp/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from mxnet import lr_scheduler
 3 | 
 4 | 
 5 | class InverseSquareRootScheduler(lr_scheduler.LRScheduler):
 6 |     """ Reduce the learning rate according to a polynomial of given power.
 7 | 
 8 |     During warmup
 9 |         Increase the learning rate linearly from warmup_init_lr to base_lr,
10 |     After warmup
11 |         Decay the learning rate with
12 |             lr = base_lr * sqrt(warmup_steps) / sqrt(num_update)
13 | 
14 |     Parameters
15 |     ----------
16 |         warmup_steps
17 |             maximum number of updates before the decay reaches final learning rate.
18 |         base_lr
19 |             The final learning rate in the warm-up stage. The learning rate starts to decay after
20 |             the lr reaches warmup_end_lr
21 |         warmup_init_lr
22 |             The initial learning rate of the scheduler. The warm up starts at this point.
23 |     """
24 | 
25 |     def __init__(self, warmup_steps: int, base_lr: float = 1E-3, warmup_init_lr: float = 0.0):
26 |         super().__init__(
27 |             base_lr, warmup_steps, warmup_init_lr, 'linear')
28 |         self.base_lr = base_lr
29 |         self.warmup_steps = warmup_steps
30 | 
31 |     def __call__(self, num_update):
32 |         if num_update < self.warmup_steps:
33 |             return self.get_warmup_lr(num_update)
34 |         else:
35 |             return self.base_lr * math.sqrt(self.warmup_steps) / math.sqrt(num_update)
36 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import *
 2 | from .albert import *
 3 | from .bert import *
 4 | from .electra import *
 5 | from .gpt2 import *
 6 | from .mobilebert import *
 7 | from .roberta import *
 8 | from .transformer import *
 9 | from .transformer_xl import *
10 | from .xlmr import *
11 | from .bart import *
12 | from .t5 import *
13 | from .mt5 import *
14 | 
15 | __all__ = base.__all__ + \
16 |           albert.__all__ + \
17 |           bert.__all__ + \
18 |           electra.__all__ + \
19 |           gpt2.__all__ +\
20 |           mobilebert.__all__ + \
21 |           roberta.__all__ + \
22 |           transformer.__all__ + \
23 |           transformer_xl.__all__ + \
24 |           t5.__all__ + \
25 |           mt5.__all__
26 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/base.py:
--------------------------------------------------------------------------------
 1 | __all__ = ['list_backbone_names', 'get_backbone', 'BACKBONE_REGISTRY']
 2 | 
 3 | from typing import Tuple, List
 4 | from ..base import get_model_zoo_home_dir
 5 | from ..data.tokenizers import BaseTokenizer
 6 | from ..utils.registry import Registry
 7 | from mxnet.gluon import Block
 8 | 
 9 | BACKBONE_REGISTRY = Registry('Backbone Models')
10 | 
11 | 
12 | def list_backbone_names():
13 |     all_keys = []
14 |     for backbone_type in BACKBONE_REGISTRY.list_keys():
15 |         all_keys.extend(BACKBONE_REGISTRY.get(backbone_type)[-1]())
16 |     return all_keys
17 | 
18 | 
19 | def get_backbone(model_name: str,
20 |                  root: str = get_model_zoo_home_dir(),
21 |                  **kwargs) -> Tuple['Block', str, BaseTokenizer, str, List]:
22 |     """Get the backbone network
23 | 
24 |     Parameters
25 |     ----------
26 |     model_name
27 |         The name of the pretrained model
28 |     root
29 |         Downloaded directory of the model zoo
30 | 
31 |     Returns
32 |     -------
33 |     model_cls
34 |         The class to construct the backbone network
35 |     cfg
36 |         Path to the config file of the backbone
37 |     tokenizer
38 |         The tokenizer that is bound to the backbone model
39 |     backbone_param_path
40 |         The path to the pretrained backbone weights
41 |     others
42 |         The other items returned by the create function.
43 |          Will be wrapped into a list
44 | 
45 |     Examples
46 |     --------
47 | 
48 |     >>> from gluonnlp.models import get_backbone
49 |     >>> model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone('google_en_cased_bert_base')
50 |     >>> model = model_cls.from_cfg(cfg)
51 |     >>> model.load_parameters(backbone_param_path)
52 |     """
53 |     model_cls, local_create_fn = None, None
54 | 
55 |     for backbone_type in BACKBONE_REGISTRY.list_keys():
56 |         ele_model_cls, ele_local_create_fn, list_key_fn = BACKBONE_REGISTRY.get(backbone_type)
57 |         if model_name in list_key_fn():
58 |             model_cls = ele_model_cls
59 |             local_create_fn = ele_local_create_fn
60 |     if model_cls is None or local_create_fn is None:
61 |         raise KeyError('The backbone model "{}" is not found! '
62 |                        'Here are all available backbone models = {}'
63 |                        .format(model_name,
64 |                                list_backbone_names()))
65 |     cfg, tokenizer, local_params_path, *others = local_create_fn(model_name=model_name, root=root,
66 |                                                                  **kwargs)
67 |     return model_cls, cfg, tokenizer, local_params_path, others
68 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/albert.txt:
--------------------------------------------------------------------------------
 1 | google_albert_base_v2/model-125be477.params          125be477d1cecc6843245eafe46ca1dc5961ffb5   46736016
 2 | google_albert_base_v2/model-8767fdc9.yml             8767fdc9e1190606dc9aa17725438b4ae33704c4   436
 3 | google_albert_base_v2/model_mlm-fe20650e.params      fe20650e289fcd1a36c09d39e1d5cf5ffa64ba32   47251372
 4 | google_albert_base_v2/spm-65999e5d.model             65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
 5 | google_albert_base_v2/vocab-2ee53ae7.json            2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
 6 | google_albert_large_v2/model-ad60bcd5.params         ad60bcd55cbba463c6e85062769fce846dd9fcf0   70737552
 7 | google_albert_large_v2/model-e2e9b974.yml            e2e9b9748ffe2b147cd92cbc8edba129ed9e98c1   388
 8 | google_albert_large_v2/model_mlm-6a5015ee.params     6a5015ee845f874c1201b5a954275a489e0ed10c   71383980
 9 | google_albert_large_v2/spm-65999e5d.model            65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
10 | google_albert_large_v2/vocab-2ee53ae7.json           2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
11 | google_albert_xlarge_v2/model-4149c9e2.params        4149c9e2793dbd9352d27ab11d67f84b0763f4b2   234901136
12 | google_albert_xlarge_v2/model-8123bffd.yml           8123bffda684857ddac48ebeaaa18aba0e1503fb   437
13 | google_albert_xlarge_v2/model_mlm-ee184d38.params    ee184d389424bab1adf17cc1feb86c69ba0791ff   236071852
14 | google_albert_xlarge_v2/spm-65999e5d.model           65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
15 | google_albert_xlarge_v2/vocab-2ee53ae7.json          2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
16 | google_albert_xxlarge_v2/model-5601a0ed.params       5601a0edddb11d324aecccca7f496ef09013481e   890384016
17 | google_albert_xxlarge_v2/model-07fbeebc.yml          07fbeebcdee60e2362040807d56c572ae7dd7f03   438
18 | google_albert_xxlarge_v2/model_mlm-d2e2b06f.params   d2e2b06f68668cab9c37dd60dca82f00e2e248ab   892603308
19 | google_albert_xxlarge_v2/spm-65999e5d.model          65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
20 | google_albert_xxlarge_v2/vocab-2ee53ae7.json         2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
21 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/bart.txt:
--------------------------------------------------------------------------------
1 | fairseq_bart_base/model-8f4929b5.params   8f4929b54f2f77619885cea9f3bd7dba51a27f38    560560748
2 | fairseq_bart_base/gpt2-396d4d8e.merges    396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
3 | fairseq_bart_base/model-251bf089.yml      251bf08944d18cc29b59a4a854bdbccf601dabb5    754
4 | fairseq_bart_base/gpt2-f4dedacb.vocab     f4dedacb076b1df441c9c7398ed9acd3c19865f3    575079
5 | fairseq_bart_large/model-862277b1.params  862277b1489ed95140cb63279fbd0098ef2dea90    1625180962
6 | fairseq_bart_large/gpt2-396d4d8e.merges   396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
7 | fairseq_bart_large/model-a2932dea.yml     a2932deaf9737d95891755841fae3e388f3d698a    746
8 | fairseq_bart_large/gpt2-f1335494.vocab    f1335494f47917829e3b1d08e579ff2c3fe4fd60    558231
9 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/electra.txt:
--------------------------------------------------------------------------------
 1 | google_electra_small/vocab-e6d2b21d.json             e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
 2 | google_electra_small/model-2654c8b4.params           2654c8b4e240a5713078d2bd79582285c3f1b333   53945262
 3 | google_electra_small/gen_model-0c30d1c5.params       0c30d1c5678154937dee1d11bef8db6f43d4d767   54202512
 4 | google_electra_small/model-9ffb21c8.yml              9ffb21c8885bdb3e5f62c3f7a670d406167ec10c   472
 5 | google_electra_small/disc_model-137714b6.params      137714b6c7f327e642861a7380dd94c8b3dbf1ea   54211975
 6 | google_electra_base/vocab-e6d2b21d.json              e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
 7 | google_electra_base/model-31c235cc.params            31c235cc6da6f1872adffb31efe9318600b89ae5   435579680
 8 | google_electra_base/gen_model-253a62c9.params        253a62c9aa9de24d85e09a9ae62ef88501e53dff   134978192
 9 | google_electra_base/model-5b35ca0b.yml               5b35ca0b7f117978e372cfd8d98970d2d726e6c0   477
10 | google_electra_base/disc_model-514bd353.params       514bd353f9d42bc907bfa7e1175f4013b0147d7e   437947611
11 | google_electra_large/vocab-e6d2b21d.json             e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
12 | google_electra_large/model-9baf9ff5.params           9baf9ff55cee0195b7754aee7fcb3a1019c99f45   1336395080
13 | google_electra_large/gen_model-82c1b17b.params       82c1b17b4b5ac19700c272858b0b211437f72855   205211944
14 | google_electra_large/model-31b7dfdd.yml              31b7dfdd343bd2b2e43e200a735c83b0af1963f1   476
15 | google_electra_large/disc_model-5b820c02.params      5b820c026aa2ad779c1e9a41ff4ff1408fefacbf   1340602227
16 | gluon_electra_small_owt/vocab-e6d2b21d.json          e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
17 | gluon_electra_small_owt/model-e9636891.params        e9636891daae9f2940b2b3210cca3c34c3d8f21e   53748654
18 | gluon_electra_small_owt/model-6e276d98.yml           6e276d98360fbb7c379d28bac34a3ca2918a90ab   473
19 | gluon_electra_small_owt/gen_model-45a6fb67.params    45a6fb67e1e6cb65d22b80498f2152ce9780d579   33926624
20 | gluon_electra_small_owt/disc_model-87836017.params   878360174ac71c3fdc7071be7835bea532c09b8d   54015367
21 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/gpt2.txt:
--------------------------------------------------------------------------------
 1 | gpt2_124M/model_lm-99b90604.params  99b9060488b4542ccd045c28401da10a3158ca80    497771820
 2 | gpt2_124M/gpt2-396d4d8e.merges      396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
 3 | gpt2_124M/gpt2-9dc62091.vocab       9dc620913410d5ec1a988abf852891e1c9f0f649    558055
 4 | gpt2_124M/model-bfed311d.params     bfed311d5c980ba475f90ccf7f536d25c3b40386    497769466
 5 | gpt2_355M/model_lm-eed0e964.params  eed0e964f4222823a557acfee2c106f228ce0188    1419317644
 6 | gpt2_355M/gpt2-396d4d8e.merges      396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
 7 | gpt2_355M/gpt2-9dc62091.vocab       9dc620913410d5ec1a988abf852891e1c9f0f649    558055
 8 | gpt2_355M/model-81dee612.params     81dee612413733899f6e5fbbeac91da781805e1b    1419312986
 9 | gpt2_774M/model_lm-cfbfa641.params  cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6    3096157676
10 | gpt2_774M/gpt2-396d4d8e.merges      396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
11 | gpt2_774M/gpt2-9dc62091.vocab       9dc620913410d5ec1a988abf852891e1c9f0f649    558055
12 | gpt2_774M/model-9917e24e.params     9917e24e89c651793adea69042d6cceddfc7973c    3096150714
13 | gpt2_1558M/model_lm-c8489dcb.params c8489dcbdb0d39bc3eac6d1d62e0e3dace9faa8f    6230494540
14 | gpt2_1558M/gpt2-396d4d8e.merges     396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
15 | gpt2_1558M/gpt2-9dc62091.vocab      9dc620913410d5ec1a988abf852891e1c9f0f649    558055
16 | gpt2_1558M/model-af3dd713.params    af3dd71313b55b4be5f52bdd538c9db054c1e190    6230485274
17 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/mobilebert.txt:
--------------------------------------------------------------------------------
1 | google_uncased_mobilebert/model-1c33216b.yml             1c33216b256a76713e0906b7ceefb3b37d4d35a0   510
2 | google_uncased_mobilebert/vocab-e6d2b21d.json            e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
3 | google_uncased_mobilebert/model-c8346cf2.params          c8346cf2caf9cc422f081f03b50bc69945328894   98424130
4 | google_uncased_mobilebert/model_mlm-53948e82.params      53948e82d8ec091927af357387b36ade0e42b34c   146503986
5 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/mt5.txt:
--------------------------------------------------------------------------------
 1 | google_mt5_small/mt5-2730df74.vocab     2730df74056f29388cc4c8c912af6e97ac54bab2    4309802
 2 | google_mt5_small/model-23352279.yml     23352279d13971a536847aebe31b34c4a0b80dd8    242
 3 | google_mt5_small/model-b20e24d7.params  b20e24d75d097e9eea647f4b9a0dc53b956a9d1a    688633650
 4 | google_mt5_base/mt5-2730df74.vocab      2730df74056f29388cc4c8c912af6e97ac54bab2    4309802
 5 | google_mt5_base/model-da71d108.yml      da71d1084d75af5648e1b9247fecfa74e0361da0    244
 6 | google_mt5_base/model-91eaa894.params   91eaa89444e062e2fc3953b1184e15ccf5375385    1561555474
 7 | google_mt5_large/mt5-2730df74.vocab     2730df74056f29388cc4c8c912af6e97ac54bab2    4309802
 8 | google_mt5_large/model-1226608e.yml     1226608ec2c53cc6dcf2303a8f1b19c59f43cbfe    245
 9 | google_mt5_large/model-6b46e841.params  6b46e841e9b1b4c8ad97b071b316f9c52c2731e6    3894572546
10 | google_mt5_xl/mt5-2730df74.vocab        2730df74056f29388cc4c8c912af6e97ac54bab2    4309802
11 | google_mt5_xl/model-089b83a2.yml        089b83a2c893bd901fe26180f2fbfd2f52804ae0    245
12 | google_mt5_xl/model-7655ea81.params     7655ea81d4b7c9787dd1bfa902e96cdf9e124e3d    12922784462
13 | google_mt5_xxl/mt5-2730df74.vocab       2730df74056f29388cc4c8c912af6e97ac54bab2    4309802
14 | google_mt5_xxl/model-65e24812.yml       65e248120fbdcbaced58fb6f6c21f8143f9e97be    246
15 | google_mt5_xxl/model-2e9e44b9.params    2e9e44b9fc10d8a4c7133fa5e67ecadedfbfb692    47588620878
16 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/roberta.txt:
--------------------------------------------------------------------------------
 1 | fairseq_roberta_base/model-565d1db7.yml         565d1db71b0452fa2c28f155b8e9d90754f4f40a    401
 2 | fairseq_roberta_base/gpt2-396d4d8e.merges       396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
 3 | fairseq_roberta_base/gpt2-f1335494.vocab        f1335494f47917829e3b1d08e579ff2c3fe4fd60    558231
 4 | fairseq_roberta_base/model-09a1520a.params      09a1520adf652468c07e43a6ed28908418fa58a7    496222787
 5 | fairseq_roberta_base/model_mlm-29889e2b.params  29889e2b4ef20676fda117bb7b754e1693d0df25    498794868
 6 | fairseq_roberta_large/model-6b043b91.params     6b043b91a6a781a12ea643d0644d32300db38ec8    1417251819
 7 | fairseq_roberta_large/gpt2-396d4d8e.merges      396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
 8 | fairseq_roberta_large/model-6e66dc4a.yml        6e66dc4a450560a93aaf3d0ba9e0d447495d778a    402
 9 | fairseq_roberta_large/gpt2-f1335494.vocab       f1335494f47917829e3b1d08e579ff2c3fe4fd60    558231
10 | fairseq_roberta_large/model_mlm-119f38e1.params 119f38e1249bd28bea7dd2e90c09b8f4b879fa19    1421664140
11 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/t5.txt:
--------------------------------------------------------------------------------
 1 | google_t5_small/t5-5f05e7c5.vocab       5f05e7c57adf916bdba74912b0b37dea5c585988    791656
 2 | google_t5_small/model-3cc6e5f7.yml      3cc6e5f7c6ccc3e2ac174d899b1aed74d7de65e0    235
 3 | google_t5_small/model-e34f6fbd.params   e34f6fbda666c02f0ffd5e15fec02056d3e3014d    242141346
 4 | google_t5_base/t5-5f05e7c5.vocab        5f05e7c57adf916bdba74912b0b37dea5c585988    791656
 5 | google_t5_base/model-ca5cc26c.yml       ca5cc26c9dfe31295c97ef536b3f6f954ef1a447    237
 6 | google_t5_base/model-e1956ac9.params    e1956ac9670263b6803672bd0d7579f71d7494c6    891901274
 7 | google_t5_large/t5-5f05e7c5.vocab       5f05e7c57adf916bdba74912b0b37dea5c585988    791656
 8 | google_t5_large/model-01c5d9ae.yml      01c5d9ae5476b18c3516ebbe3a505b966982027d    238
 9 | google_t5_large/model-bf5fc813.params   bf5fc8138a04aa5f3bc495cacb010c873e59e909    2951363690
10 | google_t5_3B/t5-5f05e7c5.vocab          5f05e7c57adf916bdba74912b0b37dea5c585988    791656
11 | google_t5_3B/model-791f2e90.yml         791f2e90057fcccfa83bf8130034196d3550fb77    240
12 | google_t5_3B/model-48ba7250.params      48ba72501239c8d2d355282eebdebd0935556780    11407098198
13 | google_t5_11B/t5-5f05e7c5.vocab         5f05e7c57adf916bdba74912b0b37dea5c585988    791656
14 | google_t5_11B/model-2e50d93e.yml        2e50d93effc258aa75af162e9598be60ae13a83e    241
15 | google_t5_11B/model-1936031c.params     1936031c6db581ae866f41ec6d3c1c6de2049823    45229995126
16 | 


--------------------------------------------------------------------------------
/src/gluonnlp/models/model_zoo_checksums/xlmr.txt:
--------------------------------------------------------------------------------
1 | fairseq_xlmr_base/model-3fa134e9.params         3fa134e9a13e2329ffa7b8d39612695ed8397c9d      1109814851
2 | fairseq_xlmr_base/model-b893d178.yml            b893d178fa859fb6c708a08fc970b9980e047825      402
3 | fairseq_xlmr_base/model_mlm-86e37954.params     86e379542a6430cd988ff4b6a25966949afc241a      1113185880
4 | fairseq_xlmr_base/sentencepiece-18e17bae.model  18e17bae37be115135d4cf4ad9dfcc4f3b12cb80      5069075
5 | fairseq_xlmr_large/model-b62b074c.params        b62b074cdd41e682075e2407f842be6578696b26      2235374571
6 | fairseq_xlmr_large/model-01fc59fb.yml           01fc59fb3a805f09d2aa11369d5b57e0be931fdd      403
7 | fairseq_xlmr_large/model_mlm-887506c2.params    887506c20bda452cf13ef04390eaa57a55602a92      2240585840
8 | fairseq_xlmr_large/sentencepiece-18e17bae.model 18e17bae37be115135d4cf4ad9dfcc4f3b12cb80      5069075
9 | 


--------------------------------------------------------------------------------
/src/gluonnlp/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dmlc/gluon-nlp/14553a0c19790a3b03e8107471c987c7aa4f8faa/src/gluonnlp/third_party/__init__.py


--------------------------------------------------------------------------------
/src/gluonnlp/torch/__init__.py:
--------------------------------------------------------------------------------
1 | from . import attention_cell
2 | from . import data
3 | from . import layers
4 | from . import optimizers
5 | from . import models
6 | from . import utils
7 | 


--------------------------------------------------------------------------------
/src/gluonnlp/torch/clib/amp_C_frontend.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lans_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   const float lr,
 8 |   const float beta1,
 9 |   const float beta2,
10 |   const float epsilon,
11 |   const int step,
12 |   const int bias_correction,
13 |   const float weight_decay,
14 |   const int grad_averaging,
15 |   const int mode,
16 |   const bool normalize_grad);
17 | 
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |   m.def("multi_tensor_lans", &multi_tensor_lans_cuda,
21 |         "Computes and apply update for LANS optimizer");
22 | }
23 | 


--------------------------------------------------------------------------------
/src/gluonnlp/torch/clib/compat.h:
--------------------------------------------------------------------------------
1 | #ifndef TORCH_CHECK
2 | #define TORCH_CHECK AT_CHECK
3 | #endif
4 | 
5 | #define DATA_PTR data_ptr
6 | 


--------------------------------------------------------------------------------
/src/gluonnlp/torch/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import batchify
2 | 


--------------------------------------------------------------------------------
/src/gluonnlp/torch/models/__init__.py:
--------------------------------------------------------------------------------
1 | from . import transformer
2 | from . import bert
3 | 


--------------------------------------------------------------------------------
/src/gluonnlp/torch/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from . import schedules
2 | from . import fused_lans
3 | 
4 | from .fused_lans import FusedLANS
5 | 


--------------------------------------------------------------------------------
/src/gluonnlp/torch/optimizers/schedules.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020, Amazon.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Optimization for BERT model."""
15 | 
16 | from torch.optim.lr_scheduler import LambdaLR
17 | 
18 | __all__ = ['get_warmup_linear_const_decay_poly_schedule']
19 | 
20 | 
21 | def get_warmup_linear_const_decay_poly_schedule(optimizer, total_steps, warmup_ratio=0.002,
22 |                                                 const_ratio=0., degree=1.0, last_epoch=-1):
23 |     """Create a schedule with a learning rate that decreases linearly from the
24 |     initial lr set in the optimizer to 0, after a warmup period during which it
25 |     increases linearly from 0 to the initial lr set in the optimizer and a
26 |     constant period.
27 | 
28 |     Args:
29 |         optimizer (:class:`~torch.optim.Optimizer`):
30 |             The optimizer for which to schedule the learning rate.
31 |         total_steps (:obj:`int`):
32 |             The total number of training steps.
33 |         warmup_ratio (:obj:`float`):
34 |             The number of steps for the warmup phase.
35 |         constant_ratio (:obj:`float`):
36 |             The total number of training steps.
37 |         last_epoch (:obj:`int`, `optional`, defaults to -1):
38 |             The index of the last epoch when resuming training.
39 | 
40 |     Return:
41 |         :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
42 | 
43 |     """
44 |     def lr_lambda(global_step: int):
45 |         x = global_step / total_steps
46 |         if warmup_ratio == 0.0:
47 |             return 1.0
48 |         elif x < warmup_ratio:
49 |             return x / warmup_ratio
50 |         elif x < warmup_ratio + const_ratio:
51 |             return 1.0
52 |         return ((1.0 - x) / (1.0 - warmup_ratio - const_ratio))**degree
53 | 
54 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
55 | 


--------------------------------------------------------------------------------
/src/gluonnlp/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import shm
3 | from . import lazy_imports
4 | from . import preprocessing
5 | from . import registry
6 | from . import testing
7 | from .parameter import *
8 | from .misc import *
9 | 


--------------------------------------------------------------------------------
/src/gluonnlp/utils/config.py:
--------------------------------------------------------------------------------
 1 | import yacs.config
 2 | 
 3 | 
 4 | class CfgNode(yacs.config.CfgNode):
 5 |     def clone_merge(self, cfg_filename_or_other_cfg):
 6 |         """Create a new cfg by cloning and merging with the given cfg
 7 | 
 8 |         Parameters
 9 |         ----------
10 |         cfg_filename_or_other_cfg
11 | 
12 |         Returns
13 |         -------
14 | 
15 |         """
16 |         ret = self.clone()
17 |         if isinstance(cfg_filename_or_other_cfg, str):
18 |             ret.merge_from_file(cfg_filename_or_other_cfg)
19 |             return ret
20 |         elif isinstance(cfg_filename_or_other_cfg, CfgNode):
21 |             ret.merge_from_other_cfg(cfg_filename_or_other_cfg)
22 |             return ret
23 |         elif cfg_filename_or_other_cfg is None:
24 |             return ret
25 |         else:
26 |             raise TypeError('Type of config path is not supported!')
27 | 


--------------------------------------------------------------------------------
/src/gluonnlp/utils/shm.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import mmap
 3 | 
 4 | if pickle.HIGHEST_PROTOCOL < 5:
 5 |     del pickle
 6 |     import pickle5 as pickle
 7 | 
 8 | 
 9 | def serialize(path, tbl):
10 |     """Serialize tbl with out-of-band data to path for zero-copy shared memory usage.
11 | 
12 |     If the object to be serialized itself, or the objects it uses for data
13 |     storage (such as numpy arrays) implement the the pickle protocol version 5
14 |     pickle.PickleBuffer type in __reduce_ex__, then this function can store
15 |     these buffers out-of-band as files in `path` so that they subsequently be
16 |     re-used for zero-copy sharing accross processes.
17 | 
18 |     Parameters
19 |     ----------
20 |     path : pathlib.Path
21 |         Empty folder used to save serialized data. Usually a folder /dev/shm
22 |     tbl : object
23 |         Object to serialize. For example a PyArrow Table, a Pandas Dataframe or
24 |         any type that relies on NumPy to store the binary data.
25 | 
26 |     """
27 |     idx = 0
28 | 
29 |     def buffer_callback(buf):
30 |         nonlocal idx
31 |         with open(path / f'{idx}.bin', 'wb') as f:
32 |             f.write(buf)
33 |         idx += 1
34 | 
35 |     with open(path / 'meta.pkl', 'wb') as f:
36 |         pickle.dump(tbl, f, protocol=5, buffer_callback=buffer_callback)
37 | 
38 | 
39 | def load(path):
40 |     """Load serialized object with out-of-band data from path based on zero-copy shared memory.
41 | 
42 |     Parameters
43 |     ----------
44 |     path : pathlib.Path
45 |         Folder used to save serialized data with serialize(). Usually a folder /dev/shm
46 | 
47 |     """
48 |     num_buffers = len(list(path.iterdir())) - 1  # exclude meta.idx
49 |     buffers = []
50 |     for idx in range(num_buffers):
51 |         f = open(path / f'{idx}.bin', 'rb')
52 |         buffers.append(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ))
53 |     with open(path / 'meta.pkl', 'rb') as f:
54 |         return pickle.load(f, buffers=buffers)
55 | 


--------------------------------------------------------------------------------
/src/gluonnlp/utils/tvm_utils.py:
--------------------------------------------------------------------------------
 1 | __all__ = ['get_ec2_tvm_flags', 'update_tvm_convert_map']
 2 | 
 3 | import tvm.relay.op as _op
 4 | import tvm.relay.expr as _expr
 5 | from typing import Dict
 6 | from tvm.relay.frontend.mxnet import _convert_map
 7 | from tvm.relay.frontend.common import infer_type as _infer_type
 8 | 
 9 | def get_ec2_tvm_flags() -> Dict[str, Dict]:
10 |     r"""Return the recommended flags for TVM compilation in AWS EC2 instances.
11 | 
12 |     Including C4, C5, G4, P3.
13 | 
14 |     For more details about AWS EC2 instances, refer to https://aws.amazon.com/ec2/instance-types/.
15 | 
16 |     Returns
17 |     -------
18 |     info_dict
19 |         A dictionary that contains the mapping between instance type and the
20 |         corresponding compilation flags.
21 |         Each element includes:
22 | 
23 |         - target
24 |             The compilation target
25 |         - use_gpu
26 |             Whether it's a GPU instance
27 |         - opt_level
28 |             The optimization level in compilation
29 |         - pass
30 |             Additional graph passes for further improvement.
31 |     """
32 |     instance_info = {
33 |         'g4': {'target': "cuda -model=t4 -libs=cublas,cudnn",
34 |                'use_gpu': True,
35 |                'opt_level': 3,
36 |                'required_pass': ["FastMath"]},
37 |         'c4': {'target': 'llvm -mcpu=core-avx2 -libs=cblas',
38 |                'use_gpu': False,
39 |                'opt_level': 3,
40 |                'required_pass': ["FastMath"]},
41 |         'c5': {'target': 'llvm -mcpu=skylake-avx512 -libs=cblas',
42 |                'use_gpu': False,
43 |                'opt_level': 3,
44 |                'required_pass': ["FastMath"]},
45 |         'p3': {'target': 'cuda -model=v100 -libs=cublas,cudnn',
46 |                'use_gpu': True,
47 |                'opt_level': 3,
48 |                'required_pass': ["FastMath"]}
49 |     }
50 |     return instance_info
51 | 
52 | 
53 | def update_tvm_convert_map() -> None:
54 |     """A Monkey Patch to update convert map in tvm/relay/frontend/mxnet.py"""
55 |     op = (('masked_softmax', _mx_masked_softmax),)
56 |     _convert_map.update({key: value for key, value in op})
57 | 
58 | 
59 | def _mx_masked_softmax(inputs, attrs):
60 |     assert len(inputs) == 1 or len(inputs) == 2
61 |     axis = attrs.get_int("axis")
62 |     temperature = attrs.get_float("temperature")
63 |     if len(inputs) == 1:
64 |         result = _op.nn.softmax(inputs[0] / _expr.const(temperature), axis=axis)
65 |     else:
66 |         neg = -1e18
67 |         att_score, mask = inputs
68 |         att_score_dtype = _infer_type(att_score).checked_type.dtype
69 |         if att_score_dtype == "float16":
70 |             neg = -1e4
71 |         temp = _op.where(mask, 
72 |                          att_score,
73 |                          _expr.const(neg))
74 |         result = _op.multiply(_op.nn.softmax(temp / _expr.const(temperature), axis=axis), mask.astype("float32"))
75 |     return result
76 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Unit Tests
 2 | 
 3 | To run the unittests, use the following command
 4 | 
 5 | ```bash
 6 | python3 -m pytest --forked --device="cpu" .
 7 | ```
 8 | 
 9 | To test for certain file, e.g., the `test_models_transformer.py`, use the following command
10 | 
11 | ```bash
12 | python3 -m pytest --forked --device="cpu" test_models_transformer.py
13 | ```
14 | 
15 | To test only for gpu device, use the following command
16 | 
17 | ```bash
18 | python3 -m pytest --forked --device="gpu" test_models_transformer.py
19 | ```
20 | 
21 | To test both for cpu and gpu device, use the following command
22 | 
23 | ```bash
24 | python3 -m pytest --forked --device="cpu" --device="gpu" test_models_transformer.py
25 | ```
26 | 
27 | In addition, to run all the tests, you should add the `--runslow` flag
28 | 
29 | ```bash
30 | python3 -m pytest --forked --device="gpu" --runslow test_models.py
31 | ```
32 | 
33 | Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details.
34 | 
35 | # Naming Convention
36 | 
37 | The naming convention of the tests are `test_{module_name}.py`. 
38 | For example, the test of [models/transformer.py](../src/gluonnlp/models/transformer.py) will be in 
39 | `test_models_transformer.py`. The test of [models/__init__.py](../src/gluonnlp/models/__init__.py) 
40 | is `test_models.py`. 
41 | 
42 | Also, we include the scheduled testing scripts for `nlp_process` in [process_cli](process_cli), 
43 | and 'nlp_data' in [data_cli](data_cli).
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/data_cli/test_glue.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tempfile
 3 | import pandas as pd
 4 | from gluonnlp.cli.data.general_nlp_benchmark import prepare_glue
 5 | 
 6 | 
 7 | @pytest.mark.remote_required
 8 | @pytest.mark.parametrize('task', ["cola", "sst", "mrpc", "qqp", "sts", "mnli",
 9 |                                   "snli", "qnli", "rte", "wnli", "diagnostic"])
10 | def test_glue(task):
11 |     parser = prepare_glue.get_parser()
12 |     with tempfile.TemporaryDirectory() as root:
13 |         args = parser.parse_args(['--benchmark', 'glue',
14 |                                   '--tasks', task,
15 |                                   '--data_dir', root])
16 |         prepare_glue.main(args)
17 | 
18 | 
19 | @pytest.mark.remote_required
20 | @pytest.mark.parametrize('task', ["cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record",
21 |                                   'broadcoverage-diagnostic', 'winogender-diagnostic'])
22 | def test_glue(task):
23 |     parser = prepare_glue.get_parser()
24 |     with tempfile.TemporaryDirectory() as root:
25 |         args = parser.parse_args(['--benchmark', 'superglue',
26 |                                   '--tasks', task,
27 |                                   '--data_dir', root])
28 |         prepare_glue.main(args)
29 | 


--------------------------------------------------------------------------------
/tests/data_cli/test_wikipedia.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tempfile
 3 | from gluonnlp.cli.data.pretrain_corpus import prepare_wikipedia
 4 | 
 5 | 
 6 | @pytest.mark.remote_required
 7 | # Test for zh-classical (文言) + wuu (吴语), which are smaller compared with English
 8 | @pytest.mark.parametrize('lang', ['zh-classical', 'wuu'])
 9 | def test_download_format(lang):
10 |     parser = prepare_wikipedia.get_parser()
11 |     with tempfile.TemporaryDirectory() as root:
12 |         download_args = parser.parse_args(['--mode', 'download+format',
13 |                                            '--lang', lang,
14 |                                            '--date', 'latest', '-o', root])
15 |         prepare_wikipedia.main(download_args)
16 | 


--------------------------------------------------------------------------------
/tests/process_cli/test_average_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gluonnlp.cli import average_checkpoint
 3 | from mxnet.gluon import nn
 4 | from numpy.testing import assert_allclose
 5 | 
 6 | _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
 7 | 
 8 | def test_avg_ckpt():
 9 |     try:
10 |         average_checkpoint.cli_main()
11 |     except:
12 |         pass
13 |     num_ckpts = 5
14 |     model = nn.Dense(units=10, in_units=10)
15 |     model.initialize()
16 |     params = model.collect_params()
17 |     gd_avg = {}
18 |     for key in params.keys():
19 |         gd_avg[key] = params[key].data().asnumpy()
20 |     model.save_parameters(os.path.join(_CURR_DIR, 'update0.params'))
21 |     
22 |     for i in range(1, num_ckpts):
23 |         model.initialize(force_reinit=True)
24 |         params = model.collect_params()
25 |         for key in gd_avg.keys():
26 |             gd_avg[key] += params[key].data().asnumpy()
27 |         model.save_parameters(os.path.join(_CURR_DIR, 'update{}.params'.format(i)))
28 |     
29 |     for key in gd_avg.keys():
30 |         gd_avg[key] /= num_ckpts
31 | 
32 |     parser = average_checkpoint.get_parser()
33 |     args = parser.parse_args(['--checkpoints', None,
34 |                               '--begin', '0',
35 |                               '--end', str(num_ckpts-1),
36 |                               '--save-path', os.path.join(_CURR_DIR, 'avg.params')])
37 |     args.checkpoints = ['fake', 'ckpt']
38 |     try:
39 |         average_checkpoint.main(args)
40 |     except:
41 |         pass
42 |     args.checkpoints = [os.path.join(_CURR_DIR, 'update{}.params'.format(i)) \
43 |                         for i in range(0, num_ckpts)]
44 |     average_checkpoint.main(args)
45 |     
46 |     model.load_parameters(os.path.join(_CURR_DIR, 'avg.params'))
47 |     params = model.collect_params()
48 |     
49 |     for key in gd_avg.keys():
50 |         assert_allclose(gd_avg[key], params[key].data().asnumpy(), 1E-7, 1E-7)
51 | 


--------------------------------------------------------------------------------
/tests/test_data_filtering.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from gluonnlp.data.filtering import ProfanityFilter, MosesNormalizer, LanguageIdentifier
 3 | import multiprocessing
 4 | 
 5 | 
 6 | def test_profanity_filter():
 7 |     profanity_filter = ProfanityFilter('en')
 8 |     filter_word = 'anal'
 9 |     unfilter_word = 'analysis'
10 |     for text in [' ' + filter_word, ' ' + filter_word + ' ',
11 |                  filter_word, filter_word + ' ' + unfilter_word]:
12 |         assert profanity_filter.match(text) is True
13 |     for text in [' ' + unfilter_word, unfilter_word, unfilter_word + ' ']:
14 |         assert profanity_filter.match(text) is False
15 | 
16 | 
17 | def test_sentence_normalizer():
18 |     normalizer = MosesNormalizer('en')
19 |     assert normalizer('    hello  world!!".\t\t\r') == ' hello world!!."  '
20 |     assert normalizer(
21 |         b'We therefore defend, and will continue to defend wherever necessary, our position of \xe2\x80\x98no diversion\xe2\x80\x99.\n'.decode('utf-8')) == \
22 |            "We therefore defend, and will continue to defend wherever necessary, our position of 'no diversion'. "
23 |     normalizer = MosesNormalizer('en', remove_non_printable_char=False)
24 |     assert normalizer('    hello  world!!".\t\t\r') == ' hello world!!."\t\t'
25 |     normalizer = MosesNormalizer('en', remove_non_printable_char=False, unicode_norm_form='NFKC')
26 |     assert normalizer('    hello  world!!"⁵.\t\t\r') == ' hello world!!"5.\t\t'
27 | 
28 | 
29 | @pytest.mark.parametrize('algo', ['fasttext', 'fasttext_compressed', 'langid'])
30 | def test_language_identifier(algo):
31 |     lang_id_model = LanguageIdentifier(algo=algo)
32 |     lang_label, score = lang_id_model('你好，世界')
33 |     assert lang_label == 'zh'
34 |     with multiprocessing.Pool(2) as pool:
35 |         out = pool.map(lang_id_model, ['你好，世界', 'Hello World'])
36 |     assert out[0][0] == 'zh'
37 |     assert out[1][0] == 'en'
38 | 


--------------------------------------------------------------------------------
/tests/test_embedding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import collections
 3 | import os
 4 | import tempfile
 5 | import pytest
 6 | from gluonnlp.embedding import load_embeddings, get_fasttext_model
 7 | from gluonnlp.data import Vocab
 8 | 
 9 | def test_load_embeddings():
10 |     text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world', 'sadgood']
11 |     counter = collections.Counter(text_data)
12 |     vocab1 = Vocab(counter)
13 |     # load with vocab
14 |     matrix1 = load_embeddings(vocab1)
15 |     assert len(matrix1) == len(vocab1)
16 |     # load without vocab
17 |     matrix2, vocab2 = load_embeddings()
18 |     assert len(matrix2) == len(vocab2)
19 |     np.testing.assert_almost_equal(matrix1[vocab1["hello"]], matrix2[vocab2["hello"]])
20 | 
21 |     # test_unk_method
22 |     def simple(words):
23 |         return np.ones((len(words), 50))
24 |     matrix3 = load_embeddings(vocab1, unk_method=simple)
25 |     assert sum(matrix3[vocab1['sadgood']] == 1) == matrix3.shape[-1]
26 |     np.testing.assert_almost_equal(matrix3[vocab1["hello"]], matrix2[vocab2["hello"]])
27 | 
28 |     # load txt
29 |     with tempfile.TemporaryDirectory() as root:
30 |         path = os.path.join(root, "tmp.txt")
31 |         with open(path, "w") as f:
32 |             f.write("{} {}\n".format(matrix1.shape[0], matrix1.shape[1]))
33 |             for word, vec in zip(vocab1.all_tokens, matrix1):
34 |                 f.write(word + " ")
35 |                 f.write(" ".join([str(num) for num in vec.tolist()]))
36 |                 f.write("\n")
37 |         matrix4 = load_embeddings(vocab1, path)
38 |         np.testing.assert_almost_equal(matrix4, matrix1)
39 | 
40 | 
41 | @pytest.mark.slow
42 | @pytest.mark.remote_required
43 | def test_get_fasttext_model():
44 |     text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
45 |     counter = collections.Counter(text_data)
46 |     vocab1 = Vocab(counter)
47 |     matrix1 = load_embeddings(vocab1, 'wiki.en')
48 |     ft = get_fasttext_model('wiki.en')
49 |     np.testing.assert_almost_equal(matrix1[vocab1["hello"]], ft['hello'], decimal=4)
50 |     with pytest.raises(ValueError):
51 |         get_fasttext_model('wiki.multi.ar')
52 | 
53 | 


--------------------------------------------------------------------------------
/tests/test_gluon_block.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import mxnet as mx
 3 | from mxnet import nd, np, npx
 4 | from mxnet.test_utils import assert_allclose
 5 | from mxnet.gluon import HybridBlock, Constant
 6 | from mxnet.gluon.data import DataLoader
 7 | import itertools
 8 | mx.npx.set_np()
 9 | 
10 | 
11 | def test_const():
12 |     class Foo(HybridBlock):
13 |         def __init__(self):
14 |             super().__init__()
15 |             self.weight = Constant(np.ones((10, 10)))
16 | 
17 |         def forward(self, x, weight):
18 |             return x, weight.astype(np.float32)
19 | 
20 |     foo = Foo()
21 |     foo.hybridize()
22 |     foo.initialize()
23 | 
24 | 
25 | def test_scalar():
26 |     class Foo(HybridBlock):
27 |         def forward(self, x):
28 |             return x * x * 2
29 | 
30 |     foo = Foo()
31 |     foo.hybridize()
32 |     foo.initialize()
33 |     out = foo(mx.np.array(1.0))
34 |     assert_allclose(out.asnumpy(), np.array(2.0))
35 | 
36 | 
37 | def test_gluon_nonzero_hybridize():
38 |     class Foo(HybridBlock):
39 |         def __init__(self):
40 |             super().__init__()
41 | 
42 |         def forward(self, x):
43 |             dat = npx.nonzero(x)
44 |             return dat.sum() + dat
45 | 
46 |     foo = Foo()
47 |     foo.hybridize()
48 |     out = foo(mx.np.array([1, 0, 2, 0, 3, 0]))
49 |     out.wait_to_read()
50 |     out = foo(mx.np.array([0, 0, 0, 0, 0, 0]))
51 |     out.wait_to_read()
52 | 
53 | 
54 | @pytest.mark.xfail(reason='Expected to fail due to MXNet bug https://github.com/apache/'
55 |                           'incubator-mxnet/issues/19659')
56 | def test_gluon_boolean_mask():
57 |     class Foo(HybridBlock):
58 |         def forward(self, data, indices):
59 |             mask = indices < 3
60 |             data = npx.reshape(data, (-1, -2), reverse=True)
61 |             mask = np.reshape(mask, (-1,))
62 |             sel = nd.np._internal.boolean_mask(data, mask)
63 |             return sel
64 |     data = mx.np.random.normal(0, 1, (5, 5, 5, 5, 16))
65 |     indices = mx.np.random.randint(0, 5, (5, 5, 5, 5))
66 |     data.attach_grad()
67 |     indices.attach_grad()
68 |     foo = Foo()
69 |     foo.hybridize()
70 |     with mx.autograd.record():
71 |         out = foo(data, indices)
72 |         out.backward()
73 |     out.wait_to_read()
74 | 
75 | 
76 | def test_basic_dataloader():
77 |     def grouper(iterable, n, fillvalue=None):
78 |         """Collect data into fixed-length chunks or blocks"""
79 |         # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
80 |         args = [iter(iterable)] * n
81 |         return itertools.zip_longest(*args, fillvalue=fillvalue)
82 |     ctx_l = [mx.cpu(i) for i in range(8)]
83 |     dataset = [mx.np.ones((2,)) * i for i in range(1000)]
84 |     dataloader = DataLoader(dataset, 2, num_workers=4, prefetch=10)
85 | 
86 |     for i, data_l in enumerate(grouper(dataloader, len(ctx_l))):
87 |         for data, ctx in zip(data_l, ctx_l):
88 |             if data is None:
89 |                 continue
90 |             data = data.as_in_ctx(ctx)
91 |             mx.npx.waitall()
92 | 


--------------------------------------------------------------------------------
/tests/test_initializer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from gluonnlp import initializer
 3 | import mxnet as mx
 4 | from mxnet.gluon import nn
 5 | mx.npx.set_np()
 6 | 
 7 | 
 8 | def test_truncnorm_string_alias_works():
 9 |     try:
10 |         layer = nn.Dense(in_units=1, units=1, weight_initializer='truncnorm')
11 |         layer.initialize()
12 |     except RuntimeError:
13 |         pytest.fail('Layer couldn\'t be initialized')
14 | 
15 | 
16 | def test_truncnorm_all_values_inside_boundaries():
17 |     mean = 0
18 |     std = 0.01
19 |     layer = nn.Dense(in_units=1, units=1000)
20 |     layer.initialize(init=initializer.TruncNorm(mean, std))
21 |     assert (layer.weight.data() <= 2 * std).asnumpy().all()
22 |     assert (layer.weight.data() >= -2 * std).asnumpy().all()
23 | 
24 | 
25 | def test_truncnorm_generates_values_with_defined_mean_and_std():
26 |     from scipy import stats
27 | 
28 |     mean = 10
29 |     std = 5
30 |     layer = nn.Dense(in_units=1, units=100000)
31 |     layer.initialize(init=initializer.TruncNorm(mean, std))
32 |     samples = layer.weight.data().reshape((-1, )).asnumpy()
33 | 
34 |     p_value = stats.kstest(samples, 'truncnorm', args=(-2, 2, mean, std)).pvalue
35 |     assert p_value > 0.0001
36 | 


--------------------------------------------------------------------------------
/tests/test_loss.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import numpy as np
 3 | import pytest
 4 | from numpy.testing import assert_allclose
 5 | import scipy.special as sspecial
 6 | from gluonnlp.loss import LabelSmoothCrossEntropyLoss
 7 | mx.npx.set_np()
 8 | 
 9 | 
10 | @pytest.mark.parametrize('label_shape', [(5, 3), (3,), (2, 3, 2)])
11 | @pytest.mark.parametrize('alpha', [0.0, 0.1])
12 | @pytest.mark.parametrize('from_logits', [True, False])
13 | @pytest.mark.parametrize('hybridize', [True, False])
14 | def test_label_smoothing(label_shape, alpha, from_logits, hybridize):
15 |     def _np_label_smoothing(pred, labels, alpha, from_logits):
16 |         flatten_pred = pred.reshape((-1, pred.shape[-1]))
17 |         flatten_labels = labels.reshape((-1,))
18 |         smoothed_labels = np.full_like(flatten_pred,
19 |                                        fill_value=alpha / flatten_pred.shape[-1])
20 |         smoothed_labels[np.arange(flatten_pred.shape[0]), flatten_labels]\
21 |             = 1 - alpha + alpha / flatten_pred.shape[-1]
22 |         if not from_logits:
23 |             flatten_logits = np.log(sspecial.softmax(flatten_pred, axis=-1))
24 |         else:
25 |             flatten_logits = flatten_pred
26 |         # Calculate cross-entropy
27 |         loss = - (smoothed_labels * flatten_logits).sum(axis=-1)
28 |         return loss.reshape(labels.shape)
29 |     label_num = 5
30 |     loss = LabelSmoothCrossEntropyLoss(num_labels=label_num, alpha=alpha, from_logits=from_logits)
31 |     if hybridize:
32 |         loss.hybridize()
33 |     if from_logits:
34 |         pred = mx.np.random.uniform(-10, -1, label_shape + (label_num,))
35 |     else:
36 |         pred = mx.np.random.normal(0, 1, label_shape + (label_num,))
37 |     labels = mx.np.random.randint(0, label_num, label_shape)
38 |     out = loss(pred, labels)
39 |     np_out = _np_label_smoothing(pred.asnumpy(), labels.asnumpy(), alpha, from_logits)
40 |     assert_allclose(np_out, out.asnumpy(), 1E-4, 1E-4)
41 | 
42 | 


--------------------------------------------------------------------------------
/tests/test_models_mt5.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tempfile
 3 | 
 4 | from gluonnlp.models.mt5 import (
 5 |     MT5Model, MT5Inference, mt5_cfg_reg, list_pretrained_mt5, get_pretrained_mt5
 6 | )
 7 | 
 8 | def test_list_pretrained_mt5(): 
 9 |     assert len(list_pretrained_mt5()) > 0
10 | 
11 | 
12 | @pytest.mark.parametrize('cfg_key', mt5_cfg_reg.list_keys())
13 | def test_mt5_model_and_inference(cfg_key, ctx): 
14 |     # since MT5Model, MT5Inference simply inherits the T5Model, T5Inference, 
15 |     # we just want to make sure the model can be properly loaded, and leave 
16 |     # the correctness tests to test_model_t5.py
17 |     with ctx: 
18 |         cfg = MT5Model.get_cfg(cfg_key)
19 |         if cfg_key != 'google_mt5_small': 
20 |             cfg.defrost()
21 |             cfg.MODEL.vocab_size = 256
22 |             cfg.MODEL.d_model = 128
23 |             cfg.MODEL.d_ff = 512
24 |             cfg.MODEL.num_layers = 2
25 |             cfg.MODEL.num_heads = 4
26 |             cfg.freeze()
27 |         mt5_model = MT5Model.from_cfg(cfg)
28 |         mt5_model.initialize()
29 |         mt5_model.hybridize()
30 |         if cfg_key == 'google_mt5_small': 
31 |             inference_model = MT5Inference(mt5_model)
32 |             inference_model.hybridize()
33 | 
34 | 
35 | def test_mt5_get_pretrained(ctx): 
36 |     with tempfile.TemporaryDirectory() as root, ctx: 
37 |         cfg, tokenizer, backbone_params_path, _ = get_pretrained_mt5('google_mt5_small')
38 |         # we exclude <extra_id>s in the comparison below by avoiding len(tokenizer.vocab)
39 |         assert cfg.MODEL.vocab_size >= len(tokenizer._sp_model)
40 |         mt5_model = MT5Model.from_cfg(cfg)
41 |         mt5_model.load_parameters(backbone_params_path)
42 |         mt5_model.hybridize()
43 |         mt5_inference_model = MT5Inference(mt5_model)
44 |         mt5_inference_model.hybridize()
45 | 


--------------------------------------------------------------------------------
/tests/test_models_xlmr.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import mxnet as mx
 4 | import tempfile
 5 | from gluonnlp.models.xlmr import XLMRModel, \
 6 |     list_pretrained_xlmr, get_pretrained_xlmr
 7 | from gluonnlp.loss import LabelSmoothCrossEntropyLoss
 8 | 
 9 | mx.npx.set_np()
10 | 
11 | 
12 | def test_list_pretrained_xlmr():
13 |     assert len(list_pretrained_xlmr()) > 0
14 | 
15 | 
16 | # We choose to not test amp for XLMR because it's the same as RoBERTa.
17 | @pytest.mark.slow
18 | @pytest.mark.remote_required
19 | @pytest.mark.parametrize('model_name', list_pretrained_xlmr())
20 | def test_xlmr(model_name, ctx):
21 |     # test from pretrained
22 |     assert len(list_pretrained_xlmr()) > 0
23 |     with ctx:
24 |         with tempfile.TemporaryDirectory() as root:
25 |             cfg, tokenizer, params_path, mlm_params_path =\
26 |                 get_pretrained_xlmr(model_name, load_backbone=True, load_mlm=False, root=root)
27 |             assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
28 |             # test backbone
29 |             xlmr_model = XLMRModel.from_cfg(cfg)
30 |             xlmr_model.load_parameters(params_path)
31 |             # pass the mlm model
32 | 
33 |         # test forward
34 |         batch_size = 1
35 |         seq_length = 4
36 |         vocab_size = len(tokenizer.vocab)
37 |         input_ids = mx.np.array(
38 |             np.random.randint(
39 |                 2,
40 |                 vocab_size,
41 |                 (batch_size, seq_length)
42 |             ),
43 |             dtype=np.int32
44 |         )
45 |         valid_length = mx.np.array(
46 |             np.random.randint(
47 |                 seq_length // 2,
48 |                 seq_length,
49 |                 (batch_size,)
50 |             ),
51 |             dtype=np.int32
52 |         )
53 |         contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length)
54 |         mx.npx.waitall()
55 |         # test backward
56 |         label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
57 |         with mx.autograd.record():
58 |             contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length)
59 |             loss = label_smooth_loss(contextual_embeddings, input_ids)
60 |             loss.backward()
61 |         mx.npx.waitall()
62 | 


--------------------------------------------------------------------------------
/tests/test_pytest.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import pytest
 3 | import numpy as np
 4 | import mxnet as mx
 5 | 
 6 | 
 7 | @pytest.mark.seed(1)
 8 | def test_test():
 9 |     """Test that fixing a random seed works."""
10 |     py_rnd = random.randint(0, 100)
11 |     np_rnd = np.random.randint(0, 100)
12 |     mx_rnd = mx.nd.random_uniform(shape=(1, )).asscalar()
13 | 
14 |     random.seed(1)
15 |     mx.random.seed(1)
16 |     np.random.seed(1)
17 | 
18 |     assert py_rnd == random.randint(0, 100)
19 |     assert np_rnd == np.random.randint(0, 100)
20 |     assert mx_rnd == mx.nd.random_uniform(shape=(1, )).asscalar()
21 | 


--------------------------------------------------------------------------------
/tests/test_utils_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | from numpy.testing import assert_allclose
 4 | from gluonnlp.utils.preprocessing import get_trimmed_lengths, match_tokens_with_char_spans
 5 | 
 6 | 
 7 | def test_get_trimmed_lengths():
 8 |     for lengths, do_merge, max_length, gt_trimmed_lengths in\
 9 |             [([10, 5, 4, 8], False, 6, [6, 5, 4, 6]),
10 |              ([10, 5, 4, 8], True, 6, [2, 2, 1, 1]),
11 |              ([20], False, 30, [20]),
12 |              ([20], True, 30, [20]),
13 |              ([15, 20], False, 30, [15, 20]),
14 |              ([15, 20], True, 30, [15, 15])]:
15 |         trimmed_lengths = get_trimmed_lengths(lengths,
16 |                                               max_length=max_length,
17 |                                               do_merge=do_merge)
18 |         assert_allclose(trimmed_lengths, np.array(gt_trimmed_lengths))
19 | 
20 | 
21 | def test_match_tokens_with_char_spans():
22 |     token_offsets = np.array([(0, 1), (1, 2), (3, 4), (5, 6)])
23 |     spans = np.array([(0, 3), (4, 6)])
24 |     out = match_tokens_with_char_spans(token_offsets, spans)
25 |     assert_allclose(out, np.array([[0, 2],
26 |                                    [2, 3]]))
27 | 
28 |     token_offsets = np.array([(5, 10), (10, 20), (20, 25), (26, 30)])
29 |     spans = np.array([(0, 3), (4, 6), (10, 30),
30 |                       (22, 23), (15, 25),
31 |                       (10, 35), (36, 38)])
32 |     out = match_tokens_with_char_spans(token_offsets, spans)
33 |     assert_allclose(out, np.array([[0, 0],
34 |                                    [0, 0],
35 |                                    [1, 3],
36 |                                    [2, 2],
37 |                                    [1, 2],
38 |                                    [1, 3],
39 |                                    [3, 3]]))
40 | 


--------------------------------------------------------------------------------
/tests/test_utils_registry.py:
--------------------------------------------------------------------------------
 1 | from gluonnlp.utils.registry import Registry
 2 | 
 3 | 
 4 | def test_registry():
 5 |     MODEL_REGISTRY = Registry('MODEL')
 6 |     @MODEL_REGISTRY.register()
 7 |     class MyModel:
 8 |         def __init__(self, a, b):
 9 |             self.a = a
10 |             self.b = b
11 | 
12 |     @MODEL_REGISTRY.register()
13 |     def my_model():
14 |         return
15 | 
16 |     @MODEL_REGISTRY.register('test_class')
17 |     class MyModelWithNickName:
18 |         def __init__(self, a, b, c):
19 |             self.a = a
20 |             self.b = b
21 |             self.c = c
22 | 
23 |     @MODEL_REGISTRY.register('test_function')
24 |     def my_model_with_nick_name():
25 |         return
26 | 
27 |     class MyModel2:
28 |         pass
29 | 
30 |     MODEL_REGISTRY.register(MyModel2)
31 |     MODEL_REGISTRY.register('my_model2', MyModel2)
32 |     assert MODEL_REGISTRY.list_keys() ==\
33 |            ['MyModel', 'my_model', 'test_class', 'test_function', 'MyModel2', 'my_model2']
34 |     model = MODEL_REGISTRY.create('MyModel', 1, 2)
35 |     assert model.a == 1 and model.b == 2
36 |     model = MODEL_REGISTRY.create('MyModel', a=2, b=3)
37 |     assert model.a == 2 and model.b == 3
38 |     model = MODEL_REGISTRY.create_with_json('MyModel', '[4, 5]')
39 |     assert model.a == 4 and model.b == 5
40 |     model = MODEL_REGISTRY.create_with_json('test_class',
41 |                                             '{"a": 100, "b": 200, "c": 300}')
42 |     assert model.a == 100 and model.b == 200 and model.c == 300
43 |     assert MODEL_REGISTRY.get('test_class') == MyModelWithNickName
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/torch/test_layers_torch.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | import numpy as np
 3 | from gluonnlp.torch.layers import SinusoidalPositionalEmbedding
 4 | 
 5 | 
 6 | def test_sinusoidal_pos_embed():
 7 |     embed1 = SinusoidalPositionalEmbedding(128, learnable=False)
 8 |     embed2 = SinusoidalPositionalEmbedding(128, learnable=True)
 9 |     assert len([(name, param) for name, param in embed1.named_parameters()
10 |                 if param.requires_grad]) == 0
11 |     assert len([(name, param) for name, param in embed2.named_parameters()
12 |                 if param.requires_grad]) == 1
13 |     inputs = th.randint(0, 128, (8, 4))
14 |     np.testing.assert_allclose(embed1(inputs).detach().cpu().numpy(),
15 |                                embed2(inputs).detach().cpu().numpy(), 1E-3, 1E-3)
16 | 


--------------------------------------------------------------------------------
/tools/batch/backbone_benchmark/run_batch_backbone_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | INSTANCE_TYPE=${1:-g4dn.2x}
 5 | LOG_PATH=${2:-submit_backbone_benchmark.log}
 6 | SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py
 7 | 
 8 | python3 ${SUBMIT_SCRIPT_PATH} \
 9 |     --region us-east-1 \
10 |     --source-ref fix_benchmark3 \
11 |     --job-type ${INSTANCE_TYPE} \
12 |     --save-path temp \
13 |     --name test_backbone_benchmark_${INSTANCE_TYPE} \
14 |     --work-dir scripts/benchmarks \
15 |     --remote https://github.com/sxjscience/gluon-nlp/ \
16 |     --command "bash run_backbone_benchmark.sh 2>&1 | tee stdout.log" \
17 |     | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \
18 |     | sed -e 's/ - / /g' >> ${LOG_PATH}
19 | 


--------------------------------------------------------------------------------
/tools/batch/batch_states/compile_notebooks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Shell script for submitting AWS Batch jobs to compile notebooks
 3 | 
 4 | event=$1
 5 | ref=$2
 6 | 
 7 | FAIL=0
 8 | 
 9 | compile_notebook () {
10 |     local MDFILE=$1
11 |     DIR=$(dirname $MDFILE)
12 |     BASENAME=$(basename $MDFILE)
13 |     TARGETNAME=$(dirname $MDFILE)/${BASENAME%.md}.ipynb
14 |     LOGNAME=$(dirname $MDFILE)/${BASENAME%.md}.stdout.log
15 | 
16 |     echo Compiling $BASENAME ...
17 | 
18 |     python3 docs/md2ipynb.py ${MDFILE} &> $LOGNAME
19 | 
20 |     EXIT_CODE=$?
21 | 
22 |     if [ $EXIT_CODE -ne 0 ]; then
23 |         echo Compiling $BASENAME Failed, please download Notebook_Logs in build Artifacts for more details.
24 |     else
25 |         echo Compiling $BASENAME Succeeded
26 |     fi
27 |     exit $EXIT_CODE
28 | }
29 | 
30 | pids=()
31 | 
32 | for f in $(find docs/tutorials -type f -name '*.md' -print); do
33 |     compile_notebook "$f" &
34 |     pids+=($!)
35 | done;
36 | 
37 | for pid in "${pids[@]}"; do
38 |     wait "$pid" || let "FAIL+=1"
39 | done;
40 | 
41 | if [ "$FAIL" == "0" ]; then
42 |     echo Building Website
43 |     make docs_local
44 |     EXIT_CODE=$?
45 |     if [ $EXIT_CODE -ne 0 ]; then
46 |         echo Building Website Failed.
47 |         exit $EXIT_CODE
48 |     else
49 |         echo Building Website Succeeded.
50 |         if [ "$1" == "push" ]; then
51 |             echo "Uploading docs to s3://gluon-nlp/$2/"
52 |             aws s3 sync --delete ./docs/_build/html/ s3://gluon-nlp/$2/ --quiet --acl public-read
53 |         else
54 |             echo "Uploading docs to s3://gluon-nlp-staging/PR$1/$2/"
55 |             aws s3 sync --delete ./docs/_build/html/ s3://gluon-nlp-staging/PR$1/$2/ --quiet --acl public-read
56 |         fi
57 |     fi
58 | else
59 |     exit 1
60 | fi
61 | 


--------------------------------------------------------------------------------
/tools/batch/batch_states/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Shell script for installing dependencies and running test on AWS Batch
 3 | set -ex
 4 | 
 5 | echo $PWD
 6 | SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 7 | REPODIR="$( readlink -f ${SCRIPTPATH}/../../../../gluon-nlp)"
 8 | 
 9 | python3 -m pip install --upgrade --user pytest pytest-cov contextvars
10 | python3 -m pytest --cov=$REPODIR --cov-config=$REPODIR/.coveragerc --cov-report=xml --durations=50 --device="gpu" --runslow $REPODIR/tests/
11 | 


--------------------------------------------------------------------------------
/tools/batch/batch_states/test_data_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Shell script for testing the data preprocessing on AWS Batch
 3 | 
 4 | set -ex
 5 | export PYTHONIOENCODING=utf8
 6 | echo $PWD
 7 | 
 8 | for MODEL in spm yttm
 9 | do
10 |   bash ../../../scripts/datasets/machine_translation/wmt2014_ende.sh ${MODEL}
11 | done
12 | for MODEL in spm yttm
13 | do
14 |   bash ../../../scripts/datasets/machine_translation/wmt2017_zhen.sh ${MODEL}
15 | done
16 | 


--------------------------------------------------------------------------------
/tools/batch/hello_world.py:
--------------------------------------------------------------------------------
 1 | from gluonnlp.data.vocab import Vocab
 2 | import mxnet as mx
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 |     vocab = Vocab(['Hello', 'World!'], unk_token=None)
 7 |     print(vocab)
 8 |     num_gpus = mx.context.num_gpus()
 9 |     print('Number of GPUS:', num_gpus)
10 | 
11 | 


--------------------------------------------------------------------------------
/tools/batch/question_answering/parse_squad_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pandas as pd
 4 | import glob
 5 | import math
 6 | import argparse
 7 | from datetime import datetime
 8 | 
 9 | parser = argparse.ArgumentParser(description='Parse SQuAD results generated by '
10 |                                              '"sync_batch_result.sh" to csv.')
11 | parser.add_argument('--dir', type=str, required=True,
12 |                     help='The basic directory to analyze the results.')
13 | parser.add_argument('--save_path', type=str, default=None, help='The path to save the results.')
14 | args = parser.parse_args()
15 | 
16 | if args.save_path is None:
17 |     args.save_path = os.path.basename(os.path.realpath(args.dir)) + '.csv'
18 | 
19 | base_dir = args.dir
20 | prefix = 'test_squad2_'
21 | 
22 | dat_l = []
23 | datetime_parser = '%Y-%m-%d %H:%M:%S,%f'
24 | 
25 | for folder in sorted(os.listdir(base_dir)):
26 |     if folder.startswith(prefix):
27 |         model_name = folder[len(prefix):]
28 |         log_path_l = glob.glob(os.path.join(base_dir, folder, 'fintune*/finetune*.log'))
29 |         param_path_l = sorted(glob.glob(os.path.join(base_dir, folder, 'fintune*/*.params')))
30 |         if len(param_path_l) == 0 or len(log_path_l) == 0:
31 |             best_f1_threshold = math.nan
32 |             best_exact_threshold = math.nan
33 |             best_f1 = math.nan
34 |             best_em = math.nan
35 |             time_spent_in_hours = math.nan
36 |         else:
37 |             log_path = log_path_l[0]
38 |             result_file = glob.glob(os.path.join(base_dir, folder, 'fintune*/best_results.json'))[0]
39 |             with open(result_file, 'r') as in_f:
40 |                 result_dat = json.load(in_f)
41 |             if 'best_f1_thresh' in result_dat:
42 |                 best_f1_threshold = result_dat['best_f1_thresh']
43 |                 best_exact_threshold = result_dat['best_exact_thresh']
44 |                 best_f1 = result_dat['best_f1']
45 |                 best_em = result_dat['best_exact']
46 |             else:
47 |                 best_f1_threshold = math.nan
48 |                 best_exact_threshold = math.nan
49 |                 best_f1 = result_dat['f1']
50 |                 best_em = result_dat['exact']
51 |             with open(log_path, 'r') as in_f:
52 |                 log_lines = in_f.readlines()
53 |                 start_time_str = ' '.join(log_lines[0].split()[0:2])
54 |                 end_time_str = ' '.join(log_lines[-1].split()[0:2])
55 |                 start_time = datetime.strptime(start_time_str, datetime_parser)
56 |                 end_time = datetime.strptime(end_time_str, datetime_parser)
57 |                 time_spent = end_time - start_time
58 |                 time_spent_in_hours = time_spent.total_seconds() / 3600
59 |         dat_l.append({'name': model_name,
60 |                       'best_f1': best_f1,
61 |                       'best_em': best_em,
62 |                       'best_f1_thresh': best_f1_threshold,
63 |                       'best_em_thresh': best_exact_threshold,
64 |                       'time_spent_in_hours': time_spent_in_hours})
65 | df = pd.DataFrame(dat_l)
66 | print(df)
67 | print('Saving to {}'.format(args.save_path))
68 | df.to_csv(args.save_path)
69 | 


--------------------------------------------------------------------------------
/tools/batch/question_answering/run_batch_squad.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | USE_HOROVOD=${1:-0}
 6 | VERSION=${2:-2.0}
 7 | LOG_PATH=${3:-submit_squad_v2.log}
 8 | DTYPE=${4:-float32}
 9 | SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py
10 | 
11 | 
12 | for MODEL_NAME in albert_base \
13 |                   albert_large \
14 |                   albert_xlarge \
15 |                   albert_xxlarge \
16 |                   electra_base \
17 |                   electra_large \
18 |                   electra_small \
19 |                   roberta_large \
20 |                   uncased_bert_base \
21 |                   uncased_bert_large \
22 |                   uncased_bert_wwm_large \
23 |                   gluon_en_cased_bert_base_v1 \
24 |                   mobilebert
25 | do
26 |   python3 ${SUBMIT_SCRIPT_PATH} \
27 |       --region us-east-1 \
28 |       --source-ref master \
29 |       --job-type g4dn.12x \
30 |       --save-path temp \
31 |       --name test_squad2_${MODEL_NAME} \
32 |       --work-dir scripts/question_answering \
33 |       --remote https://github.com/dmlc/gluon-nlp/ \
34 |       --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} ${DTYPE} | tee stdout.log" \
35 |       | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \
36 |       | sed -e 's/ - / /g' >> ${LOG_PATH}
37 | done
38 | 


--------------------------------------------------------------------------------
/tools/batch/run_batch_conversion.sh:
--------------------------------------------------------------------------------
 1 | for MODEL_NAME in bert albert electra mobilebert roberta xlmr bart
 2 | do
 3 |   python3 submit-job.py \
 4 |       --region us-east-1 \
 5 |       --source-ref master \
 6 |       --job-type c5n.4x \
 7 |       --name convert_${MODEL_NAME} \
 8 |       --work-dir scripts/conversion_toolkits \
 9 |       --remote https://github.com/dmlc/gluon-nlp/ \
10 |       --command 'bash convert_'${MODEL_NAME}'.sh | tee stdout.log' >> log.info
11 | done
12 | 


--------------------------------------------------------------------------------
/tools/batch/sync_batch_result.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | LOG_PATH=$1
 6 | SAVE_DIR_NAME=${2:-squad_2.0}
 7 | 
 8 | while read -r job_name job_id; do
 9 |     aws s3 sync s3://gluon-nlp-dev/batch/${job_id}/temp ${SAVE_DIR_NAME}/${job_name}
10 | done < ${LOG_PATH}
11 | 


--------------------------------------------------------------------------------
/tools/docker/devel_entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /start_jupyter.sh
4 | 
5 | exec "$@"
6 | 


--------------------------------------------------------------------------------
/tools/docker/gluon_nlp_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | date
 4 | echo "Args: $@"
 5 | env
 6 | echo "jobId: $AWS_BATCH_JOB_ID"
 7 | echo "jobQueue: $AWS_BATCH_JQ_NAME"
 8 | echo "computeEnvironment: $AWS_BATCH_CE_NAME"
 9 | 
10 | SOURCE_REF=$1
11 | WORK_DIR=$2
12 | COMMAND=$3
13 | SAVED_OUTPUT=$4
14 | SAVE_PATH=$5
15 | REMOTE=$6
16 | DEVICE=${7:-gpu}
17 | 
18 | if [ ! -z $REMOTE ]; then
19 |     git remote set-url origin $REMOTE
20 | fi;
21 | 
22 | git fetch origin $SOURCE_REF:working
23 | git checkout working
24 | 
25 | if [ $DEVICE == "cpu" ]; then
26 |   python3 -m pip uninstall --quiet mxnet -y
27 |   python3 -m pip install -U --quiet --pre "mxnet>=2.0.0b20210121" -f https://dist.mxnet.io/python
28 | else
29 |   python3 -m pip uninstall --quiet mxnet-cu102 -y
30 |   python3 -m pip install -U --quiet --pre "mxnet-cu102>=2.0.0a" --user
31 | fi
32 | 
33 | python3 -m pip install --quiet -e .[extras,dev]
34 | 
35 | cd $WORK_DIR
36 | /bin/bash -o pipefail -c "$COMMAND"
37 | COMMAND_EXIT_CODE=$?
38 | if [[ -f $SAVED_OUTPUT ]]; then
39 |   aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH --quiet;
40 | elif [[ -d $SAVED_OUTPUT ]]; then
41 |   aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$AWS_BATCH_JOB_ID/$SAVE_PATH --quiet;
42 | fi;
43 | exit $COMMAND_EXIT_CODE
44 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_horovod.sh:
--------------------------------------------------------------------------------
1 | set -euo pipefail
2 | 
3 | # Install Horovod
4 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \
5 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITH_PYTORCH=1 \
6 | HOROVOD_WITHOUT_TENSORFLOW=1 python3 -m pip install --no-cache-dir horovod==0.20.3 --user
7 | # Debug horovod by default
8 | echo NCCL_DEBUG=INFO >> /etc/nccl.conf
9 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_jupyter_lab.sh:
--------------------------------------------------------------------------------
 1 | set -euo pipefail
 2 | 
 3 | # Install NodeJS + Tensorboard + TensorboardX
 4 | 
 5 | curl -sL https://deb.nodesource.com/setup_14.x | bash - \
 6 |     && apt-get install -y nodejs
 7 | 
 8 | apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev
 9 | 
10 | python3 -m pip install --no-cache --upgrade \
11 |     soundfile==0.10.2 \
12 |     ipywidgets==7.5.1 \
13 |     jupyter_tensorboard==0.2.0 \
14 |     widgetsnbextension==3.5.1 \
15 |     tensorboard==2.1.1 \
16 |     tensorboardX==2.1 --user
17 | jupyter labextension install jupyterlab_tensorboard \
18 |    && jupyter nbextension enable --py widgetsnbextension \
19 |    && jupyter labextension install @jupyter-widgets/jupyterlab-manager
20 | 
21 | # Revise default shell to /bin/bash
22 | jupyter notebook --generate-config \
23 |   && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py
24 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_llvm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | 
19 | set -e
20 | set -u
21 | set -o pipefail
22 | 
23 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\
24 |      >> /etc/apt/sources.list.d/llvm.list
25 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main\
26 |      >> /etc/apt/sources.list.d/llvm.list
27 | 
28 | 
29 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
30 |      >> /etc/apt/sources.list.d/llvm.list
31 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
32 |      >> /etc/apt/sources.list.d/llvm.list
33 | 
34 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\
35 |      >> /etc/apt/sources.list.d/llvm.list
36 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main\
37 |      >> /etc/apt/sources.list.d/llvm.list
38 | 
39 | echo deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
40 |      >> /etc/apt/sources.list.d/llvm.list
41 | echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
42 |      >> /etc/apt/sources.list.d/llvm.list
43 | 
44 | wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
45 | apt-get update && apt-get install -y llvm-9 llvm-10 llvm-11 clang-9 clang-10 clang-11
46 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_openmpi.sh:
--------------------------------------------------------------------------------
 1 | set -euo pipefail
 2 | 
 3 | mkdir /tmp/openmpi \
 4 |  && cd /tmp/openmpi \
 5 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 6 |  && tar zxf openmpi-4.0.1.tar.gz \
 7 |  && cd openmpi-4.0.1 \
 8 |  && ./configure --enable-orterun-prefix-by-default \
 9 |  && make -j $(nproc) all \
10 |  && make install \
11 |  && ldconfig \
12 |  && rm -rf /tmp/openmpi
13 | 
14 | # Create a wrapper for OpenMPI to allow running as root by default
15 | mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
16 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
17 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
18 |  && chmod a+x /usr/local/bin/mpirun
19 | 
20 | echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
21 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
22 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_python_packages.sh:
--------------------------------------------------------------------------------
 1 | set -euo pipefail
 2 | 
 3 | 
 4 | python3 -m pip --no-cache-dir install --upgrade \
 5 |     pip \
 6 |     setuptools \
 7 |     wheel
 8 | 
 9 | # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
10 | python3 -m pip install --no-cache --upgrade \
11 |     numpy==1.19.1 \
12 |     pandas==0.25.1 \
13 |     cython \
14 |     pytest \
15 |     pytest-cov \
16 |     Pillow \
17 |     requests==2.22.0 \
18 |     scikit-learn==0.20.4 \
19 |     scipy==1.2.2 \
20 |     urllib3==1.25.8 \
21 |     python-dateutil==2.8.0 \
22 |     sagemaker-experiments==0.* \
23 |     PyYAML==5.3.1 \
24 |     mpi4py==3.0.2 \
25 |     jupyterlab==2.2.4 \
26 |     contextvars \
27 |     cmake \
28 |     awscli --user
29 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_tvm_cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | 
19 | set -e
20 | set -u
21 | set -o pipefail
22 | 
23 | cd ${WORKDIR}
24 | git clone https://github.com/apache/incubator-tvm tvm --recursive
25 | cd ${WORKDIR}/tvm
26 | # checkout a hash-tag
27 | git checkout bf862d4c4355eae4f18d89b3b6b98ed0a2c18e9c
28 | 
29 | mkdir -p build
30 | cp cmake/config.cmake build
31 | echo set\(USE_LLVM llvm-config-10\) >> build/config.cmake
32 | echo set\(USE_GRAPH_EXECUTOR ON\) >> build/config.cmake
33 | echo set\(USE_BLAS openblas\) >> build/config.cmake
34 | 
35 | cd build
36 | cmake .. -GNinja
37 | ninja
38 | 
39 | # install python binding
40 | cd ..
41 | cd python
42 | python3 -m pip install -U -e . --user
43 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_tvm_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Licensed to the Apache Software Foundation (ASF) under one
 3 | # or more contributor license agreements.  See the NOTICE file
 4 | # distributed with this work for additional information
 5 | # regarding copyright ownership.  The ASF licenses this file
 6 | # to you under the Apache License, Version 2.0 (the
 7 | # "License"); you may not use this file except in compliance
 8 | # with the License.  You may obtain a copy of the License at
 9 | #
10 | #   http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing,
13 | # software distributed under the License is distributed on an
14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | # KIND, either express or implied.  See the License for the
16 | # specific language governing permissions and limitations
17 | # under the License.
18 | 
19 | set -e
20 | set -u
21 | set -o pipefail
22 | 
23 | cd ${WORKDIR}
24 | git clone https://github.com/apache/incubator-tvm tvm --recursive
25 | cd ${WORKDIR}/tvm
26 | # checkout a hash-tag
27 | git checkout bf862d4c4355eae4f18d89b3b6b98ed0a2c18e9c
28 | 
29 | 
30 | mkdir -p build
31 | cp cmake/config.cmake build
32 | echo set\(USE_LLVM llvm-config-10\) >> build/config.cmake
33 | echo set\(USE_CUDA ON\) >> build/config.cmake
34 | echo set\(USE_CUDNN ON\) >> build/config.cmake
35 | echo set\(USE_CUBLAS ON\) >> build/config.cmake
36 | echo set\(USE_GRAPH_EXECUTOR ON\) >> build/config.cmake
37 | echo set\(USE_BLAS openblas\) >> build/config.cmake
38 | 
39 | cd build
40 | cmake -GNinja -DCUDA_CUBLAS_LIBRARY=/usr/lib/x86_64-linux-gnu/libcublas.so ..
41 | ninja
42 | 
43 | # install python binding
44 | cd ..
45 | cd python
46 | python3 -m pip install -U -e . --user
47 | 


--------------------------------------------------------------------------------
/tools/docker/install/install_ubuntu18.04_core.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | set -u
 3 | set -o pipefail
 4 | 
 5 | export DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | rm -rf /var/lib/apt/lists/* \
 8 |  && apt-get clean \
 9 |  && apt-get update \
10 |  && apt-get install -y --no-install-recommends \
11 |     software-properties-common \
12 |     build-essential \
13 |     ca-certificates \
14 |     curl \
15 |     emacs \
16 |     subversion \
17 |     locales \
18 |     cmake \
19 |     git \
20 |     libopencv-dev \
21 |     htop \
22 |     vim \
23 |     wget \
24 |     unzip \
25 |     less \
26 |     libopenblas-dev \
27 |     gpg-agent \
28 |     ninja-build \
29 |     openssh-client \
30 |     openssh-server \
31 |     python3-dev \
32 |     python3-pip \
33 |     python3-setuptools \
34 |     libxft-dev \
35 |     zlib1g-dev \
36 |  && apt-get clean \
37 |  && rm -rf /var/lib/apt/lists/*
38 | 
39 | ln -s $(which python3) /usr/local/bin/python
40 | 


--------------------------------------------------------------------------------
/tools/docker/start_jupyter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run Jupyter in foreground if $JUPYTER_FG is set
 4 | if [[ "${JUPYTER_FG}" == "true" ]]; then
 5 |    jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token=''
 6 |    exit 0
 7 | else
 8 |    nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
 9 | 
10 |    echo "Notebook server successfully started, a JupyterLab instance has been executed!"
11 |    echo "Make local folders visible by volume mounting to /workspace/notebook"
12 |    echo "To access visit http://localhost:8888 on your host machine."
13 |    echo 'Ensure the following arguments to "docker run" are added to expose the server ports to your host machine:
14 |       -p 8888:8888 -p 8787:8787 -p 8786:8786'
15 | fi
16 | 


--------------------------------------------------------------------------------
/tools/docker/ubuntu18.04-cpu.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04 as base
 2 | 
 3 | LABEL maintainer="GluonNLP Team"
 4 | COPY install /install
 5 | 
 6 | ENV PYTHONDONTWRITEBYTECODE=1 \
 7 |     PYTHONUNBUFFERED=1 \
 8 |     LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
 9 |     PYTHONIOENCODING=UTF-8 \
10 |     LANG=C.UTF-8 \
11 |     LC_ALL=C.UTF-8
12 | 
13 | ENV WORKDIR=/workspace
14 | ENV SHELL=/bin/bash
15 | 
16 | RUN mkdir -p ${WORKDIR}
17 | 
18 | 
19 | RUN bash /install/install_ubuntu18.04_core.sh
20 | 
21 | # Install Open MPI
22 | RUN bash /install/install_openmpi.sh
23 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
24 | ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH
25 | 
26 | # Install LLVM
27 | RUN bash /install/install_llvm.sh
28 | 
29 | # Install Python Packages
30 | RUN bash /install/install_python_packages.sh
31 | 
32 | # Install TVM
33 | RUN bash /install/install_tvm_cpu.sh
34 | 
35 | # Install MXNet
36 | RUN python3 -m pip install -U --pre "mxnet>=2.0.0a" --user
37 | 
38 | # Install PyTorch
39 | RUN python3 -m pip install "torch==1.7.1+cpu" torchvision -f https://download.pytorch.org/whl/torch_stable.html
40 | 
41 | # Install Jupyter Lab
42 | RUN bash /install/install_jupyter_lab.sh
43 | 
44 | RUN mkdir -p ${WORKDIR}/data
45 | RUN mkdir -p /.init
46 | RUN cd ${WORKDIR} \
47 |    && git clone https://github.com/dmlc/gluon-nlp \
48 |    && cd gluon-nlp \
49 |    && git checkout master \
50 |    && python3 -m pip install -U -e ."[extras,dev]"
51 | 
52 | 
53 | # Stage-CI
54 | FROM base as ci
55 | WORKDIR ${WORKDIR}/gluon-nlp
56 | ADD gluon_nlp_job.sh .
57 | RUN chmod +x gluon_nlp_job.sh
58 | 
59 | 
60 | # Stage-Devel
61 | FROM base as devel
62 | COPY start_jupyter.sh /start_jupyter.sh
63 | COPY devel_entrypoint.sh /devel_entrypoint.sh
64 | RUN chmod +x /devel_entrypoint.sh
65 | 
66 | EXPOSE 8888
67 | EXPOSE 8787
68 | EXPOSE 8786
69 | 
70 | WORKDIR ${WORKDIR}
71 | 
72 | # Add Tini
73 | ARG TINI_VERSION=v0.19.0
74 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
75 | RUN chmod +x /tini
76 | ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
77 | CMD ["/bin/bash"]
78 | 


--------------------------------------------------------------------------------
/tools/docker/ubuntu18.04-gpu.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as base
 2 | 
 3 | LABEL maintainer="GluonNLP Team"
 4 | COPY install /install
 5 | 
 6 | ENV PYTHONDONTWRITEBYTECODE=1 \
 7 |     PYTHONUNBUFFERED=1 \
 8 |     LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
 9 |     PYTHONIOENCODING=UTF-8 \
10 |     LANG=C.UTF-8 \
11 |     LC_ALL=C.UTF-8
12 | 
13 | ENV WORKDIR=/workspace
14 | ENV SHELL=/bin/bash
15 | 
16 | RUN mkdir -p ${WORKDIR}
17 | 
18 | RUN bash /install/install_ubuntu18.04_core.sh
19 | 
20 | # Install Open MPI
21 | RUN bash /install/install_openmpi.sh
22 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
23 | ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:/usr/bin:$PATH
24 | 
25 | # Install LLVM
26 | RUN bash /install/install_llvm.sh
27 | 
28 | # Install Python Packages
29 | RUN bash /install/install_python_packages.sh
30 | 
31 | # Install TVM
32 | RUN bash /install/install_tvm_gpu.sh
33 | 
34 | # Install MXNet
35 | RUN python3 -m pip install -U --pre "mxnet-cu102>=2.0.0a" --user
36 | 
37 | # Install PyTorch
38 | RUN python3 -m pip install "torch==1.8.1+cu102" torchvision -f https://download.pytorch.org/whl/torch_stable.html
39 | 
40 | # Install Horovod
41 | RUN bash /install/install_horovod.sh
42 | 
43 | # Install Jupyter Lab
44 | RUN bash /install/install_jupyter_lab.sh
45 | 
46 | RUN mkdir -p ${WORKDIR}/data
47 | RUN mkdir -p /.init
48 | RUN cd ${WORKDIR} \
49 |    && git clone https://github.com/dmlc/gluon-nlp \
50 |    && cd gluon-nlp \
51 |    && git checkout master \
52 |    && python3 -m pip install -U -e ."[extras,dev]"
53 | 
54 | # Stage-CI
55 | FROM base as ci
56 | WORKDIR ${WORKDIR}/gluon-nlp
57 | ADD gluon_nlp_job.sh .
58 | RUN chmod +x gluon_nlp_job.sh
59 | 
60 | # Stage-Devel
61 | FROM base as devel
62 | COPY start_jupyter.sh /start_jupyter.sh
63 | COPY devel_entrypoint.sh /devel_entrypoint.sh
64 | RUN chmod +x /devel_entrypoint.sh
65 | 
66 | EXPOSE 8888
67 | EXPOSE 8787
68 | EXPOSE 8786
69 | 
70 | WORKDIR ${WORKDIR}
71 | 
72 | # Add Tini
73 | ARG TINI_VERSION=v0.19.0
74 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
75 | RUN chmod +x /tini
76 | ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
77 | CMD ["/bin/bash"]
78 | 


--------------------------------------------------------------------------------