├── .codecov.yml ├── .github ├── dependabot.yml └── workflows │ ├── create-release.yml │ └── mlbench-core.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pyup.yml ├── CHANGELOG.md ├── DEVELOPMENT.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs ├── Makefile ├── _static │ └── css │ │ └── custom.css ├── aggregation.rst ├── api.rst ├── clientapi.rst ├── conf.py ├── controlflow.rst ├── dataset.rst ├── evaluation.rst ├── examples.rst ├── gensidebar.py ├── index.rst ├── lr_scheduler.bib ├── lr_scheduler.rst ├── models.bib ├── models.rst ├── optim.bib ├── optim.rst ├── requirements.txt └── utils.rst ├── mlbench_core ├── __init__.py ├── aggregation │ ├── __init__.py │ └── pytorch │ │ ├── __init__.py │ │ ├── aggregation.py │ │ ├── centralized.py │ │ └── decentralized.py ├── api.py ├── cli │ ├── __init__.py │ ├── aws_utils.py │ ├── chartbuilder.py │ ├── cli.py │ ├── gcloud_utils.py │ ├── kind_utils.py │ └── utils.py ├── controlflow │ ├── __init__.py │ ├── pytorch │ │ ├── __init__.py │ │ ├── checkpoints_evaluation.py │ │ ├── controlflow.py │ │ ├── helpers.py │ │ └── train_validation.py │ └── tensorflow │ │ ├── __init__.py │ │ └── train_validation.py ├── dataset │ ├── __init__.py │ ├── imagerecognition │ │ ├── __init__.py │ │ ├── pytorch │ │ │ ├── __init__.py │ │ │ └── dataloader.py │ │ └── tensorflow │ │ │ ├── __init__.py │ │ │ └── cifar10.py │ ├── linearmodels │ │ ├── __init__.py │ │ └── pytorch │ │ │ ├── __init__.py │ │ │ └── dataloader.py │ ├── nlp │ │ ├── __init__.py │ │ └── pytorch │ │ │ ├── __init__.py │ │ │ ├── wikitext2_dataset.py │ │ │ ├── wmt16 │ │ │ ├── __init__.py │ │ │ ├── preprocess │ │ │ │ ├── download_dataset.sh │ │ │ │ ├── filter_dataset.py │ │ │ │ └── preprocess.py │ │ │ ├── utils.py │ │ │ ├── wmt16_config.py │ │ │ └── wmt16_tokenizer.py │ │ │ ├── wmt16_dataset.py │ │ │ ├── wmt17 │ │ │ ├── __init__.py │ │ │ ├── batching.py │ │ │ ├── collate.py │ │ │ ├── preprocess │ │ │ │ ├── __init__.py │ │ │ │ ├── indexed_dataset.py │ │ │ │ ├── newstest2014.de │ │ │ │ ├── newstest2014.en │ │ │ │ ├── preprocess.py │ │ │ │ ├── reference_dictionary.ende.txt │ │ │ │ └── sub_tokenizer.py │ │ │ └── wmt17_dictionary.py │ │ │ └── wmt17_dataset.py │ └── util │ │ ├── __init__.py │ │ ├── pytorch │ │ ├── __init__.py │ │ ├── libsvm.py │ │ └── partition.py │ │ └── tools.py ├── evaluation │ ├── __init__.py │ ├── goals.py │ ├── pytorch │ │ ├── __init__.py │ │ ├── criterion.py │ │ └── metrics.py │ └── tensorflow │ │ ├── __init__.py │ │ ├── criterion.py │ │ └── metrics.py ├── install_cuda_extensions.py ├── lr_scheduler │ ├── __init__.py │ ├── pytorch │ │ ├── __init__.py │ │ └── lr.py │ └── tensorflow │ │ ├── __init__.py │ │ └── lr.py ├── models │ ├── __init__.py │ ├── pytorch │ │ ├── __init__.py │ │ ├── gnmt │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── attn_score │ │ │ │ ├── attn_score_cuda.cpp │ │ │ │ └── attn_score_cuda_kernel.cu │ │ │ ├── decoder.py │ │ │ ├── encoder.py │ │ │ ├── models.py │ │ │ ├── translator.py │ │ │ └── utils.py │ │ ├── language_models │ │ │ ├── __init__.py │ │ │ └── lstm.py │ │ ├── layers │ │ │ ├── __init__.py │ │ │ └── dropout_layers.py │ │ ├── linear_models.py │ │ ├── resnet.py │ │ ├── transformer │ │ │ ├── __init__.py │ │ │ ├── decoder.py │ │ │ ├── encoder.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ ├── embeddings.py │ │ │ │ ├── layers.py │ │ │ │ ├── multihead_attention.py │ │ │ │ └── strided_batched_gemm │ │ │ │ │ ├── strided_batched_gemm.cpp │ │ │ │ │ └── strided_batched_gemm_cuda.cu │ │ │ ├── sequence_generator.py │ │ │ └── transformer.py │ │ └── vgg.py │ └── tensorflow │ │ ├── __init__.py │ │ └── resnet_model.py ├── optim │ ├── __init__.py │ └── pytorch │ │ ├── __init__.py │ │ ├── centralized.py │ │ ├── decentralized.py │ │ ├── fp_optimizers.py │ │ └── optim.py └── utils │ ├── __init__.py │ ├── log_metrics.py │ ├── pytorch │ ├── __init__.py │ ├── checkpoint.py │ ├── distributed.py │ ├── helpers.py │ ├── inference │ │ ├── __init__.py │ │ └── beam_search.py │ ├── topology.py │ └── utils.py │ ├── task_args.py │ ├── tensorflow │ └── __init__.py │ └── tracker.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_api.py ├── test_cli.py ├── test_gcloud_cli.py ├── test_python_optim.py ├── test_pytorch_controlflow.py ├── test_pytorch_helpers.py ├── test_pytorch_metrics.py ├── test_pytorch_models.py ├── test_pytorch_schedulers.py ├── test_pytorch_utils.py └── test_utils.py └── tox.ini /.codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | patch: 4 | default: 5 | target: 1% 6 | project: 7 | default: 8 | threshold: 30% -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "monthly" 12 | 13 | labels: 14 | - "dependencies" 15 | reviewers: 16 | - "ehoelzl" 17 | - "mmilenkoski" 18 | 19 | - package-ecosystem: "pip" # See documentation for possible values 20 | directory: "/docs" # Location of package manifests 21 | schedule: 22 | interval: "monthly" 23 | 24 | labels: 25 | - "dependencies" 26 | reviewers: 27 | - "ehoelzl" 28 | - "mmilenkoski" 29 | -------------------------------------------------------------------------------- /.github/workflows/create-release.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow that is manually triggered 2 | 3 | name: Create a new MLBench release 4 | 5 | # Controls when the action will run. Workflow runs when manually triggered using the UI 6 | # or API. 7 | on: 8 | workflow_dispatch: 9 | # Inputs the workflow accepts. 10 | inputs: 11 | patch: 12 | # Friendly description to be shown in the UI instead of 'name' 13 | description: 'Patch [major].[minor].[patch]-[dev]' 14 | # Input has to be provided for the workflow to run 15 | required: true 16 | futureRelease: 17 | description: 'Future release Number' 18 | required: true 19 | 20 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 21 | jobs: 22 | # This workflow contains a single job called "greet" 23 | create-release: 24 | # The type of runner that the job will run on 25 | runs-on: ubuntu-latest 26 | 27 | # Steps represent a sequence of tasks that will be executed as part of the job 28 | steps: 29 | # Create release branch 30 | - name: Create Release Branch 31 | uses: peterjgrainger/action-create-branch@v2.0.1 32 | env: 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | with: 35 | branch: release/v${{ github.event.inputs.futureRelease}} 36 | # Checkout Branch 37 | - name: Checkout release branch 38 | uses: actions/checkout@v2.3.4 39 | env: 40 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 41 | with: 42 | ref: release/v${{ github.event.inputs.futureRelease}} 43 | # Bump version 44 | - name: setup python 45 | uses: actions/setup-python@v2 46 | with: 47 | python-version: 3.7 48 | - name: Bump version 49 | run: pip install bumpversion && bumpversion --allow-dirty --no-tag --no-commit ${{ github.event.inputs.patch }} 50 | # Generate changelog 51 | - name: Find Latest Tag 52 | # You may pin to the exact commit or the version. 53 | # uses: oprypin/find-latest-tag@cc85180adff5be91282940868529accfc5ab40a7 54 | uses: oprypin/find-latest-tag@v1.0.4 55 | with: 56 | repository: mlbench/mlbench-core 57 | id: previousTag 58 | - name: Generate Changelog using github-changelog-generator 59 | # You may pin to the exact commit or the version. 60 | # uses: faberNovel/github-changelog-generator-action@5fcc510347703c66014a0d54c2c6dfb6c1851eaa 61 | uses: faberNovel/github-changelog-generator-action@v1.0.0-alpha02 62 | with: 63 | options: -u mlbench -p mlbench-core -t ${{ secrets.GITHUB_TOKEN }} \ 64 | --release-branch release/v${{ github.event.inputs.futureRelease}} --future-release v${{ github.event.inputs.futureRelease }} \ 65 | --since-tag ${{ steps.previousTag.outputs.tag }} --base CHANGELOG.md 66 | - name: Commit Changes 67 | uses: stefanzweifel/git-auto-commit-action@v4.7.2 68 | with: 69 | commit_message: 'Bump version and update Changelog' 70 | 71 | -------------------------------------------------------------------------------- /.github/workflows/mlbench-core.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: mlbench-core 5 | 6 | on: [push] 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python: [3.7] 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Setup Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: ${{ matrix.python }} 22 | - name: Install Tox and any other packages 23 | run: pip install tox 24 | - name: Lint Check 25 | # Run tox using the version of Python in `PATH` 26 | run: TOXENV=lint python -m tox 27 | - name: tests 28 | run: TOXENV=py37 python -m tox 29 | - name: docs 30 | run: TOXENV=docs python -m tox 31 | - name: Upload coverage to Codecov 32 | uses: codecov/codecov-action@v1.0.15 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | *.inc 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | .venv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | #vscode 106 | .vscode 107 | /docs/mlbench.rst 108 | /docs/modules.rst 109 | /docs/refimpls/ 110 | 111 | # helm 112 | **/charts/*.tgz 113 | myvalues.yaml 114 | 115 | setup_telepresence.sh -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.7 7 | 8 | - repo: https://github.com/asottile/seed-isort-config 9 | rev: v2.1.1 10 | hooks: 11 | - id: seed-isort-config 12 | 13 | - repo: https://github.com/timothycrosley/isort 14 | rev: 4.3.21 15 | hooks: 16 | - id: isort 17 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | # configure updates globally 2 | # default: all 3 | # allowed: all, insecure, False 4 | update: all 5 | 6 | # configure dependency pinning globally 7 | # default: True 8 | # allowed: True, False 9 | pin: True 10 | 11 | # set the default branch 12 | # default: empty, the default branch on GitHub 13 | branch: develop 14 | 15 | # update schedule 16 | # default: empty 17 | # allowed: "every day", "every week", .. 18 | schedule: "every day" 19 | 20 | # search for requirement files 21 | # default: True 22 | # allowed: True, False 23 | search: True 24 | 25 | # Specify requirement files by hand, default is empty 26 | # default: empty 27 | # allowed: list 28 | # requirements: 29 | # - requirements/staging.txt: 30 | # # update all dependencies and pin them 31 | # update: all 32 | # pin: True 33 | # - requirements/dev.txt: 34 | # # don't update dependencies, use global 'pin' default 35 | # update: False 36 | # - requirements/prod.txt: 37 | # # update insecure only, pin all 38 | # update: insecure 39 | # pin: True 40 | 41 | # add a label to pull requests, default is not set 42 | # requires private repo permissions, even on public repos 43 | # default: empty 44 | #label_prs: update 45 | 46 | # assign users to pull requests, default is not set 47 | # requires private repo permissions, even on public repos 48 | # default: empty 49 | # assignees: 50 | # - carl 51 | # - carlsen 52 | 53 | # configure the branch prefix the bot is using 54 | # default: pyup- 55 | # branch_prefix: pyup/ 56 | 57 | # set a global prefix for PRs 58 | # default: empty 59 | # pr_prefix: "Bug #12345" 60 | 61 | # allow to close stale PRs 62 | # default: True 63 | # close_prs: True -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | Developer Docs 2 | ============== 3 | 4 | 5 | Local Dev Setup 6 | --------------- 7 | - Clone the repo locally 8 | - install dependencies with `pip install .[test]` 9 | - Setup pre-commit hooks using `pre-commit install` 10 | 11 | Tests can be run using `tox` or `pytest` commands. 12 | 13 | Docs can be built using `make docs` command. 14 | 15 | 16 | Making a release 17 | ---------------- 18 | 19 | Steps to make a release: 20 | 21 | - Create a development branch based on current `develop`, named `release/vX.X.X` (e.g. `release/2.4.1`) 22 | - Use bumpversion to bump the version, e.g. `bumpversion --verbose --no-commit --no-tag minor` to bump the minor version (`major`, `minor`, `patch` and `dev` are supported) 23 | - Generate the new changelog (based on github issues) like `github_changelog_generator -u mlbench -p mlbench-core -t --release-branch release/2.4.1 --future-release 2.4.1 --base CHANGELOG.md` (use a valid ``) 24 | found here https://github.com/github-changelog-generator/github-changelog-generator 25 | Convert the resulting Changelog.md file to *.rst with a tool like https://cloudconvert.com/md-to-rst . Use this to update the `changelog.rst` in the `mlbench-docs` repo. 26 | - Commit the changes and merge the `release/X.X.X` branch into both master and develop and push with `git push`. 27 | - Create a tag of the master version using `git tag -m "Release X.X.X" vX.X.X` and push with `git push --all` 28 | - Build with `python setup.py sdist bdist_wheel` (delete `dist/` before building) and the upload to Pypi with `twine upload dist/*` -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG.md 2 | include LICENSE 3 | 4 | recursive-include tests * 5 | recursive-include * *.cpp 6 | recursive-include * *.cu 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | 10 | recursive-include docs *.rst conf.py Makefile *.jpg *.png *.gif -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with black, sort imports 54 | black --check . 55 | isort --check-only . 56 | 57 | test: ## run tests quickly with the default Python 58 | py.test 59 | 60 | test-all: ## run tests on every Python version with tox 61 | tox 62 | 63 | coverage: ## check code coverage quickly with the default Python 64 | coverage run --source mlbench_core -m pytest 65 | coverage report -m 66 | coverage html 67 | $(BROWSER) htmlcov/index.html 68 | 69 | docs: ## generate Sphinx HTML documentation, including API docs 70 | $(MAKE) -C docs clean 71 | $(MAKE) -C docs html 72 | $(BROWSER) docs/_build/html/index.html 73 | 74 | servedocs: docs ## compile the docs watching for changes 75 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mlbench-core: Distributed Machine Learning Benchmark Core Library 2 | ================================================================= 3 | 4 | ![Build Status](https://github.com/mlbench/mlbench-core/workflows/mlbench-core/badge.svg?branch=develop) 5 | [![Documentation Status](https://readthedocs.org/projects/mlbench-core/badge/?version=latest)](https://mlbench.readthedocs.io/projects/mlbench_core/en/latest/?badge=latest) 6 | [![codecov](https://codecov.io/gh/mlbench/mlbench-core/branch/develop/graph/badge.svg)](https://codecov.io/gh/mlbench/mlbench-core) 7 | 8 | 9 | MLBench is a Benchmarking Framework for Distributed Machine Learning algorithms. 10 | 11 | This repository contains the core Python library for MLBench which is used to share code between Benchmark implementations as well as for communication with the dashboard. 12 | 13 | For more information refer to the [MLBench Core Documentation](https://mlbench.readthedocs.io/projects/mlbench_core/en/stable/api.html) 14 | or the [Main Documentation](https://mlbench.readthedocs.io/) 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | .wy-menu-vertical li.toctree-l4.current li.toctree-l5>a { 2 | display: block; 3 | background: #bdbdbd; 4 | padding: .4045em 6.663em; 5 | } 6 | 7 | .wy-menu-vertical li.on a, .wy-menu-vertical li>a.current { 8 | color: #404040; 9 | padding: .4045em 1.618em; 10 | font-weight: bold; 11 | position: relative; 12 | background: #fcfcfc; 13 | border: none; 14 | padding-left: 1.618em -4px; 15 | } 16 | 17 | .wy-menu-vertical li.toctree-l3.current li.toctree-l4>ul { 18 | display: none; 19 | } 20 | 21 | .wy-menu-vertical li.toctree-l3.current li.toctree-l4.current>ul { 22 | display: block; 23 | } -------------------------------------------------------------------------------- /docs/aggregation.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.aggregation 2 | ------------------------ 3 | 4 | .. autoapimodule:: mlbench_core.aggregation 5 | .. currentmodule:: mlbench_core.aggregation 6 | 7 | 8 | pytorch 9 | ~~~~~~~ 10 | 11 | .. autoapimodule:: mlbench_core.aggregation.pytorch 12 | .. currentmodule:: mlbench_core.aggregation.pytorch 13 | 14 | Aggregation 15 | +++++++++++ 16 | 17 | .. autoapimodule:: mlbench_core.aggregation.pytorch.aggregation 18 | .. currentmodule:: mlbench_core.aggregation.pytorch.aggregation 19 | 20 | .. autoapiclass:: Aggregation 21 | :members: 22 | :private-members: 23 | :undoc-members: 24 | 25 | Centralized (Synchronous) aggregation 26 | +++++++++++++++++++++++++++++++++++++ 27 | 28 | .. autoapimodule:: mlbench_core.aggregation.pytorch.centralized 29 | .. currentmodule:: mlbench_core.aggregation.pytorch.centralized 30 | 31 | All-Reduce 32 | '''''''''' 33 | 34 | .. autoapiclass:: AllReduceAggregation 35 | :show-inheritance: 36 | :private-members: 37 | 38 | All-Reduce Horovod 39 | '''''''''''''''''' 40 | 41 | .. autoapiclass:: AllReduceAggregationHVD 42 | :show-inheritance: 43 | :private-members: 44 | 45 | Sparsified Aggregation 46 | '''''''''''''''''''''' 47 | 48 | .. autoapiclass:: SparsifiedAggregation 49 | :show-inheritance: 50 | :private-members: 51 | 52 | Power Aggregation 53 | ''''''''''''''''' 54 | 55 | .. autoapiclass:: PowerAggregation 56 | :show-inheritance: 57 | :private-members: 58 | 59 | Decentralized (Asynchronous) aggregation 60 | ++++++++++++++++++++++++++++++++++++++++ 61 | 62 | .. autoapimodule:: mlbench_core.aggregation.pytorch.decentralized 63 | .. currentmodule:: mlbench_core.aggregation.pytorch.decentralized 64 | 65 | Decentralized Aggregation 66 | ''''''''''''''''''''''''' 67 | 68 | .. autoapiclass:: DecentralizedAggregation 69 | :show-inheritance: 70 | :private-members: 71 | 72 | 73 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _mlbench_core_api: 3 | 4 | MLBench Core API 5 | ================ 6 | 7 | .. toctree:: 8 | :caption: Examples 9 | 10 | examples 11 | 12 | .. toctree:: 13 | :caption: Core API 14 | :maxdepth: 1 15 | 16 | aggregation 17 | clientapi 18 | controlflow 19 | dataset 20 | evaluation 21 | lr_scheduler 22 | models 23 | optim 24 | utils -------------------------------------------------------------------------------- /docs/clientapi.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.api 2 | ---------------- 3 | 4 | .. autoapimodule:: mlbench_core.api 5 | .. currentmodule:: mlbench_core.api 6 | 7 | .. autoapidata:: MLBENCH_IMAGES 8 | 9 | .. autoapiclass:: ApiClient 10 | :members: 11 | -------------------------------------------------------------------------------- /docs/controlflow.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.controlflow 2 | ------------------------ 3 | 4 | .. autoapimodule:: mlbench_core.controlflow 5 | .. currentmodule:: mlbench_core.controlflow 6 | 7 | pytorch 8 | ~~~~~~~ 9 | 10 | .. autoapimodule:: mlbench_core.controlflow.pytorch 11 | .. currentmodule:: mlbench_core.controlflow.pytorch 12 | 13 | Controlflow 14 | +++++++++++ 15 | 16 | .. autoapifunction:: validation_round 17 | 18 | .. autoapifunction:: record_train_batch_stats 19 | 20 | .. autoapifunction:: record_validation_stats 21 | 22 | CheckpointsEvaluationControlFlow 23 | ++++++++++++++++++++++++++++++++ 24 | 25 | .. autoapiclass:: CheckpointsEvaluationControlFlow 26 | :members: 27 | 28 | Helpers 29 | +++++++ 30 | 31 | .. autoapimodule:: mlbench_core.controlflow.pytorch.helpers 32 | .. currentmodule:: mlbench_core.controlflow.pytorch.helpers 33 | 34 | .. autoapifunction:: maybe_range 35 | .. autoapifunction:: convert_dtype 36 | .. autoapifunction:: prepare_batch 37 | .. autoapifunction:: iterate_dataloader 38 | 39 | 40 | tensorflow 41 | ~~~~~~~~~~ 42 | 43 | .. autoapimodule:: mlbench_core.controlflow.tensorflow 44 | .. currentmodule:: mlbench_core.controlflow.tensorflow 45 | 46 | 47 | TrainValidation 48 | +++++++++++++++ 49 | 50 | .. autoapiclass:: TrainValidation 51 | :members: 52 | 53 | .. autoapimethod:: __call__ 54 | -------------------------------------------------------------------------------- /docs/dataset.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.dataset 2 | -------------------- 3 | 4 | .. autoapimodule:: mlbench_core.dataset 5 | .. currentmodule:: mlbench_core.dataset 6 | 7 | 8 | linearmodels 9 | ~~~~~~~~~~~~ 10 | .. autoapimodule:: mlbench_core.dataset.linearmodels 11 | .. currentmodule:: mlbench_core.dataset.linearmodels 12 | 13 | pytorch 14 | +++++++ 15 | 16 | .. autoapimodule:: mlbench_core.dataset.linearmodels.pytorch.dataloader 17 | .. currentmodule:: mlbench_core.dataset.linearmodels.pytorch.dataloader 18 | 19 | Epsilon Logistic Regression 20 | ''''''''''''''''''''''''''' 21 | 22 | .. autoapiclass:: LMDBDataset 23 | :members: 24 | 25 | .. autoapiclass:: LMDBPTClass 26 | :members: 27 | 28 | imagerecognition 29 | ~~~~~~~~~~~~~~~~ 30 | 31 | .. autoapimodule:: mlbench_core.dataset.imagerecognition 32 | .. currentmodule:: mlbench_core.dataset.imagerecognition 33 | 34 | pytorch 35 | +++++++ 36 | 37 | .. autoapimodule:: mlbench_core.dataset.imagerecognition.pytorch.dataloader 38 | .. currentmodule:: mlbench_core.dataset.imagerecognition.pytorch.dataloader 39 | 40 | CIFAR10V1 41 | ''''''''' 42 | 43 | .. autoapiclass:: CIFAR10V1 44 | :members: 45 | 46 | Imagenet 47 | '''''''' 48 | 49 | .. autoapiclass:: Imagenet 50 | :members: 51 | 52 | tensorflow 53 | ++++++++++ 54 | 55 | .. autoapimodule:: mlbench_core.dataset.imagerecognition.tensorflow 56 | .. currentmodule:: mlbench_core.dataset.imagerecognition.tensorflow 57 | 58 | DatasetCifar 59 | '''''''''''' 60 | 61 | .. autoapiclass:: DatasetCifar 62 | :members: 63 | 64 | NLP 65 | ~~~ 66 | 67 | .. autoapimodule:: mlbench_core.dataset.nlp 68 | .. currentmodule:: mlbench_core.dataset.nlp 69 | 70 | pytorch 71 | +++++++ 72 | 73 | .. autoapimodule:: mlbench_core.dataset.nlp.pytorch 74 | .. currentmodule:: mlbench_core.dataset.nlp.pytorch 75 | 76 | Translation WMT16 77 | ''''''''''''''''' 78 | 79 | .. autoapiclass:: WMT16Dataset 80 | :members: 81 | 82 | .. autoapimodule:: mlbench_core.dataset.nlp.pytorch.wmt16.wmt16_tokenizer 83 | :members: 84 | 85 | Translation WMT17 86 | ''''''''''''''''' 87 | 88 | .. autoapiclass:: WMT17Dataset 89 | :members: 90 | 91 | .. autoapimodule:: mlbench_core.dataset.nlp.pytorch.wmt17 92 | :members: 93 | 94 | Language Modeling WikiText2 95 | ''''''''''''''''''''''''''' 96 | 97 | .. autoapiclass:: BPTTWikiText2 98 | :members: 99 | 100 | -------------------------------------------------------------------------------- /docs/evaluation.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.evaluation 2 | ----------------------- 3 | .. autoapimodule:: mlbench_core.evaluation 4 | .. currentmodule:: mlbench_core.evaluation 5 | 6 | pytorch 7 | ~~~~~~~ 8 | 9 | .. autoapimodule:: mlbench_core.evaluation.pytorch 10 | .. currentmodule:: mlbench_core.evaluation.pytorch 11 | 12 | criterion 13 | +++++++++ 14 | 15 | .. autoapimodule:: mlbench_core.evaluation.pytorch.criterion 16 | .. currentmodule:: mlbench_core.evaluation.pytorch.criterion 17 | 18 | 19 | BCELossRegularized 20 | '''''''''''''''''' 21 | 22 | .. autoapiclass:: BCELossRegularized 23 | :members: 24 | 25 | 26 | MSELossRegularized 27 | '''''''''''''''''' 28 | 29 | .. autoapiclass:: MSELossRegularized 30 | :members: 31 | 32 | .. autoapiclass:: LabelSmoothing 33 | :members: 34 | 35 | metrics 36 | +++++++ 37 | 38 | .. autoapimodule:: mlbench_core.evaluation.pytorch.metrics 39 | .. currentmodule:: mlbench_core.evaluation.pytorch.metrics 40 | 41 | 42 | TopKAccuracy 43 | '''''''''''' 44 | 45 | .. autoapiclass:: TopKAccuracy 46 | :members: 47 | 48 | .. autoapimethod:: __call__ 49 | 50 | tensorflow 51 | ~~~~~~~~~~ 52 | 53 | criterion 54 | +++++++++ 55 | 56 | .. autoapimodule:: mlbench_core.evaluation.tensorflow.criterion 57 | .. currentmodule:: mlbench_core.evaluation.tensorflow.criterion 58 | 59 | 60 | softmax_cross_entropy_with_logits_v2_l2_regularized 61 | ''''''''''''''''''''''''''''''''''''''''''''''''''' 62 | 63 | .. autoapifunction:: softmax_cross_entropy_with_logits_v2_l2_regularized 64 | 65 | metrics 66 | +++++++ 67 | 68 | .. autoapimodule:: mlbench_core.evaluation.tensorflow.metrics 69 | .. currentmodule:: mlbench_core.evaluation.tensorflow.metrics 70 | 71 | topk_accuracy 72 | ''''''''''''' 73 | 74 | .. autoapifunction:: topk_accuracy_with_logits 75 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | -------- -------------------------------------------------------------------------------- /docs/gensidebar.py: -------------------------------------------------------------------------------- 1 | # 2 | # This file generates the sidebar/toctree for all RobotPy projects and should 3 | # be copied to each project when it is updated 4 | # 5 | 6 | import os 7 | 8 | 9 | def write_if_changed(fname, contents): 10 | 11 | try: 12 | with open(fname, "r") as fp: 13 | old_contents = fp.read() 14 | except: 15 | old_contents = "" 16 | 17 | if old_contents != contents: 18 | with open(fname, "w") as fp: 19 | fp.write(contents) 20 | 21 | 22 | def generate_sidebar(conf, conf_api): 23 | 24 | # determine 'latest' or 'stable' 25 | # if not conf.do_gen: 26 | do_gen = os.environ.get("SIDEBAR", None) == "1" or conf["on_rtd"] 27 | version = conf["rtd_version"] 28 | 29 | lines = ["", ".. DO NOT MODIFY! THIS PAGE IS AUTOGENERATED!", ""] 30 | 31 | def toctree(name): 32 | lines.extend( 33 | [".. toctree::", " :caption: %s" % name, " :maxdepth: 2", ""] 34 | ) 35 | 36 | def endl(): 37 | lines.append("") 38 | 39 | def write(desc, link): 40 | if conf_api == "mlbench": 41 | args = desc, link 42 | elif not do_gen: 43 | return 44 | else: 45 | args = ( 46 | desc, 47 | "https://mlbench.readthedocs.io/en/%s/%s.html" % (version, link), 48 | ) 49 | 50 | lines.append(" %s <%s>" % args) 51 | 52 | def write_api(project, desc): 53 | if project != conf_api: 54 | if do_gen: 55 | args = desc, project, version 56 | lines.append( 57 | " %s API " 58 | % args 59 | ) 60 | else: 61 | lines.append(" %s API " % desc) 62 | 63 | def write_ref(project, desc): 64 | if project != conf_api: 65 | if do_gen: 66 | args = desc, project, version 67 | lines.append( 68 | " %s " 69 | % args 70 | ) 71 | else: 72 | lines.append(" %s " % desc) 73 | 74 | # 75 | # Specify the sidebar contents here 76 | # 77 | 78 | toctree("MLBench") 79 | write("Benchmarks", "benchmark-tasks") 80 | write("Prerequisites", "prerequisites") 81 | write("Installation", "installation") 82 | write("Component Overview", "overview") 83 | write("Tutorials", "tutorials") 84 | endl() 85 | 86 | toctree("Components") 87 | write_ref("mlbench_helm", "Helm Chart") 88 | write_ref("mlbench_dashboard", "Dashboard") 89 | write_ref("mlbench_benchmarks", "Benchmark Implementations") 90 | write_api("mlbench_core", "Core") 91 | endl() 92 | 93 | toctree("Additional Info") 94 | write("Developer Guide", "devguide") 95 | write("Contributing", "contributing") 96 | write("Changelog", "changelog") 97 | endl() 98 | 99 | write_if_changed("_sidebar.rst.inc", "\n".join(lines)) 100 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | MLBench Core Documentation 2 | ================================ 3 | 4 | .. include:: _sidebar.rst.inc 5 | 6 | 7 | Indices and tables 8 | ================== 9 | 10 | * :ref:`genindex` 11 | * :ref:`modindex` 12 | * :ref:`search` -------------------------------------------------------------------------------- /docs/lr_scheduler.bib: -------------------------------------------------------------------------------- 1 | @article{ginsburg2018large, 2 | title={Large Batch Training of Convolutional Networks with Layer-wise Adaptive Rate Scaling}, 3 | author={Ginsburg, Boris and Gitman, Igor and You, Yang}, 4 | year={2018}, 5 | journal={Open Review} 6 | } 7 | 8 | @inproceedings{smith2017cyclical, 9 | title={Cyclical learning rates for training neural networks}, 10 | author={Smith, Leslie N}, 11 | booktitle={Applications of Computer Vision (WACV), 2017 IEEE Winter Conference on}, 12 | pages={464--472}, 13 | year={2017}, 14 | organization={IEEE} 15 | } 16 | 17 | @article{goyal2017accurate, 18 | title={Accurate, large minibatch SGD: training imagenet in 1 hour}, 19 | author={Goyal, Priya and Doll{\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming}, 20 | journal={arXiv preprint arXiv:1706.02677}, 21 | year={2017} 22 | } 23 | 24 | @article{smith2017super, 25 | title={Super-Convergence: Very Fast Training of Residual Networks Using Large Learning Rates}, 26 | author={Smith, Leslie N and Topin, Nicholay}, 27 | journal={arXiv preprint arXiv:1708.07120}, 28 | year={2017} 29 | } -------------------------------------------------------------------------------- /docs/lr_scheduler.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.lr_scheduler 2 | ------------------------- 3 | .. autoapimodule:: mlbench_core.lr_scheduler 4 | .. currentmodule:: mlbench_core.lr_scheduler 5 | 6 | pytorch 7 | ~~~~~~~ 8 | 9 | .. autoapimodule:: mlbench_core.lr_scheduler.pytorch.lr 10 | .. currentmodule:: mlbench_core.lr_scheduler.pytorch.lr 11 | 12 | LRLinearWarmUp 13 | ++++++++++++++ 14 | 15 | .. autoapiclass:: LRLinearWarmUp 16 | :members: 17 | 18 | MultiStepLRLinearWarmUp 19 | +++++++++++++++++++++++ 20 | 21 | .. autoapiclass:: MultiStepLRLinearWarmUp 22 | :members: 23 | 24 | ReduceLROnPlateauWithWarmup 25 | +++++++++++++++++++++++++++ 26 | 27 | .. autoapiclass:: ReduceLROnPlateauWithWarmup 28 | :members: 29 | 30 | SparsifiedSGDLR 31 | +++++++++++++++ 32 | 33 | .. autoapiclass:: SparsifiedSGDLR 34 | :members: 35 | 36 | TimeDecayLR 37 | +++++++++++ 38 | 39 | .. autoapiclass:: TimeDecayLR 40 | :members: 41 | 42 | SQRTTimeDecayLR 43 | +++++++++++++++ 44 | 45 | .. autoapiclass:: SQRTTimeDecayLR 46 | :members: 47 | 48 | ExponentialWarmupMultiStepLR 49 | ++++++++++++++++++++++++++++ 50 | 51 | .. autoapiclass:: ExponentialWarmupMultiStepLR 52 | :members: 53 | 54 | SQRTTimeDecayLRWithWarmup 55 | +++++++++++++++++++++++++ 56 | 57 | .. autoapiclass:: SQRTTimeDecayLRWithWarmup 58 | :members: 59 | 60 | tensorflow 61 | ~~~~~~~~~~ 62 | 63 | .. autoapimodule:: mlbench_core.lr_scheduler.tensorflow 64 | .. currentmodule:: mlbench_core.lr_scheduler.tensorflow 65 | 66 | manual_stepping 67 | +++++++++++++++ 68 | 69 | .. autoapifunction:: manual_stepping 70 | 71 | 72 | .. rubric:: References 73 | 74 | 75 | .. bibliography:: lr_scheduler.bib 76 | :cited: -------------------------------------------------------------------------------- /docs/models.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{he2016deep, 2 | title={Deep residual learning for image recognition}, 3 | author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, 4 | booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, 5 | pages={770--778}, 6 | year={2016} 7 | } 8 | 9 | @inproceedings{he2016identity, 10 | title={Identity mappings in deep residual networks}, 11 | author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, 12 | booktitle={European conference on computer vision}, 13 | pages={630--645}, 14 | year={2016}, 15 | organization={Springer} 16 | } 17 | 18 | @incollection{NIPS2017_7181, 19 | title = {Attention is All you Need}, 20 | author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia}, 21 | booktitle = {Advances in Neural Information Processing Systems 30}, 22 | editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett}, 23 | pages = {5998--6008}, 24 | year = {2017}, 25 | publisher = {Curran Associates, Inc.}, 26 | url = {http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf} 27 | } -------------------------------------------------------------------------------- /docs/models.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.models 2 | ------------------- 3 | .. autoapimodule:: mlbench_core.models 4 | .. currentmodule:: mlbench_core.models 5 | 6 | pytorch 7 | ~~~~~~~ 8 | 9 | Since `Kuang Liu` has already included many classical 10 | neural network models. We use their implementation direclty for 11 | 12 | - VGG 13 | 14 | .. autoapimodule:: mlbench_core.models.pytorch 15 | .. currentmodule:: mlbench_core.models.pytorch 16 | 17 | 18 | linear_models 19 | +++++++++++++ 20 | 21 | .. autoapimodule:: mlbench_core.models.pytorch.linear_models 22 | .. currentmodule:: mlbench_core.models.pytorch.linear_models 23 | 24 | 25 | LogisticRegression 26 | '''''''''''''''''' 27 | 28 | .. autoapiclass:: LogisticRegression 29 | :members: 30 | 31 | LinearRegression 32 | '''''''''''''''''' 33 | 34 | .. autoapiclass:: LinearRegression 35 | :members: 36 | 37 | 38 | resnet 39 | ++++++ 40 | .. autoapimodule:: mlbench_core.models.pytorch.resnet 41 | .. currentmodule:: mlbench_core.models.pytorch.resnet 42 | 43 | ResNetCIFAR 44 | ''''''''''' 45 | 46 | .. autoapiclass:: ResNetCIFAR 47 | :members: 48 | 49 | 50 | RNN 51 | +++ 52 | --- 53 | 54 | Google Neural Machine Translation 55 | ''''''''''''''''''''''''''''''''' 56 | .. autoapimodule:: mlbench_core.models.pytorch.gnmt 57 | .. currentmodule:: mlbench_core.models.pytorch.gnmt 58 | 59 | Model 60 | ===== 61 | 62 | .. autoapiclass:: GNMT 63 | :members: encode, decode, generate, forward 64 | 65 | BahdanauAttention 66 | ================= 67 | 68 | .. autoapiclass:: BahdanauAttention 69 | :members: 70 | 71 | Encoder 72 | ======= 73 | .. autoapimodule:: mlbench_core.models.pytorch.gnmt.encoder 74 | .. currentmodule:: mlbench_core.models.pytorch.gnmt.encoder 75 | 76 | .. autoapiclass:: ResidualRecurrentEncoder 77 | :members: 78 | 79 | Decoder 80 | ======= 81 | .. autoapimodule:: mlbench_core.models.pytorch.gnmt.decoder 82 | .. currentmodule:: mlbench_core.models.pytorch.gnmt.decoder 83 | 84 | .. autoapiclass:: RecurrentAttention 85 | :members: 86 | 87 | .. autoapiclass:: Classifier 88 | :members: 89 | 90 | .. autoapiclass:: ResidualRecurrentDecoder 91 | :members: 92 | 93 | Transformer Model for Translation 94 | ''''''''''''''''''''''''''''''''' 95 | .. autoapimodule:: mlbench_core.models.pytorch.transformer 96 | .. currentmodule:: mlbench_core.models.pytorch.transformer 97 | 98 | Model 99 | ===== 100 | 101 | .. autoapiclass:: TransformerModel 102 | :members: forward 103 | 104 | Encoder 105 | ======= 106 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.encoder 107 | .. currentmodule:: mlbench_core.models.pytorch.transformer.encoder 108 | 109 | .. autoapiclass:: TransformerEncoder 110 | :members: forward 111 | 112 | Decoder 113 | ======= 114 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.decoder 115 | .. currentmodule:: mlbench_core.models.pytorch.transformer.decoder 116 | 117 | .. autoapiclass:: TransformerDecoder 118 | :members: forward 119 | 120 | Layers 121 | ====== 122 | 123 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.modules 124 | .. currentmodule:: mlbench_core.models.pytorch.transformer.modules 125 | 126 | .. autoapiclass:: TransformerEncoderLayer 127 | :members: forward 128 | 129 | .. autoapiclass:: TransformerDecoderLayer 130 | :members: forward 131 | 132 | SequenceGenerator 133 | ================= 134 | 135 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.sequence_generator 136 | .. currentmodule:: mlbench_core.models.pytorch.transformer.sequence_generator 137 | 138 | .. autoapiclass:: SequenceGenerator 139 | :members: 140 | 141 | 142 | .. rubric:: References 143 | 144 | .. bibliography:: models.bib 145 | :cited: 146 | 147 | 148 | NLP 149 | +++ 150 | .. autoapimodule:: mlbench_core.models.pytorch.nlp 151 | .. currentmodule:: mlbench_core.models.pytorch.nlp 152 | 153 | LSTM Language Model 154 | ''''''''''''''''''' 155 | 156 | .. autoapiclass:: RNNLM 157 | :members: 158 | 159 | 160 | tensorflow 161 | ~~~~~~~~~~ 162 | 163 | .. autoapimodule:: mlbench_core.models.tensorflow 164 | .. currentmodule:: mlbench_core.models.tensorflow 165 | 166 | resnet 167 | ++++++ 168 | 169 | .. autoapimodule:: mlbench_core.models.tensorflow.resnet_model 170 | .. currentmodule:: mlbench_core.models.tensorflow.resnet_model 171 | 172 | 173 | .. autoapifunction:: fixed_padding 174 | 175 | .. autoapifunction:: conv2d_fixed_padding 176 | 177 | .. autoapifunction:: block_layer 178 | 179 | .. autoapifunction:: batch_norm 180 | 181 | 182 | Model 183 | ''''' 184 | 185 | .. autoapiclass:: Model 186 | :members: 187 | 188 | 189 | Cifar10Model 190 | '''''''''''' 191 | 192 | .. autoapiclass:: Cifar10Model 193 | :members: 194 | 195 | 196 | -------------------------------------------------------------------------------- /docs/optim.bib: -------------------------------------------------------------------------------- 1 | 2 | @inproceedings{adam_convergence, 3 | title={On the Convergence of Adam and Beyond}, 4 | author={Sashank J. Reddi and Satyen Kale and Sanjiv Kumar}, 5 | booktitle={International Conference on Learning Representations}, 6 | year={2018}, 7 | url={https://openreview.net/forum?id=ryQu7f-RZ}, 8 | } -------------------------------------------------------------------------------- /docs/optim.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.optim 2 | ------------------ 3 | 4 | .. autoapimodule:: mlbench_core.optim 5 | .. currentmodule:: mlbench_core.optim 6 | 7 | 8 | pytorch 9 | ~~~~~~~ 10 | .. autoapimodule:: mlbench_core.optim.pytorch 11 | .. currentmodule:: mlbench_core.optim.pytorch 12 | 13 | 14 | Optimizers 15 | ++++++++++ 16 | 17 | The optimizers in this module are not distributed. Their purpose is to implement logic that 18 | can be inherited by distributed optimizers. 19 | 20 | .. autoapimodule:: mlbench_core.optim.pytorch.optim 21 | .. currentmodule:: mlbench_core.optim.pytorch.optim 22 | 23 | 24 | SparsifiedSGD 25 | ''''''''''''' 26 | 27 | .. autoapiclass:: SparsifiedSGD 28 | :members: 29 | 30 | SignSGD 31 | ''''''''''''' 32 | 33 | .. autoapiclass:: SignSGD 34 | :members: 35 | 36 | Centralized (Synchronous) Optimizers 37 | ++++++++++++++++++++++++++++++++++++ 38 | 39 | The optimizers in this module are all distributed and synchronous: workers advance in a synchronous manner. All workers 40 | communicate with each other using `all_reduce` or `all_gather` operations. 41 | 42 | .. autoapimodule:: mlbench_core.optim.pytorch.centralized 43 | .. currentmodule:: mlbench_core.optim.pytorch.centralized 44 | 45 | Generic Centralized Optimizer 46 | +++++++++++++++++++++++++++++ 47 | 48 | .. autoapiclass:: GenericCentralizedOptimizer 49 | :members: 50 | 51 | CentralizedSGD 52 | '''''''''''''' 53 | 54 | .. autoapiclass:: CentralizedSGD 55 | :show-inheritance: 56 | :members: 57 | 58 | CentralizedAdam 59 | ''''''''''''''' 60 | 61 | .. autoapiclass:: CentralizedAdam 62 | :show-inheritance: 63 | :members: 64 | 65 | CustomCentralizedOptimizer 66 | '''''''''''''''''''''''''' 67 | 68 | .. autoapiclass:: CustomCentralizedOptimizer 69 | :show-inheritance: 70 | :members: 71 | 72 | CentralizedSparsifiedSGD 73 | '''''''''''''''''''''''' 74 | 75 | .. autoapiclass:: CentralizedSparsifiedSGD 76 | :members: 77 | 78 | PowerSGD 79 | '''''''' 80 | 81 | .. autoapiclass:: PowerSGD 82 | :members: 83 | 84 | Decentralized (Asynchronous) Optimizers 85 | +++++++++++++++++++++++++++++++++++++++ 86 | 87 | The optimizers in this module are all distributed and asynchronous: workers advance independently from each other, 88 | and communication patterns follow an arbitrary graph. 89 | 90 | .. autoapimodule:: mlbench_core.optim.pytorch.decentralized 91 | .. currentmodule:: mlbench_core.optim.pytorch.decentralized 92 | 93 | DecentralizedSGD 94 | '''''''''''''''' 95 | 96 | .. autoapiclass:: DecentralizedSGD 97 | :members: 98 | 99 | 100 | .. rubric:: References 101 | 102 | .. bibliography:: optim.bib 103 | :cited: 104 | 105 | Mixed Precision Optimizers 106 | ++++++++++++++++++++++++++ 107 | 108 | .. autoapimodule:: mlbench_core.optim.pytorch.fp_optimizers 109 | .. currentmodule:: mlbench_core.optim.pytorch.fp_optimizers 110 | 111 | FP16Optimizer 112 | ''''''''''''' 113 | 114 | .. autoapiclass:: FP16Optimizer 115 | :members: 116 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx==4.0.2 2 | sphinx-rtd-theme==0.5.2 3 | sphinxcontrib-napoleon==0.7 4 | sphinxcontrib-bibtex==2.3.0 5 | sphinx-autoapi==1.8.1 -------------------------------------------------------------------------------- /docs/utils.rst: -------------------------------------------------------------------------------- 1 | mlbench_core.utils 2 | ------------------ 3 | .. autoapimodule:: mlbench_core.utils 4 | .. currentmodule:: mlbench_core.utils 5 | 6 | pytorch 7 | ~~~~~~~ 8 | 9 | .. autoapimodule:: mlbench_core.utils.pytorch 10 | .. currentmodule:: mlbench_core.utils.pytorch 11 | 12 | 13 | FCGraph 14 | ''''''' 15 | 16 | .. autoapiclass:: FCGraph 17 | :members: 18 | 19 | initialize_backends 20 | ''''''''''''''''''' 21 | 22 | .. autoapifunction:: initialize_backends 23 | 24 | 25 | Checkpointer 26 | '''''''''''' 27 | 28 | .. autoapimodule:: mlbench_core.utils.pytorch.checkpoint 29 | .. currentmodule:: mlbench_core.utils.pytorch.checkpoint 30 | 31 | .. autoapiclass:: Checkpointer 32 | 33 | helpers 34 | ''''''' 35 | 36 | .. autoapimodule:: mlbench_core.utils.pytorch.helpers 37 | .. currentmodule:: mlbench_core.utils.pytorch.helpers 38 | 39 | .. autoapifunction:: config_logging 40 | 41 | .. autoapifunction:: config_pytorch 42 | 43 | .. autoapifunction:: config_path 44 | 45 | utils 46 | ''''' 47 | 48 | .. autoapimodule:: mlbench_core.utils.pytorch.utils 49 | .. currentmodule:: mlbench_core.utils.pytorch.utils 50 | 51 | .. autoapifunction:: pack_tensors 52 | 53 | .. autoapifunction:: unpack_tensors 54 | 55 | .. autoapifunction:: orthogonalize 56 | 57 | 58 | 59 | 60 | Inference 61 | ''''''''' 62 | 63 | .. autoapimodule:: mlbench_core.utils.pytorch.inference 64 | .. currentmodule:: mlbench_core.utils.pytorch.inference 65 | 66 | Translator 67 | ++++++++++ 68 | 69 | .. autoapiclass:: Translator 70 | :members: 71 | 72 | BeamSearch 73 | ++++++++++ 74 | 75 | .. autoapimodule:: mlbench_core.utils.pytorch.inference.beam_search 76 | :members: 77 | 78 | 79 | tensorflow 80 | ~~~~~~~~~~ 81 | 82 | .. autoapimodule:: mlbench_core.utils.tensorflow 83 | .. currentmodule:: mlbench_core.utils.tensorflow 84 | 85 | 86 | -------------------------------------------------------------------------------- /mlbench_core/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Top-level package for mlbench_core.""" 4 | 5 | __version__ = "3.0.0-dev23" 6 | 7 | from . import api, controlflow, dataset, evaluation, lr_scheduler, models, optim, utils 8 | -------------------------------------------------------------------------------- /mlbench_core/aggregation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/aggregation/__init__.py -------------------------------------------------------------------------------- /mlbench_core/aggregation/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/aggregation/pytorch/__init__.py -------------------------------------------------------------------------------- /mlbench_core/aggregation/pytorch/aggregation.py: -------------------------------------------------------------------------------- 1 | from mlbench_core.utils.pytorch.utils import pack_tensors, unpack_tensors 2 | 3 | 4 | class Aggregation(object): 5 | """Aggregate updates / models from different processes. 6 | 7 | Args: 8 | use_cuda (bool): Whether to use CUDA tensors for communication 9 | """ 10 | 11 | def __init__(self, use_cuda=False): 12 | self.use_cuda = use_cuda 13 | 14 | def _agg(self, data, op, denom=None): 15 | """Aggregate data using `op` operation. 16 | 17 | Args: 18 | data (:obj:`torch.Tensor`): A Tensor to be aggregated. 19 | op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc. 20 | denom (:obj:`torch.Tensor`, optional): Custom denominator to average by 21 | Use with op == `custom_avg`. (default: `None`) 22 | 23 | Returns: 24 | :obj:`torch.Tensor`: An aggregated tensor. 25 | """ 26 | raise NotImplementedError 27 | 28 | def _agg_weights_by_model(self, model, op, denom=None): 29 | """Aggregate models by model weight, all layers at once 30 | 31 | Args: 32 | model (:obj:`torch.Module`): Models to be averaged. 33 | op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS` 34 | denom (:obj:`torch.Tensor`, optional): Custom denominator to average by 35 | Use with op == `custom_avg`. (default: `None`) 36 | """ 37 | # Pack all layers 38 | packed, indices, sizes = pack_tensors( 39 | [t for t in model.parameters()], use_cuda=self.use_cuda 40 | ) 41 | aggregated = self._agg(packed, op=op, denom=denom) 42 | 43 | tensors = unpack_tensors(aggregated, indices, sizes) 44 | # Unpack 45 | for i, param in enumerate(model.parameters()): 46 | param.data = tensors[i] 47 | 48 | def _agg_gradients_by_model(self, model, op, denom=None): 49 | """Aggregate models gradients, all layers at once 50 | 51 | Args: 52 | model (:obj:`torch.Module`): Models to be averaged. 53 | op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS` 54 | denom (:obj:`torch.Tensor`, optional): Custom denominator to average by 55 | Use with op == `custom_avg`. (default: `None`) 56 | """ 57 | # Pack all layers 58 | packed, indices, sizes = pack_tensors( 59 | [t.grad for t in model.parameters()], use_cuda=self.use_cuda 60 | ) 61 | aggregated = self._agg(packed, op=op, denom=denom) 62 | 63 | # Unpack 64 | tensors = unpack_tensors(aggregated, indices, sizes) 65 | for i, param in enumerate(model.parameters()): 66 | param.grad.data = tensors[i] 67 | 68 | def _agg_weights_by_layer(self, model, op, denom=None): 69 | """Aggregate models by model weight, for each layer individually 70 | 71 | Args: 72 | model (:obj:`torch.Module`): Models to be averaged. 73 | op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS` 74 | denom (:obj:`torch.Tensor`, optional): Custom denominator to average by 75 | Use with op == `custom_avg`. (default: `None`) 76 | """ 77 | # Aggregate layer by layer 78 | for _, param in enumerate(model.parameters()): 79 | grad = self._agg(param.data, op=op, denom=denom) 80 | param.data = grad 81 | 82 | def _agg_gradients_by_layer(self, model, op, denom=None): 83 | """Aggregate models gradients each layer individually 84 | 85 | Args: 86 | model (:obj:`torch.Module`): Models to be averaged. 87 | op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS` 88 | denom (:obj:`torch.Tensor`, optional): Custom denominator to average by 89 | Use with op == `custom_avg`. (default: `None`) 90 | """ 91 | # Aggregate layer by layer 92 | for _, param in enumerate(model.parameters()): 93 | grad = self._agg(param.grad.data, op=op, denom=denom) 94 | param.grad.data = grad 95 | 96 | def agg_model(self, by_layer=False): 97 | if by_layer: 98 | return self._agg_weights_by_layer 99 | else: 100 | return self._agg_weights_by_model 101 | 102 | def agg_grad(self, by_layer=False): 103 | if by_layer: 104 | return self._agg_gradients_by_layer 105 | else: 106 | return self._agg_gradients_by_model 107 | -------------------------------------------------------------------------------- /mlbench_core/aggregation/pytorch/decentralized.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | from mlbench_core.aggregation.pytorch.aggregation import Aggregation 5 | 6 | 7 | class DecentralizedAggregation(Aggregation): 8 | """Aggregate updates in a decentralized manner.""" 9 | 10 | def __init__(self, rank, neighbors, use_cuda=False): 11 | """ 12 | Args: 13 | rank (int): Rank of the current process 14 | neighbors (list): A list of ranks of its neighbors. 15 | """ 16 | assert rank not in neighbors 17 | self.rank = rank 18 | self.neighbors = neighbors 19 | super(DecentralizedAggregation, self).__init__(use_cuda=use_cuda) 20 | 21 | def _agg(self, data, op, denom=None): 22 | """Aggregate data using `op` operation. 23 | 24 | Args: 25 | data (:obj:`torch.Tensor`): A Tensor to be aggragated. 26 | op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc. 27 | 28 | Returns: 29 | :obj:`torch.Tensor`: An aggregated tensor. 30 | """ 31 | # Create some tensors to host the values from neighborhood. 32 | local_data = {i: torch.zeros_like(data) for i in self.neighbors} 33 | local_data[self.rank] = data 34 | 35 | reqs = [] 36 | for node in self.neighbors: 37 | reqs.append(dist.isend(tensor=local_data[self.rank], dst=node)) 38 | reqs.append(dist.irecv(tensor=local_data[node], src=node)) 39 | 40 | for req in reqs: 41 | req.wait() 42 | 43 | # Aggregate local_data 44 | if op == "avg_world": 45 | output = sum(local_data.values()) / (len(self.neighbors) + 1) 46 | else: 47 | raise NotImplementedError("op {} is not supported yet.".format(op)) 48 | 49 | return output 50 | -------------------------------------------------------------------------------- /mlbench_core/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Commandline Interface for mlbench_core.""" 4 | 5 | from .cli import cli_group 6 | 7 | __all__ = ["cli_group"] 8 | -------------------------------------------------------------------------------- /mlbench_core/cli/chartbuilder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import tempfile 4 | 5 | from git import Repo 6 | from supermutes.dot import dotify 7 | 8 | DEFAULT_GIT_BRANCH = "develop" 9 | 10 | 11 | def git_clone(repo_url, branch="master", path=""): 12 | """clones repo to a temporary dir, the path of which is determined by the platform""" 13 | 14 | _tmp_dir = tempfile.mkdtemp(prefix="mlbench-") 15 | repo = Repo.clone_from(repo_url, _tmp_dir, branch=branch) 16 | 17 | return os.path.join(_tmp_dir, path) 18 | 19 | 20 | class ChartBuilder: 21 | """Class that allows building helm charts either from a repository or a local folder 22 | 23 | Args: 24 | chart (dict): Dictionary describing the location. Should look like: 25 | ``` 26 | { 27 | "name": [chart_name], 28 | "source": { 29 | "type": ["git" or "directory"], 30 | "location": [repo_url or directory path] 31 | "reference": [optional, to select the branch] 32 | } 33 | } 34 | ``` 35 | """ 36 | 37 | def __init__(self, chart): 38 | self.chart = dotify(chart) 39 | self.source_directory = self.source_clone() 40 | 41 | def source_clone(self): 42 | """ 43 | Clone the charts source 44 | We only support a git source type right now, which can also 45 | handle git:// local paths as well 46 | """ 47 | 48 | subpath = self.chart.source.get("subpath", "") 49 | 50 | if "name" not in self.chart: 51 | raise ValueError("Please specify name for the chart") 52 | 53 | if "type" not in self.chart.source: 54 | raise ValueError("Need source type for chart {}".format(self.chart.name)) 55 | 56 | if self.chart.source.type == "git": 57 | if "reference" not in self.chart.source: 58 | self.chart.source.reference = DEFAULT_GIT_BRANCH 59 | if "path" not in self.chart.source: 60 | self.chart.source.path = "" 61 | self._source_tmp_dir = git_clone( 62 | self.chart.source.location, 63 | self.chart.source.reference, 64 | self.chart.source.path, 65 | ) 66 | elif self.chart.source.type == "directory": 67 | self._source_tmp_dir = self.chart.source.location 68 | 69 | else: 70 | raise ValueError( 71 | "Unknown source type %s for chart %s", 72 | self.chart.name, 73 | self.chart.source.type, 74 | ) 75 | 76 | return os.path.join(self._source_tmp_dir, subpath) 77 | 78 | def _get_values_string(self, vals, parent=None): 79 | """Given a dictionary of values, recursively returns the arguments to pass to `helm template`. 80 | 81 | For example: {"key1": "value1", "key2": {"key3":"value3"}} 82 | gives ["--set", "key1=value1", "--set", "key2.key3=value3"] 83 | 84 | Args: 85 | vals (dict): Dictionary of values 86 | parent (str, optional): The parent key 87 | 88 | Returns: 89 | (list[str]): The command list 90 | """ 91 | values = [] 92 | for k, v in vals.items(): 93 | if type(v) == dict: 94 | values += self._get_values_string(v, k) 95 | else: 96 | key = "{}={}".format(k, v) 97 | if parent is not None: 98 | key = "{}.{}".format(parent, key) 99 | 100 | values += ["--set", key] 101 | return values 102 | 103 | def get_chart(self, release_name, values): 104 | """Executes the command `helm template {args}` to generate the chart 105 | and saves the yaml to a temporary directory 106 | 107 | Args: 108 | release_name (str): Release name 109 | values (dict): Values to overwrite 110 | 111 | Returns: 112 | (str): Path of generated template 113 | """ 114 | values_options = self._get_values_string(values) 115 | output = subprocess.check_output( 116 | ["helm", "template", release_name, self.source_directory] + values_options 117 | ) 118 | 119 | if self.chart.source.type == "git": 120 | subpath = self.chart.source.get("subpath", "") 121 | template_path = os.path.join( 122 | self._source_tmp_dir, subpath, "mlbench_template.yaml" 123 | ) 124 | else: 125 | template_path = os.path.join(tempfile.mkdtemp(), "template.yaml") 126 | 127 | with open(template_path, "wb") as f: 128 | f.write(output) 129 | return template_path 130 | -------------------------------------------------------------------------------- /mlbench_core/cli/utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from time import sleep 3 | 4 | from kubernetes import client 5 | 6 | from mlbench_core.cli.chartbuilder import ChartBuilder 7 | 8 | 9 | def _get_master_pod(release_name, pods): 10 | """Given a release name and a list of pods, returns the master pod of the release 11 | 12 | Args: 13 | release_name (str): Release name 14 | pods (:obj:`V1PodList`): List of pods 15 | 16 | Returns: 17 | (:obj:`Pod`, optional): The master pod 18 | """ 19 | 20 | master_pod_name = "{}-mlbench-master-".format(release_name) 21 | for pod in pods.items: 22 | if master_pod_name in pod.metadata.name: 23 | return pod 24 | 25 | return None 26 | 27 | 28 | def _wait_for_deployment(release_name): 29 | """Given a release name, waits for the master pod to be running 30 | 31 | Args: 32 | release_name (str): Release name 33 | 34 | Raises: 35 | ValueError: If the master pod is not running 36 | """ 37 | kube_api = client.CoreV1Api() 38 | pods = kube_api.list_namespaced_pod(namespace="default") 39 | master_pod = _get_master_pod(release_name, pods) 40 | while master_pod is None or master_pod.status.phase == "Pending": 41 | pods = kube_api.list_namespaced_pod(namespace="default") 42 | master_pod = _get_master_pod(release_name, pods) 43 | sleep(1) 44 | if master_pod is None or master_pod.status.phase in ["Failed", "Unknown"]: 45 | raise ValueError("Could not deploy chart") 46 | 47 | 48 | def deploy_chart( 49 | num_workers, 50 | num_gpus, 51 | num_cpus, 52 | release_name, 53 | custom_value, 54 | kube_context, 55 | custom_chart=None, 56 | ): 57 | """Deploys the mlbench-helm chart given its values 58 | 59 | Args: 60 | num_workers (int): Number of worker nodes (excluding master) 61 | num_gpus (int): Number of GPUs per node 62 | num_cpus (int): Number of CPUs per node 63 | release_name (str): Release name 64 | custom_value (str): Custom values for chart 65 | kube_context (str): Current kube-context (must be saved in kubeconfig) 66 | custom_chart (dict, optional): Custom chart to use (e.g. local chart) 67 | """ 68 | sleep(5) 69 | 70 | # install chart 71 | chart = ChartBuilder( 72 | { 73 | "name": "mlbench-helm", 74 | "source": { 75 | "type": "git", 76 | "location": "https://github.com/mlbench/mlbench-helm", 77 | }, 78 | } 79 | if custom_chart is None 80 | else custom_chart 81 | ) 82 | 83 | values = {"limits": {"workers": num_workers, "gpu": num_gpus, "cpu": num_cpus}} 84 | if custom_value: 85 | # merge custom values with values 86 | for cv in custom_value: 87 | key, v = cv.split("=", 1) 88 | 89 | current = values 90 | key_path = key.split(".") 91 | 92 | for k in key_path[:-1]: 93 | if k not in current: 94 | current[k] = {} 95 | 96 | current = current[k] 97 | 98 | current[key_path[-1]] = v 99 | 100 | chart_path = chart.get_chart(release_name, values) 101 | 102 | output = subprocess.check_output( 103 | [ 104 | "kubectl", 105 | "apply", 106 | "--validate=false", 107 | "--context={}".format(kube_context), 108 | "-f", 109 | chart_path, 110 | ] 111 | ) 112 | sleep(1) 113 | 114 | _wait_for_deployment(release_name) 115 | -------------------------------------------------------------------------------- /mlbench_core/controlflow/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | 8 | try: 9 | import tensorflow 10 | 11 | from . import tensorflow 12 | except ImportError: 13 | pass 14 | -------------------------------------------------------------------------------- /mlbench_core/controlflow/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .checkpoints_evaluation import CheckpointsEvaluationControlFlow 2 | from .controlflow import ( 3 | compute_train_batch_metrics, 4 | record_train_batch_stats, 5 | record_validation_stats, 6 | validation_round, 7 | ) 8 | from .helpers import prepare_batch 9 | 10 | __all__ = [ 11 | "CheckpointsEvaluationControlFlow", 12 | "compute_train_batch_metrics", 13 | "record_validation_stats", 14 | "record_train_batch_stats", 15 | "validation_round", 16 | "prepare_batch", 17 | ] 18 | -------------------------------------------------------------------------------- /mlbench_core/controlflow/pytorch/checkpoints_evaluation.py: -------------------------------------------------------------------------------- 1 | """Evaluate training/validation set using models in checkpoints""" 2 | import logging 3 | 4 | import torch 5 | 6 | from mlbench_core.aggregation.pytorch.centralized import AllReduceAggregation 7 | from mlbench_core.controlflow.pytorch.helpers import iterate_dataloader 8 | from mlbench_core.utils.pytorch.distributed import global_average 9 | 10 | logger = logging.getLogger("mlbench") 11 | 12 | 13 | class CheckpointsEvaluationControlFlow(object): 14 | """Evaluate models on training / validation dataset. 15 | 16 | Args: 17 | ckpt_dir (str): Path to checkpoints. 18 | rank (int): The rank of the current process 19 | world_size (int): The total number of workers 20 | checkpointer (:obj:`Checkpointer`): Used to load checkpoints. 21 | model (:obj:`torch.optim.Optimizer`): An optimizer for the given model. 22 | epochs (int): Number of epochs to traing. 23 | loss_function (:obj:`torch.nn.modules.loss._Loss`): loss function. 24 | metrics (:obj:`list` of :obj:`mlbench_core.evaluation.pytorch.*`): metrics like TopKAccuracy. 25 | use_cuda (bool): Whether to train on GPU or not. Default: `False` 26 | dtype (str): The datatype to use for the dataloader data 27 | max_batch_per_epoch (int): Maximum number of batches per epoch. Whole dataset 28 | is used if not specified. Default: `None` 29 | """ 30 | 31 | def __init__( 32 | self, 33 | ckpt_dir, 34 | rank, 35 | world_size, 36 | checkpointer, 37 | model, 38 | epochs, 39 | loss_function, 40 | metrics, 41 | use_cuda=False, 42 | dtype=None, 43 | max_batch_per_epoch=None, 44 | ): 45 | self.ckpt_dir = ckpt_dir 46 | self.rank = rank 47 | self.checkpointer = checkpointer 48 | self.model = model 49 | self.epochs = epochs 50 | self.loss_function = loss_function 51 | self.metrics = metrics 52 | self.dtype = dtype 53 | self.max_batch_per_epoch = max_batch_per_epoch 54 | self.use_cuda = use_cuda 55 | 56 | self.model_agg_fn = AllReduceAggregation(world_size=world_size).agg_model() 57 | 58 | self._check_checkpoints() 59 | 60 | def _check_checkpoints(self): 61 | for epoch in range(self.epochs): 62 | self.checkpointer.checkpoint_exists(self.ckpt_dir, self.rank, epoch) 63 | 64 | def _load_model(self, epoch): 65 | # Load epoch-rank model 66 | model = self.checkpointer.load_model_by_epoch( 67 | self.ckpt_dir, self.rank, epoch, self.model 68 | ) 69 | 70 | # aggregate models 71 | self.model_agg_fn(model, op="avg_world") 72 | return model 73 | 74 | def evaluate_by_epochs(self, dataloader): 75 | """Evaluate dataset using the averaged models. 76 | 77 | In each epoch each process loads models and averages them. The averaged model is 78 | used to evaluate train / validation dataset. 79 | 80 | Args: 81 | dataloader (:obj:`torch.utils.data.DataLoader`): The dataset to be evaluated. 82 | 83 | Returns: 84 | list: list of stats of models in each epoch. 85 | """ 86 | stats_list = [] 87 | for epoch in range(self.epochs): 88 | # Same model for all workers. 89 | model = self._load_model(epoch) 90 | model.eval() 91 | 92 | stats = {"epoch": epoch, "count": 0, "total_loss": 0} 93 | for metric in self.metrics: 94 | stats["total_" + metric.name] = 0 95 | 96 | data_iter = iterate_dataloader( 97 | dataloader, self.dtype, self.max_batch_per_epoch, self.use_cuda 98 | ) 99 | 100 | with torch.no_grad(): 101 | for i, (data, target) in enumerate(data_iter): 102 | output = model(data) 103 | 104 | # Compute loss and metrics. 105 | count = len(target) 106 | stats["count"] += count 107 | stats["total_loss"] += self.loss_function(output, target) * count 108 | for metric in self.metrics: 109 | stats["total_" + metric.name] += metric(output, target) * count 110 | 111 | logger.info( 112 | "E{:4}B{:4}: total loss={:10.3e}".format( 113 | epoch, i, stats["total_loss"] / stats["count"] 114 | ) 115 | ) 116 | 117 | # Keep globally averaged loss / metrics, etc. 118 | stats["loss"] = global_average(stats["total_loss"], stats["count"]).item() 119 | for metric in self.metrics: 120 | stats[metric.name] = global_average( 121 | stats["total_" + metric.name], stats["count"] 122 | ).item() 123 | del stats["total_" + metric.name] 124 | del stats["count"], stats["total_loss"] 125 | 126 | stats_list.append(stats) 127 | return stats_list 128 | -------------------------------------------------------------------------------- /mlbench_core/controlflow/pytorch/helpers.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | 4 | def maybe_range(maximum): 5 | """Map an integer or None to an integer iterator starting from 0 with stride 1. 6 | 7 | If maximum number of batches per epoch is limited, then return an finite 8 | iterator. Otherwise, return an iterator of infinite length. 9 | 10 | Args: 11 | maximum (int | None): Maximum number of steps in iterator. 12 | If none, returns iterator of infinite length 13 | 14 | Returns: 15 | (iterator) 16 | """ 17 | if maximum is None: 18 | counter = itertools.count(0) 19 | else: 20 | counter = range(maximum) 21 | return counter 22 | 23 | 24 | def convert_dtype(dtype, obj): 25 | """Converts given tensor to given dtype 26 | 27 | Args: 28 | dtype (str): One of `fp32` or `fp64` 29 | obj (`obj`:torch.Tensor | `obj`:torch.nn.Module): Module or tensor to convert 30 | 31 | Returns: 32 | (`obj`:torch.Tensor | `obj`:torch.nn.Module): Converted tensor or module 33 | """ 34 | # The object should be a ``module`` or a ``tensor`` 35 | if dtype == "fp32": 36 | return obj.float() 37 | elif dtype == "fp64": 38 | return obj.double() 39 | else: 40 | raise NotImplementedError("dtype {} not supported.".format(dtype)) 41 | 42 | 43 | def prepare_batch(data, target, dtype, transform_target_dtype=False, use_cuda=False): 44 | """Prepares a batch for training by changing the type and sending to cuda 45 | if necessary 46 | 47 | Args: 48 | data (`obj`:torch.Tensor): The input tensor 49 | target (`obj`:torch.Tensor): The target tensor 50 | dtype (str): One of `fp32` or `fp64`, data type to transform input and/or target 51 | transform_target_dtype (bool): Transform target to `dtype` too 52 | use_cuda (bool): Send tensors to GPU 53 | 54 | Returns: 55 | (`obj`:torch.Tensor, `obj`:torch.Tensor): Input and target tensors 56 | """ 57 | data = convert_dtype(dtype, data) 58 | if transform_target_dtype: 59 | target = convert_dtype(dtype, target) 60 | 61 | if use_cuda: 62 | data, target = data.cuda(), target.cuda() 63 | 64 | return data, target 65 | 66 | 67 | def iterate_dataloader( 68 | dataloader, 69 | dtype, 70 | max_batch_per_epoch=None, 71 | use_cuda=False, 72 | transform_target_type=False, 73 | ): 74 | """Function that returns an iterator on the given loader. 75 | Can be used to limit the number of batches, converting input and target dtypes 76 | and sending to GPU 77 | 78 | Args: 79 | dataloader (`obj`:torch.utils.data.DataLoader): The loader 80 | dtype (str): Type to convert to (`fp32` or `fp64`) 81 | max_batch_per_epoch (int | None): Maximum number of batches 82 | use_cuda (bool): Send tensors to GPU 83 | transform_target_type (bool): Transform target dtype as well 84 | 85 | Returns: 86 | (iterator): An iterator over the data 87 | """ 88 | for _, (data, target) in zip(maybe_range(max_batch_per_epoch), dataloader): 89 | data, target = prepare_batch( 90 | data=data, 91 | target=target, 92 | dtype=dtype, 93 | transform_target_dtype=transform_target_type, 94 | use_cuda=use_cuda, 95 | ) 96 | 97 | yield data, target 98 | -------------------------------------------------------------------------------- /mlbench_core/controlflow/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .train_validation import TrainValidation 2 | 3 | __all__ = ["TrainValidation"] 4 | -------------------------------------------------------------------------------- /mlbench_core/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from . import imagerecognition, linearmodels, nlp, util 2 | -------------------------------------------------------------------------------- /mlbench_core/dataset/imagerecognition/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | 8 | try: 9 | import tensorflow 10 | 11 | from . import tensorflow 12 | except ImportError: 13 | pass 14 | -------------------------------------------------------------------------------- /mlbench_core/dataset/imagerecognition/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataloader import CIFAR10V1, Imagenet 2 | -------------------------------------------------------------------------------- /mlbench_core/dataset/imagerecognition/pytorch/dataloader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import torchvision.datasets as datasets 5 | import torchvision.transforms as transforms 6 | 7 | _logger = logging.getLogger("mlbench") 8 | 9 | 10 | class CIFAR10V1(datasets.CIFAR10): 11 | """CIFAR10V1 Dataset. 12 | 13 | Loads CIFAR10V1 images with mean and std-dev normalisation. 14 | Performs random crop and random horizontal flip on train and 15 | only normalisation on val. 16 | Based on `torchvision.datasets.CIFAR10` and `Pytorch CIFAR 10 Example`_. 17 | 18 | Args: 19 | root (str): Root folder for the dataset 20 | train (bool): Whether to get the train or validation set (default=True) 21 | download (bool): Whether to download the dataset if it's not present 22 | 23 | .. _Pytorch CIFAR 10 Example: 24 | https://github.com/kuangliu/pytorch-cifar/blob/master/main.py 25 | """ 26 | 27 | def __init__(self, root, train=True, download=False): 28 | cifar10_stats = { 29 | "mean": (0.4914, 0.4822, 0.4465), 30 | "std": (0.2023, 0.1994, 0.2010), 31 | } 32 | 33 | if train: 34 | transform = transforms.Compose( 35 | [ 36 | transforms.RandomHorizontalFlip(), 37 | transforms.RandomCrop(32, padding=4), 38 | transforms.ToTensor(), 39 | transforms.Normalize(cifar10_stats["mean"], cifar10_stats["std"]), 40 | ] 41 | ) 42 | else: 43 | transform = transforms.Compose( 44 | [ 45 | transforms.ToTensor(), 46 | transforms.Normalize(cifar10_stats["mean"], cifar10_stats["std"]), 47 | ] 48 | ) 49 | super(CIFAR10V1, self).__init__( 50 | root=root, train=train, transform=transform, download=download 51 | ) 52 | 53 | 54 | class Imagenet(datasets.ImageFolder): 55 | """Imagenet (ILSVRC2017) Dataset. 56 | 57 | Loads Imagenet images with mean and std-dev normalisation. 58 | Performs random crop and random horizontal flip on train and 59 | resize + center crop on val. 60 | Based on `torchvision.datasets.ImageFolder` 61 | 62 | Args: 63 | root (str): Root folder of Imagenet dataset (without `train/` or `val/`) 64 | train (bool): Whether to get the train or validation set (default=True) 65 | """ 66 | 67 | def __init__(self, root, train=True): 68 | self.train = train 69 | 70 | imagenet_stats = {"mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225]} 71 | 72 | if train: 73 | transform = transforms.Compose( 74 | [ 75 | transforms.RandomResizedCrop(224), 76 | transforms.RandomHorizontalFlip(), 77 | transforms.ToTensor(), 78 | transforms.Normalize(imagenet_stats["mean"], imagenet_stats["std"]), 79 | ] 80 | ) 81 | self.root = os.path.join(self.root, "train") 82 | else: 83 | transform = transforms.Compose( 84 | [ 85 | transforms.Resize(256), 86 | transforms.CenterCrop(224), 87 | transforms.ToTensor(), 88 | transforms.Normalize(imagenet_stats["mean"], imagenet_stats["std"]), 89 | ] 90 | ) 91 | self.root = os.path.join(self.root, "val") 92 | 93 | super().__init__(self.root, transform) 94 | -------------------------------------------------------------------------------- /mlbench_core/dataset/imagerecognition/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .cifar10 import DatasetCifar 2 | 3 | __all__ = ["DatasetCifar"] 4 | -------------------------------------------------------------------------------- /mlbench_core/dataset/linearmodels/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | -------------------------------------------------------------------------------- /mlbench_core/dataset/linearmodels/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataloader import LMDBDataset 2 | 3 | __all__ = ["LMDBDataset"] 4 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from mlbench_core.dataset.nlp.pytorch.wmt16.utils import build_collate_fn 2 | 3 | from .wikitext2_dataset import Wikitext2Dataset 4 | from .wmt16_dataset import WMT16Dataset 5 | from .wmt17.batching import get_batches 6 | from .wmt17_dataset import WMT17Dataset 7 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt16/__init__.py: -------------------------------------------------------------------------------- 1 | from . import wmt16_config 2 | from .utils import * 3 | from .wmt16_tokenizer import WMT16Tokenizer 4 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt16/preprocess/filter_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import Counter 3 | 4 | 5 | def parse_args(): 6 | parser = argparse.ArgumentParser(description="Clean dataset") 7 | parser.add_argument("-f1", "--file1", help="file1") 8 | parser.add_argument("-f2", "--file2", help="file2") 9 | return parser.parse_args() 10 | 11 | 12 | def save_output(fname, data): 13 | with open(fname, "w") as f: 14 | f.writelines(data) 15 | 16 | 17 | def main(): 18 | """ 19 | Discards all pairs of sentences which can't be decoded by latin-1 encoder. 20 | 21 | It aims to filter out sentences with rare unicode glyphs and pairs which 22 | are most likely not valid English-German sentences. 23 | 24 | Examples of discarded sentences: 25 | 26 | ✿★★★Hommage au king de la pop ★★★✿ ✿★★★Que son âme repos... 27 | 28 | Для их осуществления нам, прежде всего, необходимо преодолеть 29 | возражения рыночных фундаменталистов, которые хотят ликвидировать или 30 | уменьшить роль МВФ. 31 | 32 | practised as a scientist in various medical departments of the ⇗Medical 33 | University of Hanover , the ⇗University of Ulm , and the ⇗RWTH Aachen 34 | (rheumatology, pharmacology, physiology, pathology, microbiology, 35 | immunology and electron-microscopy). 36 | 37 | The same shift】 and press 【】 【alt out with a smaller diameter 38 | circle. 39 | 40 | Brought to you by ABMSUBS ♥leira(Coordinator/Translator) 41 | ♥chibichan93(Timer/Typesetter) ♥ja... 42 | 43 | Some examples: &0u - ☺ &0U - ☻ &tel - ☏ &PI - ¶ &SU - ☼ &cH- - ♥ &M2=♫ 44 | &sn - ﺵ SGML maps SGML to unicode. 45 | """ 46 | args = parse_args() 47 | 48 | c = Counter() 49 | skipped = 0 50 | valid = 0 51 | data1 = [] 52 | data2 = [] 53 | 54 | with open(args.file1) as f1, open(args.file2) as f2: 55 | for idx, lines in enumerate(zip(f1, f2)): 56 | line1, line2 = lines 57 | if idx % 100000 == 1: 58 | print("Processed {} lines".format(idx)) 59 | try: 60 | line1.encode("latin1") 61 | line2.encode("latin1") 62 | except UnicodeEncodeError: 63 | skipped += 1 64 | else: 65 | data1.append(line1) 66 | data2.append(line2) 67 | valid += 1 68 | c.update(line1) 69 | 70 | ratio = valid / (skipped + valid) 71 | print("Skipped: {}, Valid: {}, Valid ratio {}".format(skipped, valid, ratio)) 72 | print("Character frequency:", c) 73 | 74 | save_output(args.file1, data1) 75 | save_output(args.file2, data2) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt16/preprocess/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import torch 5 | 6 | from mlbench_core.dataset.nlp.pytorch.wmt16 import wmt16_config 7 | from mlbench_core.dataset.nlp.pytorch.wmt16_dataset import WMT16Dataset 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser( 12 | description="GNMT prepare data", 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 14 | ) 15 | 16 | parser.add_argument( 17 | "--dataset-dir", 18 | default="data/wmt16_de_en", 19 | help="path to the directory with training/test data", 20 | ) 21 | parser.add_argument( 22 | "--max-size", 23 | default=None, 24 | type=int, 25 | help="use at most MAX_SIZE elements from training \ 26 | dataset (useful for benchmarking), by default \ 27 | uses entire dataset", 28 | ) 29 | 30 | parser.add_argument( 31 | "--math", 32 | default="fp16", 33 | choices=["fp32", "fp16"], 34 | help="arithmetic type", 35 | ) 36 | 37 | parser.add_argument( 38 | "--max-length-train", 39 | default=75, 40 | type=int, 41 | help="maximum sequence length for training \ 42 | (including special BOS and EOS tokens)", 43 | ) 44 | parser.add_argument( 45 | "--min-length-train", 46 | default=0, 47 | type=int, 48 | help="minimum sequence length for training \ 49 | (including special BOS and EOS tokens)", 50 | ) 51 | 52 | parser.add_argument( 53 | "--num-workers", default=2, type=int, help="Number of workers for loader" 54 | ) 55 | parser.add_argument( 56 | "--batch-size", default=1024, type=int, help="Batch size for loader" 57 | ) 58 | args = parser.parse_args() 59 | return args 60 | 61 | 62 | def build_collate_fn(max_seq_len): 63 | def collate_seq(seq): 64 | lengths = torch.tensor([len(s) for s in seq]) 65 | batch_length = max_seq_len 66 | 67 | shape = (len(seq), batch_length) 68 | seq_tensor = torch.full(shape, wmt16_config.PAD, dtype=torch.int64) 69 | 70 | for i, s in enumerate(seq): 71 | end_seq = lengths[i] 72 | seq_tensor[i, :end_seq].copy_(s[:end_seq]) 73 | 74 | return seq_tensor, lengths 75 | 76 | def parallel_collate(seqs): 77 | src_seqs, tgt_seqs = zip(*seqs) 78 | return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]]) 79 | 80 | return parallel_collate 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | 86 | print(f"Run arguments: {args}") 87 | 88 | train_data = WMT16Dataset( 89 | args.dataset_dir, 90 | lang=("en", "de"), 91 | math_precision=args.math, 92 | download=False, 93 | train=True, 94 | lazy=True, 95 | min_len=args.min_length_train, 96 | max_len=args.max_length_train, 97 | sort=False, 98 | max_size=args.max_size, 99 | ) 100 | 101 | print("Total train points to pre-process: {}".format(len(train_data))) 102 | collate_fn = build_collate_fn(max_seq_len=args.max_length_train) 103 | 104 | train_data.write_as_preprocessed( 105 | collate_fn, 106 | args.min_length_train, 107 | args.max_length_train, 108 | num_workers=args.num_workers, 109 | batch_size=args.batch_size, 110 | ) 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt16/wmt16_config.py: -------------------------------------------------------------------------------- 1 | """Configuration for WMT16 dataset""" 2 | PAD_TOKEN = "" 3 | UNK_TOKEN = "" 4 | BOS_TOKEN = "" 5 | EOS_TOKEN = "<\s>" 6 | 7 | PAD, UNK, BOS, EOS = 0, 1, 2, 3 8 | BPE_CODES = "bpe.32000" 9 | VOCAB_FNAME = "vocab.bpe.32000" 10 | 11 | TRAIN_FNAME = "train.tok.clean.bpe.32000" 12 | VAL_FNAME = "newstest2014.tok.bpe.32000" 13 | 14 | EXTS = (".en", ".de") 15 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt16/wmt16_tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from functools import partial 4 | 5 | import torch 6 | 7 | from mlbench_core.dataset.nlp.pytorch.wmt16 import wmt16_config 8 | 9 | 10 | def _pad_vocabulary(vocab, math): 11 | """ 12 | Pads vocabulary to a multiple of 'pad' tokens. 13 | 14 | Args: 15 | vocab (list): list with vocabulary 16 | math (str): Math precision. either `fp_16`, `manual_fp16` or `fp32` 17 | 18 | Returns: 19 | list: padded vocabulary 20 | """ 21 | if math == "fp16": 22 | pad = 8 23 | elif math == "fp32": 24 | pad = 1 25 | else: 26 | raise NotImplementedError() 27 | 28 | vocab_size = len(vocab) 29 | padded_vocab_size = (vocab_size + pad - 1) // pad * pad 30 | for i in range(0, padded_vocab_size - vocab_size): 31 | token = f"madeupword{i:04d}" 32 | vocab.append(token) 33 | assert len(vocab) % pad == 0 34 | return vocab 35 | 36 | 37 | class WMT16Tokenizer: 38 | """Tokenizer Class for WMT16 that uses the whole vocabulary 39 | 40 | Args: 41 | base_dir (str): Base directory for files 42 | math_precision (str): Math precision 43 | separator (str): BPE 44 | """ 45 | 46 | def __init__( 47 | self, 48 | base_dir, 49 | math_precision=None, 50 | separator="@@", 51 | ): 52 | self.separator = separator 53 | 54 | vocab = [ 55 | wmt16_config.PAD_TOKEN, 56 | wmt16_config.UNK_TOKEN, 57 | wmt16_config.BOS_TOKEN, 58 | wmt16_config.EOS_TOKEN, 59 | ] 60 | vocab_fname = os.path.join(base_dir, wmt16_config.VOCAB_FNAME) 61 | 62 | with open(vocab_fname, encoding="utf-8") as vfile: 63 | for line in vfile: 64 | vocab.append(line.strip()) 65 | 66 | vocab = _pad_vocabulary(vocab, math_precision) 67 | self.vocab_size = len(vocab) 68 | 69 | self.tok2idx = defaultdict(partial(int, wmt16_config.UNK)) 70 | for idx, token in enumerate(vocab): 71 | self.tok2idx[token] = idx 72 | 73 | self.idx2tok = {} 74 | for key, value in self.tok2idx.items(): 75 | self.idx2tok[value] = key 76 | 77 | def segment(self, line): 78 | """ 79 | Tokenizes single sentence and adds special BOS and EOS tokens. 80 | 81 | :param line: sentence 82 | 83 | returns: list representing tokenized sentence 84 | """ 85 | line = line.strip().split() 86 | entry = [self.tok2idx[i] for i in line] 87 | entry = [wmt16_config.BOS] + entry + [wmt16_config.EOS] 88 | return entry 89 | 90 | def detokenize(self, inputs, delim=" "): 91 | """ 92 | Detokenizes single sentence and removes token separator characters. 93 | 94 | :param inputs: sequence of tokens 95 | :param delim: tokenization delimiter 96 | 97 | returns: string representing detokenized sentence 98 | """ 99 | detok = delim.join([self.idx2tok[idx] for idx in inputs]) 100 | detok = detok.replace(self.separator + " ", "") 101 | detok = detok.replace(self.separator, "") 102 | 103 | detok = detok.replace(wmt16_config.BOS_TOKEN, "") 104 | detok = detok.replace(wmt16_config.EOS_TOKEN, "") 105 | detok = detok.replace(wmt16_config.PAD_TOKEN, "") 106 | detok = detok.strip() 107 | return detok 108 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt17/__init__.py: -------------------------------------------------------------------------------- 1 | from .collate import collate_batch 2 | from .wmt17_dictionary import Dictionary 3 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt17/collate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def _collate_tokens( 5 | values, 6 | pad_idx, 7 | eos_idx, 8 | left_pad, 9 | move_eos_to_beginning=False, 10 | n_seq_per_batch_multiple=8, 11 | seq_len_multiple=1, 12 | ): 13 | """Convert a list of 1d tensors into a padded 2d tensor. 14 | 15 | Args: 16 | values (list[torch.Tensor]): A list of tensors 17 | pad_idx (int): Padding symbol index 18 | eos_idx (int): EOS symbol index 19 | left_pad (bool): left- or right-padding (true: left, false: right) 20 | move_eos_to_beginning (bool): Reverse order of sequence of tokens (true: reverse, false: original) 21 | n_seq_per_batch_multiple (int): The number of sequences per batch to round down to 22 | seq_len_multiple (int): The number of tokens per sequence to round up to 23 | 24 | Returns: 25 | (:obj:`torch.Tensor`): The tensor of collated and padded tokens 26 | """ 27 | size_of_seq_dim = max(v.size(0) for v in values) # Unpadded size 28 | n_seq_in_batch = len(values) 29 | 30 | if n_seq_per_batch_multiple % seq_len_multiple == 0: 31 | n_seq_multiple = n_seq_per_batch_multiple / seq_len_multiple 32 | else: 33 | n_seq_multiple = n_seq_per_batch_multiple 34 | 35 | if n_seq_in_batch < n_seq_multiple or n_seq_in_batch % n_seq_multiple > 0: 36 | seq_len_multiple = n_seq_per_batch_multiple 37 | 38 | size_of_seq_dim = ( 39 | (size_of_seq_dim + seq_len_multiple - 1) // seq_len_multiple * seq_len_multiple 40 | ) # Padded seq len, rounded up to next multiple 41 | 42 | padded_2d_tensor = values[0].new(len(values), size_of_seq_dim).fill_(pad_idx) 43 | 44 | def copy_tensor(src, dst): 45 | assert dst.numel() == src.numel() 46 | 47 | if move_eos_to_beginning: 48 | assert src[-1] == eos_idx 49 | dst[0] = eos_idx 50 | dst[1:] = src[:-1] 51 | else: 52 | dst.copy_(src) 53 | 54 | if left_pad: 55 | for idx, val in enumerate(values): 56 | copy_tensor(val, padded_2d_tensor[idx][size_of_seq_dim - len(val) :]) 57 | else: 58 | for idx, val in enumerate(values): 59 | copy_tensor(val, padded_2d_tensor[idx][: len(val)]) 60 | 61 | return padded_2d_tensor 62 | 63 | 64 | def collate_batch( 65 | samples, 66 | pad_idx, 67 | eos_idx, 68 | left_pad_source=True, 69 | left_pad_target=False, 70 | bsz_mult=8, 71 | seq_len_multiple=1, 72 | ): 73 | """Collate a list of samples into a batch 74 | 75 | Args: 76 | samples (list[dict]): Samples to collate 77 | pad_idx (int): Padding symbol index 78 | eos_idx (int): EOS symbol index 79 | left_pad_source (bool): Pad sources on the left 80 | left_pad_target (bool): Pad sources on the right 81 | bsz_mult (int): Batch size multiple 82 | seq_len_multiple (int): Sequence length multiple 83 | 84 | Returns: 85 | (dict): Containing keys `id` (list of indices), `ntokens` (total num tokens), `net_input` and `target` 86 | 87 | """ 88 | if len(samples) == 0: 89 | return {} 90 | 91 | def merge(key, left_pad, move_eos_to_beginning=False): 92 | return _collate_tokens( 93 | [s[key] for s in samples], 94 | pad_idx, 95 | eos_idx, 96 | left_pad, 97 | move_eos_to_beginning, 98 | bsz_mult, 99 | seq_len_multiple, 100 | ) 101 | 102 | id = torch.LongTensor([s["id"] for s in samples]) 103 | src_tokens = merge("source", left_pad=left_pad_source) 104 | # sort by descending source length 105 | src_lengths = torch.LongTensor([s["source"].numel() for s in samples]) 106 | 107 | prev_output_tokens = None 108 | target = None 109 | if samples[0].get("target", None) is not None: 110 | target = merge("target", left_pad=left_pad_target) 111 | # we create a shifted version of targets for feeding the 112 | # previous output token(s) into the next decoder step 113 | prev_output_tokens = merge( 114 | "target", 115 | left_pad=left_pad_target, 116 | move_eos_to_beginning=True, 117 | ) 118 | ntokens = sum(len(s["target"]) for s in samples) 119 | else: 120 | ntokens = sum(len(s["source"]) for s in samples) 121 | 122 | return { 123 | "id": id, 124 | "ntokens": ntokens, 125 | "net_input": { 126 | "src_tokens": src_tokens, 127 | "src_lengths": src_lengths, 128 | "prev_output_tokens": prev_output_tokens, 129 | }, 130 | "target": target, 131 | } 132 | -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt17/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/dataset/nlp/pytorch/wmt17/preprocess/__init__.py -------------------------------------------------------------------------------- /mlbench_core/dataset/nlp/pytorch/wmt17/preprocess/indexed_dataset.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | import numpy as np 4 | 5 | dtypes = { 6 | 1: np.uint8, 7 | 2: np.int8, 8 | 3: np.int16, 9 | 4: np.int32, 10 | 5: np.int64, 11 | 6: np.float, 12 | 7: np.double, 13 | } 14 | 15 | 16 | def write_longs(f, a): 17 | f.write(np.array(a, dtype=np.int64)) 18 | 19 | 20 | def code(dtype): 21 | for k in dtypes.keys(): 22 | if dtypes[k] == dtype: 23 | return k 24 | 25 | 26 | class IndexedDatasetBuilder(object): 27 | element_sizes = { 28 | np.uint8: 1, 29 | np.int8: 1, 30 | np.int16: 2, 31 | np.int32: 4, 32 | np.int64: 8, 33 | np.float: 4, 34 | np.double: 8, 35 | } 36 | 37 | def __init__(self, out_file, dtype=np.int32): 38 | self.out_file = open(out_file, "wb") 39 | self.dtype = dtype 40 | self.data_offsets = [0] 41 | self.dim_offsets = [0] 42 | self.sizes = [] 43 | self.element_size = self.element_sizes[self.dtype] 44 | 45 | def add_item(self, tensor): 46 | bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype)) 47 | self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) 48 | for s in tensor.size(): 49 | self.sizes.append(s) 50 | self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) 51 | 52 | def finalize(self, index_file): 53 | self.out_file.close() 54 | index = open(index_file, "wb") 55 | index.write(b"TNTIDX\x00\x00") 56 | index.write(struct.pack("") 21 | self.pad_index = 1 22 | self.eos_index = 2 23 | self.nspecial = 3 24 | 25 | def __eq__(self, other): 26 | return self.indices == other.indices 27 | 28 | def __getitem__(self, idx): 29 | if idx < len(self.symbols): 30 | return self.symbols[idx] 31 | else: 32 | assert idx < len(self.symbols) 33 | 34 | def __len__(self): 35 | """Returns the number of symbols in the dictionary""" 36 | return len(self.symbols) 37 | 38 | def index(self, sym): 39 | """Returns the index of the specified symbol""" 40 | if sym in self.indices: 41 | return self.indices[sym] 42 | else: 43 | assert sym in self.indices 44 | 45 | def string(self, tensor, bpe_symbol=None): 46 | """Helper for converting a tensor of token indices to a string. 47 | 48 | Can optionally remove BPE symbols or escape words. 49 | """ 50 | if torch.is_tensor(tensor) and tensor.dim() == 2: 51 | return "\n".join(self.string(t) for t in tensor) 52 | 53 | def token_string(i): 54 | return self[i] 55 | 56 | sent = " ".join(token_string(i) for i in tensor if i != self.eos()) 57 | if bpe_symbol is not None: 58 | sent = (sent + " ").replace(bpe_symbol, "").rstrip() 59 | 60 | return sent 61 | 62 | def add_symbol(self, word, n=1): 63 | """Adds a word to the dictionary""" 64 | if word in self.indices: 65 | idx = self.indices[word] 66 | self.count[idx] = self.count[idx] + n 67 | return idx 68 | else: 69 | idx = len(self.symbols) 70 | self.indices[word] = idx 71 | self.symbols.append(word) 72 | self.count.append(n) 73 | return idx 74 | 75 | def update(self, new_dict): 76 | """Updates counts from new dictionary.""" 77 | for word in new_dict.symbols: 78 | idx2 = new_dict.indices[word] 79 | if word in self.indices: 80 | idx = self.indices[word] 81 | self.count[idx] = self.count[idx] + new_dict.count[idx2] 82 | else: 83 | idx = len(self.symbols) 84 | self.indices[word] = idx 85 | self.symbols.append(word) 86 | self.count.append(new_dict.count[idx2]) 87 | 88 | def pad(self): 89 | """Helper to get index of pad symbol""" 90 | return self.pad_index 91 | 92 | def eos(self): 93 | """Helper to get index of end-of-sentence symbol""" 94 | return self.eos_index 95 | 96 | @classmethod 97 | def load(cls, f, ignore_utf_errors=False): 98 | """Loads the dictionary from a text file with the format: 99 | 100 | ``` 101 | 102 | 103 | ... 104 | ``` 105 | 106 | Args: 107 | f (str): Dictionary file name 108 | ignore_utf_errors (bool): Ignore UTF-8 related errors 109 | """ 110 | if isinstance(f, str): 111 | try: 112 | if not ignore_utf_errors: 113 | with open(f, "r", encoding="utf-8") as fd: 114 | return cls.load(fd) 115 | else: 116 | with open(f, "r", encoding="utf-8", errors="ignore") as fd: 117 | return cls.load(fd) 118 | 119 | except FileNotFoundError as fnfe: 120 | raise fnfe 121 | 122 | except Exception: 123 | raise Exception( 124 | "Incorrect encoding detected in {}, please rebuild the dataset".format( 125 | f 126 | ) 127 | ) 128 | 129 | d = cls() 130 | for line in f.readlines(): 131 | word = line.strip()[1:-1] 132 | count = 1 133 | d.indices[word] = len(d.symbols) 134 | d.symbols.append(word) 135 | d.count.append(count) 136 | 137 | n_pad_tokens_on_end = 33712 - len(d.symbols) 138 | 139 | for i in range(n_pad_tokens_on_end): 140 | pad_str = "" 141 | d.indices[pad_str] = len(d.symbols) 142 | d.symbols.append(pad_str) 143 | d.count.append(1) 144 | 145 | return d 146 | -------------------------------------------------------------------------------- /mlbench_core/dataset/util/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | 8 | try: 9 | import tensorflow 10 | 11 | from . import tensorflow 12 | except ImportError: 13 | pass 14 | -------------------------------------------------------------------------------- /mlbench_core/dataset/util/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .partition import ( 2 | DataPartitioner, 3 | Partition, 4 | Partitioner, 5 | partition_dataset_by_rank, 6 | ) 7 | -------------------------------------------------------------------------------- /mlbench_core/dataset/util/pytorch/partition.py: -------------------------------------------------------------------------------- 1 | r"""Partition PyTorch datasets.""" 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | import random 5 | 6 | import numpy as np 7 | import torch 8 | import torch.distributed as dist 9 | 10 | from mlbench_core.utils.pytorch.distributed import get_backend_tensor 11 | 12 | _logger = logging.getLogger("mlbench") 13 | 14 | 15 | class Partition(object): 16 | """Dataset-like object, but only access a subset of it. 17 | 18 | Wraps a dataset, only exposing the entries selected by the `indices` 19 | parameter. 20 | 21 | Args: 22 | data (:obj:`list` of data entries): The data to partition over 23 | indices (:obj:`list` of :obj:`int`): indices of entries to use 24 | """ 25 | 26 | def __init__(self, data, indices): 27 | self.data = data 28 | self.indices = indices 29 | 30 | def __len__(self): 31 | return len(self.indices) 32 | 33 | def __getitem__(self, index): 34 | data_idx = self.indices[index] 35 | return self.data[data_idx] 36 | 37 | def __getattr__(self, item): 38 | return self.data.__getattribute__(item) 39 | 40 | 41 | class Partitioner(object): 42 | """Use a partition of dataset.""" 43 | 44 | def consistent_indices(self, rank, indices, shuffle): 45 | r"""synchronize indices among workers.""" 46 | if rank == 0 and shuffle: 47 | random.shuffle(indices) 48 | 49 | # broadcast. 50 | indices = get_backend_tensor(torch.IntTensor(indices)) 51 | 52 | dist.broadcast(indices, src=0) 53 | 54 | return indices.tolist() 55 | 56 | 57 | class DataPartitioner(Partitioner): 58 | """Partitions a dataset into different sized chunks. 59 | 60 | Used for train:test:validation split. 61 | 62 | Args: 63 | data (:obj:`list` of data entries): The data to partition over 64 | rank (int): The rank of the current node 65 | shuffle (bool): Whether to shuffle entries or not 66 | sizes (:obj:`list` of :obj:`float`): The relative sizes of the 67 | splits. Should sum up to 1.0. (Default = [0.7, 0.2, 0.1]) 68 | """ 69 | 70 | def __init__(self, data, rank, shuffle, sizes=[0.7, 0.2, 0.1]): 71 | # prepare info. 72 | self.data = data 73 | self.data_size = len(self.data) 74 | self.partitions = [] 75 | 76 | # get shuffled/unshuffled data. 77 | indices = [x for x in range(0, self.data_size)] 78 | indices = self.consistent_indices(rank, indices, shuffle) 79 | 80 | # partition indices. 81 | sizes = np.cumsum(sizes) 82 | from_index = 0 83 | for ind, _ in enumerate(sizes): 84 | to_index = int(sizes[ind] * self.data_size) 85 | self.partitions.append(indices[from_index:to_index]) 86 | from_index = to_index 87 | 88 | def use(self, partition_ind): 89 | """Return a partition of data. 90 | 91 | Args: 92 | partition_ind (int): The index of the partition to get 93 | """ 94 | return Partition(self.data, self.partitions[partition_ind]) 95 | 96 | 97 | def partition_dataset_by_rank( 98 | dataset, rank, world_size, distribution="uniform", shuffle=True 99 | ): 100 | r"""Given a dataset, partition it by a distribution and each rank takes part of data. 101 | 102 | Args: 103 | dataset (:obj:`torch.utils.data.Dataset`): The dataset 104 | rank (int): The rank of the current worker 105 | world_size (int): The total number of workers 106 | distribution (str): The sampling distribution to use. Default: `uniform` 107 | shuffle (bool): Whether to shuffle the dataset before partitioning. Default: `True` 108 | """ 109 | if distribution != "uniform": 110 | raise NotImplementedError( 111 | "Distribution {} not implemented.".format(distribution) 112 | ) 113 | 114 | partition_sizes = [1.0 / world_size for _ in range(world_size)] 115 | partition = DataPartitioner(dataset, rank, shuffle, partition_sizes) 116 | partitioned_data = partition.use(rank) 117 | _logger.debug("Partition dataset use {}-th.".format(rank)) 118 | return partitioned_data 119 | -------------------------------------------------------------------------------- /mlbench_core/dataset/util/tools.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import os 3 | import sys 4 | import tarfile 5 | import zipfile 6 | from urllib.request import urlretrieve 7 | 8 | 9 | def progress_download(url, dest): 10 | """Downloads a file from `url` to `dest` and shows progress 11 | 12 | Args: 13 | url (src): Url to retrieve file from 14 | dest (src): Destination file 15 | """ 16 | 17 | def _progress(count, block_size, total_size): 18 | percentage = float(count * block_size) / float(total_size) * 100.0 19 | if percentage % 25 == 0: 20 | sys.stdout.write( 21 | "\r>> Downloading %s %.1f%%" % (os.path.basename(dest), percentage) 22 | ) 23 | sys.stdout.flush() 24 | 25 | urlretrieve(url, dest, _progress) 26 | print("\nDownloaded {} to {}\n".format(url, dest)) 27 | 28 | 29 | def extract_bz2_file(source, dest, delete=True): 30 | """Extracts a bz2 archive 31 | 32 | Args: 33 | source (str): Source file (must have .bz2 extension) 34 | dest (str): Destination file 35 | delete (bool): Delete compressed file after decompression 36 | 37 | """ 38 | assert source.endswith(".bz2"), "Extracting non bz2 archive" 39 | 40 | if os.path.isfile(dest): 41 | print("File {} already extracted to {}".format(source, dest)) 42 | return 43 | with open(dest, "wb") as d, open(source, "rb") as s: 44 | decompressor = bz2.BZ2Decompressor() 45 | for data in iter(lambda: s.read(1000 * 1024), b""): 46 | d.write(decompressor.decompress(data)) 47 | 48 | if delete: 49 | os.remove(source) 50 | 51 | 52 | def compress_to_bz2_file(source, delete=True): 53 | """Extracts a bz2 archive 54 | 55 | Args: 56 | source (str): Source file to compress 57 | delete (bool): Delete un-compressed file 58 | """ 59 | 60 | dest = source + ".bz2" 61 | with open(source, "rb") as s, open(dest, "wb") as d: 62 | compressor = bz2.BZ2Compressor() 63 | for data in iter(lambda: s.read(1000 * 1024), b""): 64 | d.write(compressor.compress(data)) 65 | 66 | if delete: 67 | os.remove(source) 68 | 69 | 70 | def maybe_download_and_extract_bz2(root, file_name, data_url): 71 | """Downloads file from given URL and extracts if bz2 72 | 73 | Args: 74 | root (str): The root directory 75 | file_name (str): File name to download to 76 | data_url (str): Url of data 77 | """ 78 | if not os.path.exists(root): 79 | os.makedirs(root) 80 | 81 | file_path = os.path.join(root, file_name) 82 | file_basename = os.path.splitext(file_name)[0] 83 | extracted_fpath = os.path.join(root, file_basename) 84 | 85 | if os.path.isfile(extracted_fpath): 86 | return extracted_fpath 87 | 88 | # Download file if not present 89 | if len([x for x in os.listdir(root) if x == file_name]) == 0: 90 | progress_download(data_url, file_path) 91 | 92 | # Extract downloaded file if compressed 93 | if file_name.endswith(".bz2"): 94 | # Extract file 95 | extract_bz2_file(file_path, extracted_fpath, delete=True) 96 | file_path = extracted_fpath 97 | return file_path 98 | 99 | 100 | def maybe_download_and_extract_tar_gz(root, file_name, data_url): 101 | """Downloads file from given URL and extracts if compressed as tar.gz 102 | 103 | Args: 104 | root (str): The root directory 105 | file_name (str): File name to download to 106 | data_url (str): Url of data 107 | """ 108 | if not os.path.exists(root): 109 | os.makedirs(root) 110 | 111 | file_path = os.path.join(root, file_name) 112 | 113 | # Download file if not present 114 | if len([x for x in os.listdir(root) if x == file_name]) == 0: 115 | progress_download(data_url, file_path) 116 | 117 | if file_name.endswith(".tar.gz"): 118 | with tarfile.open(file_path, "r:gz") as tar: 119 | dirs = [member for member in tar.getmembers()] 120 | tar.extractall(path=root, members=dirs) 121 | 122 | 123 | def maybe_download_and_extract_zip(root, file_name, data_url): 124 | """Downloads file from given URL and extracts if compressed as zip 125 | 126 | Args: 127 | root (str): The root directory 128 | file_name (str): File name to download to 129 | data_url (str): Url of data 130 | """ 131 | if not os.path.exists(root): 132 | os.makedirs(root) 133 | 134 | file_path = os.path.join(root, file_name) 135 | 136 | # Download file if not present 137 | if len([x for x in os.listdir(root) if x == file_name]) == 0: 138 | progress_download(data_url, file_path) 139 | 140 | if file_name.endswith(".zip"): 141 | with zipfile.ZipFile(file_path, "r") as zip: 142 | zip.extractall(root) 143 | -------------------------------------------------------------------------------- /mlbench_core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | 8 | try: 9 | import tensorflow 10 | 11 | from . import tensorflow 12 | except ImportError: 13 | pass 14 | -------------------------------------------------------------------------------- /mlbench_core/evaluation/goals.py: -------------------------------------------------------------------------------- 1 | def _add_detailed_times(result, tracker): 2 | compute_time = tracker.get_total_compute_time() 3 | 4 | if compute_time: 5 | result += ", Compute: {} seconds".format(compute_time) 6 | 7 | communication_time = tracker.get_total_communication_time() 8 | 9 | if communication_time: 10 | result += ", Communication: {} seconds".format(communication_time) 11 | 12 | metrics_time = tracker.get_total_metrics_time() 13 | 14 | if metrics_time: 15 | result += ", Metrics Computation: {} seconds".format(metrics_time) 16 | 17 | preprocess_time = tracker.get_total_preprocess_time() 18 | 19 | if preprocess_time: 20 | result += ", Pre-processing: {} seconds".format(preprocess_time) 21 | return result 22 | 23 | 24 | def time_to_accuracy_goal(threshold): 25 | def _time_to_accuracy_goal(metric_name, value, tracker): 26 | if metric_name != "val_global_Prec@1": 27 | return None 28 | if value >= threshold: 29 | duration = tracker.get_total_train_time() 30 | 31 | result = ( 32 | "{0:02d}% Top 1 Validation Accuracy reached in {1:.3f} " 33 | "seconds".format(threshold, duration) 34 | ) 35 | 36 | result = _add_detailed_times(result, tracker) 37 | 38 | return result 39 | 40 | return None 41 | 42 | return _time_to_accuracy_goal 43 | 44 | 45 | def task1_time_to_accuracy_goal(): 46 | """Accuracy over Time target for benchmark task 1: Image classification 47 | 48 | Target is 80% accuracy 49 | 50 | Return: 51 | func: time_time_to_accuracy_goal with threshold = 80 52 | """ 53 | return time_to_accuracy_goal(80) 54 | 55 | 56 | def task1_time_to_accuracy_light_goal(): 57 | """Accuracy over Time target for benchmark task 1: Image classification 58 | (Light) 59 | 60 | Light target is 70% accuracy 61 | 62 | Return: 63 | func: time_time_to_accuracy_goal with threshold = 70 64 | """ 65 | return time_to_accuracy_goal(70) 66 | 67 | 68 | def task2_time_to_accuracy_goal(): 69 | """Time to accuracy goal for benchmark task 2: Linear binary classifier 70 | 71 | Target is an accuracy of 89% 72 | 73 | Return: 74 | func: time_time_to_accuracy_goal with threshold = 89 75 | """ 76 | return time_to_accuracy_goal(89) 77 | 78 | 79 | def task2_time_to_accuracy_light_goal(): 80 | """Time to perplexity goal for benchmark task 2: Linear binary classifier 81 | 82 | Target is an accuracy of 80% 83 | 84 | Return: 85 | func: time_time_to_accuracy_goal with threshold = 80 86 | """ 87 | return time_to_accuracy_goal(80) 88 | 89 | 90 | def task3_time_to_perplexity_goal(threshold=70): 91 | """Time to perplexity goal for benchmark task 3: Language Modeling""" 92 | 93 | def _time_to_perplexity_goal(metric_name, value, tracker): 94 | if metric_name != "val_global_Perplexity": 95 | return None 96 | 97 | if value <= threshold: 98 | duration = tracker.get_total_train_time() 99 | result = "Validation Perplexity of {0} reached in {1:.3f} seconds".format( 100 | threshold, duration 101 | ) 102 | 103 | result = _add_detailed_times(result, tracker) 104 | 105 | return result 106 | return None 107 | 108 | return _time_to_perplexity_goal 109 | 110 | 111 | def task4_time_to_bleu_goal(threshold=24): 112 | """Time to BLEU-score goal for benchmark task 4: GNMT machine translation""" 113 | 114 | def _time_to_bleu_goal(metric_name, value, tracker): 115 | if metric_name != "val_global_BLEU-Score": 116 | return None 117 | 118 | if value >= threshold: 119 | duration = tracker.get_total_train_time() 120 | result = "Validation BLEU-Score of {0} reached in {1:.3f} seconds".format( 121 | threshold, duration 122 | ) 123 | 124 | result = _add_detailed_times(result, tracker) 125 | 126 | return result 127 | 128 | return None 129 | 130 | return _time_to_bleu_goal 131 | -------------------------------------------------------------------------------- /mlbench_core/evaluation/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from . import criterion, metrics 2 | 3 | __all__ = ["criterion", "metrics"] 4 | -------------------------------------------------------------------------------- /mlbench_core/evaluation/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from . import criterion 2 | 3 | __all__ = ["criterion"] 4 | -------------------------------------------------------------------------------- /mlbench_core/evaluation/tensorflow/criterion.py: -------------------------------------------------------------------------------- 1 | r"""Define loss functions.""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def softmax_cross_entropy_with_logits_v2_l2_regularized( 7 | logits, labels, l2, loss_filter_fn 8 | ): 9 | """Return an op for computing cross entropy with weight decay. 10 | 11 | The `labels` are assumed to be one-hot encoded. The loss filter function excludes some 12 | tensors from computing weight decay. 13 | 14 | Args: 15 | logits (:obj:`tf.Tensor`): input logits tensor. 16 | labels (:obj:`tf.Tensor`): input one-hot encoded tensor. 17 | l2 (:obj:`float`): size of weight decay 18 | loss_filter_fn (:obj:`callable`): filter function. 19 | 20 | Returns: 21 | :obj:`tf.Tensor`: a scalar tensor 22 | """ 23 | labels = tf.cast(labels, tf.int32) 24 | with tf.name_scope("loss"): 25 | cross_entropy = tf.reduce_mean( 26 | tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels) 27 | ) 28 | 29 | loss = cross_entropy + l2 * tf.add_n( 30 | [ 31 | tf.nn.l2_loss(v) 32 | for v in tf.trainable_variables() 33 | if loss_filter_fn(v.name) 34 | ] 35 | ) 36 | return loss 37 | -------------------------------------------------------------------------------- /mlbench_core/evaluation/tensorflow/metrics.py: -------------------------------------------------------------------------------- 1 | r"""Define tensorflow metrics.""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class TopKAccuracy(object): 7 | """Compute the top-k accuracy of logits. 8 | 9 | Args: 10 | logits (:obj:`tf.Tensor`): input tensor 11 | labels (:obj:`tf.Tensor`): input one-hot encoded tensor. 12 | topkk (:obj:`int`, optional): Defaults to 1. top k accuracy. 13 | """ 14 | 15 | def __init__(self, logits, labels, topk=1): 16 | labels = tf.cast(labels, tf.int32) 17 | true_classes = tf.argmax(labels, axis=1) 18 | 19 | # predicted classes 20 | pred_probs = tf.nn.softmax(logits, name="softmax_tensor") 21 | pred_classes = tf.argmax(pred_probs, axis=1) 22 | 23 | # get metrics. 24 | with tf.name_scope("metrics"): 25 | if topk == 1: 26 | self.name = "Prec@1" 27 | self.metric_op = ( 28 | tf.reduce_mean( 29 | tf.cast(tf.equal(true_classes, pred_classes), tf.float32) 30 | ) 31 | * 100.0 32 | ) 33 | else: 34 | topk_op = tf.nn.in_top_k( 35 | predictions=pred_probs, targets=true_classes, k=topk 36 | ) 37 | self.name = "Prec@" + str(topk) 38 | self.metric_op = tf.reduce_mean(tf.cast(topk_op, tf.float32)) * 100.0 39 | 40 | 41 | def topk_accuracy_with_logits(logits, labels, k=1): 42 | """Compute the top-k accuracy of logits. 43 | 44 | Args: 45 | logits (:obj:`tf.Tensor`): input tensor 46 | labels (:obj:`tf.Tensor`): input one-hot encoded tensor. 47 | k (:obj:`int`, optional): Defaults to 1. top k accuracy. 48 | 49 | Returns: 50 | :obj:`tf.Tensor`: a scalar tensor of the accuracy (between 0 and 1). 51 | """ 52 | m = TopKAccuracy(logits=logits, labels=labels, topk=k) 53 | return {"name": m.name, "value": m.metric_op} 54 | -------------------------------------------------------------------------------- /mlbench_core/install_cuda_extensions.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup 4 | 5 | import mlbench_core 6 | 7 | base__dir = os.path.dirname(mlbench_core.__file__) 8 | ext_modules = [] 9 | try: 10 | from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension 11 | 12 | dir_1 = os.path.join( 13 | base__dir, "models/pytorch/transformer/modules/strided_batched_gemm" 14 | ) 15 | 16 | dir_2 = os.path.join(base__dir, "models/pytorch/gnmt/attn_score") 17 | strided_batched_gemm = CUDAExtension( 18 | name="mlbench_core.models.pytorch.transformer.modules.strided_batched_gemm", 19 | sources=[ 20 | os.path.join(dir_1, "strided_batched_gemm.cpp"), 21 | os.path.join(dir_1, "strided_batched_gemm_cuda.cu"), 22 | ], 23 | extra_compile_args={ 24 | "cxx": [ 25 | "-O2", 26 | ], 27 | "nvcc": [ 28 | "-I/app/cutlass/", 29 | "--gpu-architecture=compute_70", 30 | "--gpu-code=sm_70", 31 | "-O3", 32 | "-U__CUDA_NO_HALF_OPERATORS__", 33 | "-U__CUDA_NO_HALF_CONVERSIONS__", 34 | ], 35 | }, 36 | ) 37 | 38 | attn_score = CUDAExtension( 39 | name="mlbench_core.models.pytorch.gnmt.attn_score", 40 | sources=[ 41 | os.path.join(dir_2, "attn_score_cuda.cpp"), 42 | os.path.join(dir_2, "attn_score_cuda_kernel.cu"), 43 | ], 44 | extra_compile_args={ 45 | "cxx": [ 46 | "-O2", 47 | ], 48 | "nvcc": [ 49 | "--gpu-architecture=sm_70", 50 | ], 51 | }, 52 | ) 53 | ext_modules.append(strided_batched_gemm) 54 | ext_modules.append(attn_score) 55 | cmdclass = {"build_ext": BuildExtension} 56 | 57 | setup(ext_modules=ext_modules, cmdclass=cmdclass) 58 | 59 | except (ImportError, OSError) as e: 60 | raise ValueError("Cannot install extensions because CUDA was not found") 61 | -------------------------------------------------------------------------------- /mlbench_core/lr_scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | 8 | 9 | try: 10 | import tensorflow 11 | 12 | from . import tensorflow 13 | except ImportError: 14 | pass 15 | -------------------------------------------------------------------------------- /mlbench_core/lr_scheduler/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | """Scheduling Learning Rates. 2 | """ 3 | 4 | from .lr import ( 5 | ExponentialWarmupMultiStepLR, 6 | LRLinearWarmUp, 7 | MultiStepLRLinearWarmUp, 8 | ReduceLROnPlateauWithWarmup, 9 | SparsifiedSGDLR, 10 | SQRTTimeDecayLR, 11 | SQRTTimeDecayLRWithWarmup, 12 | TimeDecayLR, 13 | ) 14 | -------------------------------------------------------------------------------- /mlbench_core/lr_scheduler/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .lr import manual_stepping 2 | -------------------------------------------------------------------------------- /mlbench_core/lr_scheduler/tensorflow/lr.py: -------------------------------------------------------------------------------- 1 | r"""Learning rate scheduling in tensorflow. 2 | 3 | The manual_stepping function is taken from : 4 | 5 | https://github.com/tensorflow/models/blob/master/research/object_detection/utils/learning_schedules.py 6 | """ 7 | 8 | import tensorflow as tf 9 | 10 | 11 | def manual_stepping(global_step, boundaries, rates, warmup=False): 12 | """Manually stepped learning rate schedule. 13 | 14 | This function provides fine grained control over learning rates. One must 15 | specify a sequence of learning rates as well as a set of integer steps 16 | at which the current learning rate must transition to the next. For example, 17 | if boundaries = [5, 10] and rates = [.1, .01, .001], then the learning 18 | rate returned by this function is .1 for global_step=0,...,4, .01 for 19 | global_step=5...9, and .001 for global_step=10 and onward. 20 | 21 | Args: 22 | global_step (:obj:`tf.Tensor`): int64 (scalar) tensor representing global step. 23 | boundaries (list): a list of global steps at which to switch learning 24 | rates (list): a list of (float) learning rates corresponding to intervals between 25 | the boundaries. The length of this list must be exactly len(boundaries) + 1. 26 | warmup (bool, optional): Defaults to False. Whether to linearly interpolate learning 27 | rate for steps in [0, boundaries[0]]. 28 | 29 | Raises: 30 | ValueError: boundaries is a strictly increasing list of positive integers 31 | ValueError: len(rates) == len(boundaries) + 1 32 | ValueError: boundaries[0] != 0 33 | 34 | Returns: 35 | :obj:`tf.Tensor`: a (scalar) float tensor representing learning rate 36 | """ 37 | 38 | if any([b < 0 for b in boundaries]) or any( 39 | [not isinstance(b, int) for b in boundaries] 40 | ): 41 | raise ValueError("boundaries must be a list of positive integers") 42 | if any([bnext <= b for bnext, b in zip(boundaries[1:], boundaries[:-1])]): 43 | raise ValueError("Entries in boundaries must be strictly increasing.") 44 | if any([not isinstance(r, float) for r in rates]): 45 | raise ValueError("Learning rates must be floats") 46 | if len(rates) != len(boundaries) + 1: 47 | raise ValueError( 48 | "Number of provided learning rates must exceed " 49 | "number of boundary points by exactly 1." 50 | ) 51 | 52 | if boundaries and boundaries[0] == 0: 53 | raise ValueError("First step cannot be zero.") 54 | 55 | if warmup and boundaries: 56 | slope = (rates[1] - rates[0]) * 1.0 / boundaries[0] 57 | warmup_steps = range(boundaries[0]) 58 | warmup_rates = [rates[0] + slope * step for step in warmup_steps] 59 | boundaries = warmup_steps + boundaries 60 | rates = warmup_rates + rates[1:] 61 | else: 62 | boundaries = [0] + boundaries 63 | num_boundaries = len(boundaries) 64 | rate_index = tf.reduce_max( 65 | tf.where( 66 | tf.greater_equal(global_step, boundaries), 67 | list(range(num_boundaries)), 68 | [0] * num_boundaries, 69 | ) 70 | ) 71 | return tf.reduce_sum( 72 | rates * tf.one_hot(rate_index, depth=num_boundaries), name="learning_rate" 73 | ) 74 | -------------------------------------------------------------------------------- /mlbench_core/models/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | 8 | 9 | try: 10 | import tensorflow 11 | 12 | from . import tensorflow 13 | except ImportError: 14 | pass 15 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/models/pytorch/__init__.py -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/gnmt/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import GNMT 2 | from .translator import Translator 3 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/gnmt/attn_score/attn_score_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | // CUDA forward declarations 6 | 7 | at::Tensor attn_score_forward_cuda( 8 | const at::Tensor &attn_query, 9 | const at::Tensor &attn_keys, 10 | const at::Tensor &bias, 11 | const at::Tensor &linear_attn); 12 | 13 | std::vector attn_score_backward_cuda( 14 | const at::Tensor &grad_output, 15 | const at::Tensor &attn_query, 16 | const at::Tensor &attn_keys, 17 | const at::Tensor &bias, 18 | const at::Tensor &linear_attn); 19 | 20 | // C++ interface 21 | 22 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 23 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 24 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 25 | 26 | at::Tensor attn_score_forward( 27 | const at::Tensor &attn_query, 28 | const at::Tensor &attn_keys, 29 | const at::Tensor &bias, 30 | const at::Tensor &linear_attn) { 31 | CHECK_INPUT(attn_query); 32 | CHECK_INPUT(attn_keys); 33 | CHECK_INPUT(bias); 34 | CHECK_INPUT(linear_attn); 35 | 36 | return attn_score_forward_cuda(attn_query, attn_keys, bias, linear_attn); 37 | } 38 | 39 | std::vector attn_score_backward( 40 | const at::Tensor &grad_output, 41 | const at::Tensor &attn_query, 42 | const at::Tensor &attn_keys, 43 | const at::Tensor &bias, 44 | const at::Tensor &linear_attn) { 45 | CHECK_INPUT(grad_output); 46 | CHECK_INPUT(attn_query); 47 | CHECK_INPUT(attn_keys); 48 | CHECK_INPUT(bias); 49 | CHECK_INPUT(linear_attn); 50 | 51 | return attn_score_backward_cuda(grad_output, attn_query, attn_keys, bias, linear_attn); 52 | } 53 | 54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 55 | m.def("forward", &attn_score_forward, "Attention score calculation forward (CUDA)"); 56 | m.def("backward", &attn_score_backward, "Attention score calculation backward (CUDA)"); 57 | } 58 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/gnmt/encoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 3 | 4 | import mlbench_core.dataset.nlp.pytorch.wmt16.wmt16_config as config 5 | from mlbench_core.models.pytorch.gnmt.utils import init_lstm_ 6 | 7 | 8 | class ResidualRecurrentEncoder(nn.Module): 9 | """ 10 | Encoder with Embedding, LSTM layers, residual connections and optional 11 | dropout. 12 | 13 | The first LSTM layer is bidirectional and uses variable sequence length 14 | API, the remaining (num_layers-1) layers are unidirectional. Residual 15 | connections are enabled after third LSTM layer, dropout is applied on 16 | inputs to LSTM layers. 17 | 18 | Args: 19 | vocab_size: size of vocabulary 20 | hidden_size: hidden size for LSTM layers 21 | num_layers: number of LSTM layers, 1st layer is bidirectional 22 | dropout: probability of dropout (on input to LSTM layers) 23 | embedder: instance of nn.Embedding, if None constructor will 24 | create new embedding layer 25 | init_weight: range for the uniform initializer 26 | """ 27 | 28 | def __init__( 29 | self, 30 | vocab_size, 31 | hidden_size=1024, 32 | num_layers=4, 33 | dropout=0.2, 34 | embedder=None, 35 | init_weight=0.1, 36 | ): 37 | super(ResidualRecurrentEncoder, self).__init__() 38 | self.rnn_layers = nn.ModuleList() 39 | # 1st LSTM layer, bidirectional 40 | self.rnn_layers.append( 41 | nn.LSTM( 42 | hidden_size, 43 | hidden_size, 44 | num_layers=1, 45 | bias=True, 46 | batch_first=False, 47 | bidirectional=True, 48 | ) 49 | ) 50 | 51 | # 2nd LSTM layer, with 2x larger input_size 52 | self.rnn_layers.append( 53 | nn.LSTM( 54 | (2 * hidden_size), 55 | hidden_size, 56 | num_layers=1, 57 | bias=True, 58 | batch_first=False, 59 | ) 60 | ) 61 | 62 | # Remaining LSTM layers 63 | for _ in range(num_layers - 2): 64 | self.rnn_layers.append( 65 | nn.LSTM( 66 | hidden_size, 67 | hidden_size, 68 | num_layers=1, 69 | bias=True, 70 | batch_first=False, 71 | ) 72 | ) 73 | 74 | for lstm in self.rnn_layers: 75 | init_lstm_(lstm, init_weight) 76 | 77 | self.dropout = nn.Dropout(p=dropout) 78 | 79 | if embedder is not None: 80 | self.embedder = embedder 81 | else: 82 | self.embedder = nn.Embedding( 83 | vocab_size, hidden_size, padding_idx=config.PAD 84 | ) 85 | nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight) 86 | 87 | def forward(self, inputs, lengths): 88 | """ 89 | Execute the encoder. 90 | 91 | Args: 92 | inputs: tensor with indices from the vocabulary 93 | lengths: vector with sequence lengths (excluding padding) 94 | 95 | Returns: 96 | tensor with encoded sequences 97 | 98 | """ 99 | x = self.embedder(inputs) 100 | 101 | # bidirectional layer 102 | x = self.dropout(x) 103 | x = pack_padded_sequence(x, lengths.cpu(), batch_first=False) 104 | x, _ = self.rnn_layers[0](x) 105 | x, _ = pad_packed_sequence(x, batch_first=False) 106 | 107 | # 1st unidirectional layer 108 | x = self.dropout(x) 109 | x, _ = self.rnn_layers[1](x) 110 | 111 | # the rest of unidirectional layers, 112 | # with residual connections starting from 3rd layer 113 | for i in range(2, len(self.rnn_layers)): 114 | residual = x 115 | x = self.dropout(x) 116 | x, _ = self.rnn_layers[i](x) 117 | x = x + residual 118 | 119 | return x 120 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/gnmt/models.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn.functional import log_softmax 3 | 4 | from mlbench_core.dataset.nlp.pytorch.wmt16 import wmt16_config 5 | from mlbench_core.models.pytorch.gnmt.decoder import ResidualRecurrentDecoder 6 | from mlbench_core.models.pytorch.gnmt.encoder import ResidualRecurrentEncoder 7 | 8 | 9 | class Seq2Seq(nn.Module): 10 | """ 11 | Generic Seq2Seq module, with an encoder and a decoder. 12 | Args: 13 | encoder (Encoder): Model encoder 14 | decoder (Decoder): Model decoder 15 | """ 16 | 17 | def __init__(self, encoder=None, decoder=None): 18 | super(Seq2Seq, self).__init__() 19 | self.encoder = encoder 20 | self.decoder = decoder 21 | 22 | def encode(self, inputs, lengths): 23 | """ 24 | Applies the encoder to inputs with a given input sequence lengths. 25 | 26 | Args: 27 | inputs (torch.tensor): tensor with inputs (seq_len, batch) 28 | lengths: vector with sequence lengths (excluding padding) 29 | 30 | Returns: 31 | torch.tensor 32 | """ 33 | return self.encoder(inputs, lengths) 34 | 35 | def decode(self, inputs, context, inference=False): 36 | """ 37 | Applies the decoder to inputs, given the context from the encoder. 38 | 39 | Args: 40 | inputs (torch.tensor): tensor with inputs (seq_len, batch) 41 | context: context from the encoder 42 | inference: if True inference mode, if False training mode 43 | 44 | Returns: 45 | torch.tensor 46 | """ 47 | return self.decoder(inputs, context, inference) 48 | 49 | def generate(self, inputs, context, beam_size): 50 | """ 51 | Autoregressive generator, works with SequenceGenerator class. 52 | Executes decoder (in inference mode), applies log_softmax and topK for 53 | inference with beam search decoding. 54 | 55 | Args: 56 | inputs: tensor with inputs to the decoder 57 | context: context from the encoder 58 | beam_size: beam size for the generator 59 | 60 | Returns: 61 | (words, logprobs, scores, new_context) 62 | words: indices of topK tokens 63 | logprobs: log probabilities of topK tokens 64 | scores: scores from the attention module (for coverage penalty) 65 | new_context: new decoder context, includes new hidden states for 66 | decoder RNN cells 67 | """ 68 | logits, scores, new_context = self.decode(inputs, context, True) 69 | logprobs = log_softmax(logits, dim=-1) 70 | logprobs, words = logprobs.topk(beam_size, dim=-1) 71 | return words, logprobs, scores, new_context 72 | 73 | 74 | class GNMT(Seq2Seq): 75 | """ 76 | GNMT v2 model 77 | 78 | Args: 79 | vocab_size (int): size of vocabulary (number of tokens) 80 | hidden_size (int): internal hidden size of the model 81 | num_layers (int): number of layers, applies to both encoder and 82 | decoder 83 | dropout (float): probability of dropout (in encoder and decoder) 84 | tensors, if false the model uses (seq, batch, feature) 85 | share_embedding (bool): if True embeddings are shared between 86 | encoder and decoder 87 | """ 88 | 89 | def __init__( 90 | self, 91 | vocab_size, 92 | hidden_size=1024, 93 | num_layers=4, 94 | dropout=0.2, 95 | share_embedding=True, 96 | fusion=True, 97 | ): 98 | super(GNMT, self).__init__() 99 | 100 | if share_embedding: 101 | embedder = nn.Embedding( 102 | vocab_size, hidden_size, padding_idx=wmt16_config.PAD 103 | ) 104 | nn.init.uniform_(embedder.weight.data, -0.1, 0.1) 105 | else: 106 | embedder = None 107 | 108 | self.encoder = ResidualRecurrentEncoder( 109 | vocab_size, hidden_size, num_layers, dropout, embedder 110 | ) 111 | 112 | self.decoder = ResidualRecurrentDecoder( 113 | vocab_size, hidden_size, num_layers, dropout, embedder, fusion=fusion 114 | ) 115 | 116 | def forward(self, input_encoder, input_enc_len, input_decoder): 117 | context = self.encode(input_encoder, input_enc_len) 118 | context = (context, input_enc_len, None) 119 | output, _, _ = self.decode(input_decoder, context) 120 | 121 | return output 122 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/gnmt/translator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mosestokenizer import MosesDetokenizer 3 | 4 | import mlbench_core.dataset.nlp.pytorch.wmt16.wmt16_config as wmt16_config 5 | from mlbench_core.utils.pytorch.inference.beam_search import SequenceGenerator 6 | 7 | 8 | class Translator: 9 | """ 10 | Translator that output translated sentences from GNMT model by using a Sequence Generator 11 | 12 | Args: 13 | model (`obj`:torch.nn.Module): Model to use 14 | trg_tokenizer (`obj`:mlbench_core.dataset.nlp.pytorch.wmt16.WMT16Tokenizer): The target tokenizer 15 | """ 16 | 17 | def __init__( 18 | self, 19 | model, 20 | trg_tokenizer, 21 | trg_lang="de", 22 | beam_size=5, 23 | len_norm_factor=0.6, 24 | len_norm_const=5.0, 25 | cov_penalty_factor=0.1, 26 | max_seq_len=150, 27 | ): 28 | 29 | self.model = model 30 | self.tokenizer = trg_tokenizer 31 | self.insert_target_start = [wmt16_config.BOS] 32 | self.insert_src_start = [wmt16_config.BOS] 33 | self.insert_src_end = [wmt16_config.EOS] 34 | self.beam_size = beam_size 35 | self.trg_lang = trg_lang 36 | 37 | self.generator = SequenceGenerator( 38 | model=self.model, 39 | beam_size=beam_size, 40 | max_seq_len=max_seq_len, 41 | len_norm_factor=len_norm_factor, 42 | len_norm_const=len_norm_const, 43 | cov_penalty_factor=cov_penalty_factor, 44 | ) 45 | 46 | def get_detokenized_target(self, trg, batch_size): 47 | targets = [] 48 | with MosesDetokenizer(self.trg_lang) as detok: 49 | for i in range(batch_size): 50 | t = self.tokenizer.detokenize(trg[:, i].tolist()) 51 | t = detok(t.split()) 52 | targets.append(t) 53 | 54 | return targets 55 | 56 | def translate(self, src, trg): 57 | """Given a source a target tokenized tensors, outputs the 58 | non-tokenized translation from the model, as well as the non-tokenized target 59 | 60 | Args: 61 | src: 62 | trg: 63 | 64 | Returns: 65 | 66 | """ 67 | src, src_len = src 68 | trg, trg_len = trg 69 | device = next(self.model.parameters()).device 70 | 71 | batch_size = src.shape[1] 72 | 73 | bos = [self.insert_target_start] * (batch_size * self.beam_size) 74 | bos = torch.tensor(bos, dtype=torch.int64, device=device).view(1, -1) 75 | 76 | if self.beam_size == 1: 77 | generator = self.generator.greedy_search 78 | else: 79 | generator = self.generator.beam_search 80 | 81 | with torch.no_grad(): 82 | context = self.model.encode(src, src_len) 83 | context = [context, src_len, None] 84 | preds, lengths, counter = generator(batch_size, bos, context) 85 | 86 | preds = preds.cpu() 87 | targets = self.get_detokenized_target(trg, batch_size) 88 | 89 | output = [] 90 | with MosesDetokenizer(self.trg_lang) as detokenizer: 91 | for pred in preds: 92 | pred = pred.tolist() 93 | detok = self.tokenizer.detokenize(pred) 94 | detok = detokenizer(detok.split()) 95 | output.append(detok) 96 | 97 | return output, targets 98 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/gnmt/utils.py: -------------------------------------------------------------------------------- 1 | from torch.nn import init as init 2 | 3 | 4 | def init_lstm_(lstm, init_weight=0.1): 5 | """ 6 | Initializes weights of LSTM layer. 7 | Weights and biases are initialized with uniform(-init_weight, init_weight) 8 | distribution. 9 | 10 | Args: 11 | lstm (torch.nn.LSTM): 12 | init_weight (float): range for the uniform initializer 13 | 14 | """ 15 | # Initialize hidden-hidden weights 16 | init.uniform_(lstm.weight_hh_l0.data, -init_weight, init_weight) 17 | # Initialize input-hidden weights: 18 | init.uniform_(lstm.weight_ih_l0.data, -init_weight, init_weight) 19 | 20 | # Initialize bias. PyTorch LSTM has two biases, one for input-hidden GEMM 21 | # and the other for hidden-hidden GEMM. Here input-hidden bias is 22 | # initialized with uniform distribution and hidden-hidden bias is 23 | # initialized with zeros. 24 | init.uniform_(lstm.bias_ih_l0.data, -init_weight, init_weight) 25 | init.zeros_(lstm.bias_hh_l0.data) 26 | 27 | if lstm.bidirectional: 28 | init.uniform_(lstm.weight_hh_l0_reverse.data, -init_weight, init_weight) 29 | init.uniform_(lstm.weight_ih_l0_reverse.data, -init_weight, init_weight) 30 | 31 | init.uniform_(lstm.bias_ih_l0_reverse.data, -init_weight, init_weight) 32 | init.zeros_(lstm.bias_hh_l0_reverse.data) 33 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/language_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .lstm import LSTMLanguageModel 2 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/language_models/lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from mlbench_core.models.pytorch.layers import ( 5 | LockedDropout, 6 | WeightDrop, 7 | embedded_dropout, 8 | ) 9 | 10 | 11 | class LSTMLanguageModel(nn.Module): 12 | """Container module with an encoder, a recurrent module, and a decoder. 13 | 14 | Model taken from https://github.com/salesforce/awd-lstm-lm 15 | 16 | Args: 17 | ntoken (int): Number of tokens in vocabulary 18 | ninp (int): Embedding size (LSTM input size) 19 | nhid (int): Number of hidden LSTM units per layer 20 | nlayers (int): Number of LSTM layers 21 | dropout (float): Output dropout rate (LockedDropout). Default 0.5 22 | dropouth (float): LSTM output dropout rate (between each layer except for last). Default 0.5 23 | dropouti (float): Input dropout to LSTM layers. Default 0.5 24 | dropoute (float): Embedding dropout. Default 0.1 25 | wdrop (float): Weight dropout for LSTM layers. Default 0 26 | tie_weights (bool): If True, encoder and decoder weights are tied. Default False 27 | 28 | """ 29 | 30 | def __init__( 31 | self, 32 | ntoken, 33 | ninp, 34 | nhid, 35 | nlayers, 36 | dropout=0.5, 37 | dropouth=0.5, 38 | dropouti=0.5, 39 | dropoute=0.1, 40 | wdrop=0, 41 | tie_weights=False, 42 | ): 43 | super(LSTMLanguageModel, self).__init__() 44 | self.lockdroph = LockedDropout(p=dropouth) 45 | self.lockdropi = LockedDropout(p=dropouti) 46 | self.lockdrop = LockedDropout(p=dropout) 47 | self.encoder = nn.Embedding(ntoken, ninp) 48 | 49 | self.rnns = [ 50 | torch.nn.LSTM( 51 | ninp if l == 0 else nhid, 52 | nhid if l != nlayers - 1 else (ninp if tie_weights else nhid), 53 | 1, 54 | dropout=0, 55 | ) 56 | for l in range(nlayers) 57 | ] 58 | if wdrop: 59 | self.rnns = [ 60 | WeightDrop(rnn, ["weight_hh_l0"], dropout=wdrop) for rnn in self.rnns 61 | ] 62 | print(self.rnns) 63 | self.rnns = torch.nn.ModuleList(self.rnns) 64 | self.decoder = nn.Linear(nhid, ntoken) 65 | 66 | # Optionally tie weights as in: 67 | # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) 68 | # https://arxiv.org/abs/1608.05859 69 | # and 70 | # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) 71 | # https://arxiv.org/abs/1611.01462 72 | if tie_weights: 73 | # if nhid != ninp: 74 | # raise ValueError('When using the tied flag, nhid must be equal to emsize') 75 | self.decoder.weight = self.encoder.weight 76 | 77 | self.init_weights() 78 | 79 | self.ntoken = ntoken 80 | self.ninp = ninp 81 | self.nhid = nhid 82 | self.nlayers = nlayers 83 | self.dropoute = dropoute 84 | self.tie_weights = tie_weights 85 | 86 | def init_weights(self): 87 | initrange = 0.1 88 | self.encoder.weight.data.uniform_(-initrange, initrange) 89 | self.decoder.bias.data.fill_(0) 90 | self.decoder.weight.data.uniform_(-initrange, initrange) 91 | 92 | def forward(self, input, hidden, return_h=False): 93 | # Embedded Dropout 94 | emb = embedded_dropout( 95 | self.encoder, input, dropout=self.dropoute if self.training else 0 96 | ) 97 | # LSTM input dropout 98 | emb = self.lockdropi(emb) 99 | 100 | # Manual feeding of LSTM layers 101 | raw_output = emb 102 | new_hidden = [] 103 | raw_outputs = [] 104 | outputs = [] 105 | # Iterate on all LSTM layers 106 | for l, rnn in enumerate(self.rnns): 107 | # Compute output and hidden state 108 | raw_output, new_h = rnn(raw_output, hidden[l]) 109 | new_hidden.append(new_h) 110 | raw_outputs.append(raw_output) 111 | # Apply LockDrop if not last layer 112 | if l != self.nlayers - 1: 113 | raw_output = self.lockdroph(raw_output) 114 | outputs.append(raw_output) 115 | hidden = new_hidden 116 | 117 | # Output dropout 118 | output = self.lockdrop(raw_output) 119 | outputs.append(output) 120 | # 121 | result = self.decoder( 122 | output.view(output.size(0) * output.size(1), output.size(2)) 123 | ) 124 | if return_h: 125 | return result, hidden, raw_outputs, outputs 126 | return result, hidden 127 | 128 | def init_hidden(self, bsz): 129 | weight = next(self.parameters()).data 130 | return [ 131 | ( 132 | weight.new( 133 | 1, 134 | bsz, 135 | self.nhid 136 | if l != self.nlayers - 1 137 | else (self.ninp if self.tie_weights else self.nhid), 138 | ).zero_(), 139 | weight.new( 140 | 1, 141 | bsz, 142 | self.nhid 143 | if l != self.nlayers - 1 144 | else (self.ninp if self.tie_weights else self.nhid), 145 | ).zero_(), 146 | ) 147 | for l in range(self.nlayers) 148 | ] 149 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dropout_layers import LockedDropout, WeightDrop, embedded_dropout 2 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/layers/dropout_layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | 5 | """Following classes were taken and adapted from https://github.com/salesforce/awd-lstm-lm""" 6 | 7 | 8 | class LockedDropout(nn.Module): 9 | """LockedDropout applies the same dropout mask to every time step. 10 | 11 | Args: 12 | p (float): Probability of an element in the dropout mask to be zeroed. 13 | """ 14 | 15 | def __init__(self, p=0.5): 16 | self.p = p 17 | super().__init__() 18 | 19 | def forward(self, x): 20 | """ 21 | Args: 22 | x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to 23 | apply dropout too. 24 | """ 25 | if not self.training or not self.p: 26 | return x 27 | x = x.clone() 28 | mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_( 29 | 1 - self.p 30 | ) 31 | mask = mask.div_(1 - self.p) 32 | mask = mask.expand_as(x) 33 | return x * mask 34 | 35 | def __repr__(self): 36 | return self.__class__.__name__ + "(" + "p=" + str(self.p) + ")" 37 | 38 | 39 | def embedded_dropout(embed, words, dropout=0.1, scale=None): 40 | """Applies a mask dropout to the embedding layer 41 | 42 | Args: 43 | embed (:obj:`torch.nn.Embedding`): Embedding layer to use 44 | words (:obj:`torch.Tensor`): Word inputs (tokenized) 45 | dropout (float): Dropout rate (Default 0.1) 46 | scale (float, optional): Scale factor for embedding weights 47 | 48 | Returns: 49 | (:obj:`torch.Tensor`) Output of Embedding after applying dropout mask to weights 50 | """ 51 | if dropout: 52 | mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_( 53 | 1 - dropout 54 | ).expand_as(embed.weight) / (1 - dropout) 55 | masked_embed_weight = mask * embed.weight 56 | else: 57 | masked_embed_weight = embed.weight 58 | if scale: 59 | masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight 60 | 61 | padding_idx = embed.padding_idx 62 | if padding_idx is None: 63 | padding_idx = -1 64 | 65 | X = F.embedding( 66 | words, 67 | masked_embed_weight, 68 | padding_idx, 69 | embed.max_norm, 70 | embed.norm_type, 71 | embed.scale_grad_by_freq, 72 | embed.sparse, 73 | ) 74 | return X 75 | 76 | 77 | class WeightDrop(torch.nn.Module): 78 | """Weight Dropout layer. Wraps another module and patches the forward method to apply dropout to module weights. 79 | 80 | Args: 81 | module (:obj:`torch.nn.Module`): Module to wrap 82 | weights (listr[str]): Weights to apply dropout to 83 | dropout (float): Dropout rate (Default 0) 84 | 85 | """ 86 | 87 | def __init__(self, module, weights, dropout=0): 88 | super(WeightDrop, self).__init__() 89 | self.module = module 90 | self.weights = weights 91 | self.dropout = dropout 92 | self._setup() 93 | 94 | def _setup(self): 95 | """Sets up new weights for the module""" 96 | for name_w in self.weights: 97 | print("Applying weight drop of {} to {}".format(self.dropout, name_w)) 98 | # Make space for new weights 99 | w = getattr(self.module, name_w) 100 | del self.module._parameters[name_w] 101 | # Register raw weights 102 | self.module.register_parameter(name_w + "_raw", nn.Parameter(w.data)) 103 | 104 | def _setweights(self): 105 | """Sets dropped out weights""" 106 | for name_w in self.weights: 107 | # Get raw weights and apply dropout 108 | raw_w = getattr(self.module, name_w + "_raw") 109 | w = F.dropout(raw_w, p=self.dropout, training=self.training) 110 | 111 | # This is because we may call this function in non-training mode first and so, as self.training=False, w is 112 | # a nn.Parameter and thus self.module.weight remains a Parameter of self.module when we don't want it to. 113 | if name_w in self.module._parameters: 114 | del self.module._parameters[name_w] 115 | # Set dropped out weights 116 | setattr(self.module, name_w, w) 117 | 118 | def forward(self, *args): 119 | """Forward patch""" 120 | self._setweights() 121 | self.module.flatten_parameters() 122 | return self.module.forward(*args) 123 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/linear_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class LogisticRegression(torch.nn.Module): 5 | """ 6 | Logistic regression implementation 7 | 8 | Args: 9 | n_features (int): Number of features 10 | """ 11 | 12 | def __init__(self, n_features): 13 | super(LogisticRegression, self).__init__() 14 | 15 | self.linear = torch.nn.Linear(n_features, 1, bias=False) 16 | 17 | def forward(self, x): 18 | y_pred = torch.sigmoid(self.linear(x)) 19 | return y_pred 20 | 21 | 22 | class LinearRegression(torch.nn.Module): 23 | """ 24 | Ridge regression implementation 25 | 26 | Args: 27 | n_features (int): Number of features 28 | """ 29 | 30 | def __init__(self, n_features): 31 | super(LinearRegression, self).__init__() 32 | self.linear = torch.nn.Linear(n_features, 1, bias=False) 33 | 34 | def forward(self, x): 35 | return self.linear(x) 36 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .sequence_generator import SequenceGenerator 2 | from .transformer import TransformerModel 3 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/transformer/decoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | 7 | from mlbench_core.models.pytorch.transformer.modules import ( 8 | PositionalEmbedding, 9 | TransformerDecoderLayer, 10 | ) 11 | 12 | 13 | class TransformerDecoder(nn.Module): 14 | """ 15 | Transformer decoder consisting of *args.decoder_layers* layers. Each layer 16 | is a :class:`TransformerDecoderLayer`. 17 | 18 | Args: 19 | args: Arguments of model. All arguments should be accessible via `__getattribute__` method 20 | dictionary (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): decoding dictionary 21 | embed_tokens (torch.nn.Embedding): output embedding 22 | no_encoder_attn (bool, optional): whether to attend to encoder outputs 23 | (default: False). 24 | left_pad (bool): Pad targets to the left (`True`) or right (`False`). Default: `False` 25 | """ 26 | 27 | def __init__( 28 | self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False 29 | ): 30 | super().__init__() 31 | self.dictionary = dictionary 32 | self.dropout = args.dropout 33 | self.share_input_output_embed = args.share_decoder_input_output_embed 34 | 35 | embed_dim = embed_tokens.embedding_dim 36 | padding_idx = embed_tokens.padding_idx 37 | self.max_target_positions = args.max_target_positions 38 | 39 | self.embed_tokens = embed_tokens 40 | self.embed_scale = math.sqrt(embed_dim) 41 | self.embed_positions = ( 42 | PositionalEmbedding( 43 | args.max_target_positions, 44 | embed_dim, 45 | padding_idx, 46 | left_pad=left_pad, 47 | learned=args.decoder_learned_pos, 48 | ) 49 | if not args.no_token_positional_embeddings 50 | else None 51 | ) 52 | 53 | self.layers = nn.ModuleList( 54 | [ 55 | TransformerDecoderLayer(args, no_encoder_attn) 56 | for _ in range(args.decoder_layers) 57 | ] 58 | ) 59 | 60 | if not self.share_input_output_embed: 61 | self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim)) 62 | nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5) 63 | self.normalize = args.decoder_normalize_before 64 | 65 | if self.normalize: 66 | self.layer_norm = nn.LayerNorm(embed_dim) 67 | 68 | def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): 69 | # embed positions 70 | positions = ( 71 | self.embed_positions( 72 | prev_output_tokens, 73 | incremental_state=incremental_state, 74 | ) 75 | if self.embed_positions is not None 76 | else None 77 | ) 78 | 79 | if incremental_state is not None: 80 | prev_output_tokens = prev_output_tokens[:, -1:] 81 | if positions is not None: 82 | positions = positions[:, -1:] 83 | 84 | # embed tokens and positions 85 | x = self.embed_scale * self.embed_tokens(prev_output_tokens) 86 | if positions is not None: 87 | x += positions 88 | x = F.dropout(x, p=self.dropout, training=self.training) 89 | 90 | # B x T x C -> T x B x C 91 | x = x.transpose(0, 1) 92 | 93 | if x.size(1) == 1: 94 | if x.is_contiguous(): 95 | x = x.view(x.size(0), x.size(1), x.size(2)) 96 | else: 97 | x = x.contiguous() 98 | else: 99 | x = x.contiguous() 100 | 101 | attn = None 102 | 103 | # decoder layers 104 | for layer in self.layers: 105 | x, attn = layer( 106 | x, 107 | encoder_out["encoder_out"] if encoder_out is not None else None, 108 | encoder_out["encoder_padding_mask"] 109 | if encoder_out is not None 110 | else None, 111 | incremental_state, 112 | ) 113 | 114 | if self.normalize: 115 | x = self.layer_norm(x) 116 | 117 | # T x B x C -> B x T x C 118 | x = x.transpose(0, 1) 119 | # project back to size of vocabulary 120 | if self.share_input_output_embed: 121 | x = F.linear(x, self.embed_tokens.weight) 122 | else: 123 | x = F.linear(x, self.embed_out) 124 | 125 | return x, attn 126 | 127 | def max_positions(self): 128 | """Maximum output length supported by the decoder.""" 129 | if self.embed_positions is None: 130 | return self.max_target_positions 131 | return min(self.max_target_positions, self.embed_positions.max_positions()) 132 | 133 | def reorder_incremental_state(self, incremental_state, new_order): 134 | """Reorder incremental state. 135 | 136 | This should be called when the order of the input has changed from the 137 | previous time step. A typical use case is beam search, where the input 138 | order changes between time steps based on the selection of beams. 139 | """ 140 | 141 | def apply_reorder_incremental_state(module): 142 | if module != self and hasattr(module, "reorder_incremental_state"): 143 | module.reorder_incremental_state( 144 | incremental_state, 145 | new_order, 146 | ) 147 | 148 | self.apply(apply_reorder_incremental_state) 149 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/transformer/encoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn 6 | 7 | from mlbench_core.models.pytorch.transformer.modules import ( 8 | PositionalEmbedding, 9 | TransformerEncoderLayer, 10 | ) 11 | 12 | 13 | class TransformerEncoder(nn.Module): 14 | """ 15 | Transformer encoder consisting of *args.encoder_layers* layers. Each layer 16 | is a :class:`TransformerEncoderLayer`. 17 | 18 | Args: 19 | args: Arguments of model. All arguments should be accessible via `__getattribute__` method 20 | dictionary (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): encoding dictionary 21 | embed_tokens (torch.nn.Embedding): input embedding 22 | left_pad (bool): Pad sources to the left (`True`) or right (`False`). Default: `True` 23 | """ 24 | 25 | def __init__(self, args, dictionary, embed_tokens, left_pad=True): 26 | super().__init__() 27 | self.dictionary = dictionary 28 | self.dropout = args.dropout 29 | 30 | embed_dim = embed_tokens.embedding_dim 31 | self.padding_idx = embed_tokens.padding_idx 32 | self.max_source_positions = args.max_source_positions 33 | 34 | self.softmax_type = args.softmax_type 35 | 36 | self.embed_tokens = embed_tokens 37 | self.embed_scale = math.sqrt(embed_dim) 38 | self.embed_positions = ( 39 | PositionalEmbedding( 40 | args.max_source_positions, 41 | embed_dim, 42 | self.padding_idx, 43 | left_pad=left_pad, 44 | learned=args.encoder_learned_pos, 45 | ) 46 | if not args.no_token_positional_embeddings 47 | else None 48 | ) 49 | 50 | self.layers = nn.ModuleList( 51 | [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] 52 | ) 53 | 54 | self.normalize = args.encoder_normalize_before 55 | if self.normalize: 56 | self.layer_norm = nn.LayerNorm(embed_dim) 57 | 58 | def forward(self, src_tokens): 59 | """Forward function of encoder 60 | 61 | Args: 62 | src_tokens (:obj:`torch.Tensor`): Source tokens 63 | 64 | Returns: 65 | (dict): {`encoder:out` (:obj:`torch.Tensor`), `encoder_padding_mask` (:obj:`torch.Tensor`)} 66 | """ 67 | # embed tokens and positions 68 | x = self.embed_scale * self.embed_tokens(src_tokens) 69 | 70 | if self.embed_positions is not None: 71 | x += self.embed_positions(src_tokens) 72 | x = F.dropout(x, p=self.dropout, training=self.training) 73 | 74 | # B x T x C -> T x B x C 75 | x = x.transpose(0, 1) 76 | 77 | if x.size(1) == 1: 78 | if x.is_contiguous(): 79 | x = x.view(x.size(0), x.size(1), x.size(2)) 80 | else: 81 | x = x.contiguous() 82 | else: 83 | x = x.contiguous() 84 | 85 | # compute padding mask 86 | encoder_padding_mask = src_tokens.eq(self.padding_idx) 87 | if not encoder_padding_mask.any(): 88 | encoder_padding_mask = None 89 | if (self.softmax_type == "fast_fill") and (encoder_padding_mask is not None): 90 | encoder_padding_mask = torch.zeros_like( 91 | encoder_padding_mask, dtype=x.dtype 92 | ).masked_fill_(encoder_padding_mask, float("-inf")) 93 | 94 | # encoder layers 95 | for layer in self.layers: 96 | x = layer(x, encoder_padding_mask) 97 | 98 | if self.normalize: 99 | x = self.layer_norm(x) 100 | 101 | return { 102 | "encoder_out": x, # T x B x C 103 | "encoder_padding_mask": encoder_padding_mask, # B x T 104 | } 105 | 106 | def reorder_encoder_out(self, encoder_out, new_order): 107 | if encoder_out["encoder_out"] is not None: 108 | encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 109 | 1, new_order 110 | ) 111 | if encoder_out["encoder_padding_mask"] is not None: 112 | encoder_out["encoder_padding_mask"] = encoder_out[ 113 | "encoder_padding_mask" 114 | ].index_select(0, new_order) 115 | return encoder_out 116 | 117 | def max_positions(self): 118 | """Maximum input length supported by the encoder.""" 119 | if self.embed_positions is None: 120 | return self.max_source_positions 121 | return min(self.max_source_positions, self.embed_positions.max_positions()) 122 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/transformer/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .embeddings import PositionalEmbedding, build_embedding 2 | from .layers import TransformerDecoderLayer, TransformerEncoderLayer 3 | 4 | __ALL__ = [ 5 | PositionalEmbedding, 6 | build_embedding, 7 | TransformerDecoderLayer, 8 | TransformerEncoderLayer, 9 | ] 10 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/transformer/modules/strided_batched_gemm/strided_batched_gemm.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | #include 15 | #include 16 | 17 | at::Tensor strided_batched_gemm_cuda( 18 | float beta, 19 | at::Tensor in_result, 20 | float alpha, 21 | at::Tensor batch1, 22 | at::Tensor batch2); 23 | 24 | // C++ interface 25 | 26 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 27 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 28 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 29 | 30 | at::Tensor strided_batched_gemm( 31 | float beta, 32 | at::Tensor in_result, 33 | float alpha, 34 | at::Tensor batch1, 35 | at::Tensor batch2) { 36 | //CHECK_INPUT(in_result); 37 | //CHECK_INPUT(batch1); 38 | //CHECK_INPUT(batch2); 39 | 40 | AT_ASSERTM(in_result.dim() == 3, "expected 3D tensor"); 41 | AT_ASSERTM(batch1.dim() == 3, "expected 3D tensor"); 42 | AT_ASSERTM(batch2.dim() == 3, "expected 3D tensor"); 43 | 44 | AT_ASSERTM(in_result.size(0) == batch1.size(0), "equal number of batches expected"); 45 | AT_ASSERTM(in_result.size(0) == batch2.size(0), "equal number of batches expected"); 46 | 47 | AT_ASSERTM(in_result.size(1) == batch1.size(1), "wrong matrix size"); 48 | AT_ASSERTM(in_result.size(2) == batch2.size(2), "wrong matrix size"); 49 | AT_ASSERTM(batch1.size(2) == batch2.size(1), "wrong matrix size"); 50 | 51 | AT_ASSERTM(batch1.type().scalarType() == at::ScalarType::Half, "Only HALF is supported"); 52 | AT_ASSERTM(batch2.type().scalarType() == at::ScalarType::Half, "Only HALF is supported"); 53 | AT_ASSERTM(in_result.type().scalarType() == at::ScalarType::Half, "Only HALF is supported"); 54 | 55 | return strided_batched_gemm_cuda(beta, in_result, alpha, batch1, batch2); 56 | } 57 | 58 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 59 | m.def("strided_batched_gemm", &strided_batched_gemm, "Special strided batched gemm."); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/transformer/transformer.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from mlbench_core.models.pytorch.transformer.decoder import TransformerDecoder 4 | from mlbench_core.models.pytorch.transformer.encoder import TransformerEncoder 5 | from mlbench_core.models.pytorch.transformer.modules import build_embedding 6 | 7 | DEFAULT_MAX_SOURCE_POSITIONS = 256 8 | DEFAULT_MAX_TARGET_POSITIONS = 256 9 | 10 | 11 | class TransformerModel(nn.Module): 12 | """Transformer model 13 | 14 | This model uses MultiHeadAttention as described in 15 | :cite:`NIPS2017_7181` 16 | 17 | Args: 18 | args: Arguments of model. All arguments should be accessible via `__getattribute__` method 19 | src_dict (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): Source dictionary 20 | trg_dict (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): Target dictionary 21 | """ 22 | 23 | def __init__(self, args, src_dict, trg_dict): 24 | super().__init__() 25 | self._is_generation_fast = False 26 | if not hasattr(args, "max_source_positions"): 27 | args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS 28 | if not hasattr(args, "max_target_positions"): 29 | args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS 30 | 31 | # Define embedding layer 32 | if args.share_all_embeddings: 33 | if src_dict != trg_dict: 34 | raise ValueError("share_all_embeddings requires a joined dictionary") 35 | if args.encoder_embed_dim != args.decoder_embed_dim: 36 | raise ValueError( 37 | "share_all_embeddings requires encoder_embed_dim to match decoder_embed_dim" 38 | ) 39 | if args.decoder_embed_path and ( 40 | args.decoder_embed_path != args.encoder_embed_path 41 | ): 42 | raise ValueError( 43 | "share_all_embeddings not compatible with decoder_embed_path" 44 | ) 45 | encoder_embed_tokens = build_embedding( 46 | src_dict, args.encoder_embed_dim, args.encoder_embed_path 47 | ) 48 | decoder_embed_tokens = encoder_embed_tokens 49 | args.share_decoder_input_output_embed = True 50 | else: 51 | encoder_embed_tokens = build_embedding( 52 | src_dict, args.encoder_embed_dim, args.encoder_embed_path 53 | ) 54 | decoder_embed_tokens = build_embedding( 55 | trg_dict, args.decoder_embed_dim, args.decoder_embed_path 56 | ) 57 | self.encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens) 58 | self.decoder = TransformerDecoder(args, trg_dict, decoder_embed_tokens) 59 | 60 | def forward( 61 | self, 62 | src_tokens, 63 | src_lengths, 64 | prev_output_tokens, 65 | ): 66 | """ 67 | Run the forward pass of the transformer model. 68 | 69 | Args: 70 | src_tokens (:obj:`torch.Tensor`): Source tokens 71 | src_lengths (:obj:`torch.Tensor`): Source sentence lengths 72 | prev_output_tokens (:obj:`torch.Tensor`): Previous output tokens 73 | 74 | Returns: 75 | (:obj:`torch.Tensor`, Optional[:obj:`torch.Tensor`]): 76 | The model output, and attention weights if needed 77 | """ 78 | encoder_out = self.encoder(src_tokens) 79 | decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out) 80 | return decoder_out 81 | 82 | def max_positions(self): 83 | """Maximum length supported by the model.""" 84 | return self.encoder.max_positions(), self.decoder.max_positions() 85 | 86 | def max_decoder_positions(self): 87 | """Maximum length supported by the decoder. 88 | 89 | Returns: 90 | (int) 91 | """ 92 | return self.decoder.max_positions() 93 | -------------------------------------------------------------------------------- /mlbench_core/models/pytorch/vgg.py: -------------------------------------------------------------------------------- 1 | """VGG11/13/16/19 in Pytorch. 2 | 3 | From https://github.com/kuangliu/pytorch-cifar.""" 4 | import torch 5 | import torch.nn as nn 6 | 7 | cfg = { 8 | "VGG11": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"], 9 | "VGG13": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"], 10 | "VGG16": [ 11 | 64, 12 | 64, 13 | "M", 14 | 128, 15 | 128, 16 | "M", 17 | 256, 18 | 256, 19 | 256, 20 | "M", 21 | 512, 22 | 512, 23 | 512, 24 | "M", 25 | 512, 26 | 512, 27 | 512, 28 | "M", 29 | ], 30 | "VGG19": [ 31 | 64, 32 | 64, 33 | "M", 34 | 128, 35 | 128, 36 | "M", 37 | 256, 38 | 256, 39 | 256, 40 | 256, 41 | "M", 42 | 512, 43 | 512, 44 | 512, 45 | 512, 46 | "M", 47 | 512, 48 | 512, 49 | 512, 50 | 512, 51 | "M", 52 | ], 53 | } 54 | 55 | 56 | class VGG(nn.Module): 57 | def __init__(self, vgg_name): 58 | super(VGG, self).__init__() 59 | self.features = self._make_layers(cfg[vgg_name]) 60 | self.classifier = nn.Linear(512, 10) 61 | 62 | def forward(self, x): 63 | out = self.features(x) 64 | out = out.view(out.size(0), -1) 65 | out = self.classifier(out) 66 | return out 67 | 68 | def _make_layers(self, cfg): 69 | layers = [] 70 | in_channels = 3 71 | for x in cfg: 72 | if x == "M": 73 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 74 | else: 75 | layers += [ 76 | nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 77 | nn.BatchNorm2d(x), 78 | nn.ReLU(inplace=True), 79 | ] 80 | in_channels = x 81 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)] 82 | return nn.Sequential(*layers) 83 | 84 | 85 | def test(): 86 | net = VGG("VGG11") 87 | x = torch.randn(2, 3, 32, 32) 88 | y = net(x) 89 | print(y.size()) 90 | -------------------------------------------------------------------------------- /mlbench_core/models/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_model import Cifar10Model 2 | 3 | __all__ = ["Cifar10Model"] 4 | -------------------------------------------------------------------------------- /mlbench_core/optim/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | 4 | from . import pytorch 5 | except ImportError: 6 | pass 7 | -------------------------------------------------------------------------------- /mlbench_core/optim/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .centralized import * 2 | from .decentralized import * 3 | from .optim import * 4 | 5 | optimizers = { 6 | "sign_sgd": SignSGD, 7 | "sparsified_sgd": SparsifiedSGD, 8 | "centralized_sparsified_sgd": CentralizedSparsifiedSGD, 9 | "centralized_sgd": CentralizedSGD, 10 | "centralized_adam": CentralizedAdam, 11 | "power_sgd": PowerSGD, 12 | "decentralized_sgd": DecentralizedSGD, 13 | } 14 | 15 | 16 | def get_optimizer(optimizer, **kwargs): 17 | """Returns an object of the class specified with the argument `optimizer`. 18 | 19 | Args: 20 | optimizer (str): name of the optimizer 21 | **kwargs (dict, optional): additional optimizer-specific parameters. For the list of supported parameters 22 | for each optimizer, please look at its documentation. 23 | """ 24 | return optimizers[optimizer](**kwargs) 25 | -------------------------------------------------------------------------------- /mlbench_core/optim/pytorch/decentralized.py: -------------------------------------------------------------------------------- 1 | from torch.optim import SGD 2 | from torch.optim.optimizer import required 3 | 4 | from mlbench_core.aggregation.pytorch.decentralized import DecentralizedAggregation 5 | 6 | 7 | class DecentralizedSGD(SGD): 8 | r"""Implements decentralized stochastic gradient descent (optionally with momentum). 9 | 10 | Args: 11 | rank (int): rank of current process in the network 12 | neighbors (list): list of ranks of the neighbors of current process 13 | model (:obj:`nn.Module`): model which contains parameters for SGD 14 | lr (float): learning rate 15 | momentum (float, optional): momentum factor (default: 0) 16 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 17 | dampening (float, optional): dampening for momentum (default: 0) 18 | nesterov (bool, optional): enables Nesterov momentum (default: False) 19 | average_world (bool): Whether to average models on the world_size (default: `True`) 20 | use_cuda (bool): Whether to use cuda tensors for aggregation 21 | by_layer (bool): Aggregate by layer instead of all layers at once 22 | """ 23 | 24 | def __init__( 25 | self, 26 | rank=None, 27 | neighbors=None, 28 | model=None, 29 | lr=required, 30 | momentum=0, 31 | dampening=0, 32 | weight_decay=0, 33 | nesterov=False, 34 | average_world=True, 35 | use_cuda=False, 36 | by_layer=False, 37 | ): 38 | if not rank: 39 | raise ValueError('"rank" not set for optimizer') 40 | if not neighbors: 41 | raise ValueError('"neighbors" not set for optimizer') 42 | if not model: 43 | raise ValueError('"model" not set for optimizer') 44 | super(DecentralizedSGD, self).__init__( 45 | model.parameters(), lr, momentum, dampening, weight_decay, nesterov 46 | ) 47 | 48 | if average_world: 49 | self.agg_mode = "avg_world" 50 | else: 51 | raise NotImplementedError("Only average model is supported right now.") 52 | 53 | self.model = model 54 | self.agg = DecentralizedAggregation( 55 | rank, neighbors, use_cuda=use_cuda 56 | ).agg_model(by_layer=by_layer) 57 | 58 | def step(self, closure=None, tracker=None): 59 | """Aggregates the gradients and performs a single optimization step. 60 | 61 | Arguments: 62 | closure (callable, optional): A closure that reevaluates the model 63 | and returns the loss. 64 | tracker (:obj:`mlbench_core.utils.Tracker`, optional) The current tracker 65 | """ 66 | loss = super(DecentralizedSGD, self).step(closure=closure) 67 | if tracker: 68 | tracker.record_batch_opt_step() 69 | # Averaging the model after updating the gradient separately. 70 | self.agg(self.model, self.agg_mode) 71 | if tracker: 72 | tracker.record_batch_agg() 73 | return loss 74 | -------------------------------------------------------------------------------- /mlbench_core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .log_metrics import LogMetrics 2 | from .tracker import AverageMeter, Tracker 3 | 4 | try: 5 | import torch 6 | 7 | from . import pytorch 8 | except ImportError: 9 | pass 10 | 11 | 12 | try: 13 | import tensorflow 14 | 15 | from . import tensorflow 16 | except ImportError: 17 | pass 18 | -------------------------------------------------------------------------------- /mlbench_core/utils/log_metrics.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | 4 | from mlbench_core.api import ApiClient 5 | 6 | 7 | class LogMetrics(object): 8 | """Use to write metric values to the Dashboard API and to Trackers 9 | 10 | Caches API client for performance reasons 11 | """ 12 | 13 | in_cluster = os.getenv("KUBERNETES_SERVICE_HOST") is not None 14 | 15 | if in_cluster: 16 | api = ApiClient() 17 | 18 | @staticmethod 19 | def log(run_id, rank, epoch, metric_name, value): 20 | """Logs metrics to the Metrics API 21 | 22 | Currently only logs inside of a cluster 23 | 24 | Args: 25 | run_id (str): The id of the run in the dashboard 26 | rank (int): Rank of the current worker node 27 | epoch (float): The current epoch (fractional) 28 | metric_name (str): The name of the metric 29 | value (float / int / str): The metric value to write 30 | tracker(:obj:`mlbench_core.utils.Tracker`): The value Tracker 31 | time (float): The current time (used for Tracker) 32 | 33 | """ 34 | 35 | if not LogMetrics.in_cluster: 36 | return 37 | 38 | metric_name = "{} @ {}".format(metric_name, rank) 39 | 40 | LogMetrics.api.post_metric( 41 | run_id, 42 | metric_name, 43 | value, 44 | metadata="{{rank: {}, epoch:{}}}".format(rank, epoch), 45 | ) 46 | -------------------------------------------------------------------------------- /mlbench_core/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from contextlib import contextmanager 3 | 4 | import torch 5 | import torch.distributed as dist 6 | 7 | from .helpers import config_logging, config_path, config_pytorch 8 | from .topology import FCGraph 9 | 10 | __all__ = ["initialize_backends", "FCGraph"] 11 | 12 | 13 | @contextmanager 14 | def initialize_backends( 15 | comm_backend="mpi", 16 | hosts=None, 17 | rank=-1, 18 | logging_level="INFO", 19 | logging_file="/mlbench.log", 20 | use_cuda=False, 21 | seed=None, 22 | cudnn_deterministic=False, 23 | ckpt_run_dir="/checkpoints", 24 | delete_existing_ckpts=False, 25 | ): 26 | """Initializes the backends. 27 | 28 | Sets up logging, sets up pytorch and configures paths 29 | correctly. 30 | 31 | Args: 32 | config (:obj:`types.SimpleNamespace`): a global object containing all of the config. 33 | 34 | Returns: 35 | (:obj:`types.SimpleNamespace`): a global object containing all of the config. 36 | """ 37 | 38 | if not (hasattr(dist, "_initialized") and dist._initialized): 39 | 40 | if comm_backend in [dist.Backend.GLOO, dist.Backend.NCCL]: 41 | 42 | if comm_backend == dist.Backend.NCCL: 43 | assert ( 44 | torch.cuda.is_available() 45 | ), "Invalid use of NCCL backend without CUDA support available" 46 | 47 | hosts = hosts.split(",") 48 | os.environ["MASTER_ADDR"] = hosts[0] 49 | os.environ["MASTER_PORT"] = "29500" 50 | os.environ["RANK"] = str(rank) 51 | os.environ["WORLD_SIZE"] = str(len(hosts)) 52 | 53 | dist.init_process_group(comm_backend) 54 | 55 | config_logging(logging_level, logging_file) 56 | 57 | rank, world_size, graph = config_pytorch(use_cuda, seed, cudnn_deterministic) 58 | 59 | config_path(ckpt_run_dir, delete_existing_ckpts) 60 | 61 | yield rank, world_size, graph 62 | 63 | dist.destroy_process_group() 64 | -------------------------------------------------------------------------------- /mlbench_core/utils/pytorch/distributed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | 5 | def global_average(sum, count): 6 | def helper(array): 7 | array = get_backend_tensor(torch.Tensor(array)) 8 | 9 | dist.all_reduce(array, op=dist.ReduceOp.SUM) 10 | return array[0] / array[1] 11 | 12 | avg = helper([sum, count]) 13 | return avg 14 | 15 | 16 | def get_backend_tensor(tensor): 17 | if dist.is_initialized() and dist.get_backend() == dist.Backend.NCCL: 18 | return tensor.cuda() 19 | return tensor 20 | -------------------------------------------------------------------------------- /mlbench_core/utils/pytorch/helpers.py: -------------------------------------------------------------------------------- 1 | r"""Helper functions.""" 2 | 3 | import logging 4 | import os 5 | import random 6 | import shutil 7 | import socket 8 | 9 | import numpy as np 10 | import torch 11 | from torch import distributed as dist 12 | 13 | from mlbench_core.utils.pytorch.topology import FCGraph 14 | 15 | 16 | def config_logging(logging_level="INFO", logging_file="/mlbench.log"): 17 | """Setup logging modules. 18 | A stream handler and file handler are added to default logger `mlbench`. 19 | 20 | Args: 21 | logging_level (str): Log level 22 | logging_file (str): Log file 23 | 24 | """ 25 | 26 | class RankFilter(logging.Filter): 27 | def filter(self, record): 28 | record.rank = dist.get_rank() 29 | return True 30 | 31 | logger = logging.getLogger("mlbench") 32 | if len(logger.handlers) >= 2: 33 | return 34 | 35 | logger.setLevel(logging_level) 36 | logger.addFilter(RankFilter()) 37 | 38 | formatter = logging.Formatter( 39 | "%(asctime)s %(name)s %(rank)2s %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S" 40 | ) 41 | 42 | ch = logging.StreamHandler() 43 | ch.setLevel(logging_level) 44 | ch.setFormatter(formatter) 45 | logger.addHandler(ch) 46 | 47 | fh = logging.FileHandler(logging_file) 48 | fh.setLevel(logging_level) 49 | fh.setFormatter(formatter) 50 | logger.addHandler(fh) 51 | 52 | 53 | def config_pytorch(use_cuda=False, seed=None, cudnn_deterministic=False): 54 | """Config pytorch packages. 55 | 56 | Fix random number for packages and initialize distributed environment for pytorch. 57 | Setup cuda environment for pytorch. 58 | 59 | Args: 60 | use_cuda (bool): Use CUDA acceleration 61 | seed (int | None): Random seed to use 62 | cudnn_deterministic (bool): Set `cudnn.determenistic=True` 63 | 64 | Returns: 65 | (int, int, `obj`:FCGraph): The rank, world size, and network graph 66 | """ 67 | # Setting `cudnn.deterministic = True` will turn on 68 | # CUDNN deterministic setting which can slow down training considerably. 69 | # Unexpected behavior may also be observed from checkpoint. 70 | # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py 71 | if cudnn_deterministic: 72 | # cudnn.deterministic = True 73 | print( 74 | "You have chosen to seed training. " 75 | "This will turn on the CUDNN deterministic setting, " 76 | "which can slow down your training considerably! " 77 | "You may see unexpected behavior when restarting " 78 | "from checkpoints." 79 | ) 80 | 81 | if seed: 82 | torch.manual_seed(seed) 83 | torch.cuda.manual_seed_all(seed) 84 | np.random.seed(seed) 85 | random.seed(seed) 86 | os.environ["PYTHONHASHSEED"] = str(seed) 87 | 88 | # define the graph for the computation. 89 | if use_cuda: 90 | assert torch.cuda.is_available() 91 | 92 | rank = dist.get_rank() 93 | world_size = dist.get_world_size() 94 | backend = dist.get_backend() if dist.is_initialized() else None 95 | graph = FCGraph(rank, world_size, use_cuda) 96 | 97 | # enable cudnn accelerator if we are using cuda. 98 | if use_cuda: 99 | graph.assigned_gpu_id() 100 | torch.backends.cudnn.enabled = True 101 | torch.backends.cudnn.benchmark = False 102 | 103 | if cudnn_deterministic: 104 | torch.backends.cudnn.deterministic = True 105 | 106 | if torch.backends.cudnn.version() is None: 107 | print("CUDNN not found on device.") 108 | 109 | print( 110 | "World size={}, Rank={}, hostname={}, backend={}, cuda_available={}, cuda_device={}".format( 111 | world_size, 112 | rank, 113 | socket.gethostname(), 114 | backend, 115 | torch.cuda.is_available(), 116 | torch.cuda.current_device(), 117 | ) 118 | ) 119 | 120 | return rank, world_size, graph 121 | 122 | 123 | def config_path(ckpt_run_dir, delete_existing_ckpts=False): 124 | """Config the path used during the experiments.""" 125 | if delete_existing_ckpts: 126 | print("Remove previous checkpoint directory : {}".format(ckpt_run_dir)) 127 | shutil.rmtree(ckpt_run_dir, ignore_errors=True) 128 | os.makedirs(ckpt_run_dir, exist_ok=True) 129 | -------------------------------------------------------------------------------- /mlbench_core/utils/pytorch/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/utils/pytorch/inference/__init__.py -------------------------------------------------------------------------------- /mlbench_core/utils/pytorch/topology.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | from mlbench_core.utils.pytorch.distributed import get_backend_tensor 7 | 8 | 9 | def _ranks_on_same_node(rank, world_size): 10 | hostname = socket.gethostname() 11 | hostname_length = get_backend_tensor(torch.IntTensor([len(hostname)])) 12 | 13 | dist.all_reduce(hostname_length, op=dist.ReduceOp.MAX) 14 | max_hostname_length = hostname_length.item() 15 | 16 | encoding = [ord(c) for c in hostname] 17 | encoding += [-1 for c in range(max_hostname_length - len(hostname))] 18 | encoding = get_backend_tensor(torch.IntTensor(encoding)) 19 | 20 | all_encodings = [ 21 | get_backend_tensor(torch.IntTensor([0] * max_hostname_length)) 22 | for _ in range(world_size) 23 | ] 24 | dist.all_gather(all_encodings, encoding) 25 | 26 | if dist.get_backend() == dist.Backend.NCCL: 27 | all_encodings = [ec.cpu() for ec in all_encodings] 28 | 29 | all_encodings = [ec.numpy().tolist() for ec in all_encodings] 30 | 31 | ranks = [] 32 | for i in range(world_size): 33 | if all_encodings[rank] == all_encodings[i]: 34 | ranks.append(i) 35 | return ranks 36 | 37 | 38 | class FCGraph(object): 39 | """Fully-Connected Network Graph 40 | 41 | Args: 42 | config (dict): a global object containing all of the config. 43 | """ 44 | 45 | def __init__(self, rank, world_size, use_cuda=False): 46 | self.rank = rank 47 | self.world_size = world_size 48 | self.use_cuda = use_cuda 49 | 50 | @property 51 | def current_device_name(self): 52 | return "cuda:{}".format(torch.cuda.current_device()) if self.use_cuda else "cpu" 53 | 54 | @property 55 | def current_device(self): 56 | return torch.device(self.current_device_name()) 57 | 58 | def assigned_gpu_id(self): 59 | num_gpus_on_device = torch.cuda.device_count() 60 | ranks = _ranks_on_same_node(self.rank, self.world_size) 61 | # raise NotImplementedError(self.rank, ranks) 62 | assigned_id = ranks.index(self.rank) % num_gpus_on_device 63 | torch.cuda.set_device(assigned_id) 64 | 65 | def __str__(self): 66 | return "{}".format(self.current_device_name) 67 | 68 | def __repr__(self): 69 | return self.__str__() 70 | -------------------------------------------------------------------------------- /mlbench_core/utils/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | @torch.jit.script 5 | def orthogonalize(matrix, eps=torch.FloatTensor([1e-16])): 6 | """Function used to orthogonalize a matrix. 7 | 8 | Args: 9 | matrix (torch.Tensor): Matrix to orthogonalize 10 | eps (torch.FloatTensor): Used to avoid division by zero (default: 1e-16) 11 | """ 12 | n, m = matrix.shape 13 | for i in range(m): 14 | # Normalize the i'th column 15 | col = matrix[:, i : i + 1] 16 | col /= torch.sqrt(torch.sum(col ** 2)) + eps 17 | # Project it on the rest and remove it 18 | if i + 1 < m: 19 | rest = matrix[:, i + 1 :] 20 | # rest -= torch.matmul(col.t(), rest) * col 21 | rest -= torch.sum(col * rest, dim=0) * col 22 | 23 | 24 | def pack_tensors(tensors, use_cuda=False): 25 | """ 26 | Packs a list of tensors into one 1-dimensional tensor. 27 | 28 | Args: 29 | tensors (list[torch.Tensor]): The tensors to pack 30 | use_cuda (bool): Whether the resulting tensor should be on cuda 31 | 32 | Returns: 33 | (torch.Tensor, list[int], list[(int, int)]): 34 | The flattened tensors, the list start indices of each packed tensor, 35 | and the original shape of each tensor. 36 | 37 | Those values are used to then unpack the tensor 38 | """ 39 | indices = [0] 40 | for tensor in tensors: 41 | new_end = indices[-1] + tensor.nelement() 42 | indices.append(new_end) 43 | 44 | tensor_sizes = [t.size() for t in tensors] 45 | 46 | vec = torch.empty( 47 | indices[-1], 48 | device=tensors[0].device if tensors[0].is_cuda and use_cuda else "cpu", 49 | dtype=tensors[0].dtype, 50 | ) 51 | 52 | for tensor, start_idx, end_idx in zip(tensors, indices[:-1], indices[1:]): 53 | vec[start_idx:end_idx] = tensor.data.view(-1) 54 | 55 | return vec, indices, tensor_sizes 56 | 57 | 58 | def unpack_tensors(aggregated, indices, sizes): 59 | """ 60 | Unpacks a 1-dimensional tensor into a list of tensors 61 | 62 | Args: 63 | aggregated (torch.Tensor): The 1-dimensional tensor 64 | indices (List[Int]): The start index of each tensor 65 | sizes (List[(Int, Int)]): The size of each resulting tensor 66 | 67 | Returns: 68 | List[torch.Tensor]: The unpacked tensors 69 | """ 70 | start_index = indices[:-1] 71 | end_index = indices[1:] 72 | 73 | tensors = [] 74 | for i, (start, end) in enumerate(zip(start_index, end_index)): 75 | tensors.append(aggregated[start:end].view(sizes[i])) 76 | 77 | return tensors 78 | -------------------------------------------------------------------------------- /mlbench_core/utils/task_args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | 5 | def task_main(main_func, uid="allreduce"): 6 | """Parses the task arguments and launches the main 7 | 8 | Args: 9 | main_func: Main function. Must have arguments `run_id`, `dataset_dir`, `ckpt_run_dir`, `output_dir`, 10 | `rank`, `backend`, `hosts`, `validation_only`, `gpu`, `light_target`, 11 | uid: Task unique ID 12 | 13 | """ 14 | dataset_dir, ckpt_run_dir, output_dir, args = _task_args(uid=uid) 15 | 16 | main_func( 17 | run_id=args.run_id, 18 | dataset_dir=dataset_dir, 19 | ckpt_run_dir=ckpt_run_dir, 20 | output_dir=output_dir, 21 | rank=args.rank, 22 | backend=args.backend, 23 | hosts=args.hosts, 24 | validation_only=args.validation_only, 25 | gpu=args.gpu, 26 | light_target=args.light, 27 | ) 28 | 29 | 30 | def _task_args(uid): 31 | """ 32 | Parses the task arguments 33 | 34 | Args: 35 | uid (str): Task Unique ID 36 | 37 | Returns: 38 | str, str, str, dict: Dataset directory, checkpoint directory, output directory and arguments 39 | """ 40 | parser = argparse.ArgumentParser(description="Process run parameters") 41 | parser.add_argument("--run_id", type=str, default="1", help="The id of the run") 42 | parser.add_argument( 43 | "--root-dataset", 44 | type=str, 45 | default="/datasets", 46 | help="Default root directory to dataset.", 47 | ) 48 | parser.add_argument( 49 | "--root-checkpoint", 50 | type=str, 51 | default="/checkpoint", 52 | help="Default root directory to checkpoint.", 53 | ) 54 | parser.add_argument( 55 | "--root-output", 56 | type=str, 57 | default="/output", 58 | help="Default root directory to output.", 59 | ) 60 | parser.add_argument( 61 | "--validation_only", 62 | action="store_true", 63 | default=False, 64 | help="Only validate from checkpoints.", 65 | ) 66 | parser.add_argument( 67 | "--gpu", action="store_true", default=False, help="Train with GPU" 68 | ) 69 | parser.add_argument( 70 | "--light", 71 | action="store_true", 72 | default=False, 73 | help="Train to light target metric goal", 74 | ) 75 | parser.add_argument("--rank", type=int, default=1, help="The rank of the process") 76 | parser.add_argument( 77 | "--backend", type=str, default="mpi", help="PyTorch distributed backend" 78 | ) 79 | parser.add_argument("--hosts", type=str, help="The list of hosts") 80 | 81 | args = parser.parse_args() 82 | 83 | dataset_dir = os.path.join(args.root_dataset, "torch", "wmt17") 84 | ckpt_run_dir = os.path.join(args.root_checkpoint, uid) 85 | output_dir = os.path.join(args.root_output, uid) 86 | os.makedirs(dataset_dir, exist_ok=True) 87 | os.makedirs(ckpt_run_dir, exist_ok=True) 88 | os.makedirs(output_dir, exist_ok=True) 89 | 90 | return dataset_dir, ckpt_run_dir, output_dir, args 91 | -------------------------------------------------------------------------------- /mlbench_core/utils/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialize environment for pytorch.""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def _init_cleanup(config): 7 | r"""Cleanup legacy files like logs, output.""" 8 | print("=> Initial cleanup") 9 | 10 | 11 | def _init_log(config): 12 | print("=> Initialize log") 13 | 14 | 15 | def _init_tensorflow(config): 16 | print("=> Initialize TensorFlow") 17 | 18 | 19 | def initialize_backends(config): 20 | """Initializes the backends. 21 | 22 | Sets up logging, sets up tensorflow and configures paths 23 | correctly. 24 | 25 | Args: 26 | config (:obj:`types.SimpleNamespace`): a global object containing all of the config. 27 | 28 | Returns: 29 | (:obj:`types.SimpleNamespace`): a global object containing all of the config. 30 | """ 31 | _init_cleanup(config) 32 | 33 | _init_log(config) 34 | 35 | _init_tensorflow(config) 36 | return config 37 | 38 | 39 | def default_session_config( 40 | tf_allow_soft_placement, tf_log_device_placement, tf_gpu_mem 41 | ): 42 | """Initialize session configuration.""" 43 | session_conf = tf.ConfigProto( 44 | allow_soft_placement=tf_allow_soft_placement, 45 | log_device_placement=tf_log_device_placement, 46 | ) 47 | 48 | session_conf.gpu_options.allow_growth = False # True 49 | session_conf.gpu_options.per_process_gpu_memory_fraction = tf_gpu_mem 50 | return session_conf 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | . -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 3.0.0-dev23 3 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-dev(?P[0-9]+))? 4 | serialize = 5 | {major}.{minor}.{patch}-dev{dev} 6 | {major}.{minor}.{patch} 7 | commit = False 8 | tag = False 9 | 10 | [bumpversion:file:mlbench_core/__init__.py] 11 | search = __version__ = "{current_version}" 12 | replace = __version__ = "{new_version}" 13 | 14 | [bumpversion:file:setup.py] 15 | search = version="{current_version}" 16 | replace = version="{new_version}" 17 | 18 | [flake8] 19 | exclude = docs 20 | 21 | [aliases] 22 | test = pytest 23 | 24 | [tool:pytest] 25 | collect_ignore = ['setup.py'] 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import find_packages, setup 7 | 8 | with open("README.md") as readme_file: 9 | readme = readme_file.read() 10 | 11 | with open("CHANGELOG.md") as history_file: 12 | history = history_file.read() 13 | 14 | # Common libraries 15 | requirements = [ 16 | "appdirs==1.4.4", 17 | "boto3==1.17.74", 18 | "Click>=6.0", 19 | "deprecation>=2.0.6", 20 | "dill==0.3.4", 21 | "docker==5.0.0", 22 | "GitPython==3.1.17", 23 | "google-api-python-client==1.12.8", 24 | "google-auth==1.32.1", 25 | "google-cloud==0.34.0", 26 | "google-cloud-container==2.5.0", 27 | "grpcio==1.34.0", 28 | "kubernetes==12.0.1", 29 | "lmdb==1.2.1", 30 | "matplotlib==3.4.2", 31 | "numpy==1.20.3", 32 | "oauth2client==4.1.3", 33 | "sklearn==0.0", 34 | "supermutes==0.2.5", 35 | "tabulate>=0.8.5", 36 | "tensorpack==0.11", 37 | ] 38 | 39 | # Libraries used by torch 40 | torch_reqs = [ 41 | "sacrebleu==1.5.1", 42 | "torch==1.9.0", 43 | "torchvision==0.10.0", 44 | ] 45 | 46 | tensorflow_reqs = [ 47 | "tensorflow==1.13.2", 48 | ] 49 | 50 | setup_requirements = [ 51 | "pytest-runner", 52 | ] 53 | 54 | lint_requirements = [ 55 | "black==21.5b2", 56 | "isort==5.6.4", 57 | ] 58 | 59 | test_requirements = ( 60 | [ 61 | "codecov==2.1.9", 62 | "coverage==5.5", 63 | "freezegun==1.0.0", 64 | "pre-commit", 65 | "pytest>=3", 66 | "pytest-cov==2.10.1", 67 | "pytest-mock==3.3.1", 68 | "wcwidth==0.2.5", 69 | ] 70 | + lint_requirements 71 | + torch_reqs 72 | + tensorflow_reqs 73 | ) 74 | 75 | dev_requirements = torch_reqs + tensorflow_reqs + lint_requirements + test_requirements 76 | extras = { 77 | "test": test_requirements, 78 | "lint": lint_requirements, 79 | "torch": torch_reqs, 80 | "tensorflow": tensorflow_reqs, 81 | "dev": dev_requirements, 82 | } 83 | 84 | setup( 85 | author="Ralf Grubenmann", 86 | author_email="ralf.grubenmann@epfl.ch", 87 | classifiers=[ 88 | "Development Status :: 2 - Pre-Alpha", 89 | "Intended Audience :: Developers", 90 | "License :: OSI Approved :: Apache Software License", 91 | "Natural Language :: English", 92 | "Programming Language :: Python :: 3.4", 93 | "Programming Language :: Python :: 3.5", 94 | "Programming Language :: Python :: 3.6", 95 | "Programming Language :: Python :: 3.7", 96 | ], 97 | description="A public and reproducible collection of reference implementations and benchmark suite for distributed machine learning systems.", 98 | entry_points={ 99 | "console_scripts": [ 100 | "mlbench=mlbench_core.cli:cli_group", 101 | ], 102 | }, 103 | install_requires=requirements, 104 | license="Apache Software License 2.0", 105 | long_description=readme + "\n\n" + history, 106 | include_package_data=True, 107 | keywords="mlbench", 108 | name="mlbench_core", 109 | packages=find_packages(), 110 | setup_requires=setup_requirements, 111 | test_suite="tests", 112 | tests_require=test_requirements, 113 | extras_require=extras, 114 | url="https://github.com/mlbench/mlbench_core", 115 | version="3.0.0-dev23", 116 | zip_safe=False, 117 | ) 118 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_python_optim.py: -------------------------------------------------------------------------------- 1 | """Tests for `mlbench_core.optim.pytorch` package.""" 2 | import pytest 3 | import torch 4 | import torch.distributed as dist 5 | from torch.nn.modules import Linear, MSELoss 6 | from torch.optim import SGD 7 | 8 | from mlbench_core.optim.pytorch.centralized import ( 9 | CentralizedAdam, 10 | CentralizedSGD, 11 | CentralizedSparsifiedSGD, 12 | CustomCentralizedOptimizer, 13 | GenericCentralizedOptimizer, 14 | PowerSGD, 15 | ) 16 | from mlbench_core.optim.pytorch.optim import SignSGD, SparsifiedSGD 17 | 18 | 19 | def test_SparsifiedSGD(): 20 | model = Linear(2, 1) 21 | opt = SparsifiedSGD(model.parameters(), lr=1) 22 | 23 | input_data = torch.Tensor([[1, 2], [3, 4]]) 24 | target = torch.Tensor([[1], [2]]) 25 | 26 | opt.zero_grad() 27 | output = model(input_data) 28 | loss = MSELoss()(output, target) 29 | loss.backward() 30 | opt.step() 31 | 32 | 33 | def test_SignSGD(): 34 | model = Linear(2, 1) 35 | opt = SignSGD(model.parameters(), lr=1) 36 | 37 | input_data = torch.Tensor([[1, 2], [3, 4]]) 38 | target = torch.Tensor([[1], [2]]) 39 | 40 | opt.zero_grad() 41 | output = model(input_data) 42 | loss = MSELoss()(output, target) 43 | loss.backward() 44 | opt.step() 45 | 46 | 47 | def test_GenericCentralizedOptimizer(): 48 | model = Linear(2, 1) 49 | opt = SGD(model.parameters(), lr=1) 50 | c_opt = GenericCentralizedOptimizer(world_size=1, model=model) 51 | c_opt.optimizer = opt 52 | 53 | input_data = torch.Tensor([[1, 2], [3, 4]]) 54 | target = torch.Tensor([[1], [2]]) 55 | 56 | c_opt.zero_grad() 57 | output = model(input_data) 58 | loss = MSELoss()(output, target) 59 | loss.backward() 60 | opt.step() 61 | 62 | 63 | def test_CentralizedSparsifiedSGD(mocker): 64 | dist.init_process_group( 65 | "gloo", world_size=1, init_method="file:///tmp/somefile", rank=0 66 | ) 67 | model = Linear(2, 1, bias=False) 68 | opt = CentralizedSparsifiedSGD(model.parameters(), lr=10, sparse_grad_size=1) 69 | 70 | input_data = torch.Tensor([[1, 2], [3, 4]]) 71 | target = torch.Tensor([[1, 2], [2, 3]]) 72 | 73 | opt.zero_grad() 74 | output = model(input_data) 75 | loss = MSELoss()(output, target) 76 | loss.backward() 77 | opt.step() 78 | dist.destroy_process_group() 79 | 80 | 81 | def test_CentralizedSGD(): 82 | model = Linear(2, 1) 83 | opt = CentralizedSGD(world_size=1, model=model, lr=1) 84 | 85 | input_data = torch.Tensor([[1, 2], [3, 4]]) 86 | target = torch.Tensor([[1], [2]]) 87 | 88 | opt.zero_grad() 89 | output = model(input_data) 90 | loss = MSELoss()(output, target) 91 | loss.backward() 92 | opt.step() 93 | 94 | 95 | def test_CentralizedAdam(): 96 | model = Linear(2, 1) 97 | opt = CentralizedAdam(world_size=1, model=model, lr=1) 98 | 99 | input_data = torch.Tensor([[1, 2], [3, 4]]) 100 | target = torch.Tensor([[1], [2]]) 101 | 102 | opt.zero_grad() 103 | output = model(input_data) 104 | loss = MSELoss()(output, target) 105 | loss.backward() 106 | opt.step() 107 | 108 | 109 | def test_PowerSGD(): 110 | dist.init_process_group( 111 | "gloo", world_size=1, init_method="file:///tmp/somefile", rank=0 112 | ) 113 | model = Linear(2, 1) 114 | opt = PowerSGD(world_size=1, model=model, lr=1) 115 | 116 | input_data = torch.Tensor([[1, 2], [3, 4]]) 117 | target = torch.Tensor([[1], [2]]) 118 | 119 | opt.zero_grad() 120 | output = model(input_data) 121 | loss = MSELoss()(output, target) 122 | loss.backward() 123 | opt.step() 124 | dist.destroy_process_group() 125 | 126 | 127 | def test_CustomCentralizedOptimizer(): 128 | 129 | model = Linear(2, 1) 130 | opt = SGD(params=model.parameters(), lr=1) 131 | c_opt = CustomCentralizedOptimizer( 132 | world_size=1, model=model, optimizer=opt, average_world=True 133 | ) 134 | 135 | input_data = torch.Tensor([[1, 2], [3, 4]]) 136 | target = torch.Tensor([[1], [2]]) 137 | 138 | c_opt.zero_grad() 139 | output = model(input_data) 140 | loss = MSELoss()(output, target) 141 | loss.backward() 142 | c_opt.step() 143 | -------------------------------------------------------------------------------- /tests/test_pytorch_controlflow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Tests for `mlbench_core.controlflow.pytorch` package.""" 5 | import itertools 6 | import random 7 | 8 | import pytest 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | from torch.utils.data import DataLoader 13 | 14 | from mlbench_core.controlflow.pytorch.controlflow import ( 15 | compute_train_batch_metrics, 16 | record_train_batch_stats, 17 | validation_round, 18 | ) 19 | from mlbench_core.controlflow.pytorch.helpers import ( 20 | convert_dtype, 21 | iterate_dataloader, 22 | maybe_range, 23 | ) 24 | from mlbench_core.evaluation.pytorch.metrics import TopKAccuracy 25 | 26 | 27 | @pytest.fixture 28 | def model(): 29 | return nn.Linear(1, 2) 30 | 31 | 32 | @pytest.fixture 33 | def optimizer(model): 34 | return optim.SGD(model.parameters(), lr=0.1) 35 | 36 | 37 | @pytest.fixture 38 | def loss_function(): 39 | return nn.CrossEntropyLoss() 40 | 41 | 42 | @pytest.fixture 43 | def metrics(): 44 | return [TopKAccuracy(topk=1)] 45 | 46 | 47 | def _create_random_sets(): 48 | train_set = [random.random() for _ in range(100)] 49 | train_set = [ 50 | ( 51 | torch.FloatTensor([n * 50 - 25]), 52 | 1 if (n > 0.5) != (random.random() < 0.1) else 0, 53 | ) 54 | for n in train_set 55 | ] 56 | 57 | test_set = [random.random() for _ in range(10)] 58 | test_set = [ 59 | ( 60 | torch.FloatTensor([n * 50 - 25]), 61 | 1 if (n > 0.5) != (random.random() < 0.1) else 0, 62 | ) 63 | for n in test_set 64 | ] 65 | 66 | return train_set, test_set 67 | 68 | 69 | def test_compute_train_metrics(mocker, model, optimizer, loss_function, metrics): 70 | mocker.patch("mlbench_core.utils.pytorch.distributed.dist") 71 | mocker.patch("mlbench_core.utils.tracker.LogMetrics") 72 | 73 | batch_size = 2 74 | 75 | train_set, test_set = _create_random_sets() 76 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) 77 | 78 | for i, (data, target) in enumerate(train_loader): 79 | optimizer.zero_grad() 80 | output = model(data) 81 | loss = loss_function(output, target) 82 | 83 | metric_values = compute_train_batch_metrics(output, target, metrics) 84 | metric_values = [(k, v) for k, v in metric_values.items() if k.name == "Prec@1"] 85 | assert len(metric_values) == 1 86 | 87 | metric, value = metric_values[0] 88 | 89 | assert value == metrics[0](output, target) 90 | 91 | 92 | def test_validation_round(mocker, model, optimizer, loss_function, metrics): 93 | mocker.patch("mlbench_core.utils.pytorch.distributed.dist") 94 | mocker.patch("mlbench_core.utils.tracker.LogMetrics") 95 | 96 | batch_size = 2 97 | 98 | train_set, test_set = _create_random_sets() 99 | train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) 100 | test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False) 101 | 102 | for data, target in train_loader: 103 | optimizer.zero_grad() 104 | output = model(data) 105 | loss = loss_function(output, target) 106 | 107 | loss.backward() 108 | optimizer.step() 109 | 110 | metric_values, loss_values = validation_round( 111 | test_loader, 112 | model=model, 113 | loss_function=loss_function, 114 | metrics=metrics, 115 | dtype="fp32", 116 | ) 117 | 118 | assert "Prec@1" in [m.name for m in metric_values] 119 | 120 | 121 | def test_maybe_range(): 122 | r = maybe_range(10) 123 | 124 | assert len(r) == 10 125 | assert r == range(10) 126 | 127 | r = maybe_range(None) 128 | 129 | assert isinstance(r, itertools.count) 130 | assert next(r) == 0 131 | assert next(r) == 1 132 | 133 | 134 | def test_convert_dtype(): 135 | t = torch.IntTensor([0]) 136 | 137 | tt = convert_dtype("fp32", t) 138 | 139 | assert tt.dtype == torch.float32 140 | 141 | tt2 = convert_dtype("fp64", t) 142 | 143 | assert tt2.dtype == torch.float64 144 | 145 | with pytest.raises(NotImplementedError): 146 | tt3 = convert_dtype("int", t) 147 | 148 | 149 | def test_iterate_dataloader(mocker): 150 | dataloader = [ 151 | (torch.IntTensor([0]), torch.IntTensor([1])), 152 | (torch.IntTensor([2]), torch.IntTensor([3])), 153 | ] 154 | 155 | it = iterate_dataloader( 156 | dataloader, "fp32", max_batch_per_epoch=2, transform_target_type=True 157 | ) 158 | 159 | first = next(it) 160 | 161 | assert first[0].dtype == torch.float32 162 | assert first[1].dtype == torch.float32 163 | assert first[0].data.item() == 0.0 164 | assert first[1].item() == 1.0 165 | 166 | second = next(it) 167 | 168 | assert second[0].dtype == torch.float32 169 | assert second[1].dtype == torch.float32 170 | assert second[0].data.item() == 2.0 171 | assert second[1].item() == 3.0 172 | -------------------------------------------------------------------------------- /tests/test_pytorch_helpers.py: -------------------------------------------------------------------------------- 1 | """Tests for `mlbench_core.utils.pytorch.helpers` package.""" 2 | 3 | from mlbench_core.utils.pytorch.helpers import config_path, config_pytorch 4 | 5 | 6 | def test_config_pytorch(mocker): 7 | mocker.patch("torch.distributed.get_rank", return_value=1) 8 | mocker.patch("torch.distributed.get_world_size", return_value=1) 9 | mocker.patch("mlbench_core.utils.pytorch.helpers.FCGraph") 10 | 11 | rank, world_size, graph = config_pytorch( 12 | use_cuda=False, seed=42, cudnn_deterministic=True 13 | ) 14 | 15 | assert rank == 1 16 | assert world_size == 1 17 | assert graph is not None 18 | 19 | 20 | def test_config_path(mocker): 21 | sh = mocker.patch("shutil.rmtree") 22 | osmk = mocker.patch("os.makedirs") 23 | 24 | config_path("/tmp/checkpoints", delete_existing_ckpts=False) 25 | 26 | osmk.assert_called_once_with("/tmp/checkpoints", exist_ok=True) 27 | assert sh.call_count == 0 28 | 29 | config_path("/tmp/checkpoints", delete_existing_ckpts=True) 30 | 31 | assert sh.call_count == 1 32 | assert osmk.call_count == 2 33 | -------------------------------------------------------------------------------- /tests/test_pytorch_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from mlbench_core.evaluation.pytorch.metrics import ( 5 | BLEUScore, 6 | DiceCoefficient, 7 | F1Score, 8 | Perplexity, 9 | TopKAccuracy, 10 | ) 11 | 12 | 13 | def test_f1_score(): 14 | output = torch.tensor([1, 1, 1, 1, 1]).reshape(5, 1) 15 | target = torch.tensor([0, 0, 0, 0, 0]).reshape(5, 1) 16 | 17 | f1 = F1Score() 18 | score = f1(output, target) 19 | 20 | assert score.item() == 0 21 | 22 | output = torch.tensor([1, 1, 1, 0, 1]).reshape(5, 1) 23 | target = torch.tensor([1, 0, 1, 1, 0]).reshape(5, 1) 24 | 25 | precision = 2 / (2 + 2) 26 | recall = 2 / (2 + 1) 27 | 28 | score = f1(output, target) 29 | expected_score = 2 * (precision * recall) / (precision + recall) 30 | np.testing.assert_almost_equal(score.item(), expected_score) 31 | 32 | 33 | def test_top1_accuracy(): 34 | output_1 = torch.tensor([[0, 1], [0, 1], [1, 0], [0, 1], [1, 0]]).reshape(5, 2) 35 | output_2 = torch.tensor([1, 1, 0, 1, 0]).reshape(5, 1) 36 | target = torch.tensor([0, 1, 0, 0, 1]).reshape(5, 1) 37 | 38 | acc = TopKAccuracy(topk=1) 39 | expected_score = (2 / 5) * 100 40 | 41 | actual_score_1 = acc(output_1, target) 42 | actual_score_2 = acc(output_2, target) 43 | 44 | assert actual_score_1 == expected_score 45 | assert actual_score_2 == expected_score 46 | 47 | 48 | def test_top3_accuracy(): 49 | output_1 = torch.tensor( 50 | [ 51 | [0.2, 0.2, 0.3, 0.1], 52 | [0.15, 0.2, 0.05, 0.6], 53 | [0.25, 0.3, 0.15, 0.3], 54 | [0.3, 0.1, 0.2, 0.2], 55 | [0.15, 0.15, 0.2, 0.5], 56 | ] 57 | ).reshape(5, 4) 58 | target = torch.tensor([3, 1, 0, 2, 1]).reshape(5, 1) 59 | 60 | acc = TopKAccuracy(topk=3) 61 | expected_score = (3 / 5) * 100 62 | 63 | actual_score_1 = acc(output_1, target) 64 | 65 | assert actual_score_1 == expected_score 66 | 67 | 68 | def test_perplexity(): 69 | target = torch.randint(high=1000, size=(100, 1)) 70 | outputs = torch.randn((100, 1000, 1)) 71 | 72 | true_ppl = torch.exp(torch.nn.functional.cross_entropy(outputs, target)) 73 | ppl = Perplexity() 74 | ppl_score = ppl(outputs, target) 75 | 76 | assert ppl_score == true_ppl 77 | 78 | 79 | def test_dice_coefficient(): 80 | target = torch.Tensor([1, 1, 1, 0, 0, 1]).view(-1, 1) 81 | output = torch.Tensor([0.2, 0.6, 0.1, 0.15, 0.1, 0.8]).view(-1, 1) 82 | 83 | dice = DiceCoefficient() 84 | 85 | loss = dice(output, target).item() 86 | 87 | assert round(loss, 1) == 0.6 88 | 89 | 90 | def test_raw_bleu_score(): 91 | outputs = ["the quick yellow fox jumps over the active dog"] 92 | target = ["the quick brown fox jumps over the lazy dog"] 93 | 94 | bl = BLEUScore(use_raw=True) 95 | score = bl(outputs, target) 96 | 97 | assert round(score.item(), 1) == 36.9 98 | -------------------------------------------------------------------------------- /tests/test_pytorch_models.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from mlbench_core.models.pytorch.linear_models import * 5 | from mlbench_core.models.pytorch.resnet import * 6 | 7 | 8 | def test_resnet18(): 9 | resnet = resnet18_bkj(1000) 10 | 11 | inp = torch.rand(2, 3, 32, 32) 12 | 13 | outp = resnet(inp) 14 | 15 | assert outp is not None 16 | assert outp.shape[0] == 2 17 | assert outp.shape[1] == 1000 18 | 19 | resnet = resnet18_bkj(500) 20 | 21 | inp = torch.rand(3, 3, 32, 32) 22 | 23 | outp = resnet(inp) 24 | 25 | assert outp is not None 26 | assert outp.shape[0] == 3 27 | assert outp.shape[1] == 500 28 | 29 | 30 | def test_resnet20(): 31 | resnet = get_resnet_model("resnet20", 1, "fp32") 32 | 33 | inp = torch.rand(2, 3, 32, 32) 34 | 35 | outp = resnet(inp) 36 | 37 | assert outp is not None 38 | assert outp.shape[0] == 2 39 | assert outp.shape[1] == 10 40 | 41 | resnet = get_resnet_model("resnet20", 1, "fp32") 42 | 43 | inp = torch.rand(3, 3, 32, 32) 44 | 45 | outp = resnet(inp) 46 | 47 | assert outp is not None 48 | assert outp.shape[0] == 3 49 | assert outp.shape[1] == 10 50 | 51 | 52 | def test_resnet20v2(): 53 | resnet = get_resnet_model("resnet20", 2, "fp32") 54 | 55 | inp = torch.rand(2, 3, 32, 32) 56 | 57 | outp = resnet(inp) 58 | 59 | assert outp is not None 60 | assert outp.shape[0] == 2 61 | assert outp.shape[1] == 10 62 | 63 | resnet = get_resnet_model("resnet20", 2, "fp32") 64 | 65 | inp = torch.rand(3, 3, 32, 32) 66 | 67 | outp = resnet(inp) 68 | 69 | assert outp is not None 70 | assert outp.shape[0] == 3 71 | assert outp.shape[1] == 10 72 | 73 | 74 | def test_linear_regression(): 75 | lr = LinearRegression(10) # Linear regression with 10 features 76 | inp = torch.rand(100, 10) 77 | 78 | output = lr(inp) 79 | assert output is not None 80 | assert output.shape[0] == 100 81 | assert output.shape[1] == 1 82 | 83 | 84 | def test_logistic_regression(): 85 | log = LogisticRegression(10) 86 | 87 | inp = torch.rand(100, 10) 88 | 89 | output = log(inp) 90 | assert output is not None 91 | assert output.shape[0] == 100 92 | assert output.shape[1] == 1 93 | -------------------------------------------------------------------------------- /tests/test_pytorch_schedulers.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import pytest 4 | import torch 5 | 6 | from mlbench_core.lr_scheduler.pytorch.lr import ( 7 | LRLinearWarmUp, 8 | MultiStepLRLinearWarmUp, 9 | SQRTTimeDecayLR, 10 | TimeDecayLR, 11 | ) 12 | 13 | 14 | def test_linear_warmup_1(): 15 | """Tests Linear Warmup LR""" 16 | init_lr = 0 17 | scaled_lr = 10 18 | warmup_duration = 5 19 | params = torch.nn.Parameter(torch.Tensor([1, 2, 3])) 20 | opt = torch.optim.SGD([params], lr=scaled_lr) 21 | 22 | scheduler = LRLinearWarmUp( 23 | optimizer=opt, 24 | init_lr=init_lr, 25 | scaled_lr=scaled_lr, 26 | warmup_duration=warmup_duration, 27 | ) 28 | 29 | lrs = [0, 2, 4, 6, 8, 10, 10] 30 | for i in range(7): 31 | last_lr = scheduler.get_last_lr()[0] 32 | assert last_lr == lrs[i] 33 | scheduler.step() 34 | 35 | 36 | def test_linear_warmup_2(): 37 | """Tests Linear Warmup LR""" 38 | init_lr = 10 39 | scaled_lr = 10 40 | warmup_duration = 5 41 | params = torch.nn.Parameter(torch.Tensor([1, 2, 3])) 42 | opt = torch.optim.SGD([params], lr=scaled_lr) 43 | 44 | scheduler = LRLinearWarmUp( 45 | optimizer=opt, 46 | init_lr=init_lr, 47 | scaled_lr=scaled_lr, 48 | warmup_duration=warmup_duration, 49 | ) 50 | 51 | for i in range(7): 52 | last_lr = scheduler.get_last_lr()[0] 53 | assert last_lr == scaled_lr 54 | scheduler.step() 55 | 56 | 57 | def test_multi_step_lr(): 58 | """Tests Multi step LR without warmup""" 59 | scaled_lr = 10 60 | params = torch.nn.Parameter(torch.Tensor([1, 2, 3])) 61 | opt = torch.optim.SGD([params], lr=scaled_lr) 62 | 63 | scheduler = MultiStepLRLinearWarmUp( 64 | optimizer=opt, scaled_lr=scaled_lr, gamma=0.5, milestones=[2, 3] 65 | ) 66 | 67 | lrs = [10, 10, 5, 2.5] 68 | for i in range(4): 69 | last_lr = scheduler.get_last_lr()[0] 70 | assert last_lr == lrs[i] 71 | scheduler.step() 72 | 73 | 74 | def test_multi_step_lin_warmup(): 75 | """Tests Multistep LR with linear warmup""" 76 | init_lr = 0 77 | scaled_lr = 10 78 | warmup_duration = 5 79 | params = torch.nn.Parameter(torch.Tensor([1, 2, 3])) 80 | opt = torch.optim.SGD([params], lr=scaled_lr) 81 | 82 | scheduler = MultiStepLRLinearWarmUp( 83 | optimizer=opt, 84 | warmup_init_lr=init_lr, 85 | scaled_lr=scaled_lr, 86 | warmup_duration=warmup_duration, 87 | gamma=0.5, 88 | milestones=[7, 8], 89 | ) 90 | 91 | lrs = [0, 2, 4, 6, 8, 10, 10, 5, 2.5] 92 | for i in range(9): 93 | last_lr = scheduler.get_last_lr()[0] 94 | assert last_lr == lrs[i] 95 | scheduler.step() 96 | 97 | 98 | def test_time_decay_lr(): 99 | """Tests Time Decay LR""" 100 | lr = 10 101 | beta = 1 102 | params = torch.nn.Parameter(torch.Tensor([1, 2, 3])) 103 | opt = torch.optim.SGD([params], lr=lr) 104 | 105 | scheduler = TimeDecayLR(optimizer=opt, beta=beta) 106 | 107 | for i in range(10): 108 | true_lr = lr / (i + beta) 109 | last_lr = scheduler.get_last_lr()[0] 110 | assert last_lr == pytest.approx(true_lr) 111 | scheduler.step() 112 | 113 | 114 | def test_sqrt_time_decay_lr(): 115 | """Tests SQRT Time Decay LR""" 116 | lr = 10 117 | params = torch.nn.Parameter(torch.Tensor([1, 2, 3])) 118 | opt = torch.optim.SGD([params], lr=lr) 119 | 120 | scheduler = SQRTTimeDecayLR(optimizer=opt) 121 | 122 | for i in range(10): 123 | true_lr = lr / math.sqrt(max(1, i)) 124 | last_lr = scheduler.get_last_lr()[0] 125 | assert last_lr == pytest.approx(true_lr) 126 | scheduler.step() 127 | -------------------------------------------------------------------------------- /tests/test_pytorch_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for `mlbench_core.utils.pytorch.utils` package.""" 2 | import torch 3 | 4 | from mlbench_core.utils.pytorch.utils import orthogonalize, pack_tensors, unpack_tensors 5 | 6 | 7 | def test_orthogonalize(): 8 | m = torch.rand(2, 2) 9 | identity = torch.eye(2) 10 | 11 | orthogonalize(m) 12 | 13 | # check if m'*m = I 14 | assert torch.allclose(torch.matmul(m.t(), m), identity, atol=1e-04) 15 | 16 | 17 | def test_pack_tensors(): 18 | tensors = [torch.rand(2, 2), torch.rand(2, 2)] 19 | 20 | flattened = [y for x in tensors for y in x.view(-1)] 21 | 22 | vec, indices, sizes = pack_tensors(tensors) 23 | 24 | assert vec.tolist() == flattened 25 | assert indices == [0, 4, 8] 26 | assert sizes == [(2, 2), (2, 2)] 27 | 28 | 29 | def test_unpack_tensors(): 30 | tensors = [torch.rand(2, 2), torch.rand(2, 2)] 31 | vec, indices, sizes = pack_tensors(tensors) 32 | 33 | unpacked = unpack_tensors(vec, indices, sizes) 34 | 35 | assert all((x == y).all() for x, y in zip(tensors, unpacked)) 36 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for `mlbench_core.utils` package.""" 2 | 3 | import datetime 4 | 5 | from freezegun import freeze_time 6 | 7 | from mlbench_core.evaluation.goals import task1_time_to_accuracy_light_goal 8 | from mlbench_core.evaluation.pytorch.metrics import TopKAccuracy 9 | from mlbench_core.utils import LogMetrics, Tracker 10 | 11 | 12 | def test_tracker(): 13 | tracker = Tracker([TopKAccuracy(5)], 1, 0) 14 | 15 | assert tracker is not None 16 | 17 | 18 | def test_tracker_goal(mocker): 19 | patched = mocker.patch("mlbench_core.utils.tracker.LogMetrics") 20 | 21 | metric = TopKAccuracy(1) 22 | tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal()) 23 | 24 | tracker.start() 25 | 26 | assert tracker.start_time is not None 27 | 28 | tracker.train() 29 | 30 | tracker.record_stat("global_Prec@1", 69, log_to_api=True) 31 | tracker.batch_end() 32 | 33 | assert not tracker.goal_reached 34 | 35 | tracker.record_stat("global_Prec@1", 70, log_to_api=True) 36 | tracker.batch_end() 37 | 38 | assert not tracker.goal_reached 39 | 40 | tracker.validation() 41 | 42 | tracker.record_stat("global_Prec@1", 69, log_to_api=True) 43 | tracker.batch_end() 44 | 45 | assert not tracker.goal_reached 46 | 47 | tracker.record_stat("global_Prec@1", 70, log_to_api=True) 48 | 49 | assert tracker.goal_reached 50 | 51 | 52 | def _do_batch(tracker, frozen): 53 | tracker.batch_start() 54 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 55 | tracker.record_batch_load() 56 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 57 | tracker.record_batch_init() 58 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 59 | tracker.record_batch_fwd_pass() 60 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 61 | tracker.record_batch_comp_loss() 62 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 63 | tracker.record_batch_backprop() 64 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 65 | tracker.record_batch_agg() 66 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 67 | tracker.record_batch_opt_step() 68 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 69 | tracker.record_batch_comp_metrics() 70 | frozen.tick(delta=datetime.timedelta(seconds=0.5)) 71 | tracker.batch_end() 72 | 73 | 74 | def test_tracker_goal_times(mocker): 75 | patched = mocker.patch("mlbench_core.utils.tracker.LogMetrics") 76 | 77 | metric = TopKAccuracy(1) 78 | tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal()) 79 | 80 | tracker.start() 81 | 82 | assert tracker.start_time is not None 83 | 84 | tracker.train() 85 | 86 | with freeze_time(datetime.datetime.now()) as frozen: 87 | _do_batch(tracker, frozen) 88 | 89 | assert abs(tracker.get_total_preprocess_time() - 0.5) < 0.01 90 | assert abs(tracker.get_total_communication_time() - 0.5) < 0.01 91 | assert abs(tracker.get_total_compute_time() - 2.0) < 0.01 92 | assert abs(tracker.get_total_metrics_time() - 0.5) < 0.01 93 | 94 | _do_batch(tracker, frozen) 95 | 96 | assert abs(tracker.get_total_preprocess_time() - 1.0) < 0.01 97 | assert abs(tracker.get_total_communication_time() - 1.0) < 0.01 98 | assert abs(tracker.get_total_compute_time() - 4.0) < 0.01 99 | assert abs(tracker.get_total_metrics_time() - 1.0) < 0.01 100 | 101 | tracker.validation() 102 | tracker.record_stat("global_Prec@1", 70, log_to_api=True) 103 | 104 | assert tracker.goal_reached 105 | assert any(filter(lambda c: c[1][3] == "TaskResult", patched.method_calls)) 106 | 107 | 108 | def test_LogMetrics(mocker): 109 | mocker.patch("mlbench_core.api.ApiClient") 110 | 111 | LogMetrics.log("1", 1, 1, "loss", 123) 112 | 113 | mocker.patch.dict("os.environ", {"MLBENCH_IN_DOCKER": "True"}) 114 | 115 | LogMetrics.log("1", 1, 1, "loss", 123) 116 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37, lint, docs 3 | 4 | [default] 5 | basepython = python3.7 6 | 7 | deps = 8 | .[test] 9 | 10 | setenv = 11 | PYTHONPATH = {toxinidir} 12 | 13 | [testenv] 14 | description = run tests 15 | 16 | basepython = 17 | py37: python3.7 18 | 19 | pypy3: pypy3 20 | 21 | deps = 22 | {[default]deps} 23 | 24 | setenv = 25 | {[default]setenv} 26 | 27 | passenv = CI TRAVIS TRAVIS_* 28 | 29 | commands = 30 | pytest --cov=./mlbench_core/ 31 | codecov 32 | 33 | 34 | [testenv:docs] 35 | basepython=python 36 | changedir={toxinidir}/docs 37 | deps= 38 | -rdocs/requirements.txt 39 | commands= 40 | sphinx-build -W -b html -d _build/doctrees . _build/html 41 | 42 | 43 | [testenv:lint] 44 | 45 | description = run Black (linter) and isort (import sorter) 46 | 47 | basepython = {[default]basepython} 48 | 49 | skip_install = True 50 | 51 | deps = 52 | .[lint] 53 | 54 | setenv = 55 | BLACK_LINT_ARGS=--check 56 | 57 | commands = 58 | black {env:BLACK_LINT_ARGS:} . 59 | isort --check-only . 60 | 61 | [tool:isort] 62 | ; black's default line length 63 | line_length = 88 64 | multi_line_output = 3 65 | include_trailing_comma = True 66 | known_first_party = mlbench_core 67 | known_third_party =PIL,appdirs,boto3,botocore,click,cv2,deprecation,dill,docker,docutils,freezegun,gensidebar,google,kubernetes,lmdb,matplotlib,mosestokenizer,numpy,pyhelm,pytest,requests,setuptools,six,sklearn,sphinx,tabulate,tensorflow,tensorpack,torch,torchtext,torchvision,tqdm,urllib3,yaml 68 | --------------------------------------------------------------------------------