├── .codecov.yml
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── create-release.yml
    │   └── mlbench-core.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pyup.yml
├── CHANGELOG.md
├── DEVELOPMENT.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   └── css
    │   │   └── custom.css
    ├── aggregation.rst
    ├── api.rst
    ├── clientapi.rst
    ├── conf.py
    ├── controlflow.rst
    ├── dataset.rst
    ├── evaluation.rst
    ├── examples.rst
    ├── gensidebar.py
    ├── index.rst
    ├── lr_scheduler.bib
    ├── lr_scheduler.rst
    ├── models.bib
    ├── models.rst
    ├── optim.bib
    ├── optim.rst
    ├── requirements.txt
    └── utils.rst
├── mlbench_core
    ├── __init__.py
    ├── aggregation
    │   ├── __init__.py
    │   └── pytorch
    │   │   ├── __init__.py
    │   │   ├── aggregation.py
    │   │   ├── centralized.py
    │   │   └── decentralized.py
    ├── api.py
    ├── cli
    │   ├── __init__.py
    │   ├── aws_utils.py
    │   ├── chartbuilder.py
    │   ├── cli.py
    │   ├── gcloud_utils.py
    │   ├── kind_utils.py
    │   └── utils.py
    ├── controlflow
    │   ├── __init__.py
    │   ├── pytorch
    │   │   ├── __init__.py
    │   │   ├── checkpoints_evaluation.py
    │   │   ├── controlflow.py
    │   │   ├── helpers.py
    │   │   └── train_validation.py
    │   └── tensorflow
    │   │   ├── __init__.py
    │   │   └── train_validation.py
    ├── dataset
    │   ├── __init__.py
    │   ├── imagerecognition
    │   │   ├── __init__.py
    │   │   ├── pytorch
    │   │   │   ├── __init__.py
    │   │   │   └── dataloader.py
    │   │   └── tensorflow
    │   │   │   ├── __init__.py
    │   │   │   └── cifar10.py
    │   ├── linearmodels
    │   │   ├── __init__.py
    │   │   └── pytorch
    │   │   │   ├── __init__.py
    │   │   │   └── dataloader.py
    │   ├── nlp
    │   │   ├── __init__.py
    │   │   └── pytorch
    │   │   │   ├── __init__.py
    │   │   │   ├── wikitext2_dataset.py
    │   │   │   ├── wmt16
    │   │   │       ├── __init__.py
    │   │   │       ├── preprocess
    │   │   │       │   ├── download_dataset.sh
    │   │   │       │   ├── filter_dataset.py
    │   │   │       │   └── preprocess.py
    │   │   │       ├── utils.py
    │   │   │       ├── wmt16_config.py
    │   │   │       └── wmt16_tokenizer.py
    │   │   │   ├── wmt16_dataset.py
    │   │   │   ├── wmt17
    │   │   │       ├── __init__.py
    │   │   │       ├── batching.py
    │   │   │       ├── collate.py
    │   │   │       ├── preprocess
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── indexed_dataset.py
    │   │   │       │   ├── newstest2014.de
    │   │   │       │   ├── newstest2014.en
    │   │   │       │   ├── preprocess.py
    │   │   │       │   ├── reference_dictionary.ende.txt
    │   │   │       │   └── sub_tokenizer.py
    │   │   │       └── wmt17_dictionary.py
    │   │   │   └── wmt17_dataset.py
    │   └── util
    │   │   ├── __init__.py
    │   │   ├── pytorch
    │   │       ├── __init__.py
    │   │       ├── libsvm.py
    │   │       └── partition.py
    │   │   └── tools.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── goals.py
    │   ├── pytorch
    │   │   ├── __init__.py
    │   │   ├── criterion.py
    │   │   └── metrics.py
    │   └── tensorflow
    │   │   ├── __init__.py
    │   │   ├── criterion.py
    │   │   └── metrics.py
    ├── install_cuda_extensions.py
    ├── lr_scheduler
    │   ├── __init__.py
    │   ├── pytorch
    │   │   ├── __init__.py
    │   │   └── lr.py
    │   └── tensorflow
    │   │   ├── __init__.py
    │   │   └── lr.py
    ├── models
    │   ├── __init__.py
    │   ├── pytorch
    │   │   ├── __init__.py
    │   │   ├── gnmt
    │   │   │   ├── __init__.py
    │   │   │   ├── attention.py
    │   │   │   ├── attn_score
    │   │   │   │   ├── attn_score_cuda.cpp
    │   │   │   │   └── attn_score_cuda_kernel.cu
    │   │   │   ├── decoder.py
    │   │   │   ├── encoder.py
    │   │   │   ├── models.py
    │   │   │   ├── translator.py
    │   │   │   └── utils.py
    │   │   ├── language_models
    │   │   │   ├── __init__.py
    │   │   │   └── lstm.py
    │   │   ├── layers
    │   │   │   ├── __init__.py
    │   │   │   └── dropout_layers.py
    │   │   ├── linear_models.py
    │   │   ├── resnet.py
    │   │   ├── transformer
    │   │   │   ├── __init__.py
    │   │   │   ├── decoder.py
    │   │   │   ├── encoder.py
    │   │   │   ├── modules
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── embeddings.py
    │   │   │   │   ├── layers.py
    │   │   │   │   ├── multihead_attention.py
    │   │   │   │   └── strided_batched_gemm
    │   │   │   │   │   ├── strided_batched_gemm.cpp
    │   │   │   │   │   └── strided_batched_gemm_cuda.cu
    │   │   │   ├── sequence_generator.py
    │   │   │   └── transformer.py
    │   │   └── vgg.py
    │   └── tensorflow
    │   │   ├── __init__.py
    │   │   └── resnet_model.py
    ├── optim
    │   ├── __init__.py
    │   └── pytorch
    │   │   ├── __init__.py
    │   │   ├── centralized.py
    │   │   ├── decentralized.py
    │   │   ├── fp_optimizers.py
    │   │   └── optim.py
    └── utils
    │   ├── __init__.py
    │   ├── log_metrics.py
    │   ├── pytorch
    │       ├── __init__.py
    │       ├── checkpoint.py
    │       ├── distributed.py
    │       ├── helpers.py
    │       ├── inference
    │       │   ├── __init__.py
    │       │   └── beam_search.py
    │       ├── topology.py
    │       └── utils.py
    │   ├── task_args.py
    │   ├── tensorflow
    │       └── __init__.py
    │   └── tracker.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_api.py
    ├── test_cli.py
    ├── test_gcloud_cli.py
    ├── test_python_optim.py
    ├── test_pytorch_controlflow.py
    ├── test_pytorch_helpers.py
    ├── test_pytorch_metrics.py
    ├── test_pytorch_models.py
    ├── test_pytorch_schedulers.py
    ├── test_pytorch_utils.py
    └── test_utils.py
└── tox.ini


/.codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   status:
3 |     patch:
4 |       default:
5 |         target: 1%
6 |     project:
7 |       default:
8 |         threshold: 30%


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "monthly"
12 | 
13 |     labels:
14 |       - "dependencies"
15 |     reviewers:
16 |       - "ehoelzl"
17 |       - "mmilenkoski"
18 | 
19 |   - package-ecosystem: "pip" # See documentation for possible values
20 |     directory: "/docs" # Location of package manifests
21 |     schedule:
22 |       interval: "monthly"
23 | 
24 |     labels:
25 |       - "dependencies"
26 |     reviewers:
27 |       - "ehoelzl"
28 |       - "mmilenkoski"
29 | 


--------------------------------------------------------------------------------
/.github/workflows/create-release.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow that is manually triggered
 2 | 
 3 | name: Create a new MLBench release
 4 | 
 5 | # Controls when the action will run. Workflow runs when manually triggered using the UI
 6 | # or API.
 7 | on:
 8 |   workflow_dispatch:
 9 |     # Inputs the workflow accepts.
10 |     inputs:
11 |       patch:
12 |         # Friendly description to be shown in the UI instead of 'name'
13 |         description: 'Patch [major].[minor].[patch]-[dev]'
14 |         # Input has to be provided for the workflow to run
15 |         required: true
16 |       futureRelease:
17 |         description: 'Future release Number'
18 |         required: true
19 |         
20 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
21 | jobs:
22 |   # This workflow contains a single job called "greet"
23 |   create-release:
24 |     # The type of runner that the job will run on
25 |     runs-on: ubuntu-latest
26 | 
27 |     # Steps represent a sequence of tasks that will be executed as part of the job
28 |     steps:
29 |     # Create release branch
30 |     - name: Create Release Branch
31 |       uses: peterjgrainger/action-create-branch@v2.0.1
32 |       env:
33 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 |       with:
35 |         branch: release/v${{ github.event.inputs.futureRelease}}
36 |     # Checkout Branch
37 |     - name: Checkout release branch
38 |       uses: actions/checkout@v2.3.4
39 |       env:
40 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
41 |       with:
42 |         ref: release/v${{ github.event.inputs.futureRelease}}
43 |     # Bump version
44 |     - name: setup python
45 |       uses: actions/setup-python@v2
46 |       with:
47 |        python-version: 3.7
48 |     - name: Bump version
49 |       run: pip install bumpversion && bumpversion --allow-dirty --no-tag --no-commit ${{ github.event.inputs.patch }}
50 |     # Generate changelog
51 |     - name: Find Latest Tag
52 |       # You may pin to the exact commit or the version.
53 |       # uses: oprypin/find-latest-tag@cc85180adff5be91282940868529accfc5ab40a7
54 |       uses: oprypin/find-latest-tag@v1.0.4
55 |       with:
56 |         repository: mlbench/mlbench-core
57 |       id: previousTag
58 |     - name: Generate Changelog using github-changelog-generator
59 |       # You may pin to the exact commit or the version.
60 |       # uses: faberNovel/github-changelog-generator-action@5fcc510347703c66014a0d54c2c6dfb6c1851eaa
61 |       uses: faberNovel/github-changelog-generator-action@v1.0.0-alpha02
62 |       with:
63 |         options: -u mlbench -p mlbench-core -t ${{ secrets.GITHUB_TOKEN }} \
64 |             --release-branch release/v${{ github.event.inputs.futureRelease}} --future-release v${{ github.event.inputs.futureRelease }} \
65 |             --since-tag ${{ steps.previousTag.outputs.tag }} --base CHANGELOG.md
66 |     - name: Commit Changes
67 |       uses: stefanzweifel/git-auto-commit-action@v4.7.2
68 |       with:
69 |         commit_message: 'Bump version and update Changelog'
70 |    
71 | 


--------------------------------------------------------------------------------
/.github/workflows/mlbench-core.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: mlbench-core
 5 | 
 6 | on: [push]
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python: [3.7]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - name: Setup Python
19 |         uses: actions/setup-python@v2
20 |         with:
21 |           python-version: ${{ matrix.python }}
22 |       - name: Install Tox and any other packages
23 |         run: pip install tox
24 |       - name: Lint Check
25 |         # Run tox using the version of Python in `PATH`
26 |         run: TOXENV=lint python -m tox
27 |       - name: tests
28 |         run: TOXENV=py37 python -m tox
29 |       - name: docs
30 |         run: TOXENV=docs python -m tox
31 |       - name: Upload coverage to Codecov
32 |         uses: codecov/codecov-action@v1.0.15
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | *.inc
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 
105 | #vscode
106 | .vscode
107 | /docs/mlbench.rst
108 | /docs/modules.rst
109 | /docs/refimpls/
110 | 
111 | # helm
112 | **/charts/*.tgz
113 | myvalues.yaml
114 | 
115 | setup_telepresence.sh


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: stable
 4 |     hooks:
 5 |       - id: black
 6 |         language_version: python3.7
 7 | 
 8 |   - repo: https://github.com/asottile/seed-isort-config
 9 |     rev: v2.1.1
10 |     hooks:
11 |       - id: seed-isort-config
12 | 
13 |   - repo: https://github.com/timothycrosley/isort
14 |     rev: 4.3.21
15 |     hooks:
16 |       - id: isort
17 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
 1 | # configure updates globally
 2 | # default: all
 3 | # allowed: all, insecure, False
 4 | update: all
 5 | 
 6 | # configure dependency pinning globally
 7 | # default: True
 8 | # allowed: True, False
 9 | pin: True
10 | 
11 | # set the default branch
12 | # default: empty, the default branch on GitHub
13 | branch: develop
14 | 
15 | # update schedule
16 | # default: empty
17 | # allowed: "every day", "every week", ..
18 | schedule: "every day"
19 | 
20 | # search for requirement files
21 | # default: True
22 | # allowed: True, False
23 | search: True
24 | 
25 | # Specify requirement files by hand, default is empty
26 | # default: empty
27 | # allowed: list
28 | # requirements:
29 | #   - requirements/staging.txt:
30 | #       # update all dependencies and pin them
31 | #       update: all
32 | #       pin: True
33 | #   - requirements/dev.txt:
34 | #       # don't update dependencies, use global 'pin' default
35 | #       update: False
36 | #   - requirements/prod.txt:
37 | #       # update insecure only, pin all
38 | #       update: insecure
39 | #       pin: True
40 | 
41 | # add a label to pull requests, default is not set
42 | # requires private repo permissions, even on public repos
43 | # default: empty
44 | #label_prs: update
45 | 
46 | # assign users to pull requests, default is not set
47 | # requires private repo permissions, even on public repos
48 | # default: empty
49 | # assignees:
50 | #  - carl
51 | #  - carlsen
52 | 
53 | # configure the branch prefix the bot is using
54 | # default: pyup-
55 | # branch_prefix: pyup/
56 | 
57 | # set a global prefix for PRs
58 | # default: empty
59 | # pr_prefix: "Bug #12345"
60 | 
61 | # allow to close stale PRs
62 | # default: True
63 | # close_prs: True


--------------------------------------------------------------------------------
/DEVELOPMENT.md:
--------------------------------------------------------------------------------
 1 | Developer Docs
 2 | ==============
 3 | 
 4 | 
 5 | Local Dev Setup
 6 | ---------------
 7 | - Clone the repo locally
 8 | - install dependencies with `pip install .[test]`
 9 | - Setup pre-commit hooks using `pre-commit install`
10 | 
11 | Tests can be run using `tox` or `pytest` commands.
12 | 
13 | Docs can be built using `make docs` command.
14 | 
15 | 
16 | Making a release
17 | ----------------
18 | 
19 | Steps to make a release:
20 | 
21 | - Create a development branch based on current `develop`, named `release/vX.X.X` (e.g. `release/2.4.1`)
22 | - Use bumpversion to bump the version, e.g. `bumpversion --verbose --no-commit --no-tag minor` to bump the minor version (`major`, `minor`, `patch` and `dev` are supported)
23 | - Generate the new changelog (based on github issues) like `github_changelog_generator -u mlbench -p mlbench-core -t <github_token> --release-branch release/2.4.1 --future-release 2.4.1 --base CHANGELOG.md` (use a valid `<github_token>`)
24 |   found here https://github.com/github-changelog-generator/github-changelog-generator
25 |   Convert the resulting Changelog.md file to *.rst with a tool like https://cloudconvert.com/md-to-rst . Use this to update the `changelog.rst` in the `mlbench-docs` repo.
26 | - Commit the changes and merge the `release/X.X.X` branch into both master and develop and push with `git push`.
27 | - Create a tag of the master version using `git tag -m "Release X.X.X" vX.X.X` and push with `git push --all`
28 | - Build with `python setup.py sdist bdist_wheel` (delete `dist/` before building) and the upload to Pypi with `twine upload dist/*`


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include CHANGELOG.md
 2 | include LICENSE
 3 | 
 4 | recursive-include tests *
 5 | recursive-include * *.cpp
 6 | recursive-include * *.cu
 7 | recursive-exclude * __pycache__
 8 | recursive-exclude * *.py[co]
 9 | 
10 | recursive-include docs *.rst conf.py Makefile *.jpg *.png *.gif


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean clean-test clean-pyc clean-build docs help
 2 | .DEFAULT_GOAL := help
 3 | 
 4 | define BROWSER_PYSCRIPT
 5 | import os, webbrowser, sys
 6 | 
 7 | try:
 8 | 	from urllib import pathname2url
 9 | except:
10 | 	from urllib.request import pathname2url
11 | 
12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
13 | endef
14 | export BROWSER_PYSCRIPT
15 | 
16 | define PRINT_HELP_PYSCRIPT
17 | import re, sys
18 | 
19 | for line in sys.stdin:
20 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
21 | 	if match:
22 | 		target, help = match.groups()
23 | 		print("%-20s %s" % (target, help))
24 | endef
25 | export PRINT_HELP_PYSCRIPT
26 | 
27 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
28 | 
29 | help:
30 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
31 | 
32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33 | 
34 | clean-build: ## remove build artifacts
35 | 	rm -fr build/
36 | 	rm -fr dist/
37 | 	rm -fr .eggs/
38 | 	find . -name '*.egg-info' -exec rm -fr {} +
39 | 	find . -name '*.egg' -exec rm -f {} +
40 | 
41 | clean-pyc: ## remove Python file artifacts
42 | 	find . -name '*.pyc' -exec rm -f {} +
43 | 	find . -name '*.pyo' -exec rm -f {} +
44 | 	find . -name '*~' -exec rm -f {} +
45 | 	find . -name '__pycache__' -exec rm -fr {} +
46 | 
47 | clean-test: ## remove test and coverage artifacts
48 | 	rm -fr .tox/
49 | 	rm -f .coverage
50 | 	rm -fr htmlcov/
51 | 	rm -fr .pytest_cache
52 | 
53 | lint: ## check style with black, sort imports
54 | 	black --check .
55 | 	isort --check-only .
56 | 
57 | test: ## run tests quickly with the default Python
58 | 	py.test
59 | 
60 | test-all: ## run tests on every Python version with tox
61 | 	tox
62 | 
63 | coverage: ## check code coverage quickly with the default Python
64 | 	coverage run --source mlbench_core -m pytest
65 | 	coverage report -m
66 | 	coverage html
67 | 	$(BROWSER) htmlcov/index.html
68 | 
69 | docs: ## generate Sphinx HTML documentation, including API docs
70 | 	$(MAKE) -C docs clean
71 | 	$(MAKE) -C docs html
72 | 	$(BROWSER) docs/_build/html/index.html
73 | 
74 | servedocs: docs ## compile the docs watching for changes
75 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mlbench-core: Distributed Machine Learning Benchmark Core Library
 2 | =================================================================
 3 | 
 4 | ![Build Status](https://github.com/mlbench/mlbench-core/workflows/mlbench-core/badge.svg?branch=develop)
 5 | [![Documentation Status](https://readthedocs.org/projects/mlbench-core/badge/?version=latest)](https://mlbench.readthedocs.io/projects/mlbench_core/en/latest/?badge=latest)
 6 | [![codecov](https://codecov.io/gh/mlbench/mlbench-core/branch/develop/graph/badge.svg)](https://codecov.io/gh/mlbench/mlbench-core)
 7 | 
 8 | 
 9 | MLBench is a Benchmarking Framework for Distributed Machine Learning algorithms.
10 | 
11 | This repository contains the core Python library for MLBench which is used to share code between Benchmark implementations as well as for communication with the dashboard.
12 | 
13 | For more information refer to the [MLBench Core Documentation](https://mlbench.readthedocs.io/projects/mlbench_core/en/stable/api.html)
14 | or the [Main Documentation](https://mlbench.readthedocs.io/)
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | .wy-menu-vertical li.toctree-l4.current li.toctree-l5>a {
 2 |     display: block;
 3 |     background: #bdbdbd;
 4 |     padding: .4045em 6.663em;
 5 | }
 6 | 
 7 | .wy-menu-vertical li.on a, .wy-menu-vertical li>a.current {
 8 |     color: #404040;
 9 |     padding: .4045em 1.618em;
10 |     font-weight: bold;
11 |     position: relative;
12 |     background: #fcfcfc;
13 |     border: none;
14 |     padding-left: 1.618em -4px;
15 | }
16 | 
17 | .wy-menu-vertical li.toctree-l3.current li.toctree-l4>ul {
18 |     display: none;
19 | }
20 | 
21 | .wy-menu-vertical li.toctree-l3.current li.toctree-l4.current>ul {
22 |     display: block;
23 | }


--------------------------------------------------------------------------------
/docs/aggregation.rst:
--------------------------------------------------------------------------------
 1 | mlbench_core.aggregation
 2 | ------------------------
 3 | 
 4 | .. autoapimodule:: mlbench_core.aggregation
 5 | .. currentmodule:: mlbench_core.aggregation
 6 | 
 7 | 
 8 | pytorch
 9 | ~~~~~~~
10 | 
11 | .. autoapimodule:: mlbench_core.aggregation.pytorch
12 | .. currentmodule:: mlbench_core.aggregation.pytorch
13 | 
14 | Aggregation
15 | +++++++++++
16 | 
17 | .. autoapimodule:: mlbench_core.aggregation.pytorch.aggregation
18 | .. currentmodule:: mlbench_core.aggregation.pytorch.aggregation
19 | 
20 | .. autoapiclass:: Aggregation
21 |     :members:
22 |     :private-members:
23 |     :undoc-members:
24 | 
25 | Centralized (Synchronous) aggregation
26 | +++++++++++++++++++++++++++++++++++++
27 | 
28 | .. autoapimodule:: mlbench_core.aggregation.pytorch.centralized
29 | .. currentmodule:: mlbench_core.aggregation.pytorch.centralized
30 | 
31 | All-Reduce
32 | ''''''''''
33 | 
34 | .. autoapiclass:: AllReduceAggregation
35 |     :show-inheritance:
36 |     :private-members:
37 | 
38 | All-Reduce Horovod
39 | ''''''''''''''''''
40 | 
41 | .. autoapiclass:: AllReduceAggregationHVD
42 |     :show-inheritance:
43 |     :private-members:
44 | 
45 | Sparsified Aggregation
46 | ''''''''''''''''''''''
47 | 
48 | .. autoapiclass:: SparsifiedAggregation
49 |     :show-inheritance:
50 |     :private-members:
51 | 
52 | Power Aggregation
53 | '''''''''''''''''
54 | 
55 | .. autoapiclass:: PowerAggregation
56 |     :show-inheritance:
57 |     :private-members:
58 | 
59 | Decentralized (Asynchronous) aggregation
60 | ++++++++++++++++++++++++++++++++++++++++
61 | 
62 | .. autoapimodule:: mlbench_core.aggregation.pytorch.decentralized
63 | .. currentmodule:: mlbench_core.aggregation.pytorch.decentralized
64 | 
65 | Decentralized Aggregation
66 | '''''''''''''''''''''''''
67 | 
68 | .. autoapiclass:: DecentralizedAggregation
69 |     :show-inheritance:
70 |     :private-members:
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _mlbench_core_api:
 3 | 
 4 | MLBench Core API
 5 | ================
 6 | 
 7 | .. toctree::
 8 |    :caption: Examples
 9 | 
10 |    examples
11 | 
12 | .. toctree::
13 |    :caption: Core API
14 |    :maxdepth: 1
15 | 
16 |    aggregation
17 |    clientapi
18 |    controlflow
19 |    dataset
20 |    evaluation
21 |    lr_scheduler
22 |    models
23 |    optim
24 |    utils


--------------------------------------------------------------------------------
/docs/clientapi.rst:
--------------------------------------------------------------------------------
 1 | mlbench_core.api
 2 | ----------------
 3 | 
 4 | .. autoapimodule:: mlbench_core.api
 5 | .. currentmodule:: mlbench_core.api
 6 | 
 7 | .. autoapidata:: MLBENCH_IMAGES
 8 | 
 9 | .. autoapiclass:: ApiClient
10 |     :members:
11 | 


--------------------------------------------------------------------------------
/docs/controlflow.rst:
--------------------------------------------------------------------------------
 1 | mlbench_core.controlflow
 2 | ------------------------
 3 | 
 4 | .. autoapimodule:: mlbench_core.controlflow
 5 | .. currentmodule:: mlbench_core.controlflow
 6 | 
 7 | pytorch
 8 | ~~~~~~~
 9 | 
10 | .. autoapimodule:: mlbench_core.controlflow.pytorch
11 | .. currentmodule:: mlbench_core.controlflow.pytorch
12 | 
13 | Controlflow
14 | +++++++++++
15 | 
16 | .. autoapifunction:: validation_round
17 | 
18 | .. autoapifunction:: record_train_batch_stats
19 | 
20 | .. autoapifunction:: record_validation_stats
21 | 
22 | CheckpointsEvaluationControlFlow
23 | ++++++++++++++++++++++++++++++++
24 | 
25 | .. autoapiclass:: CheckpointsEvaluationControlFlow
26 |     :members:
27 | 
28 | Helpers
29 | +++++++
30 | 
31 | .. autoapimodule:: mlbench_core.controlflow.pytorch.helpers
32 | .. currentmodule:: mlbench_core.controlflow.pytorch.helpers
33 | 
34 | .. autoapifunction:: maybe_range
35 | .. autoapifunction:: convert_dtype
36 | .. autoapifunction:: prepare_batch
37 | .. autoapifunction:: iterate_dataloader
38 | 
39 | 
40 | tensorflow
41 | ~~~~~~~~~~
42 | 
43 | .. autoapimodule:: mlbench_core.controlflow.tensorflow
44 | .. currentmodule:: mlbench_core.controlflow.tensorflow
45 | 
46 | 
47 | TrainValidation
48 | +++++++++++++++
49 | 
50 | .. autoapiclass:: TrainValidation
51 |     :members:
52 | 
53 |     .. autoapimethod:: __call__
54 | 


--------------------------------------------------------------------------------
/docs/dataset.rst:
--------------------------------------------------------------------------------
  1 | mlbench_core.dataset
  2 | --------------------
  3 | 
  4 | .. autoapimodule:: mlbench_core.dataset
  5 | .. currentmodule:: mlbench_core.dataset
  6 | 
  7 | 
  8 | linearmodels
  9 | ~~~~~~~~~~~~
 10 | .. autoapimodule:: mlbench_core.dataset.linearmodels
 11 | .. currentmodule:: mlbench_core.dataset.linearmodels
 12 | 
 13 | pytorch
 14 | +++++++
 15 | 
 16 | .. autoapimodule:: mlbench_core.dataset.linearmodels.pytorch.dataloader
 17 | .. currentmodule:: mlbench_core.dataset.linearmodels.pytorch.dataloader
 18 | 
 19 | Epsilon Logistic Regression
 20 | '''''''''''''''''''''''''''
 21 | 
 22 | .. autoapiclass:: LMDBDataset
 23 |     :members:
 24 | 
 25 | .. autoapiclass:: LMDBPTClass
 26 |     :members:
 27 | 
 28 | imagerecognition
 29 | ~~~~~~~~~~~~~~~~
 30 | 
 31 | .. autoapimodule:: mlbench_core.dataset.imagerecognition
 32 | .. currentmodule:: mlbench_core.dataset.imagerecognition
 33 | 
 34 | pytorch
 35 | +++++++
 36 | 
 37 | .. autoapimodule:: mlbench_core.dataset.imagerecognition.pytorch.dataloader
 38 | .. currentmodule:: mlbench_core.dataset.imagerecognition.pytorch.dataloader
 39 | 
 40 | CIFAR10V1
 41 | '''''''''
 42 | 
 43 | .. autoapiclass:: CIFAR10V1
 44 |     :members:
 45 | 
 46 | Imagenet
 47 | ''''''''
 48 | 
 49 | .. autoapiclass:: Imagenet
 50 |     :members:
 51 | 
 52 | tensorflow
 53 | ++++++++++
 54 | 
 55 | .. autoapimodule:: mlbench_core.dataset.imagerecognition.tensorflow
 56 | .. currentmodule:: mlbench_core.dataset.imagerecognition.tensorflow
 57 | 
 58 | DatasetCifar
 59 | ''''''''''''
 60 | 
 61 | .. autoapiclass:: DatasetCifar
 62 |     :members:
 63 | 
 64 | NLP
 65 | ~~~
 66 | 
 67 | .. autoapimodule:: mlbench_core.dataset.nlp
 68 | .. currentmodule:: mlbench_core.dataset.nlp
 69 | 
 70 | pytorch
 71 | +++++++
 72 | 
 73 | .. autoapimodule:: mlbench_core.dataset.nlp.pytorch
 74 | .. currentmodule:: mlbench_core.dataset.nlp.pytorch
 75 | 
 76 | Translation WMT16
 77 | '''''''''''''''''
 78 | 
 79 | .. autoapiclass:: WMT16Dataset
 80 |     :members:
 81 | 
 82 | .. autoapimodule:: mlbench_core.dataset.nlp.pytorch.wmt16.wmt16_tokenizer
 83 |     :members:
 84 | 
 85 | Translation WMT17
 86 | '''''''''''''''''
 87 | 
 88 | .. autoapiclass:: WMT17Dataset
 89 |     :members:
 90 | 
 91 | .. autoapimodule:: mlbench_core.dataset.nlp.pytorch.wmt17
 92 |     :members:
 93 | 
 94 | Language Modeling WikiText2
 95 | '''''''''''''''''''''''''''
 96 | 
 97 | .. autoapiclass:: BPTTWikiText2
 98 |     :members:
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/evaluation.rst:
--------------------------------------------------------------------------------
 1 | mlbench_core.evaluation
 2 | -----------------------
 3 | .. autoapimodule:: mlbench_core.evaluation
 4 | .. currentmodule:: mlbench_core.evaluation
 5 | 
 6 | pytorch
 7 | ~~~~~~~
 8 | 
 9 | .. autoapimodule:: mlbench_core.evaluation.pytorch
10 | .. currentmodule:: mlbench_core.evaluation.pytorch
11 | 
12 | criterion
13 | +++++++++
14 | 
15 | .. autoapimodule:: mlbench_core.evaluation.pytorch.criterion
16 | .. currentmodule:: mlbench_core.evaluation.pytorch.criterion
17 | 
18 | 
19 | BCELossRegularized
20 | ''''''''''''''''''
21 | 
22 | .. autoapiclass:: BCELossRegularized
23 |     :members:
24 | 
25 | 
26 | MSELossRegularized
27 | ''''''''''''''''''
28 | 
29 | .. autoapiclass:: MSELossRegularized
30 |     :members:
31 | 
32 | .. autoapiclass:: LabelSmoothing
33 |     :members:
34 | 
35 | metrics
36 | +++++++
37 | 
38 | .. autoapimodule:: mlbench_core.evaluation.pytorch.metrics
39 | .. currentmodule:: mlbench_core.evaluation.pytorch.metrics
40 | 
41 | 
42 | TopKAccuracy
43 | ''''''''''''
44 | 
45 | .. autoapiclass:: TopKAccuracy
46 |     :members:
47 | 
48 |     .. autoapimethod:: __call__
49 | 
50 | tensorflow
51 | ~~~~~~~~~~
52 | 
53 | criterion
54 | +++++++++
55 | 
56 | .. autoapimodule:: mlbench_core.evaluation.tensorflow.criterion
57 | .. currentmodule:: mlbench_core.evaluation.tensorflow.criterion
58 | 
59 | 
60 | softmax_cross_entropy_with_logits_v2_l2_regularized
61 | '''''''''''''''''''''''''''''''''''''''''''''''''''
62 | 
63 | .. autoapifunction:: softmax_cross_entropy_with_logits_v2_l2_regularized
64 | 
65 | metrics
66 | +++++++
67 | 
68 | .. autoapimodule:: mlbench_core.evaluation.tensorflow.metrics
69 | .. currentmodule:: mlbench_core.evaluation.tensorflow.metrics
70 | 
71 | topk_accuracy
72 | '''''''''''''
73 | 
74 | .. autoapifunction:: topk_accuracy_with_logits
75 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
1 | Examples
2 | --------


--------------------------------------------------------------------------------
/docs/gensidebar.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file generates the sidebar/toctree for all RobotPy projects and should
  3 | # be copied to each project when it is updated
  4 | #
  5 | 
  6 | import os
  7 | 
  8 | 
  9 | def write_if_changed(fname, contents):
 10 | 
 11 |     try:
 12 |         with open(fname, "r") as fp:
 13 |             old_contents = fp.read()
 14 |     except:
 15 |         old_contents = ""
 16 | 
 17 |     if old_contents != contents:
 18 |         with open(fname, "w") as fp:
 19 |             fp.write(contents)
 20 | 
 21 | 
 22 | def generate_sidebar(conf, conf_api):
 23 | 
 24 |     # determine 'latest' or 'stable'
 25 |     # if not conf.do_gen:
 26 |     do_gen = os.environ.get("SIDEBAR", None) == "1" or conf["on_rtd"]
 27 |     version = conf["rtd_version"]
 28 | 
 29 |     lines = ["", ".. DO NOT MODIFY! THIS PAGE IS AUTOGENERATED!", ""]
 30 | 
 31 |     def toctree(name):
 32 |         lines.extend(
 33 |             [".. toctree::", "    :caption: %s" % name, "    :maxdepth: 2", ""]
 34 |         )
 35 | 
 36 |     def endl():
 37 |         lines.append("")
 38 | 
 39 |     def write(desc, link):
 40 |         if conf_api == "mlbench":
 41 |             args = desc, link
 42 |         elif not do_gen:
 43 |             return
 44 |         else:
 45 |             args = (
 46 |                 desc,
 47 |                 "https://mlbench.readthedocs.io/en/%s/%s.html" % (version, link),
 48 |             )
 49 | 
 50 |         lines.append("    %s <%s>" % args)
 51 | 
 52 |     def write_api(project, desc):
 53 |         if project != conf_api:
 54 |             if do_gen:
 55 |                 args = desc, project, version
 56 |                 lines.append(
 57 |                     "    %s API <https://mlbench.readthedocs.io/projects/%s/en/%s/api.html>"
 58 |                     % args
 59 |                 )
 60 |         else:
 61 |             lines.append("    %s API <api>" % desc)
 62 | 
 63 |     def write_ref(project, desc):
 64 |         if project != conf_api:
 65 |             if do_gen:
 66 |                 args = desc, project, version
 67 |                 lines.append(
 68 |                     "    %s <https://mlbench.readthedocs.io/projects/%s/en/%s/readme.html>"
 69 |                     % args
 70 |                 )
 71 |         else:
 72 |             lines.append("    %s <readme>" % desc)
 73 | 
 74 |     #
 75 |     # Specify the sidebar contents here
 76 |     #
 77 | 
 78 |     toctree("MLBench")
 79 |     write("Benchmarks", "benchmark-tasks")
 80 |     write("Prerequisites", "prerequisites")
 81 |     write("Installation", "installation")
 82 |     write("Component Overview", "overview")
 83 |     write("Tutorials", "tutorials")
 84 |     endl()
 85 | 
 86 |     toctree("Components")
 87 |     write_ref("mlbench_helm", "Helm Chart")
 88 |     write_ref("mlbench_dashboard", "Dashboard")
 89 |     write_ref("mlbench_benchmarks", "Benchmark Implementations")
 90 |     write_api("mlbench_core", "Core")
 91 |     endl()
 92 | 
 93 |     toctree("Additional Info")
 94 |     write("Developer Guide", "devguide")
 95 |     write("Contributing", "contributing")
 96 |     write("Changelog", "changelog")
 97 |     endl()
 98 | 
 99 |     write_if_changed("_sidebar.rst.inc", "\n".join(lines))
100 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | MLBench Core Documentation
 2 | ================================
 3 | 
 4 | .. include:: _sidebar.rst.inc
 5 | 
 6 | 
 7 | Indices and tables
 8 | ==================
 9 | 
10 | * :ref:`genindex`
11 | * :ref:`modindex`
12 | * :ref:`search`


--------------------------------------------------------------------------------
/docs/lr_scheduler.bib:
--------------------------------------------------------------------------------
 1 | @article{ginsburg2018large,
 2 |   title={Large Batch Training of Convolutional Networks with Layer-wise Adaptive Rate Scaling},
 3 |   author={Ginsburg, Boris and Gitman, Igor and You, Yang},
 4 |   year={2018},
 5 |   journal={Open Review}
 6 | }
 7 | 
 8 | @inproceedings{smith2017cyclical,
 9 |   title={Cyclical learning rates for training neural networks},
10 |   author={Smith, Leslie N},
11 |   booktitle={Applications of Computer Vision (WACV), 2017 IEEE Winter Conference on},
12 |   pages={464--472},
13 |   year={2017},
14 |   organization={IEEE}
15 | }
16 | 
17 | @article{goyal2017accurate,
18 |   title={Accurate, large minibatch SGD: training imagenet in 1 hour},
19 |   author={Goyal, Priya and Doll{\'a}r, Piotr and Girshick, Ross and Noordhuis, Pieter and Wesolowski, Lukasz and Kyrola, Aapo and Tulloch, Andrew and Jia, Yangqing and He, Kaiming},
20 |   journal={arXiv preprint arXiv:1706.02677},
21 |   year={2017}
22 | }
23 | 
24 | @article{smith2017super,
25 |   title={Super-Convergence: Very Fast Training of Residual Networks Using Large Learning Rates},
26 |   author={Smith, Leslie N and Topin, Nicholay},
27 |   journal={arXiv preprint arXiv:1708.07120},
28 |   year={2017}
29 | }


--------------------------------------------------------------------------------
/docs/lr_scheduler.rst:
--------------------------------------------------------------------------------
 1 | mlbench_core.lr_scheduler
 2 | -------------------------
 3 | .. autoapimodule:: mlbench_core.lr_scheduler
 4 | .. currentmodule:: mlbench_core.lr_scheduler
 5 | 
 6 | pytorch
 7 | ~~~~~~~
 8 | 
 9 | .. autoapimodule:: mlbench_core.lr_scheduler.pytorch.lr
10 | .. currentmodule:: mlbench_core.lr_scheduler.pytorch.lr
11 | 
12 | LRLinearWarmUp
13 | ++++++++++++++
14 | 
15 | .. autoapiclass:: LRLinearWarmUp
16 |     :members:
17 | 
18 | MultiStepLRLinearWarmUp
19 | +++++++++++++++++++++++
20 | 
21 | .. autoapiclass:: MultiStepLRLinearWarmUp
22 |     :members:
23 | 
24 | ReduceLROnPlateauWithWarmup
25 | +++++++++++++++++++++++++++
26 | 
27 | .. autoapiclass:: ReduceLROnPlateauWithWarmup
28 |     :members:
29 | 
30 | SparsifiedSGDLR
31 | +++++++++++++++
32 | 
33 | .. autoapiclass:: SparsifiedSGDLR
34 |     :members:
35 | 
36 | TimeDecayLR
37 | +++++++++++
38 | 
39 | .. autoapiclass:: TimeDecayLR
40 |     :members:
41 | 
42 | SQRTTimeDecayLR
43 | +++++++++++++++
44 | 
45 | .. autoapiclass:: SQRTTimeDecayLR
46 |     :members:
47 | 
48 | ExponentialWarmupMultiStepLR
49 | ++++++++++++++++++++++++++++
50 | 
51 | .. autoapiclass:: ExponentialWarmupMultiStepLR
52 |     :members:
53 | 
54 | SQRTTimeDecayLRWithWarmup
55 | +++++++++++++++++++++++++
56 | 
57 | .. autoapiclass:: SQRTTimeDecayLRWithWarmup
58 |     :members:
59 | 
60 | tensorflow
61 | ~~~~~~~~~~
62 | 
63 | .. autoapimodule:: mlbench_core.lr_scheduler.tensorflow
64 | .. currentmodule:: mlbench_core.lr_scheduler.tensorflow
65 | 
66 | manual_stepping
67 | +++++++++++++++
68 | 
69 | .. autoapifunction:: manual_stepping
70 | 
71 | 
72 | .. rubric:: References
73 | 
74 | 
75 | .. bibliography:: lr_scheduler.bib
76 |    :cited:


--------------------------------------------------------------------------------
/docs/models.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{he2016deep,
 2 |   title={Deep residual learning for image recognition},
 3 |   author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
 4 |   booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
 5 |   pages={770--778},
 6 |   year={2016}
 7 | }
 8 | 
 9 | @inproceedings{he2016identity,
10 |   title={Identity mappings in deep residual networks},
11 |   author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
12 |   booktitle={European conference on computer vision},
13 |   pages={630--645},
14 |   year={2016},
15 |   organization={Springer}
16 | }
17 | 
18 | @incollection{NIPS2017_7181,
19 | title = {Attention is All you Need},
20 | author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
21 | booktitle = {Advances in Neural Information Processing Systems 30},
22 | editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
23 | pages = {5998--6008},
24 | year = {2017},
25 | publisher = {Curran Associates, Inc.},
26 | url = {http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf}
27 | }


--------------------------------------------------------------------------------
/docs/models.rst:
--------------------------------------------------------------------------------
  1 | mlbench_core.models
  2 | -------------------
  3 | .. autoapimodule:: mlbench_core.models
  4 | .. currentmodule:: mlbench_core.models
  5 | 
  6 | pytorch
  7 | ~~~~~~~
  8 | 
  9 | Since `Kuang Liu<https://github.com/kuangliu/pytorch-cifar>` has already included many classical
 10 | neural network models. We use their implementation direclty for 
 11 | 
 12 | - VGG
 13 | 
 14 | .. autoapimodule:: mlbench_core.models.pytorch
 15 | .. currentmodule:: mlbench_core.models.pytorch
 16 | 
 17 | 
 18 | linear_models
 19 | +++++++++++++
 20 | 
 21 | .. autoapimodule:: mlbench_core.models.pytorch.linear_models
 22 | .. currentmodule:: mlbench_core.models.pytorch.linear_models
 23 | 
 24 | 
 25 | LogisticRegression
 26 | ''''''''''''''''''
 27 | 
 28 | .. autoapiclass:: LogisticRegression
 29 |     :members:
 30 | 
 31 | LinearRegression
 32 | ''''''''''''''''''
 33 | 
 34 | .. autoapiclass:: LinearRegression
 35 |     :members:
 36 | 
 37 | 
 38 | resnet
 39 | ++++++
 40 | .. autoapimodule:: mlbench_core.models.pytorch.resnet
 41 | .. currentmodule:: mlbench_core.models.pytorch.resnet
 42 | 
 43 | ResNetCIFAR
 44 | '''''''''''
 45 | 
 46 | .. autoapiclass:: ResNetCIFAR
 47 |     :members:
 48 | 
 49 | 
 50 | RNN
 51 | +++
 52 | ---
 53 | 
 54 | Google Neural Machine Translation
 55 | '''''''''''''''''''''''''''''''''
 56 | .. autoapimodule:: mlbench_core.models.pytorch.gnmt
 57 | .. currentmodule:: mlbench_core.models.pytorch.gnmt
 58 | 
 59 | Model
 60 | =====
 61 | 
 62 | .. autoapiclass:: GNMT
 63 |     :members: encode, decode, generate, forward
 64 | 
 65 | BahdanauAttention
 66 | =================
 67 | 
 68 | .. autoapiclass:: BahdanauAttention
 69 |     :members:
 70 | 
 71 | Encoder
 72 | =======
 73 | .. autoapimodule:: mlbench_core.models.pytorch.gnmt.encoder
 74 | .. currentmodule:: mlbench_core.models.pytorch.gnmt.encoder
 75 | 
 76 | .. autoapiclass:: ResidualRecurrentEncoder
 77 |     :members:
 78 | 
 79 | Decoder
 80 | =======
 81 | .. autoapimodule:: mlbench_core.models.pytorch.gnmt.decoder
 82 | .. currentmodule:: mlbench_core.models.pytorch.gnmt.decoder
 83 | 
 84 | .. autoapiclass:: RecurrentAttention
 85 |     :members:
 86 | 
 87 | .. autoapiclass:: Classifier
 88 |     :members:
 89 | 
 90 | .. autoapiclass:: ResidualRecurrentDecoder
 91 |     :members:
 92 | 
 93 | Transformer Model for Translation
 94 | '''''''''''''''''''''''''''''''''
 95 | .. autoapimodule:: mlbench_core.models.pytorch.transformer
 96 | .. currentmodule:: mlbench_core.models.pytorch.transformer
 97 | 
 98 | Model
 99 | =====
100 | 
101 | .. autoapiclass:: TransformerModel
102 |     :members: forward
103 | 
104 | Encoder
105 | =======
106 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.encoder
107 | .. currentmodule:: mlbench_core.models.pytorch.transformer.encoder
108 | 
109 | .. autoapiclass:: TransformerEncoder
110 |     :members: forward
111 | 
112 | Decoder
113 | =======
114 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.decoder
115 | .. currentmodule:: mlbench_core.models.pytorch.transformer.decoder
116 | 
117 | .. autoapiclass:: TransformerDecoder
118 |     :members: forward
119 | 
120 | Layers
121 | ======
122 | 
123 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.modules
124 | .. currentmodule:: mlbench_core.models.pytorch.transformer.modules
125 | 
126 | .. autoapiclass:: TransformerEncoderLayer
127 |     :members: forward
128 | 
129 | .. autoapiclass:: TransformerDecoderLayer
130 |     :members: forward
131 | 
132 | SequenceGenerator
133 | =================
134 | 
135 | .. autoapimodule:: mlbench_core.models.pytorch.transformer.sequence_generator
136 | .. currentmodule:: mlbench_core.models.pytorch.transformer.sequence_generator
137 | 
138 | .. autoapiclass:: SequenceGenerator
139 |     :members:
140 | 
141 | 
142 | .. rubric:: References
143 | 
144 | .. bibliography:: models.bib
145 |    :cited:
146 | 
147 | 
148 | NLP
149 | +++
150 | .. autoapimodule:: mlbench_core.models.pytorch.nlp
151 | .. currentmodule:: mlbench_core.models.pytorch.nlp
152 | 
153 | LSTM Language Model
154 | '''''''''''''''''''
155 | 
156 | .. autoapiclass:: RNNLM
157 |     :members:
158 | 
159 | 
160 | tensorflow
161 | ~~~~~~~~~~
162 | 
163 | .. autoapimodule:: mlbench_core.models.tensorflow
164 | .. currentmodule:: mlbench_core.models.tensorflow
165 | 
166 | resnet
167 | ++++++
168 | 
169 | .. autoapimodule:: mlbench_core.models.tensorflow.resnet_model
170 | .. currentmodule:: mlbench_core.models.tensorflow.resnet_model
171 | 
172 | 
173 | .. autoapifunction:: fixed_padding
174 | 
175 | .. autoapifunction:: conv2d_fixed_padding
176 | 
177 | .. autoapifunction:: block_layer
178 | 
179 | .. autoapifunction:: batch_norm
180 | 
181 | 
182 | Model
183 | '''''
184 | 
185 | .. autoapiclass:: Model
186 |     :members:
187 | 
188 | 
189 | Cifar10Model
190 | ''''''''''''
191 | 
192 | .. autoapiclass:: Cifar10Model
193 |     :members:
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/docs/optim.bib:
--------------------------------------------------------------------------------
1 | 
2 | @inproceedings{adam_convergence,
3 |     title={On the Convergence of Adam and Beyond},
4 |     author={Sashank J. Reddi and Satyen Kale and Sanjiv Kumar},
5 |     booktitle={International Conference on Learning Representations},
6 |     year={2018},
7 |     url={https://openreview.net/forum?id=ryQu7f-RZ},
8 | }


--------------------------------------------------------------------------------
/docs/optim.rst:
--------------------------------------------------------------------------------
  1 | mlbench_core.optim
  2 | ------------------
  3 | 
  4 | .. autoapimodule:: mlbench_core.optim
  5 | .. currentmodule:: mlbench_core.optim
  6 | 
  7 | 
  8 | pytorch
  9 | ~~~~~~~
 10 | .. autoapimodule:: mlbench_core.optim.pytorch
 11 | .. currentmodule:: mlbench_core.optim.pytorch
 12 | 
 13 | 
 14 | Optimizers
 15 | ++++++++++
 16 | 
 17 | The optimizers in this module are not distributed. Their purpose is to implement logic that
 18 | can be inherited by distributed optimizers.
 19 | 
 20 | .. autoapimodule:: mlbench_core.optim.pytorch.optim
 21 | .. currentmodule:: mlbench_core.optim.pytorch.optim
 22 | 
 23 | 
 24 | SparsifiedSGD
 25 | '''''''''''''
 26 | 
 27 | .. autoapiclass:: SparsifiedSGD
 28 |     :members:
 29 | 
 30 | SignSGD
 31 | '''''''''''''
 32 | 
 33 | .. autoapiclass:: SignSGD
 34 |     :members:
 35 | 
 36 | Centralized (Synchronous) Optimizers
 37 | ++++++++++++++++++++++++++++++++++++
 38 | 
 39 | The optimizers in this module are all distributed and synchronous: workers advance in a synchronous manner. All workers
 40 | communicate with each other using `all_reduce` or `all_gather` operations.
 41 | 
 42 | .. autoapimodule:: mlbench_core.optim.pytorch.centralized
 43 | .. currentmodule:: mlbench_core.optim.pytorch.centralized
 44 | 
 45 | Generic Centralized Optimizer
 46 | +++++++++++++++++++++++++++++
 47 | 
 48 | .. autoapiclass:: GenericCentralizedOptimizer
 49 |     :members:
 50 | 
 51 | CentralizedSGD
 52 | ''''''''''''''
 53 | 
 54 | .. autoapiclass:: CentralizedSGD
 55 |     :show-inheritance:
 56 |     :members:
 57 | 
 58 | CentralizedAdam
 59 | '''''''''''''''
 60 | 
 61 | .. autoapiclass:: CentralizedAdam
 62 |     :show-inheritance:
 63 |     :members:
 64 | 
 65 | CustomCentralizedOptimizer
 66 | ''''''''''''''''''''''''''
 67 | 
 68 | .. autoapiclass:: CustomCentralizedOptimizer
 69 |     :show-inheritance:
 70 |     :members:
 71 | 
 72 | CentralizedSparsifiedSGD
 73 | ''''''''''''''''''''''''
 74 | 
 75 | .. autoapiclass:: CentralizedSparsifiedSGD
 76 |     :members:
 77 | 
 78 | PowerSGD
 79 | ''''''''
 80 | 
 81 | .. autoapiclass:: PowerSGD
 82 |     :members:
 83 | 
 84 | Decentralized (Asynchronous) Optimizers
 85 | +++++++++++++++++++++++++++++++++++++++
 86 | 
 87 | The optimizers in this module are all distributed and asynchronous: workers advance independently from each other,
 88 | and communication patterns follow an arbitrary graph.
 89 | 
 90 | .. autoapimodule:: mlbench_core.optim.pytorch.decentralized
 91 | .. currentmodule:: mlbench_core.optim.pytorch.decentralized
 92 | 
 93 | DecentralizedSGD
 94 | ''''''''''''''''
 95 | 
 96 | .. autoapiclass:: DecentralizedSGD
 97 |     :members:
 98 | 
 99 | 
100 | .. rubric:: References
101 | 
102 | .. bibliography:: optim.bib
103 |    :cited:
104 | 
105 | Mixed Precision Optimizers
106 | ++++++++++++++++++++++++++
107 | 
108 | .. autoapimodule:: mlbench_core.optim.pytorch.fp_optimizers
109 | .. currentmodule:: mlbench_core.optim.pytorch.fp_optimizers
110 | 
111 | FP16Optimizer
112 | '''''''''''''
113 | 
114 | .. autoapiclass:: FP16Optimizer
115 |     :members:
116 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx==4.0.2
2 | sphinx-rtd-theme==0.5.2
3 | sphinxcontrib-napoleon==0.7
4 | sphinxcontrib-bibtex==2.3.0
5 | sphinx-autoapi==1.8.1


--------------------------------------------------------------------------------
/docs/utils.rst:
--------------------------------------------------------------------------------
 1 | mlbench_core.utils
 2 | ------------------
 3 | .. autoapimodule:: mlbench_core.utils
 4 | .. currentmodule:: mlbench_core.utils
 5 | 
 6 | pytorch
 7 | ~~~~~~~
 8 | 
 9 | .. autoapimodule:: mlbench_core.utils.pytorch
10 | .. currentmodule:: mlbench_core.utils.pytorch
11 | 
12 | 
13 | FCGraph
14 | '''''''
15 | 
16 | .. autoapiclass:: FCGraph
17 |     :members:
18 | 
19 | initialize_backends
20 | '''''''''''''''''''
21 | 
22 | .. autoapifunction:: initialize_backends
23 | 
24 | 
25 | Checkpointer
26 | ''''''''''''
27 | 
28 | .. autoapimodule:: mlbench_core.utils.pytorch.checkpoint
29 | .. currentmodule:: mlbench_core.utils.pytorch.checkpoint
30 | 
31 | .. autoapiclass:: Checkpointer
32 | 
33 | helpers
34 | '''''''
35 | 
36 | .. autoapimodule:: mlbench_core.utils.pytorch.helpers
37 | .. currentmodule:: mlbench_core.utils.pytorch.helpers
38 | 
39 | .. autoapifunction:: config_logging
40 | 
41 | .. autoapifunction:: config_pytorch
42 | 
43 | .. autoapifunction:: config_path
44 | 
45 | utils
46 | '''''
47 | 
48 | .. autoapimodule:: mlbench_core.utils.pytorch.utils
49 | .. currentmodule:: mlbench_core.utils.pytorch.utils
50 | 
51 | .. autoapifunction:: pack_tensors
52 | 
53 | .. autoapifunction:: unpack_tensors
54 | 
55 | .. autoapifunction:: orthogonalize
56 | 
57 | 
58 | 
59 | 
60 | Inference
61 | '''''''''
62 | 
63 | .. autoapimodule:: mlbench_core.utils.pytorch.inference
64 | .. currentmodule:: mlbench_core.utils.pytorch.inference
65 | 
66 | Translator
67 | ++++++++++
68 | 
69 | .. autoapiclass:: Translator
70 |     :members:
71 | 
72 | BeamSearch
73 | ++++++++++
74 | 
75 | .. autoapimodule:: mlbench_core.utils.pytorch.inference.beam_search
76 |     :members:
77 | 
78 | 
79 | tensorflow
80 | ~~~~~~~~~~
81 | 
82 | .. autoapimodule:: mlbench_core.utils.tensorflow
83 | .. currentmodule:: mlbench_core.utils.tensorflow
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/mlbench_core/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """Top-level package for mlbench_core."""
4 | 
5 | __version__ = "3.0.0-dev23"
6 | 
7 | from . import api, controlflow, dataset, evaluation, lr_scheduler, models, optim, utils
8 | 


--------------------------------------------------------------------------------
/mlbench_core/aggregation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/aggregation/__init__.py


--------------------------------------------------------------------------------
/mlbench_core/aggregation/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/aggregation/pytorch/__init__.py


--------------------------------------------------------------------------------
/mlbench_core/aggregation/pytorch/aggregation.py:
--------------------------------------------------------------------------------
  1 | from mlbench_core.utils.pytorch.utils import pack_tensors, unpack_tensors
  2 | 
  3 | 
  4 | class Aggregation(object):
  5 |     """Aggregate updates / models from different processes.
  6 | 
  7 |     Args:
  8 |         use_cuda (bool): Whether to use CUDA tensors for communication
  9 |     """
 10 | 
 11 |     def __init__(self, use_cuda=False):
 12 |         self.use_cuda = use_cuda
 13 | 
 14 |     def _agg(self, data, op, denom=None):
 15 |         """Aggregate data using `op` operation.
 16 | 
 17 |         Args:
 18 |             data (:obj:`torch.Tensor`): A Tensor to be aggregated.
 19 |             op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc.
 20 |             denom (:obj:`torch.Tensor`, optional): Custom denominator to average by
 21 |                 Use with op == `custom_avg`. (default: `None`)
 22 | 
 23 |         Returns:
 24 |             :obj:`torch.Tensor`: An aggregated tensor.
 25 |         """
 26 |         raise NotImplementedError
 27 | 
 28 |     def _agg_weights_by_model(self, model, op, denom=None):
 29 |         """Aggregate models by model weight, all layers at once
 30 | 
 31 |         Args:
 32 |             model (:obj:`torch.Module`): Models to be averaged.
 33 |             op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS`
 34 |             denom (:obj:`torch.Tensor`, optional): Custom denominator to average by
 35 |                 Use with op == `custom_avg`. (default: `None`)
 36 |         """
 37 |         # Pack all layers
 38 |         packed, indices, sizes = pack_tensors(
 39 |             [t for t in model.parameters()], use_cuda=self.use_cuda
 40 |         )
 41 |         aggregated = self._agg(packed, op=op, denom=denom)
 42 | 
 43 |         tensors = unpack_tensors(aggregated, indices, sizes)
 44 |         # Unpack
 45 |         for i, param in enumerate(model.parameters()):
 46 |             param.data = tensors[i]
 47 | 
 48 |     def _agg_gradients_by_model(self, model, op, denom=None):
 49 |         """Aggregate models gradients, all layers at once
 50 | 
 51 |         Args:
 52 |             model (:obj:`torch.Module`): Models to be averaged.
 53 |             op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS`
 54 |             denom (:obj:`torch.Tensor`, optional): Custom denominator to average by
 55 |                 Use with op == `custom_avg`. (default: `None`)
 56 |         """
 57 |         # Pack all layers
 58 |         packed, indices, sizes = pack_tensors(
 59 |             [t.grad for t in model.parameters()], use_cuda=self.use_cuda
 60 |         )
 61 |         aggregated = self._agg(packed, op=op, denom=denom)
 62 | 
 63 |         # Unpack
 64 |         tensors = unpack_tensors(aggregated, indices, sizes)
 65 |         for i, param in enumerate(model.parameters()):
 66 |             param.grad.data = tensors[i]
 67 | 
 68 |     def _agg_weights_by_layer(self, model, op, denom=None):
 69 |         """Aggregate models by model weight, for each layer individually
 70 | 
 71 |         Args:
 72 |             model (:obj:`torch.Module`): Models to be averaged.
 73 |             op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS`
 74 |             denom (:obj:`torch.Tensor`, optional): Custom denominator to average by
 75 |                 Use with op == `custom_avg`. (default: `None`)
 76 |         """
 77 |         # Aggregate layer by layer
 78 |         for _, param in enumerate(model.parameters()):
 79 |             grad = self._agg(param.data, op=op, denom=denom)
 80 |             param.data = grad
 81 | 
 82 |     def _agg_gradients_by_layer(self, model, op, denom=None):
 83 |         """Aggregate models gradients each layer individually
 84 | 
 85 |         Args:
 86 |             model (:obj:`torch.Module`): Models to be averaged.
 87 |             op (str): Aggregation method. Should be in `ALLREDUCE_AGGREGATION_OPS`
 88 |             denom (:obj:`torch.Tensor`, optional): Custom denominator to average by
 89 |                 Use with op == `custom_avg`. (default: `None`)
 90 |         """
 91 |         # Aggregate layer by layer
 92 |         for _, param in enumerate(model.parameters()):
 93 |             grad = self._agg(param.grad.data, op=op, denom=denom)
 94 |             param.grad.data = grad
 95 | 
 96 |     def agg_model(self, by_layer=False):
 97 |         if by_layer:
 98 |             return self._agg_weights_by_layer
 99 |         else:
100 |             return self._agg_weights_by_model
101 | 
102 |     def agg_grad(self, by_layer=False):
103 |         if by_layer:
104 |             return self._agg_gradients_by_layer
105 |         else:
106 |             return self._agg_gradients_by_model
107 | 


--------------------------------------------------------------------------------
/mlbench_core/aggregation/pytorch/decentralized.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | from mlbench_core.aggregation.pytorch.aggregation import Aggregation
 5 | 
 6 | 
 7 | class DecentralizedAggregation(Aggregation):
 8 |     """Aggregate updates in a decentralized manner."""
 9 | 
10 |     def __init__(self, rank, neighbors, use_cuda=False):
11 |         """
12 |         Args:
13 |             rank (int): Rank of the current process
14 |             neighbors (list): A list of ranks of its neighbors.
15 |         """
16 |         assert rank not in neighbors
17 |         self.rank = rank
18 |         self.neighbors = neighbors
19 |         super(DecentralizedAggregation, self).__init__(use_cuda=use_cuda)
20 | 
21 |     def _agg(self, data, op, denom=None):
22 |         """Aggregate data using `op` operation.
23 | 
24 |         Args:
25 |             data (:obj:`torch.Tensor`): A Tensor to be aggragated.
26 |             op (str): Aggregation methods like `avg`, `sum`, `min`, `max`, etc.
27 | 
28 |         Returns:
29 |             :obj:`torch.Tensor`: An aggregated tensor.
30 |         """
31 |         # Create some tensors to host the values from neighborhood.
32 |         local_data = {i: torch.zeros_like(data) for i in self.neighbors}
33 |         local_data[self.rank] = data
34 | 
35 |         reqs = []
36 |         for node in self.neighbors:
37 |             reqs.append(dist.isend(tensor=local_data[self.rank], dst=node))
38 |             reqs.append(dist.irecv(tensor=local_data[node], src=node))
39 | 
40 |         for req in reqs:
41 |             req.wait()
42 | 
43 |         # Aggregate local_data
44 |         if op == "avg_world":
45 |             output = sum(local_data.values()) / (len(self.neighbors) + 1)
46 |         else:
47 |             raise NotImplementedError("op {} is not supported yet.".format(op))
48 | 
49 |         return output
50 | 


--------------------------------------------------------------------------------
/mlbench_core/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """Commandline Interface for mlbench_core."""
4 | 
5 | from .cli import cli_group
6 | 
7 | __all__ = ["cli_group"]
8 | 


--------------------------------------------------------------------------------
/mlbench_core/cli/chartbuilder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import tempfile
  4 | 
  5 | from git import Repo
  6 | from supermutes.dot import dotify
  7 | 
  8 | DEFAULT_GIT_BRANCH = "develop"
  9 | 
 10 | 
 11 | def git_clone(repo_url, branch="master", path=""):
 12 |     """clones repo to a temporary dir, the path of which is determined by the platform"""
 13 | 
 14 |     _tmp_dir = tempfile.mkdtemp(prefix="mlbench-")
 15 |     repo = Repo.clone_from(repo_url, _tmp_dir, branch=branch)
 16 | 
 17 |     return os.path.join(_tmp_dir, path)
 18 | 
 19 | 
 20 | class ChartBuilder:
 21 |     """Class that allows building helm charts either from a repository or a local folder
 22 | 
 23 |     Args:
 24 |         chart (dict): Dictionary describing the location. Should look like:
 25 |             ```
 26 |             {
 27 |              "name": [chart_name],
 28 |              "source": {
 29 |                         "type": ["git" or "directory"],
 30 |                         "location": [repo_url or directory path]
 31 |                         "reference": [optional, to select the branch]
 32 |                         }
 33 |             }
 34 |             ```
 35 |     """
 36 | 
 37 |     def __init__(self, chart):
 38 |         self.chart = dotify(chart)
 39 |         self.source_directory = self.source_clone()
 40 | 
 41 |     def source_clone(self):
 42 |         """
 43 |         Clone the charts source
 44 |         We only support a git source type right now, which can also
 45 |         handle git:// local paths as well
 46 |         """
 47 | 
 48 |         subpath = self.chart.source.get("subpath", "")
 49 | 
 50 |         if "name" not in self.chart:
 51 |             raise ValueError("Please specify name for the chart")
 52 | 
 53 |         if "type" not in self.chart.source:
 54 |             raise ValueError("Need source type for chart {}".format(self.chart.name))
 55 | 
 56 |         if self.chart.source.type == "git":
 57 |             if "reference" not in self.chart.source:
 58 |                 self.chart.source.reference = DEFAULT_GIT_BRANCH
 59 |             if "path" not in self.chart.source:
 60 |                 self.chart.source.path = ""
 61 |             self._source_tmp_dir = git_clone(
 62 |                 self.chart.source.location,
 63 |                 self.chart.source.reference,
 64 |                 self.chart.source.path,
 65 |             )
 66 |         elif self.chart.source.type == "directory":
 67 |             self._source_tmp_dir = self.chart.source.location
 68 | 
 69 |         else:
 70 |             raise ValueError(
 71 |                 "Unknown source type %s for chart %s",
 72 |                 self.chart.name,
 73 |                 self.chart.source.type,
 74 |             )
 75 | 
 76 |         return os.path.join(self._source_tmp_dir, subpath)
 77 | 
 78 |     def _get_values_string(self, vals, parent=None):
 79 |         """Given a dictionary of values, recursively returns the arguments to pass to `helm template`.
 80 | 
 81 |         For example: {"key1": "value1", "key2": {"key3":"value3"}}
 82 |             gives ["--set", "key1=value1", "--set", "key2.key3=value3"]
 83 | 
 84 |         Args:
 85 |             vals (dict): Dictionary of values
 86 |             parent (str, optional): The parent key
 87 | 
 88 |         Returns:
 89 |             (list[str]): The command list
 90 |         """
 91 |         values = []
 92 |         for k, v in vals.items():
 93 |             if type(v) == dict:
 94 |                 values += self._get_values_string(v, k)
 95 |             else:
 96 |                 key = "{}={}".format(k, v)
 97 |                 if parent is not None:
 98 |                     key = "{}.{}".format(parent, key)
 99 | 
100 |                 values += ["--set", key]
101 |         return values
102 | 
103 |     def get_chart(self, release_name, values):
104 |         """Executes the command `helm template {args}` to generate the chart
105 |         and saves the yaml to a temporary directory
106 | 
107 |         Args:
108 |             release_name (str): Release name
109 |             values (dict): Values to overwrite
110 | 
111 |         Returns:
112 |             (str): Path of generated template
113 |         """
114 |         values_options = self._get_values_string(values)
115 |         output = subprocess.check_output(
116 |             ["helm", "template", release_name, self.source_directory] + values_options
117 |         )
118 | 
119 |         if self.chart.source.type == "git":
120 |             subpath = self.chart.source.get("subpath", "")
121 |             template_path = os.path.join(
122 |                 self._source_tmp_dir, subpath, "mlbench_template.yaml"
123 |             )
124 |         else:
125 |             template_path = os.path.join(tempfile.mkdtemp(), "template.yaml")
126 | 
127 |         with open(template_path, "wb") as f:
128 |             f.write(output)
129 |         return template_path
130 | 


--------------------------------------------------------------------------------
/mlbench_core/cli/utils.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | from time import sleep
  3 | 
  4 | from kubernetes import client
  5 | 
  6 | from mlbench_core.cli.chartbuilder import ChartBuilder
  7 | 
  8 | 
  9 | def _get_master_pod(release_name, pods):
 10 |     """Given a release name and a list of pods, returns the master pod of the release
 11 | 
 12 |     Args:
 13 |         release_name (str): Release name
 14 |         pods (:obj:`V1PodList`): List of pods
 15 | 
 16 |     Returns:
 17 |         (:obj:`Pod`, optional): The master pod
 18 |     """
 19 | 
 20 |     master_pod_name = "{}-mlbench-master-".format(release_name)
 21 |     for pod in pods.items:
 22 |         if master_pod_name in pod.metadata.name:
 23 |             return pod
 24 | 
 25 |     return None
 26 | 
 27 | 
 28 | def _wait_for_deployment(release_name):
 29 |     """Given a release name, waits for the master pod to be running
 30 | 
 31 |     Args:
 32 |         release_name (str): Release name
 33 | 
 34 |     Raises:
 35 |         ValueError: If the master pod is not running
 36 |     """
 37 |     kube_api = client.CoreV1Api()
 38 |     pods = kube_api.list_namespaced_pod(namespace="default")
 39 |     master_pod = _get_master_pod(release_name, pods)
 40 |     while master_pod is None or master_pod.status.phase == "Pending":
 41 |         pods = kube_api.list_namespaced_pod(namespace="default")
 42 |         master_pod = _get_master_pod(release_name, pods)
 43 |         sleep(1)
 44 |     if master_pod is None or master_pod.status.phase in ["Failed", "Unknown"]:
 45 |         raise ValueError("Could not deploy chart")
 46 | 
 47 | 
 48 | def deploy_chart(
 49 |     num_workers,
 50 |     num_gpus,
 51 |     num_cpus,
 52 |     release_name,
 53 |     custom_value,
 54 |     kube_context,
 55 |     custom_chart=None,
 56 | ):
 57 |     """Deploys the mlbench-helm chart given its values
 58 | 
 59 |     Args:
 60 |         num_workers (int): Number of worker nodes (excluding master)
 61 |         num_gpus (int): Number of GPUs per node
 62 |         num_cpus (int): Number of CPUs per node
 63 |         release_name (str): Release name
 64 |         custom_value (str): Custom values for chart
 65 |         kube_context (str): Current kube-context (must be saved in kubeconfig)
 66 |         custom_chart (dict, optional): Custom chart to use (e.g. local chart)
 67 |     """
 68 |     sleep(5)
 69 | 
 70 |     # install chart
 71 |     chart = ChartBuilder(
 72 |         {
 73 |             "name": "mlbench-helm",
 74 |             "source": {
 75 |                 "type": "git",
 76 |                 "location": "https://github.com/mlbench/mlbench-helm",
 77 |             },
 78 |         }
 79 |         if custom_chart is None
 80 |         else custom_chart
 81 |     )
 82 | 
 83 |     values = {"limits": {"workers": num_workers, "gpu": num_gpus, "cpu": num_cpus}}
 84 |     if custom_value:
 85 |         # merge custom values with values
 86 |         for cv in custom_value:
 87 |             key, v = cv.split("=", 1)
 88 | 
 89 |             current = values
 90 |             key_path = key.split(".")
 91 | 
 92 |             for k in key_path[:-1]:
 93 |                 if k not in current:
 94 |                     current[k] = {}
 95 | 
 96 |                 current = current[k]
 97 | 
 98 |             current[key_path[-1]] = v
 99 | 
100 |     chart_path = chart.get_chart(release_name, values)
101 | 
102 |     output = subprocess.check_output(
103 |         [
104 |             "kubectl",
105 |             "apply",
106 |             "--validate=false",
107 |             "--context={}".format(kube_context),
108 |             "-f",
109 |             chart_path,
110 |         ]
111 |     )
112 |     sleep(1)
113 | 
114 |     _wait_for_deployment(release_name)
115 | 


--------------------------------------------------------------------------------
/mlbench_core/controlflow/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | 
 4 |     from . import pytorch
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | try:
 9 |     import tensorflow
10 | 
11 |     from . import tensorflow
12 | except ImportError:
13 |     pass
14 | 


--------------------------------------------------------------------------------
/mlbench_core/controlflow/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | from .checkpoints_evaluation import CheckpointsEvaluationControlFlow
 2 | from .controlflow import (
 3 |     compute_train_batch_metrics,
 4 |     record_train_batch_stats,
 5 |     record_validation_stats,
 6 |     validation_round,
 7 | )
 8 | from .helpers import prepare_batch
 9 | 
10 | __all__ = [
11 |     "CheckpointsEvaluationControlFlow",
12 |     "compute_train_batch_metrics",
13 |     "record_validation_stats",
14 |     "record_train_batch_stats",
15 |     "validation_round",
16 |     "prepare_batch",
17 | ]
18 | 


--------------------------------------------------------------------------------
/mlbench_core/controlflow/pytorch/checkpoints_evaluation.py:
--------------------------------------------------------------------------------
  1 | """Evaluate training/validation set using models in checkpoints"""
  2 | import logging
  3 | 
  4 | import torch
  5 | 
  6 | from mlbench_core.aggregation.pytorch.centralized import AllReduceAggregation
  7 | from mlbench_core.controlflow.pytorch.helpers import iterate_dataloader
  8 | from mlbench_core.utils.pytorch.distributed import global_average
  9 | 
 10 | logger = logging.getLogger("mlbench")
 11 | 
 12 | 
 13 | class CheckpointsEvaluationControlFlow(object):
 14 |     """Evaluate models on training / validation dataset.
 15 | 
 16 |     Args:
 17 |         ckpt_dir (str): Path to checkpoints.
 18 |         rank (int): The rank of the current process
 19 |         world_size (int): The total number of workers
 20 |         checkpointer (:obj:`Checkpointer`): Used to load checkpoints.
 21 |         model (:obj:`torch.optim.Optimizer`): An optimizer for the given model.
 22 |         epochs (int): Number of epochs to traing.
 23 |         loss_function (:obj:`torch.nn.modules.loss._Loss`): loss function.
 24 |         metrics (:obj:`list` of :obj:`mlbench_core.evaluation.pytorch.*`): metrics like TopKAccuracy.
 25 |         use_cuda (bool): Whether to train on GPU or not. Default: `False`
 26 |         dtype (str): The datatype to use for the dataloader data
 27 |         max_batch_per_epoch (int): Maximum number of batches per epoch. Whole dataset
 28 |         is used if not specified. Default: `None`
 29 |     """
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         ckpt_dir,
 34 |         rank,
 35 |         world_size,
 36 |         checkpointer,
 37 |         model,
 38 |         epochs,
 39 |         loss_function,
 40 |         metrics,
 41 |         use_cuda=False,
 42 |         dtype=None,
 43 |         max_batch_per_epoch=None,
 44 |     ):
 45 |         self.ckpt_dir = ckpt_dir
 46 |         self.rank = rank
 47 |         self.checkpointer = checkpointer
 48 |         self.model = model
 49 |         self.epochs = epochs
 50 |         self.loss_function = loss_function
 51 |         self.metrics = metrics
 52 |         self.dtype = dtype
 53 |         self.max_batch_per_epoch = max_batch_per_epoch
 54 |         self.use_cuda = use_cuda
 55 | 
 56 |         self.model_agg_fn = AllReduceAggregation(world_size=world_size).agg_model()
 57 | 
 58 |         self._check_checkpoints()
 59 | 
 60 |     def _check_checkpoints(self):
 61 |         for epoch in range(self.epochs):
 62 |             self.checkpointer.checkpoint_exists(self.ckpt_dir, self.rank, epoch)
 63 | 
 64 |     def _load_model(self, epoch):
 65 |         # Load epoch-rank model
 66 |         model = self.checkpointer.load_model_by_epoch(
 67 |             self.ckpt_dir, self.rank, epoch, self.model
 68 |         )
 69 | 
 70 |         # aggregate models
 71 |         self.model_agg_fn(model, op="avg_world")
 72 |         return model
 73 | 
 74 |     def evaluate_by_epochs(self, dataloader):
 75 |         """Evaluate dataset using the averaged models.
 76 | 
 77 |         In each epoch each process loads models and averages them. The averaged model is
 78 |         used to evaluate train / validation dataset.
 79 | 
 80 |         Args:
 81 |             dataloader (:obj:`torch.utils.data.DataLoader`): The dataset to be evaluated.
 82 | 
 83 |         Returns:
 84 |             list: list of stats of models in each epoch.
 85 |         """
 86 |         stats_list = []
 87 |         for epoch in range(self.epochs):
 88 |             # Same model for all workers.
 89 |             model = self._load_model(epoch)
 90 |             model.eval()
 91 | 
 92 |             stats = {"epoch": epoch, "count": 0, "total_loss": 0}
 93 |             for metric in self.metrics:
 94 |                 stats["total_" + metric.name] = 0
 95 | 
 96 |             data_iter = iterate_dataloader(
 97 |                 dataloader, self.dtype, self.max_batch_per_epoch, self.use_cuda
 98 |             )
 99 | 
100 |             with torch.no_grad():
101 |                 for i, (data, target) in enumerate(data_iter):
102 |                     output = model(data)
103 | 
104 |                     # Compute loss and metrics.
105 |                     count = len(target)
106 |                     stats["count"] += count
107 |                     stats["total_loss"] += self.loss_function(output, target) * count
108 |                     for metric in self.metrics:
109 |                         stats["total_" + metric.name] += metric(output, target) * count
110 | 
111 |                     logger.info(
112 |                         "E{:4}B{:4}: total loss={:10.3e}".format(
113 |                             epoch, i, stats["total_loss"] / stats["count"]
114 |                         )
115 |                     )
116 | 
117 |             # Keep globally averaged loss / metrics, etc.
118 |             stats["loss"] = global_average(stats["total_loss"], stats["count"]).item()
119 |             for metric in self.metrics:
120 |                 stats[metric.name] = global_average(
121 |                     stats["total_" + metric.name], stats["count"]
122 |                 ).item()
123 |                 del stats["total_" + metric.name]
124 |             del stats["count"], stats["total_loss"]
125 | 
126 |             stats_list.append(stats)
127 |         return stats_list
128 | 


--------------------------------------------------------------------------------
/mlbench_core/controlflow/pytorch/helpers.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | 
 4 | def maybe_range(maximum):
 5 |     """Map an integer or None to an integer iterator starting from 0 with stride 1.
 6 | 
 7 |     If maximum number of batches per epoch is limited, then return an finite
 8 |     iterator. Otherwise, return an iterator of infinite length.
 9 | 
10 |     Args:
11 |         maximum (int | None): Maximum number of steps in iterator.
12 |             If none, returns iterator of infinite length
13 | 
14 |     Returns:
15 |         (iterator)
16 |     """
17 |     if maximum is None:
18 |         counter = itertools.count(0)
19 |     else:
20 |         counter = range(maximum)
21 |     return counter
22 | 
23 | 
24 | def convert_dtype(dtype, obj):
25 |     """Converts given tensor to given dtype
26 | 
27 |     Args:
28 |         dtype (str): One of `fp32` or `fp64`
29 |         obj (`obj`:torch.Tensor | `obj`:torch.nn.Module): Module or tensor to convert
30 | 
31 |     Returns:
32 |         (`obj`:torch.Tensor | `obj`:torch.nn.Module): Converted tensor or module
33 |     """
34 |     # The object should be a ``module`` or a ``tensor``
35 |     if dtype == "fp32":
36 |         return obj.float()
37 |     elif dtype == "fp64":
38 |         return obj.double()
39 |     else:
40 |         raise NotImplementedError("dtype {} not supported.".format(dtype))
41 | 
42 | 
43 | def prepare_batch(data, target, dtype, transform_target_dtype=False, use_cuda=False):
44 |     """Prepares a batch for training by changing the type and sending to cuda
45 |     if necessary
46 | 
47 |     Args:
48 |         data (`obj`:torch.Tensor): The input tensor
49 |         target (`obj`:torch.Tensor): The target tensor
50 |         dtype (str): One of `fp32` or `fp64`, data type to transform input and/or target
51 |         transform_target_dtype (bool): Transform target to `dtype` too
52 |         use_cuda (bool): Send tensors to GPU
53 | 
54 |     Returns:
55 |         (`obj`:torch.Tensor, `obj`:torch.Tensor): Input and target tensors
56 |     """
57 |     data = convert_dtype(dtype, data)
58 |     if transform_target_dtype:
59 |         target = convert_dtype(dtype, target)
60 | 
61 |     if use_cuda:
62 |         data, target = data.cuda(), target.cuda()
63 | 
64 |     return data, target
65 | 
66 | 
67 | def iterate_dataloader(
68 |     dataloader,
69 |     dtype,
70 |     max_batch_per_epoch=None,
71 |     use_cuda=False,
72 |     transform_target_type=False,
73 | ):
74 |     """Function that returns an iterator on the given loader.
75 |     Can be used to limit the number of batches, converting input and target dtypes
76 |     and sending to GPU
77 | 
78 |     Args:
79 |         dataloader (`obj`:torch.utils.data.DataLoader): The loader
80 |         dtype (str): Type to convert to (`fp32` or `fp64`)
81 |         max_batch_per_epoch (int | None): Maximum number of batches
82 |         use_cuda (bool): Send tensors to GPU
83 |         transform_target_type (bool): Transform target dtype as well
84 | 
85 |     Returns:
86 |         (iterator): An iterator over the data
87 |     """
88 |     for _, (data, target) in zip(maybe_range(max_batch_per_epoch), dataloader):
89 |         data, target = prepare_batch(
90 |             data=data,
91 |             target=target,
92 |             dtype=dtype,
93 |             transform_target_dtype=transform_target_type,
94 |             use_cuda=use_cuda,
95 |         )
96 | 
97 |         yield data, target
98 | 


--------------------------------------------------------------------------------
/mlbench_core/controlflow/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .train_validation import TrainValidation
2 | 
3 | __all__ = ["TrainValidation"]
4 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from . import imagerecognition, linearmodels, nlp, util
2 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/imagerecognition/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | 
 4 |     from . import pytorch
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | try:
 9 |     import tensorflow
10 | 
11 |     from . import tensorflow
12 | except ImportError:
13 |     pass
14 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/imagerecognition/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataloader import CIFAR10V1, Imagenet
2 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/imagerecognition/pytorch/dataloader.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import torchvision.datasets as datasets
 5 | import torchvision.transforms as transforms
 6 | 
 7 | _logger = logging.getLogger("mlbench")
 8 | 
 9 | 
10 | class CIFAR10V1(datasets.CIFAR10):
11 |     """CIFAR10V1 Dataset.
12 | 
13 |     Loads CIFAR10V1 images with mean and std-dev normalisation.
14 |     Performs random crop and random horizontal flip on train and
15 |     only normalisation on val.
16 |     Based on `torchvision.datasets.CIFAR10` and `Pytorch CIFAR 10 Example`_.
17 | 
18 |     Args:
19 |         root (str): Root folder for the dataset
20 |         train (bool): Whether to get the train or validation set (default=True)
21 |         download (bool): Whether to download the dataset if it's not present
22 | 
23 |     .. _Pytorch CIFAR 10 Example:
24 |        https://github.com/kuangliu/pytorch-cifar/blob/master/main.py
25 |     """
26 | 
27 |     def __init__(self, root, train=True, download=False):
28 |         cifar10_stats = {
29 |             "mean": (0.4914, 0.4822, 0.4465),
30 |             "std": (0.2023, 0.1994, 0.2010),
31 |         }
32 | 
33 |         if train:
34 |             transform = transforms.Compose(
35 |                 [
36 |                     transforms.RandomHorizontalFlip(),
37 |                     transforms.RandomCrop(32, padding=4),
38 |                     transforms.ToTensor(),
39 |                     transforms.Normalize(cifar10_stats["mean"], cifar10_stats["std"]),
40 |                 ]
41 |             )
42 |         else:
43 |             transform = transforms.Compose(
44 |                 [
45 |                     transforms.ToTensor(),
46 |                     transforms.Normalize(cifar10_stats["mean"], cifar10_stats["std"]),
47 |                 ]
48 |             )
49 |         super(CIFAR10V1, self).__init__(
50 |             root=root, train=train, transform=transform, download=download
51 |         )
52 | 
53 | 
54 | class Imagenet(datasets.ImageFolder):
55 |     """Imagenet (ILSVRC2017) Dataset.
56 | 
57 |     Loads Imagenet images with mean and std-dev normalisation.
58 |     Performs random crop and random horizontal flip on train and
59 |     resize + center crop on val.
60 |     Based on `torchvision.datasets.ImageFolder`
61 | 
62 |     Args:
63 |         root (str): Root folder of Imagenet dataset (without `train/` or `val/`)
64 |         train (bool): Whether to get the train or validation set (default=True)
65 |     """
66 | 
67 |     def __init__(self, root, train=True):
68 |         self.train = train
69 | 
70 |         imagenet_stats = {"mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225]}
71 | 
72 |         if train:
73 |             transform = transforms.Compose(
74 |                 [
75 |                     transforms.RandomResizedCrop(224),
76 |                     transforms.RandomHorizontalFlip(),
77 |                     transforms.ToTensor(),
78 |                     transforms.Normalize(imagenet_stats["mean"], imagenet_stats["std"]),
79 |                 ]
80 |             )
81 |             self.root = os.path.join(self.root, "train")
82 |         else:
83 |             transform = transforms.Compose(
84 |                 [
85 |                     transforms.Resize(256),
86 |                     transforms.CenterCrop(224),
87 |                     transforms.ToTensor(),
88 |                     transforms.Normalize(imagenet_stats["mean"], imagenet_stats["std"]),
89 |                 ]
90 |             )
91 |             self.root = os.path.join(self.root, "val")
92 | 
93 |         super().__init__(self.root, transform)
94 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/imagerecognition/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .cifar10 import DatasetCifar
2 | 
3 | __all__ = ["DatasetCifar"]
4 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/linearmodels/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     import torch
3 | 
4 |     from . import pytorch
5 | except ImportError:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/linearmodels/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataloader import LMDBDataset
2 | 
3 | __all__ = ["LMDBDataset"]
4 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     import torch
3 | 
4 |     from . import pytorch
5 | except ImportError:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from mlbench_core.dataset.nlp.pytorch.wmt16.utils import build_collate_fn
2 | 
3 | from .wikitext2_dataset import Wikitext2Dataset
4 | from .wmt16_dataset import WMT16Dataset
5 | from .wmt17.batching import get_batches
6 | from .wmt17_dataset import WMT17Dataset
7 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt16/__init__.py:
--------------------------------------------------------------------------------
1 | from . import wmt16_config
2 | from .utils import *
3 | from .wmt16_tokenizer import WMT16Tokenizer
4 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt16/preprocess/filter_dataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from collections import Counter
 3 | 
 4 | 
 5 | def parse_args():
 6 |     parser = argparse.ArgumentParser(description="Clean dataset")
 7 |     parser.add_argument("-f1", "--file1", help="file1")
 8 |     parser.add_argument("-f2", "--file2", help="file2")
 9 |     return parser.parse_args()
10 | 
11 | 
12 | def save_output(fname, data):
13 |     with open(fname, "w") as f:
14 |         f.writelines(data)
15 | 
16 | 
17 | def main():
18 |     """
19 |     Discards all pairs of sentences which can't be decoded by latin-1 encoder.
20 | 
21 |     It aims to filter out sentences with rare unicode glyphs and pairs which
22 |     are most likely not valid English-German sentences.
23 | 
24 |     Examples of discarded sentences:
25 | 
26 |         ✿★★★Hommage au king de la pop ★★★✿ ✿★★★Que son âme repos...
27 | 
28 |         Для их осуществления нам, прежде всего, необходимо преодолеть
29 |         возражения рыночных фундаменталистов, которые хотят ликвидировать или
30 |         уменьшить роль МВФ.
31 | 
32 |         practised as a scientist in various medical departments of the ⇗Medical
33 |         University of Hanover , the ⇗University of Ulm , and the ⇗RWTH Aachen
34 |         (rheumatology, pharmacology, physiology, pathology, microbiology,
35 |         immunology and electron-microscopy).
36 | 
37 |         The same shift】 and press 【】 【alt out with a smaller diameter
38 |         circle.
39 | 
40 |         Brought to you by ABMSUBS ♥leira(Coordinator/Translator)
41 |         ♥chibichan93(Timer/Typesetter) ♥ja...
42 | 
43 |         Some examples: &0u - ☺ &0U - ☻ &tel - ☏ &PI - ¶ &SU - ☼ &cH- - ♥ &M2=♫
44 |         &sn - ﺵ SGML maps SGML to unicode.
45 |     """
46 |     args = parse_args()
47 | 
48 |     c = Counter()
49 |     skipped = 0
50 |     valid = 0
51 |     data1 = []
52 |     data2 = []
53 | 
54 |     with open(args.file1) as f1, open(args.file2) as f2:
55 |         for idx, lines in enumerate(zip(f1, f2)):
56 |             line1, line2 = lines
57 |             if idx % 100000 == 1:
58 |                 print("Processed {} lines".format(idx))
59 |             try:
60 |                 line1.encode("latin1")
61 |                 line2.encode("latin1")
62 |             except UnicodeEncodeError:
63 |                 skipped += 1
64 |             else:
65 |                 data1.append(line1)
66 |                 data2.append(line2)
67 |                 valid += 1
68 |                 c.update(line1)
69 | 
70 |     ratio = valid / (skipped + valid)
71 |     print("Skipped: {}, Valid: {}, Valid ratio {}".format(skipped, valid, ratio))
72 |     print("Character frequency:", c)
73 | 
74 |     save_output(args.file1, data1)
75 |     save_output(args.file2, data2)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt16/preprocess/preprocess.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | 
  4 | import torch
  5 | 
  6 | from mlbench_core.dataset.nlp.pytorch.wmt16 import wmt16_config
  7 | from mlbench_core.dataset.nlp.pytorch.wmt16_dataset import WMT16Dataset
  8 | 
  9 | 
 10 | def parse_args():
 11 |     parser = argparse.ArgumentParser(
 12 |         description="GNMT prepare data",
 13 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 14 |     )
 15 | 
 16 |     parser.add_argument(
 17 |         "--dataset-dir",
 18 |         default="data/wmt16_de_en",
 19 |         help="path to the directory with training/test data",
 20 |     )
 21 |     parser.add_argument(
 22 |         "--max-size",
 23 |         default=None,
 24 |         type=int,
 25 |         help="use at most MAX_SIZE elements from training \
 26 |                          dataset (useful for benchmarking), by default \
 27 |                          uses entire dataset",
 28 |     )
 29 | 
 30 |     parser.add_argument(
 31 |         "--math",
 32 |         default="fp16",
 33 |         choices=["fp32", "fp16"],
 34 |         help="arithmetic type",
 35 |     )
 36 | 
 37 |     parser.add_argument(
 38 |         "--max-length-train",
 39 |         default=75,
 40 |         type=int,
 41 |         help="maximum sequence length for training \
 42 |                         (including special BOS and EOS tokens)",
 43 |     )
 44 |     parser.add_argument(
 45 |         "--min-length-train",
 46 |         default=0,
 47 |         type=int,
 48 |         help="minimum sequence length for training \
 49 |                         (including special BOS and EOS tokens)",
 50 |     )
 51 | 
 52 |     parser.add_argument(
 53 |         "--num-workers", default=2, type=int, help="Number of workers for loader"
 54 |     )
 55 |     parser.add_argument(
 56 |         "--batch-size", default=1024, type=int, help="Batch size for loader"
 57 |     )
 58 |     args = parser.parse_args()
 59 |     return args
 60 | 
 61 | 
 62 | def build_collate_fn(max_seq_len):
 63 |     def collate_seq(seq):
 64 |         lengths = torch.tensor([len(s) for s in seq])
 65 |         batch_length = max_seq_len
 66 | 
 67 |         shape = (len(seq), batch_length)
 68 |         seq_tensor = torch.full(shape, wmt16_config.PAD, dtype=torch.int64)
 69 | 
 70 |         for i, s in enumerate(seq):
 71 |             end_seq = lengths[i]
 72 |             seq_tensor[i, :end_seq].copy_(s[:end_seq])
 73 | 
 74 |         return seq_tensor, lengths
 75 | 
 76 |     def parallel_collate(seqs):
 77 |         src_seqs, tgt_seqs = zip(*seqs)
 78 |         return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]])
 79 | 
 80 |     return parallel_collate
 81 | 
 82 | 
 83 | def main():
 84 |     args = parse_args()
 85 | 
 86 |     print(f"Run arguments: {args}")
 87 | 
 88 |     train_data = WMT16Dataset(
 89 |         args.dataset_dir,
 90 |         lang=("en", "de"),
 91 |         math_precision=args.math,
 92 |         download=False,
 93 |         train=True,
 94 |         lazy=True,
 95 |         min_len=args.min_length_train,
 96 |         max_len=args.max_length_train,
 97 |         sort=False,
 98 |         max_size=args.max_size,
 99 |     )
100 | 
101 |     print("Total train points to pre-process: {}".format(len(train_data)))
102 |     collate_fn = build_collate_fn(max_seq_len=args.max_length_train)
103 | 
104 |     train_data.write_as_preprocessed(
105 |         collate_fn,
106 |         args.min_length_train,
107 |         args.max_length_train,
108 |         num_workers=args.num_workers,
109 |         batch_size=args.batch_size,
110 |     )
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt16/wmt16_config.py:
--------------------------------------------------------------------------------
 1 | """Configuration for WMT16 dataset"""
 2 | PAD_TOKEN = "<pad>"
 3 | UNK_TOKEN = "<unk>"
 4 | BOS_TOKEN = "<s>"
 5 | EOS_TOKEN = "<\s>"
 6 | 
 7 | PAD, UNK, BOS, EOS = 0, 1, 2, 3
 8 | BPE_CODES = "bpe.32000"
 9 | VOCAB_FNAME = "vocab.bpe.32000"
10 | 
11 | TRAIN_FNAME = "train.tok.clean.bpe.32000"
12 | VAL_FNAME = "newstest2014.tok.bpe.32000"
13 | 
14 | EXTS = (".en", ".de")
15 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt16/wmt16_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import defaultdict
  3 | from functools import partial
  4 | 
  5 | import torch
  6 | 
  7 | from mlbench_core.dataset.nlp.pytorch.wmt16 import wmt16_config
  8 | 
  9 | 
 10 | def _pad_vocabulary(vocab, math):
 11 |     """
 12 |     Pads vocabulary to a multiple of 'pad' tokens.
 13 | 
 14 |     Args:
 15 |         vocab (list): list with vocabulary
 16 |         math (str): Math precision. either `fp_16`, `manual_fp16` or `fp32`
 17 | 
 18 |     Returns:
 19 |         list: padded vocabulary
 20 |     """
 21 |     if math == "fp16":
 22 |         pad = 8
 23 |     elif math == "fp32":
 24 |         pad = 1
 25 |     else:
 26 |         raise NotImplementedError()
 27 | 
 28 |     vocab_size = len(vocab)
 29 |     padded_vocab_size = (vocab_size + pad - 1) // pad * pad
 30 |     for i in range(0, padded_vocab_size - vocab_size):
 31 |         token = f"madeupword{i:04d}"
 32 |         vocab.append(token)
 33 |     assert len(vocab) % pad == 0
 34 |     return vocab
 35 | 
 36 | 
 37 | class WMT16Tokenizer:
 38 |     """Tokenizer Class for WMT16 that uses the whole vocabulary
 39 | 
 40 |     Args:
 41 |         base_dir (str): Base directory for files
 42 |         math_precision (str): Math precision
 43 |         separator (str): BPE
 44 |     """
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         base_dir,
 49 |         math_precision=None,
 50 |         separator="@@",
 51 |     ):
 52 |         self.separator = separator
 53 | 
 54 |         vocab = [
 55 |             wmt16_config.PAD_TOKEN,
 56 |             wmt16_config.UNK_TOKEN,
 57 |             wmt16_config.BOS_TOKEN,
 58 |             wmt16_config.EOS_TOKEN,
 59 |         ]
 60 |         vocab_fname = os.path.join(base_dir, wmt16_config.VOCAB_FNAME)
 61 | 
 62 |         with open(vocab_fname, encoding="utf-8") as vfile:
 63 |             for line in vfile:
 64 |                 vocab.append(line.strip())
 65 | 
 66 |         vocab = _pad_vocabulary(vocab, math_precision)
 67 |         self.vocab_size = len(vocab)
 68 | 
 69 |         self.tok2idx = defaultdict(partial(int, wmt16_config.UNK))
 70 |         for idx, token in enumerate(vocab):
 71 |             self.tok2idx[token] = idx
 72 | 
 73 |         self.idx2tok = {}
 74 |         for key, value in self.tok2idx.items():
 75 |             self.idx2tok[value] = key
 76 | 
 77 |     def segment(self, line):
 78 |         """
 79 |         Tokenizes single sentence and adds special BOS and EOS tokens.
 80 | 
 81 |         :param line: sentence
 82 | 
 83 |         returns: list representing tokenized sentence
 84 |         """
 85 |         line = line.strip().split()
 86 |         entry = [self.tok2idx[i] for i in line]
 87 |         entry = [wmt16_config.BOS] + entry + [wmt16_config.EOS]
 88 |         return entry
 89 | 
 90 |     def detokenize(self, inputs, delim=" "):
 91 |         """
 92 |         Detokenizes single sentence and removes token separator characters.
 93 | 
 94 |         :param inputs: sequence of tokens
 95 |         :param delim: tokenization delimiter
 96 | 
 97 |         returns: string representing detokenized sentence
 98 |         """
 99 |         detok = delim.join([self.idx2tok[idx] for idx in inputs])
100 |         detok = detok.replace(self.separator + " ", "")
101 |         detok = detok.replace(self.separator, "")
102 | 
103 |         detok = detok.replace(wmt16_config.BOS_TOKEN, "")
104 |         detok = detok.replace(wmt16_config.EOS_TOKEN, "")
105 |         detok = detok.replace(wmt16_config.PAD_TOKEN, "")
106 |         detok = detok.strip()
107 |         return detok
108 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt17/__init__.py:
--------------------------------------------------------------------------------
1 | from .collate import collate_batch
2 | from .wmt17_dictionary import Dictionary
3 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt17/collate.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | 
  4 | def _collate_tokens(
  5 |     values,
  6 |     pad_idx,
  7 |     eos_idx,
  8 |     left_pad,
  9 |     move_eos_to_beginning=False,
 10 |     n_seq_per_batch_multiple=8,
 11 |     seq_len_multiple=1,
 12 | ):
 13 |     """Convert a list of 1d tensors into a padded 2d tensor.
 14 | 
 15 |     Args:
 16 |         values (list[torch.Tensor]): A list of tensors
 17 |         pad_idx (int): Padding symbol index
 18 |         eos_idx (int): EOS symbol index
 19 |         left_pad (bool): left- or right-padding (true: left, false: right)
 20 |         move_eos_to_beginning (bool): Reverse order of sequence of tokens (true: reverse, false: original)
 21 |         n_seq_per_batch_multiple (int): The number of sequences per batch to round down to
 22 |         seq_len_multiple (int): The number of tokens per sequence to round up to
 23 | 
 24 |     Returns:
 25 |         (:obj:`torch.Tensor`): The tensor of collated and padded tokens
 26 |     """
 27 |     size_of_seq_dim = max(v.size(0) for v in values)  # Unpadded size
 28 |     n_seq_in_batch = len(values)
 29 | 
 30 |     if n_seq_per_batch_multiple % seq_len_multiple == 0:
 31 |         n_seq_multiple = n_seq_per_batch_multiple / seq_len_multiple
 32 |     else:
 33 |         n_seq_multiple = n_seq_per_batch_multiple
 34 | 
 35 |     if n_seq_in_batch < n_seq_multiple or n_seq_in_batch % n_seq_multiple > 0:
 36 |         seq_len_multiple = n_seq_per_batch_multiple
 37 | 
 38 |     size_of_seq_dim = (
 39 |         (size_of_seq_dim + seq_len_multiple - 1) // seq_len_multiple * seq_len_multiple
 40 |     )  # Padded seq len, rounded up to next multiple
 41 | 
 42 |     padded_2d_tensor = values[0].new(len(values), size_of_seq_dim).fill_(pad_idx)
 43 | 
 44 |     def copy_tensor(src, dst):
 45 |         assert dst.numel() == src.numel()
 46 | 
 47 |         if move_eos_to_beginning:
 48 |             assert src[-1] == eos_idx
 49 |             dst[0] = eos_idx
 50 |             dst[1:] = src[:-1]
 51 |         else:
 52 |             dst.copy_(src)
 53 | 
 54 |     if left_pad:
 55 |         for idx, val in enumerate(values):
 56 |             copy_tensor(val, padded_2d_tensor[idx][size_of_seq_dim - len(val) :])
 57 |     else:
 58 |         for idx, val in enumerate(values):
 59 |             copy_tensor(val, padded_2d_tensor[idx][: len(val)])
 60 | 
 61 |     return padded_2d_tensor
 62 | 
 63 | 
 64 | def collate_batch(
 65 |     samples,
 66 |     pad_idx,
 67 |     eos_idx,
 68 |     left_pad_source=True,
 69 |     left_pad_target=False,
 70 |     bsz_mult=8,
 71 |     seq_len_multiple=1,
 72 | ):
 73 |     """Collate a list of samples into a batch
 74 | 
 75 |     Args:
 76 |         samples (list[dict]): Samples to collate
 77 |         pad_idx (int): Padding symbol index
 78 |         eos_idx (int): EOS symbol index
 79 |         left_pad_source (bool): Pad sources on the left
 80 |         left_pad_target (bool): Pad sources on the right
 81 |         bsz_mult (int): Batch size multiple
 82 |         seq_len_multiple (int): Sequence length multiple
 83 | 
 84 |     Returns:
 85 |         (dict): Containing keys `id` (list of indices), `ntokens` (total num tokens), `net_input` and `target`
 86 | 
 87 |     """
 88 |     if len(samples) == 0:
 89 |         return {}
 90 | 
 91 |     def merge(key, left_pad, move_eos_to_beginning=False):
 92 |         return _collate_tokens(
 93 |             [s[key] for s in samples],
 94 |             pad_idx,
 95 |             eos_idx,
 96 |             left_pad,
 97 |             move_eos_to_beginning,
 98 |             bsz_mult,
 99 |             seq_len_multiple,
100 |         )
101 | 
102 |     id = torch.LongTensor([s["id"] for s in samples])
103 |     src_tokens = merge("source", left_pad=left_pad_source)
104 |     # sort by descending source length
105 |     src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
106 | 
107 |     prev_output_tokens = None
108 |     target = None
109 |     if samples[0].get("target", None) is not None:
110 |         target = merge("target", left_pad=left_pad_target)
111 |         # we create a shifted version of targets for feeding the
112 |         # previous output token(s) into the next decoder step
113 |         prev_output_tokens = merge(
114 |             "target",
115 |             left_pad=left_pad_target,
116 |             move_eos_to_beginning=True,
117 |         )
118 |         ntokens = sum(len(s["target"]) for s in samples)
119 |     else:
120 |         ntokens = sum(len(s["source"]) for s in samples)
121 | 
122 |     return {
123 |         "id": id,
124 |         "ntokens": ntokens,
125 |         "net_input": {
126 |             "src_tokens": src_tokens,
127 |             "src_lengths": src_lengths,
128 |             "prev_output_tokens": prev_output_tokens,
129 |         },
130 |         "target": target,
131 |     }
132 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt17/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/dataset/nlp/pytorch/wmt17/preprocess/__init__.py


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt17/preprocess/indexed_dataset.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | 
 3 | import numpy as np
 4 | 
 5 | dtypes = {
 6 |     1: np.uint8,
 7 |     2: np.int8,
 8 |     3: np.int16,
 9 |     4: np.int32,
10 |     5: np.int64,
11 |     6: np.float,
12 |     7: np.double,
13 | }
14 | 
15 | 
16 | def write_longs(f, a):
17 |     f.write(np.array(a, dtype=np.int64))
18 | 
19 | 
20 | def code(dtype):
21 |     for k in dtypes.keys():
22 |         if dtypes[k] == dtype:
23 |             return k
24 | 
25 | 
26 | class IndexedDatasetBuilder(object):
27 |     element_sizes = {
28 |         np.uint8: 1,
29 |         np.int8: 1,
30 |         np.int16: 2,
31 |         np.int32: 4,
32 |         np.int64: 8,
33 |         np.float: 4,
34 |         np.double: 8,
35 |     }
36 | 
37 |     def __init__(self, out_file, dtype=np.int32):
38 |         self.out_file = open(out_file, "wb")
39 |         self.dtype = dtype
40 |         self.data_offsets = [0]
41 |         self.dim_offsets = [0]
42 |         self.sizes = []
43 |         self.element_size = self.element_sizes[self.dtype]
44 | 
45 |     def add_item(self, tensor):
46 |         bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
47 |         self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
48 |         for s in tensor.size():
49 |             self.sizes.append(s)
50 |         self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
51 | 
52 |     def finalize(self, index_file):
53 |         self.out_file.close()
54 |         index = open(index_file, "wb")
55 |         index.write(b"TNTIDX\x00\x00")
56 |         index.write(struct.pack("<Q", 1))
57 |         index.write(struct.pack("<QQ", code(self.dtype), self.element_size))
58 |         index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
59 |         write_longs(index, self.dim_offsets)
60 |         write_longs(index, self.data_offsets)
61 |         write_longs(index, self.sizes)
62 |         index.close()
63 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/nlp/pytorch/wmt17/wmt17_dictionary.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | 
  4 | class Dictionary(object):
  5 |     """Dictionary Class for WMT17 Dataset.
  6 |     Essentially a mapping from symbols to consecutive integers
  7 | 
  8 |     Args:
  9 |         pad (str): Padding symbol to use
 10 |         eos (str): End of String symbol to use
 11 |     """
 12 | 
 13 |     def __init__(self, pad="<pad>_", eos="<EOS>_"):
 14 |         self.pad_word, self.eos_word = pad, eos
 15 |         self.symbols = []
 16 |         self.count = []
 17 |         self.indices = {}
 18 |         # dictionary indexing starts at 1 for consistency with Lua
 19 |         # Commented out and hard-coded since pad and eos are in the dictionary files already
 20 |         self.add_symbol("<lua_index_compat>")
 21 |         self.pad_index = 1
 22 |         self.eos_index = 2
 23 |         self.nspecial = 3
 24 | 
 25 |     def __eq__(self, other):
 26 |         return self.indices == other.indices
 27 | 
 28 |     def __getitem__(self, idx):
 29 |         if idx < len(self.symbols):
 30 |             return self.symbols[idx]
 31 |         else:
 32 |             assert idx < len(self.symbols)
 33 | 
 34 |     def __len__(self):
 35 |         """Returns the number of symbols in the dictionary"""
 36 |         return len(self.symbols)
 37 | 
 38 |     def index(self, sym):
 39 |         """Returns the index of the specified symbol"""
 40 |         if sym in self.indices:
 41 |             return self.indices[sym]
 42 |         else:
 43 |             assert sym in self.indices
 44 | 
 45 |     def string(self, tensor, bpe_symbol=None):
 46 |         """Helper for converting a tensor of token indices to a string.
 47 | 
 48 |         Can optionally remove BPE symbols or escape <unk> words.
 49 |         """
 50 |         if torch.is_tensor(tensor) and tensor.dim() == 2:
 51 |             return "\n".join(self.string(t) for t in tensor)
 52 | 
 53 |         def token_string(i):
 54 |             return self[i]
 55 | 
 56 |         sent = " ".join(token_string(i) for i in tensor if i != self.eos())
 57 |         if bpe_symbol is not None:
 58 |             sent = (sent + " ").replace(bpe_symbol, "").rstrip()
 59 | 
 60 |         return sent
 61 | 
 62 |     def add_symbol(self, word, n=1):
 63 |         """Adds a word to the dictionary"""
 64 |         if word in self.indices:
 65 |             idx = self.indices[word]
 66 |             self.count[idx] = self.count[idx] + n
 67 |             return idx
 68 |         else:
 69 |             idx = len(self.symbols)
 70 |             self.indices[word] = idx
 71 |             self.symbols.append(word)
 72 |             self.count.append(n)
 73 |             return idx
 74 | 
 75 |     def update(self, new_dict):
 76 |         """Updates counts from new dictionary."""
 77 |         for word in new_dict.symbols:
 78 |             idx2 = new_dict.indices[word]
 79 |             if word in self.indices:
 80 |                 idx = self.indices[word]
 81 |                 self.count[idx] = self.count[idx] + new_dict.count[idx2]
 82 |             else:
 83 |                 idx = len(self.symbols)
 84 |                 self.indices[word] = idx
 85 |                 self.symbols.append(word)
 86 |                 self.count.append(new_dict.count[idx2])
 87 | 
 88 |     def pad(self):
 89 |         """Helper to get index of pad symbol"""
 90 |         return self.pad_index
 91 | 
 92 |     def eos(self):
 93 |         """Helper to get index of end-of-sentence symbol"""
 94 |         return self.eos_index
 95 | 
 96 |     @classmethod
 97 |     def load(cls, f, ignore_utf_errors=False):
 98 |         """Loads the dictionary from a text file with the format:
 99 | 
100 |         ```
101 |         <symbol0>
102 |         <symbol1>
103 |         ...
104 |         ```
105 | 
106 |         Args:
107 |             f (str): Dictionary file name
108 |             ignore_utf_errors (bool): Ignore UTF-8 related errors
109 |         """
110 |         if isinstance(f, str):
111 |             try:
112 |                 if not ignore_utf_errors:
113 |                     with open(f, "r", encoding="utf-8") as fd:
114 |                         return cls.load(fd)
115 |                 else:
116 |                     with open(f, "r", encoding="utf-8", errors="ignore") as fd:
117 |                         return cls.load(fd)
118 | 
119 |             except FileNotFoundError as fnfe:
120 |                 raise fnfe
121 | 
122 |             except Exception:
123 |                 raise Exception(
124 |                     "Incorrect encoding detected in {}, please rebuild the dataset".format(
125 |                         f
126 |                     )
127 |                 )
128 | 
129 |         d = cls()
130 |         for line in f.readlines():
131 |             word = line.strip()[1:-1]
132 |             count = 1
133 |             d.indices[word] = len(d.symbols)
134 |             d.symbols.append(word)
135 |             d.count.append(count)
136 | 
137 |         n_pad_tokens_on_end = 33712 - len(d.symbols)
138 | 
139 |         for i in range(n_pad_tokens_on_end):
140 |             pad_str = "<pad000" + str(i) + ">"
141 |             d.indices[pad_str] = len(d.symbols)
142 |             d.symbols.append(pad_str)
143 |             d.count.append(1)
144 | 
145 |         return d
146 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/util/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | 
 4 |     from . import pytorch
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | try:
 9 |     import tensorflow
10 | 
11 |     from . import tensorflow
12 | except ImportError:
13 |     pass
14 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/util/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .partition import (
2 |     DataPartitioner,
3 |     Partition,
4 |     Partitioner,
5 |     partition_dataset_by_rank,
6 | )
7 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/util/pytorch/partition.py:
--------------------------------------------------------------------------------
  1 | r"""Partition PyTorch datasets."""
  2 | # -*- coding: utf-8 -*-
  3 | import logging
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.distributed as dist
  9 | 
 10 | from mlbench_core.utils.pytorch.distributed import get_backend_tensor
 11 | 
 12 | _logger = logging.getLogger("mlbench")
 13 | 
 14 | 
 15 | class Partition(object):
 16 |     """Dataset-like object, but only access a subset of it.
 17 | 
 18 |     Wraps a dataset, only exposing the entries selected by the `indices`
 19 |     parameter.
 20 | 
 21 |     Args:
 22 |         data (:obj:`list` of data entries): The data to partition over
 23 |         indices (:obj:`list` of :obj:`int`): indices of entries to use
 24 |     """
 25 | 
 26 |     def __init__(self, data, indices):
 27 |         self.data = data
 28 |         self.indices = indices
 29 | 
 30 |     def __len__(self):
 31 |         return len(self.indices)
 32 | 
 33 |     def __getitem__(self, index):
 34 |         data_idx = self.indices[index]
 35 |         return self.data[data_idx]
 36 | 
 37 |     def __getattr__(self, item):
 38 |         return self.data.__getattribute__(item)
 39 | 
 40 | 
 41 | class Partitioner(object):
 42 |     """Use a partition of dataset."""
 43 | 
 44 |     def consistent_indices(self, rank, indices, shuffle):
 45 |         r"""synchronize indices among workers."""
 46 |         if rank == 0 and shuffle:
 47 |             random.shuffle(indices)
 48 | 
 49 |         # broadcast.
 50 |         indices = get_backend_tensor(torch.IntTensor(indices))
 51 | 
 52 |         dist.broadcast(indices, src=0)
 53 | 
 54 |         return indices.tolist()
 55 | 
 56 | 
 57 | class DataPartitioner(Partitioner):
 58 |     """Partitions a dataset into different sized chunks.
 59 | 
 60 |     Used for train:test:validation split.
 61 | 
 62 |     Args:
 63 |         data (:obj:`list` of data entries): The data to partition over
 64 |         rank (int): The rank of the current node
 65 |         shuffle (bool): Whether to shuffle entries or not
 66 |         sizes (:obj:`list` of :obj:`float`): The relative sizes of the
 67 |             splits. Should sum up to 1.0. (Default = [0.7, 0.2, 0.1])
 68 |     """
 69 | 
 70 |     def __init__(self, data, rank, shuffle, sizes=[0.7, 0.2, 0.1]):
 71 |         # prepare info.
 72 |         self.data = data
 73 |         self.data_size = len(self.data)
 74 |         self.partitions = []
 75 | 
 76 |         # get shuffled/unshuffled data.
 77 |         indices = [x for x in range(0, self.data_size)]
 78 |         indices = self.consistent_indices(rank, indices, shuffle)
 79 | 
 80 |         # partition indices.
 81 |         sizes = np.cumsum(sizes)
 82 |         from_index = 0
 83 |         for ind, _ in enumerate(sizes):
 84 |             to_index = int(sizes[ind] * self.data_size)
 85 |             self.partitions.append(indices[from_index:to_index])
 86 |             from_index = to_index
 87 | 
 88 |     def use(self, partition_ind):
 89 |         """Return a partition of data.
 90 | 
 91 |         Args:
 92 |             partition_ind (int): The index of the partition to get
 93 |         """
 94 |         return Partition(self.data, self.partitions[partition_ind])
 95 | 
 96 | 
 97 | def partition_dataset_by_rank(
 98 |     dataset, rank, world_size, distribution="uniform", shuffle=True
 99 | ):
100 |     r"""Given a dataset, partition it by a distribution and each rank takes part of data.
101 | 
102 |     Args:
103 |         dataset (:obj:`torch.utils.data.Dataset`): The dataset
104 |         rank (int): The rank of the current worker
105 |         world_size (int): The total number of workers
106 |         distribution (str): The sampling distribution to use. Default: `uniform`
107 |         shuffle (bool): Whether to shuffle the dataset before partitioning. Default: `True`
108 |     """
109 |     if distribution != "uniform":
110 |         raise NotImplementedError(
111 |             "Distribution {} not implemented.".format(distribution)
112 |         )
113 | 
114 |     partition_sizes = [1.0 / world_size for _ in range(world_size)]
115 |     partition = DataPartitioner(dataset, rank, shuffle, partition_sizes)
116 |     partitioned_data = partition.use(rank)
117 |     _logger.debug("Partition dataset use {}-th.".format(rank))
118 |     return partitioned_data
119 | 


--------------------------------------------------------------------------------
/mlbench_core/dataset/util/tools.py:
--------------------------------------------------------------------------------
  1 | import bz2
  2 | import os
  3 | import sys
  4 | import tarfile
  5 | import zipfile
  6 | from urllib.request import urlretrieve
  7 | 
  8 | 
  9 | def progress_download(url, dest):
 10 |     """Downloads a file from `url` to `dest` and shows progress
 11 | 
 12 |     Args:
 13 |         url (src): Url to retrieve file from
 14 |         dest (src): Destination file
 15 |     """
 16 | 
 17 |     def _progress(count, block_size, total_size):
 18 |         percentage = float(count * block_size) / float(total_size) * 100.0
 19 |         if percentage % 25 == 0:
 20 |             sys.stdout.write(
 21 |                 "\r>> Downloading %s %.1f%%" % (os.path.basename(dest), percentage)
 22 |             )
 23 |             sys.stdout.flush()
 24 | 
 25 |     urlretrieve(url, dest, _progress)
 26 |     print("\nDownloaded {} to {}\n".format(url, dest))
 27 | 
 28 | 
 29 | def extract_bz2_file(source, dest, delete=True):
 30 |     """Extracts a bz2 archive
 31 | 
 32 |     Args:
 33 |         source (str): Source file (must have .bz2 extension)
 34 |         dest (str): Destination file
 35 |         delete (bool): Delete compressed file after decompression
 36 | 
 37 |     """
 38 |     assert source.endswith(".bz2"), "Extracting non bz2 archive"
 39 | 
 40 |     if os.path.isfile(dest):
 41 |         print("File {} already extracted to {}".format(source, dest))
 42 |         return
 43 |     with open(dest, "wb") as d, open(source, "rb") as s:
 44 |         decompressor = bz2.BZ2Decompressor()
 45 |         for data in iter(lambda: s.read(1000 * 1024), b""):
 46 |             d.write(decompressor.decompress(data))
 47 | 
 48 |     if delete:
 49 |         os.remove(source)
 50 | 
 51 | 
 52 | def compress_to_bz2_file(source, delete=True):
 53 |     """Extracts a bz2 archive
 54 | 
 55 |     Args:
 56 |         source (str): Source file to compress
 57 |         delete (bool): Delete un-compressed file
 58 |     """
 59 | 
 60 |     dest = source + ".bz2"
 61 |     with open(source, "rb") as s, open(dest, "wb") as d:
 62 |         compressor = bz2.BZ2Compressor()
 63 |         for data in iter(lambda: s.read(1000 * 1024), b""):
 64 |             d.write(compressor.compress(data))
 65 | 
 66 |     if delete:
 67 |         os.remove(source)
 68 | 
 69 | 
 70 | def maybe_download_and_extract_bz2(root, file_name, data_url):
 71 |     """Downloads file from given URL and extracts if bz2
 72 | 
 73 |     Args:
 74 |         root (str): The root directory
 75 |         file_name (str): File name to download to
 76 |         data_url (str): Url of data
 77 |     """
 78 |     if not os.path.exists(root):
 79 |         os.makedirs(root)
 80 | 
 81 |     file_path = os.path.join(root, file_name)
 82 |     file_basename = os.path.splitext(file_name)[0]
 83 |     extracted_fpath = os.path.join(root, file_basename)
 84 | 
 85 |     if os.path.isfile(extracted_fpath):
 86 |         return extracted_fpath
 87 | 
 88 |     # Download file if not present
 89 |     if len([x for x in os.listdir(root) if x == file_name]) == 0:
 90 |         progress_download(data_url, file_path)
 91 | 
 92 |     # Extract downloaded file if compressed
 93 |     if file_name.endswith(".bz2"):
 94 |         # Extract file
 95 |         extract_bz2_file(file_path, extracted_fpath, delete=True)
 96 |         file_path = extracted_fpath
 97 |     return file_path
 98 | 
 99 | 
100 | def maybe_download_and_extract_tar_gz(root, file_name, data_url):
101 |     """Downloads file from given URL and extracts if compressed as tar.gz
102 | 
103 |     Args:
104 |         root (str): The root directory
105 |         file_name (str): File name to download to
106 |         data_url (str): Url of data
107 |     """
108 |     if not os.path.exists(root):
109 |         os.makedirs(root)
110 | 
111 |     file_path = os.path.join(root, file_name)
112 | 
113 |     # Download file if not present
114 |     if len([x for x in os.listdir(root) if x == file_name]) == 0:
115 |         progress_download(data_url, file_path)
116 | 
117 |     if file_name.endswith(".tar.gz"):
118 |         with tarfile.open(file_path, "r:gz") as tar:
119 |             dirs = [member for member in tar.getmembers()]
120 |             tar.extractall(path=root, members=dirs)
121 | 
122 | 
123 | def maybe_download_and_extract_zip(root, file_name, data_url):
124 |     """Downloads file from given URL and extracts if compressed as zip
125 | 
126 |     Args:
127 |         root (str): The root directory
128 |         file_name (str): File name to download to
129 |         data_url (str): Url of data
130 |     """
131 |     if not os.path.exists(root):
132 |         os.makedirs(root)
133 | 
134 |     file_path = os.path.join(root, file_name)
135 | 
136 |     # Download file if not present
137 |     if len([x for x in os.listdir(root) if x == file_name]) == 0:
138 |         progress_download(data_url, file_path)
139 | 
140 |     if file_name.endswith(".zip"):
141 |         with zipfile.ZipFile(file_path, "r") as zip:
142 |             zip.extractall(root)
143 | 


--------------------------------------------------------------------------------
/mlbench_core/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | 
 4 |     from . import pytorch
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | try:
 9 |     import tensorflow
10 | 
11 |     from . import tensorflow
12 | except ImportError:
13 |     pass
14 | 


--------------------------------------------------------------------------------
/mlbench_core/evaluation/goals.py:
--------------------------------------------------------------------------------
  1 | def _add_detailed_times(result, tracker):
  2 |     compute_time = tracker.get_total_compute_time()
  3 | 
  4 |     if compute_time:
  5 |         result += ", Compute: {} seconds".format(compute_time)
  6 | 
  7 |     communication_time = tracker.get_total_communication_time()
  8 | 
  9 |     if communication_time:
 10 |         result += ", Communication: {} seconds".format(communication_time)
 11 | 
 12 |     metrics_time = tracker.get_total_metrics_time()
 13 | 
 14 |     if metrics_time:
 15 |         result += ", Metrics Computation: {} seconds".format(metrics_time)
 16 | 
 17 |     preprocess_time = tracker.get_total_preprocess_time()
 18 | 
 19 |     if preprocess_time:
 20 |         result += ", Pre-processing: {} seconds".format(preprocess_time)
 21 |     return result
 22 | 
 23 | 
 24 | def time_to_accuracy_goal(threshold):
 25 |     def _time_to_accuracy_goal(metric_name, value, tracker):
 26 |         if metric_name != "val_global_Prec@1":
 27 |             return None
 28 |         if value >= threshold:
 29 |             duration = tracker.get_total_train_time()
 30 | 
 31 |             result = (
 32 |                 "{0:02d}% Top 1 Validation Accuracy reached in {1:.3f} "
 33 |                 "seconds".format(threshold, duration)
 34 |             )
 35 | 
 36 |             result = _add_detailed_times(result, tracker)
 37 | 
 38 |             return result
 39 | 
 40 |         return None
 41 | 
 42 |     return _time_to_accuracy_goal
 43 | 
 44 | 
 45 | def task1_time_to_accuracy_goal():
 46 |     """Accuracy over Time target for benchmark task 1: Image classification
 47 | 
 48 |     Target is 80% accuracy
 49 | 
 50 |     Return:
 51 |         func: time_time_to_accuracy_goal with threshold = 80
 52 |     """
 53 |     return time_to_accuracy_goal(80)
 54 | 
 55 | 
 56 | def task1_time_to_accuracy_light_goal():
 57 |     """Accuracy over Time target for benchmark task 1: Image classification
 58 |     (Light)
 59 | 
 60 |     Light target is 70% accuracy
 61 | 
 62 |     Return:
 63 |         func: time_time_to_accuracy_goal with threshold = 70
 64 |     """
 65 |     return time_to_accuracy_goal(70)
 66 | 
 67 | 
 68 | def task2_time_to_accuracy_goal():
 69 |     """Time to accuracy goal for benchmark task 2: Linear binary classifier
 70 | 
 71 |     Target is an accuracy of 89%
 72 | 
 73 |     Return:
 74 |         func: time_time_to_accuracy_goal with threshold = 89
 75 |     """
 76 |     return time_to_accuracy_goal(89)
 77 | 
 78 | 
 79 | def task2_time_to_accuracy_light_goal():
 80 |     """Time to perplexity goal for benchmark task 2: Linear binary classifier
 81 | 
 82 |     Target is an accuracy of 80%
 83 | 
 84 |     Return:
 85 |         func: time_time_to_accuracy_goal with threshold = 80
 86 |     """
 87 |     return time_to_accuracy_goal(80)
 88 | 
 89 | 
 90 | def task3_time_to_perplexity_goal(threshold=70):
 91 |     """Time to perplexity goal for benchmark task 3: Language Modeling"""
 92 | 
 93 |     def _time_to_perplexity_goal(metric_name, value, tracker):
 94 |         if metric_name != "val_global_Perplexity":
 95 |             return None
 96 | 
 97 |         if value <= threshold:
 98 |             duration = tracker.get_total_train_time()
 99 |             result = "Validation Perplexity of {0} reached in {1:.3f} seconds".format(
100 |                 threshold, duration
101 |             )
102 | 
103 |             result = _add_detailed_times(result, tracker)
104 | 
105 |             return result
106 |         return None
107 | 
108 |     return _time_to_perplexity_goal
109 | 
110 | 
111 | def task4_time_to_bleu_goal(threshold=24):
112 |     """Time to BLEU-score goal for benchmark task 4: GNMT machine translation"""
113 | 
114 |     def _time_to_bleu_goal(metric_name, value, tracker):
115 |         if metric_name != "val_global_BLEU-Score":
116 |             return None
117 | 
118 |         if value >= threshold:
119 |             duration = tracker.get_total_train_time()
120 |             result = "Validation BLEU-Score of {0} reached in {1:.3f} seconds".format(
121 |                 threshold, duration
122 |             )
123 | 
124 |             result = _add_detailed_times(result, tracker)
125 | 
126 |             return result
127 | 
128 |         return None
129 | 
130 |     return _time_to_bleu_goal
131 | 


--------------------------------------------------------------------------------
/mlbench_core/evaluation/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from . import criterion, metrics
2 | 
3 | __all__ = ["criterion", "metrics"]
4 | 


--------------------------------------------------------------------------------
/mlbench_core/evaluation/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from . import criterion
2 | 
3 | __all__ = ["criterion"]
4 | 


--------------------------------------------------------------------------------
/mlbench_core/evaluation/tensorflow/criterion.py:
--------------------------------------------------------------------------------
 1 | r"""Define loss functions."""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def softmax_cross_entropy_with_logits_v2_l2_regularized(
 7 |     logits, labels, l2, loss_filter_fn
 8 | ):
 9 |     """Return an op for computing cross entropy with weight decay.
10 | 
11 |     The `labels` are assumed to be one-hot encoded. The loss filter function excludes some
12 |     tensors from computing weight decay.
13 | 
14 |     Args:
15 |         logits (:obj:`tf.Tensor`): input logits tensor.
16 |         labels (:obj:`tf.Tensor`): input one-hot encoded tensor.
17 |         l2 (:obj:`float`): size of weight decay
18 |         loss_filter_fn (:obj:`callable`): filter function.
19 | 
20 |     Returns:
21 |         :obj:`tf.Tensor`: a scalar tensor
22 |     """
23 |     labels = tf.cast(labels, tf.int32)
24 |     with tf.name_scope("loss"):
25 |         cross_entropy = tf.reduce_mean(
26 |             tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels)
27 |         )
28 | 
29 |         loss = cross_entropy + l2 * tf.add_n(
30 |             [
31 |                 tf.nn.l2_loss(v)
32 |                 for v in tf.trainable_variables()
33 |                 if loss_filter_fn(v.name)
34 |             ]
35 |         )
36 |     return loss
37 | 


--------------------------------------------------------------------------------
/mlbench_core/evaluation/tensorflow/metrics.py:
--------------------------------------------------------------------------------
 1 | r"""Define tensorflow metrics."""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | class TopKAccuracy(object):
 7 |     """Compute the top-k accuracy of logits.
 8 | 
 9 |     Args:
10 |         logits (:obj:`tf.Tensor`): input tensor
11 |         labels (:obj:`tf.Tensor`): input one-hot encoded tensor.
12 |         topkk (:obj:`int`, optional): Defaults to 1. top k accuracy.
13 |     """
14 | 
15 |     def __init__(self, logits, labels, topk=1):
16 |         labels = tf.cast(labels, tf.int32)
17 |         true_classes = tf.argmax(labels, axis=1)
18 | 
19 |         # predicted classes
20 |         pred_probs = tf.nn.softmax(logits, name="softmax_tensor")
21 |         pred_classes = tf.argmax(pred_probs, axis=1)
22 | 
23 |         # get metrics.
24 |         with tf.name_scope("metrics"):
25 |             if topk == 1:
26 |                 self.name = "Prec@1"
27 |                 self.metric_op = (
28 |                     tf.reduce_mean(
29 |                         tf.cast(tf.equal(true_classes, pred_classes), tf.float32)
30 |                     )
31 |                     * 100.0
32 |                 )
33 |             else:
34 |                 topk_op = tf.nn.in_top_k(
35 |                     predictions=pred_probs, targets=true_classes, k=topk
36 |                 )
37 |                 self.name = "Prec@" + str(topk)
38 |                 self.metric_op = tf.reduce_mean(tf.cast(topk_op, tf.float32)) * 100.0
39 | 
40 | 
41 | def topk_accuracy_with_logits(logits, labels, k=1):
42 |     """Compute the top-k accuracy of logits.
43 | 
44 |     Args:
45 |         logits (:obj:`tf.Tensor`): input tensor
46 |         labels (:obj:`tf.Tensor`): input one-hot encoded tensor.
47 |         k (:obj:`int`, optional): Defaults to 1. top k accuracy.
48 | 
49 |     Returns:
50 |         :obj:`tf.Tensor`: a scalar tensor of the accuracy (between 0 and 1).
51 |     """
52 |     m = TopKAccuracy(logits=logits, labels=labels, topk=k)
53 |     return {"name": m.name, "value": m.metric_op}
54 | 


--------------------------------------------------------------------------------
/mlbench_core/install_cuda_extensions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | import mlbench_core
 6 | 
 7 | base__dir = os.path.dirname(mlbench_core.__file__)
 8 | ext_modules = []
 9 | try:
10 |     from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
11 | 
12 |     dir_1 = os.path.join(
13 |         base__dir, "models/pytorch/transformer/modules/strided_batched_gemm"
14 |     )
15 | 
16 |     dir_2 = os.path.join(base__dir, "models/pytorch/gnmt/attn_score")
17 |     strided_batched_gemm = CUDAExtension(
18 |         name="mlbench_core.models.pytorch.transformer.modules.strided_batched_gemm",
19 |         sources=[
20 |             os.path.join(dir_1, "strided_batched_gemm.cpp"),
21 |             os.path.join(dir_1, "strided_batched_gemm_cuda.cu"),
22 |         ],
23 |         extra_compile_args={
24 |             "cxx": [
25 |                 "-O2",
26 |             ],
27 |             "nvcc": [
28 |                 "-I/app/cutlass/",
29 |                 "--gpu-architecture=compute_70",
30 |                 "--gpu-code=sm_70",
31 |                 "-O3",
32 |                 "-U__CUDA_NO_HALF_OPERATORS__",
33 |                 "-U__CUDA_NO_HALF_CONVERSIONS__",
34 |             ],
35 |         },
36 |     )
37 | 
38 |     attn_score = CUDAExtension(
39 |         name="mlbench_core.models.pytorch.gnmt.attn_score",
40 |         sources=[
41 |             os.path.join(dir_2, "attn_score_cuda.cpp"),
42 |             os.path.join(dir_2, "attn_score_cuda_kernel.cu"),
43 |         ],
44 |         extra_compile_args={
45 |             "cxx": [
46 |                 "-O2",
47 |             ],
48 |             "nvcc": [
49 |                 "--gpu-architecture=sm_70",
50 |             ],
51 |         },
52 |     )
53 |     ext_modules.append(strided_batched_gemm)
54 |     ext_modules.append(attn_score)
55 |     cmdclass = {"build_ext": BuildExtension}
56 | 
57 |     setup(ext_modules=ext_modules, cmdclass=cmdclass)
58 | 
59 | except (ImportError, OSError) as e:
60 |     raise ValueError("Cannot install extensions because CUDA was not found")
61 | 


--------------------------------------------------------------------------------
/mlbench_core/lr_scheduler/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | 
 4 |     from . import pytorch
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | 
 9 | try:
10 |     import tensorflow
11 | 
12 |     from . import tensorflow
13 | except ImportError:
14 |     pass
15 | 


--------------------------------------------------------------------------------
/mlbench_core/lr_scheduler/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | """Scheduling Learning Rates.
 2 | """
 3 | 
 4 | from .lr import (
 5 |     ExponentialWarmupMultiStepLR,
 6 |     LRLinearWarmUp,
 7 |     MultiStepLRLinearWarmUp,
 8 |     ReduceLROnPlateauWithWarmup,
 9 |     SparsifiedSGDLR,
10 |     SQRTTimeDecayLR,
11 |     SQRTTimeDecayLRWithWarmup,
12 |     TimeDecayLR,
13 | )
14 | 


--------------------------------------------------------------------------------
/mlbench_core/lr_scheduler/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .lr import manual_stepping
2 | 


--------------------------------------------------------------------------------
/mlbench_core/lr_scheduler/tensorflow/lr.py:
--------------------------------------------------------------------------------
 1 | r"""Learning rate scheduling in tensorflow.
 2 | 
 3 | The manual_stepping function is taken from :
 4 | 
 5 | https://github.com/tensorflow/models/blob/master/research/object_detection/utils/learning_schedules.py
 6 | """
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | 
11 | def manual_stepping(global_step, boundaries, rates, warmup=False):
12 |     """Manually stepped learning rate schedule.
13 | 
14 |     This function provides fine grained control over learning rates.  One must
15 |     specify a sequence of learning rates as well as a set of integer steps
16 |     at which the current learning rate must transition to the next.  For example,
17 |     if boundaries = [5, 10] and rates = [.1, .01, .001], then the learning
18 |     rate returned by this function is .1 for global_step=0,...,4, .01 for
19 |     global_step=5...9, and .001 for global_step=10 and onward.
20 | 
21 |     Args:
22 |         global_step (:obj:`tf.Tensor`): int64 (scalar) tensor representing global step.
23 |         boundaries (list): a list of global steps at which to switch learning
24 |         rates (list): a list of (float) learning rates corresponding to intervals between
25 |             the boundaries.  The length of this list must be exactly len(boundaries) + 1.
26 |         warmup (bool, optional): Defaults to False. Whether to linearly interpolate learning
27 |             rate for steps in [0, boundaries[0]].
28 | 
29 |     Raises:
30 |         ValueError: boundaries is a strictly increasing list of positive integers
31 |         ValueError: len(rates) == len(boundaries) + 1
32 |         ValueError: boundaries[0] != 0
33 | 
34 |     Returns:
35 |         :obj:`tf.Tensor`: a (scalar) float tensor representing learning rate
36 |     """
37 | 
38 |     if any([b < 0 for b in boundaries]) or any(
39 |         [not isinstance(b, int) for b in boundaries]
40 |     ):
41 |         raise ValueError("boundaries must be a list of positive integers")
42 |     if any([bnext <= b for bnext, b in zip(boundaries[1:], boundaries[:-1])]):
43 |         raise ValueError("Entries in boundaries must be strictly increasing.")
44 |     if any([not isinstance(r, float) for r in rates]):
45 |         raise ValueError("Learning rates must be floats")
46 |     if len(rates) != len(boundaries) + 1:
47 |         raise ValueError(
48 |             "Number of provided learning rates must exceed "
49 |             "number of boundary points by exactly 1."
50 |         )
51 | 
52 |     if boundaries and boundaries[0] == 0:
53 |         raise ValueError("First step cannot be zero.")
54 | 
55 |     if warmup and boundaries:
56 |         slope = (rates[1] - rates[0]) * 1.0 / boundaries[0]
57 |         warmup_steps = range(boundaries[0])
58 |         warmup_rates = [rates[0] + slope * step for step in warmup_steps]
59 |         boundaries = warmup_steps + boundaries
60 |         rates = warmup_rates + rates[1:]
61 |     else:
62 |         boundaries = [0] + boundaries
63 |     num_boundaries = len(boundaries)
64 |     rate_index = tf.reduce_max(
65 |         tf.where(
66 |             tf.greater_equal(global_step, boundaries),
67 |             list(range(num_boundaries)),
68 |             [0] * num_boundaries,
69 |         )
70 |     )
71 |     return tf.reduce_sum(
72 |         rates * tf.one_hot(rate_index, depth=num_boundaries), name="learning_rate"
73 |     )
74 | 


--------------------------------------------------------------------------------
/mlbench_core/models/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | 
 4 |     from . import pytorch
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | 
 9 | try:
10 |     import tensorflow
11 | 
12 |     from . import tensorflow
13 | except ImportError:
14 |     pass
15 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/models/pytorch/__init__.py


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/gnmt/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import GNMT
2 | from .translator import Translator
3 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/gnmt/attn_score/attn_score_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/torch.h>
 2 | 
 3 | #include <vector>
 4 | 
 5 | // CUDA forward declarations
 6 | 
 7 | at::Tensor attn_score_forward_cuda(
 8 |     const at::Tensor &attn_query,
 9 |     const at::Tensor &attn_keys,
10 |     const at::Tensor &bias,
11 |     const at::Tensor &linear_attn);
12 | 
13 | std::vector<at::Tensor> attn_score_backward_cuda(
14 |     const at::Tensor &grad_output,
15 |     const at::Tensor &attn_query,
16 |     const at::Tensor &attn_keys,
17 |     const at::Tensor &bias,
18 |     const at::Tensor &linear_attn);
19 | 
20 | // C++ interface
21 | 
22 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
23 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
24 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
25 | 
26 | at::Tensor attn_score_forward(
27 |     const at::Tensor &attn_query,
28 |     const at::Tensor &attn_keys,
29 |     const at::Tensor &bias,
30 |     const at::Tensor &linear_attn) {
31 |     CHECK_INPUT(attn_query);
32 |     CHECK_INPUT(attn_keys);
33 |     CHECK_INPUT(bias);
34 |     CHECK_INPUT(linear_attn);
35 | 
36 |     return attn_score_forward_cuda(attn_query, attn_keys, bias, linear_attn);
37 | }
38 | 
39 | std::vector<at::Tensor> attn_score_backward(
40 |     const at::Tensor &grad_output,
41 |     const at::Tensor &attn_query,
42 |     const at::Tensor &attn_keys,
43 |     const at::Tensor &bias,
44 |     const at::Tensor &linear_attn) {
45 |     CHECK_INPUT(grad_output);
46 |     CHECK_INPUT(attn_query);
47 |     CHECK_INPUT(attn_keys);
48 |     CHECK_INPUT(bias);
49 |     CHECK_INPUT(linear_attn);
50 | 
51 |     return attn_score_backward_cuda(grad_output, attn_query, attn_keys, bias, linear_attn);
52 | }
53 | 
54 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
55 |     m.def("forward", &attn_score_forward, "Attention score calculation forward (CUDA)");
56 |     m.def("backward", &attn_score_backward, "Attention score calculation backward (CUDA)");
57 | }
58 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/gnmt/encoder.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  3 | 
  4 | import mlbench_core.dataset.nlp.pytorch.wmt16.wmt16_config as config
  5 | from mlbench_core.models.pytorch.gnmt.utils import init_lstm_
  6 | 
  7 | 
  8 | class ResidualRecurrentEncoder(nn.Module):
  9 |     """
 10 |     Encoder with Embedding, LSTM layers, residual connections and optional
 11 |     dropout.
 12 | 
 13 |     The first LSTM layer is bidirectional and uses variable sequence length
 14 |     API, the remaining (num_layers-1) layers are unidirectional. Residual
 15 |     connections are enabled after third LSTM layer, dropout is applied on
 16 |     inputs to LSTM layers.
 17 | 
 18 |     Args:
 19 |         vocab_size: size of vocabulary
 20 |         hidden_size: hidden size for LSTM layers
 21 |         num_layers: number of LSTM layers, 1st layer is bidirectional
 22 |         dropout: probability of dropout (on input to LSTM layers)
 23 |         embedder: instance of nn.Embedding, if None constructor will
 24 |             create new embedding layer
 25 |         init_weight: range for the uniform initializer
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         vocab_size,
 31 |         hidden_size=1024,
 32 |         num_layers=4,
 33 |         dropout=0.2,
 34 |         embedder=None,
 35 |         init_weight=0.1,
 36 |     ):
 37 |         super(ResidualRecurrentEncoder, self).__init__()
 38 |         self.rnn_layers = nn.ModuleList()
 39 |         # 1st LSTM layer, bidirectional
 40 |         self.rnn_layers.append(
 41 |             nn.LSTM(
 42 |                 hidden_size,
 43 |                 hidden_size,
 44 |                 num_layers=1,
 45 |                 bias=True,
 46 |                 batch_first=False,
 47 |                 bidirectional=True,
 48 |             )
 49 |         )
 50 | 
 51 |         # 2nd LSTM layer, with 2x larger input_size
 52 |         self.rnn_layers.append(
 53 |             nn.LSTM(
 54 |                 (2 * hidden_size),
 55 |                 hidden_size,
 56 |                 num_layers=1,
 57 |                 bias=True,
 58 |                 batch_first=False,
 59 |             )
 60 |         )
 61 | 
 62 |         # Remaining LSTM layers
 63 |         for _ in range(num_layers - 2):
 64 |             self.rnn_layers.append(
 65 |                 nn.LSTM(
 66 |                     hidden_size,
 67 |                     hidden_size,
 68 |                     num_layers=1,
 69 |                     bias=True,
 70 |                     batch_first=False,
 71 |                 )
 72 |             )
 73 | 
 74 |         for lstm in self.rnn_layers:
 75 |             init_lstm_(lstm, init_weight)
 76 | 
 77 |         self.dropout = nn.Dropout(p=dropout)
 78 | 
 79 |         if embedder is not None:
 80 |             self.embedder = embedder
 81 |         else:
 82 |             self.embedder = nn.Embedding(
 83 |                 vocab_size, hidden_size, padding_idx=config.PAD
 84 |             )
 85 |             nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
 86 | 
 87 |     def forward(self, inputs, lengths):
 88 |         """
 89 |         Execute the encoder.
 90 | 
 91 |         Args:
 92 |             inputs: tensor with indices from the vocabulary
 93 |             lengths: vector with sequence lengths (excluding padding)
 94 | 
 95 |         Returns:
 96 |             tensor with encoded sequences
 97 | 
 98 |         """
 99 |         x = self.embedder(inputs)
100 | 
101 |         # bidirectional layer
102 |         x = self.dropout(x)
103 |         x = pack_padded_sequence(x, lengths.cpu(), batch_first=False)
104 |         x, _ = self.rnn_layers[0](x)
105 |         x, _ = pad_packed_sequence(x, batch_first=False)
106 | 
107 |         # 1st unidirectional layer
108 |         x = self.dropout(x)
109 |         x, _ = self.rnn_layers[1](x)
110 | 
111 |         # the rest of unidirectional layers,
112 |         # with residual connections starting from 3rd layer
113 |         for i in range(2, len(self.rnn_layers)):
114 |             residual = x
115 |             x = self.dropout(x)
116 |             x, _ = self.rnn_layers[i](x)
117 |             x = x + residual
118 | 
119 |         return x
120 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/gnmt/models.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from torch.nn.functional import log_softmax
  3 | 
  4 | from mlbench_core.dataset.nlp.pytorch.wmt16 import wmt16_config
  5 | from mlbench_core.models.pytorch.gnmt.decoder import ResidualRecurrentDecoder
  6 | from mlbench_core.models.pytorch.gnmt.encoder import ResidualRecurrentEncoder
  7 | 
  8 | 
  9 | class Seq2Seq(nn.Module):
 10 |     """
 11 |     Generic Seq2Seq module, with an encoder and a decoder.
 12 |     Args:
 13 |         encoder (Encoder): Model encoder
 14 |         decoder (Decoder): Model decoder
 15 |     """
 16 | 
 17 |     def __init__(self, encoder=None, decoder=None):
 18 |         super(Seq2Seq, self).__init__()
 19 |         self.encoder = encoder
 20 |         self.decoder = decoder
 21 | 
 22 |     def encode(self, inputs, lengths):
 23 |         """
 24 |         Applies the encoder to inputs with a given input sequence lengths.
 25 | 
 26 |         Args:
 27 |             inputs (torch.tensor): tensor with inputs (seq_len, batch)
 28 |             lengths: vector with sequence lengths (excluding padding)
 29 | 
 30 |         Returns:
 31 |             torch.tensor
 32 |         """
 33 |         return self.encoder(inputs, lengths)
 34 | 
 35 |     def decode(self, inputs, context, inference=False):
 36 |         """
 37 |         Applies the decoder to inputs, given the context from the encoder.
 38 | 
 39 |         Args:
 40 |             inputs (torch.tensor): tensor with inputs (seq_len, batch)
 41 |             context: context from the encoder
 42 |             inference: if True inference mode, if False training mode
 43 | 
 44 |         Returns:
 45 |             torch.tensor
 46 |         """
 47 |         return self.decoder(inputs, context, inference)
 48 | 
 49 |     def generate(self, inputs, context, beam_size):
 50 |         """
 51 |         Autoregressive generator, works with SequenceGenerator class.
 52 |         Executes decoder (in inference mode), applies log_softmax and topK for
 53 |         inference with beam search decoding.
 54 | 
 55 |         Args:
 56 |             inputs: tensor with inputs to the decoder
 57 |             context: context from the encoder
 58 |             beam_size: beam size for the generator
 59 | 
 60 |         Returns:
 61 |             (words, logprobs, scores, new_context)
 62 |             words: indices of topK tokens
 63 |             logprobs: log probabilities of topK tokens
 64 |             scores: scores from the attention module (for coverage penalty)
 65 |             new_context: new decoder context, includes new hidden states for
 66 |             decoder RNN cells
 67 |         """
 68 |         logits, scores, new_context = self.decode(inputs, context, True)
 69 |         logprobs = log_softmax(logits, dim=-1)
 70 |         logprobs, words = logprobs.topk(beam_size, dim=-1)
 71 |         return words, logprobs, scores, new_context
 72 | 
 73 | 
 74 | class GNMT(Seq2Seq):
 75 |     """
 76 |     GNMT v2 model
 77 | 
 78 |     Args:
 79 |         vocab_size (int): size of vocabulary (number of tokens)
 80 |         hidden_size (int): internal hidden size of the model
 81 |         num_layers (int): number of layers, applies to both encoder and
 82 |             decoder
 83 |         dropout (float): probability of dropout (in encoder and decoder)
 84 |             tensors, if false the model uses (seq, batch, feature)
 85 |         share_embedding (bool): if True embeddings are shared between
 86 |             encoder and decoder
 87 |     """
 88 | 
 89 |     def __init__(
 90 |         self,
 91 |         vocab_size,
 92 |         hidden_size=1024,
 93 |         num_layers=4,
 94 |         dropout=0.2,
 95 |         share_embedding=True,
 96 |         fusion=True,
 97 |     ):
 98 |         super(GNMT, self).__init__()
 99 | 
100 |         if share_embedding:
101 |             embedder = nn.Embedding(
102 |                 vocab_size, hidden_size, padding_idx=wmt16_config.PAD
103 |             )
104 |             nn.init.uniform_(embedder.weight.data, -0.1, 0.1)
105 |         else:
106 |             embedder = None
107 | 
108 |         self.encoder = ResidualRecurrentEncoder(
109 |             vocab_size, hidden_size, num_layers, dropout, embedder
110 |         )
111 | 
112 |         self.decoder = ResidualRecurrentDecoder(
113 |             vocab_size, hidden_size, num_layers, dropout, embedder, fusion=fusion
114 |         )
115 | 
116 |     def forward(self, input_encoder, input_enc_len, input_decoder):
117 |         context = self.encode(input_encoder, input_enc_len)
118 |         context = (context, input_enc_len, None)
119 |         output, _, _ = self.decode(input_decoder, context)
120 | 
121 |         return output
122 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/gnmt/translator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mosestokenizer import MosesDetokenizer
 3 | 
 4 | import mlbench_core.dataset.nlp.pytorch.wmt16.wmt16_config as wmt16_config
 5 | from mlbench_core.utils.pytorch.inference.beam_search import SequenceGenerator
 6 | 
 7 | 
 8 | class Translator:
 9 |     """
10 |     Translator that output translated sentences from GNMT model by using a Sequence Generator
11 | 
12 |     Args:
13 |         model (`obj`:torch.nn.Module): Model to use
14 |         trg_tokenizer (`obj`:mlbench_core.dataset.nlp.pytorch.wmt16.WMT16Tokenizer): The target tokenizer
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         model,
20 |         trg_tokenizer,
21 |         trg_lang="de",
22 |         beam_size=5,
23 |         len_norm_factor=0.6,
24 |         len_norm_const=5.0,
25 |         cov_penalty_factor=0.1,
26 |         max_seq_len=150,
27 |     ):
28 | 
29 |         self.model = model
30 |         self.tokenizer = trg_tokenizer
31 |         self.insert_target_start = [wmt16_config.BOS]
32 |         self.insert_src_start = [wmt16_config.BOS]
33 |         self.insert_src_end = [wmt16_config.EOS]
34 |         self.beam_size = beam_size
35 |         self.trg_lang = trg_lang
36 | 
37 |         self.generator = SequenceGenerator(
38 |             model=self.model,
39 |             beam_size=beam_size,
40 |             max_seq_len=max_seq_len,
41 |             len_norm_factor=len_norm_factor,
42 |             len_norm_const=len_norm_const,
43 |             cov_penalty_factor=cov_penalty_factor,
44 |         )
45 | 
46 |     def get_detokenized_target(self, trg, batch_size):
47 |         targets = []
48 |         with MosesDetokenizer(self.trg_lang) as detok:
49 |             for i in range(batch_size):
50 |                 t = self.tokenizer.detokenize(trg[:, i].tolist())
51 |                 t = detok(t.split())
52 |                 targets.append(t)
53 | 
54 |         return targets
55 | 
56 |     def translate(self, src, trg):
57 |         """Given a source a target tokenized tensors, outputs the
58 |         non-tokenized translation from the model, as well as the non-tokenized target
59 | 
60 |         Args:
61 |             src:
62 |             trg:
63 | 
64 |         Returns:
65 | 
66 |         """
67 |         src, src_len = src
68 |         trg, trg_len = trg
69 |         device = next(self.model.parameters()).device
70 | 
71 |         batch_size = src.shape[1]
72 | 
73 |         bos = [self.insert_target_start] * (batch_size * self.beam_size)
74 |         bos = torch.tensor(bos, dtype=torch.int64, device=device).view(1, -1)
75 | 
76 |         if self.beam_size == 1:
77 |             generator = self.generator.greedy_search
78 |         else:
79 |             generator = self.generator.beam_search
80 | 
81 |         with torch.no_grad():
82 |             context = self.model.encode(src, src_len)
83 |             context = [context, src_len, None]
84 |             preds, lengths, counter = generator(batch_size, bos, context)
85 | 
86 |         preds = preds.cpu()
87 |         targets = self.get_detokenized_target(trg, batch_size)
88 | 
89 |         output = []
90 |         with MosesDetokenizer(self.trg_lang) as detokenizer:
91 |             for pred in preds:
92 |                 pred = pred.tolist()
93 |                 detok = self.tokenizer.detokenize(pred)
94 |                 detok = detokenizer(detok.split())
95 |                 output.append(detok)
96 | 
97 |         return output, targets
98 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/gnmt/utils.py:
--------------------------------------------------------------------------------
 1 | from torch.nn import init as init
 2 | 
 3 | 
 4 | def init_lstm_(lstm, init_weight=0.1):
 5 |     """
 6 |     Initializes weights of LSTM layer.
 7 |     Weights and biases are initialized with uniform(-init_weight, init_weight)
 8 |     distribution.
 9 | 
10 |     Args:
11 |         lstm (torch.nn.LSTM):
12 |         init_weight (float): range for the uniform initializer
13 | 
14 |     """
15 |     # Initialize hidden-hidden weights
16 |     init.uniform_(lstm.weight_hh_l0.data, -init_weight, init_weight)
17 |     # Initialize input-hidden weights:
18 |     init.uniform_(lstm.weight_ih_l0.data, -init_weight, init_weight)
19 | 
20 |     # Initialize bias. PyTorch LSTM has two biases, one for input-hidden GEMM
21 |     # and the other for hidden-hidden GEMM. Here input-hidden bias is
22 |     # initialized with uniform distribution and hidden-hidden bias is
23 |     # initialized with zeros.
24 |     init.uniform_(lstm.bias_ih_l0.data, -init_weight, init_weight)
25 |     init.zeros_(lstm.bias_hh_l0.data)
26 | 
27 |     if lstm.bidirectional:
28 |         init.uniform_(lstm.weight_hh_l0_reverse.data, -init_weight, init_weight)
29 |         init.uniform_(lstm.weight_ih_l0_reverse.data, -init_weight, init_weight)
30 | 
31 |         init.uniform_(lstm.bias_ih_l0_reverse.data, -init_weight, init_weight)
32 |         init.zeros_(lstm.bias_hh_l0_reverse.data)
33 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/language_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .lstm import LSTMLanguageModel
2 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/language_models/lstm.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from mlbench_core.models.pytorch.layers import (
  5 |     LockedDropout,
  6 |     WeightDrop,
  7 |     embedded_dropout,
  8 | )
  9 | 
 10 | 
 11 | class LSTMLanguageModel(nn.Module):
 12 |     """Container module with an encoder, a recurrent module, and a decoder.
 13 | 
 14 |     Model taken from https://github.com/salesforce/awd-lstm-lm
 15 | 
 16 |     Args:
 17 |         ntoken (int): Number of tokens in vocabulary
 18 |         ninp (int): Embedding size (LSTM input size)
 19 |         nhid (int): Number of hidden LSTM units per layer
 20 |         nlayers (int): Number of LSTM layers
 21 |         dropout (float): Output dropout rate (LockedDropout). Default 0.5
 22 |         dropouth (float): LSTM output dropout rate (between each layer except for last). Default 0.5
 23 |         dropouti (float): Input dropout to LSTM layers. Default 0.5
 24 |         dropoute (float): Embedding dropout. Default 0.1
 25 |         wdrop (float): Weight dropout for LSTM layers. Default 0
 26 |         tie_weights (bool): If True, encoder and decoder weights are tied. Default False
 27 | 
 28 |     """
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         ntoken,
 33 |         ninp,
 34 |         nhid,
 35 |         nlayers,
 36 |         dropout=0.5,
 37 |         dropouth=0.5,
 38 |         dropouti=0.5,
 39 |         dropoute=0.1,
 40 |         wdrop=0,
 41 |         tie_weights=False,
 42 |     ):
 43 |         super(LSTMLanguageModel, self).__init__()
 44 |         self.lockdroph = LockedDropout(p=dropouth)
 45 |         self.lockdropi = LockedDropout(p=dropouti)
 46 |         self.lockdrop = LockedDropout(p=dropout)
 47 |         self.encoder = nn.Embedding(ntoken, ninp)
 48 | 
 49 |         self.rnns = [
 50 |             torch.nn.LSTM(
 51 |                 ninp if l == 0 else nhid,
 52 |                 nhid if l != nlayers - 1 else (ninp if tie_weights else nhid),
 53 |                 1,
 54 |                 dropout=0,
 55 |             )
 56 |             for l in range(nlayers)
 57 |         ]
 58 |         if wdrop:
 59 |             self.rnns = [
 60 |                 WeightDrop(rnn, ["weight_hh_l0"], dropout=wdrop) for rnn in self.rnns
 61 |             ]
 62 |         print(self.rnns)
 63 |         self.rnns = torch.nn.ModuleList(self.rnns)
 64 |         self.decoder = nn.Linear(nhid, ntoken)
 65 | 
 66 |         # Optionally tie weights as in:
 67 |         # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
 68 |         # https://arxiv.org/abs/1608.05859
 69 |         # and
 70 |         # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
 71 |         # https://arxiv.org/abs/1611.01462
 72 |         if tie_weights:
 73 |             # if nhid != ninp:
 74 |             #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
 75 |             self.decoder.weight = self.encoder.weight
 76 | 
 77 |         self.init_weights()
 78 | 
 79 |         self.ntoken = ntoken
 80 |         self.ninp = ninp
 81 |         self.nhid = nhid
 82 |         self.nlayers = nlayers
 83 |         self.dropoute = dropoute
 84 |         self.tie_weights = tie_weights
 85 | 
 86 |     def init_weights(self):
 87 |         initrange = 0.1
 88 |         self.encoder.weight.data.uniform_(-initrange, initrange)
 89 |         self.decoder.bias.data.fill_(0)
 90 |         self.decoder.weight.data.uniform_(-initrange, initrange)
 91 | 
 92 |     def forward(self, input, hidden, return_h=False):
 93 |         # Embedded Dropout
 94 |         emb = embedded_dropout(
 95 |             self.encoder, input, dropout=self.dropoute if self.training else 0
 96 |         )
 97 |         # LSTM input dropout
 98 |         emb = self.lockdropi(emb)
 99 | 
100 |         # Manual feeding of LSTM layers
101 |         raw_output = emb
102 |         new_hidden = []
103 |         raw_outputs = []
104 |         outputs = []
105 |         # Iterate on all LSTM layers
106 |         for l, rnn in enumerate(self.rnns):
107 |             # Compute output and hidden state
108 |             raw_output, new_h = rnn(raw_output, hidden[l])
109 |             new_hidden.append(new_h)
110 |             raw_outputs.append(raw_output)
111 |             # Apply LockDrop if not last layer
112 |             if l != self.nlayers - 1:
113 |                 raw_output = self.lockdroph(raw_output)
114 |                 outputs.append(raw_output)
115 |         hidden = new_hidden
116 | 
117 |         # Output dropout
118 |         output = self.lockdrop(raw_output)
119 |         outputs.append(output)
120 |         #
121 |         result = self.decoder(
122 |             output.view(output.size(0) * output.size(1), output.size(2))
123 |         )
124 |         if return_h:
125 |             return result, hidden, raw_outputs, outputs
126 |         return result, hidden
127 | 
128 |     def init_hidden(self, bsz):
129 |         weight = next(self.parameters()).data
130 |         return [
131 |             (
132 |                 weight.new(
133 |                     1,
134 |                     bsz,
135 |                     self.nhid
136 |                     if l != self.nlayers - 1
137 |                     else (self.ninp if self.tie_weights else self.nhid),
138 |                 ).zero_(),
139 |                 weight.new(
140 |                     1,
141 |                     bsz,
142 |                     self.nhid
143 |                     if l != self.nlayers - 1
144 |                     else (self.ninp if self.tie_weights else self.nhid),
145 |                 ).zero_(),
146 |             )
147 |             for l in range(self.nlayers)
148 |         ]
149 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .dropout_layers import LockedDropout, WeightDrop, embedded_dropout
2 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/layers/dropout_layers.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import functional as F
  4 | 
  5 | """Following classes were taken and adapted from https://github.com/salesforce/awd-lstm-lm"""
  6 | 
  7 | 
  8 | class LockedDropout(nn.Module):
  9 |     """LockedDropout applies the same dropout mask to every time step.
 10 | 
 11 |     Args:
 12 |         p (float): Probability of an element in the dropout mask to be zeroed.
 13 |     """
 14 | 
 15 |     def __init__(self, p=0.5):
 16 |         self.p = p
 17 |         super().__init__()
 18 | 
 19 |     def forward(self, x):
 20 |         """
 21 |         Args:
 22 |             x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
 23 |                 apply dropout too.
 24 |         """
 25 |         if not self.training or not self.p:
 26 |             return x
 27 |         x = x.clone()
 28 |         mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(
 29 |             1 - self.p
 30 |         )
 31 |         mask = mask.div_(1 - self.p)
 32 |         mask = mask.expand_as(x)
 33 |         return x * mask
 34 | 
 35 |     def __repr__(self):
 36 |         return self.__class__.__name__ + "(" + "p=" + str(self.p) + ")"
 37 | 
 38 | 
 39 | def embedded_dropout(embed, words, dropout=0.1, scale=None):
 40 |     """Applies a mask dropout to the embedding layer
 41 | 
 42 |     Args:
 43 |         embed (:obj:`torch.nn.Embedding`): Embedding layer to use
 44 |         words (:obj:`torch.Tensor`): Word inputs (tokenized)
 45 |         dropout (float): Dropout rate (Default 0.1)
 46 |         scale (float, optional): Scale factor for embedding weights
 47 | 
 48 |     Returns:
 49 |         (:obj:`torch.Tensor`) Output of Embedding after applying dropout mask to weights
 50 |     """
 51 |     if dropout:
 52 |         mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(
 53 |             1 - dropout
 54 |         ).expand_as(embed.weight) / (1 - dropout)
 55 |         masked_embed_weight = mask * embed.weight
 56 |     else:
 57 |         masked_embed_weight = embed.weight
 58 |     if scale:
 59 |         masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight
 60 | 
 61 |     padding_idx = embed.padding_idx
 62 |     if padding_idx is None:
 63 |         padding_idx = -1
 64 | 
 65 |     X = F.embedding(
 66 |         words,
 67 |         masked_embed_weight,
 68 |         padding_idx,
 69 |         embed.max_norm,
 70 |         embed.norm_type,
 71 |         embed.scale_grad_by_freq,
 72 |         embed.sparse,
 73 |     )
 74 |     return X
 75 | 
 76 | 
 77 | class WeightDrop(torch.nn.Module):
 78 |     """Weight Dropout layer. Wraps another module and patches the forward method to apply dropout to module weights.
 79 | 
 80 |     Args:
 81 |         module (:obj:`torch.nn.Module`): Module to wrap
 82 |         weights (listr[str]): Weights to apply dropout to
 83 |         dropout (float): Dropout rate (Default 0)
 84 | 
 85 |     """
 86 | 
 87 |     def __init__(self, module, weights, dropout=0):
 88 |         super(WeightDrop, self).__init__()
 89 |         self.module = module
 90 |         self.weights = weights
 91 |         self.dropout = dropout
 92 |         self._setup()
 93 | 
 94 |     def _setup(self):
 95 |         """Sets up new weights for the module"""
 96 |         for name_w in self.weights:
 97 |             print("Applying weight drop of {} to {}".format(self.dropout, name_w))
 98 |             # Make space for new weights
 99 |             w = getattr(self.module, name_w)
100 |             del self.module._parameters[name_w]
101 |             # Register raw weights
102 |             self.module.register_parameter(name_w + "_raw", nn.Parameter(w.data))
103 | 
104 |     def _setweights(self):
105 |         """Sets dropped out weights"""
106 |         for name_w in self.weights:
107 |             # Get raw weights and apply dropout
108 |             raw_w = getattr(self.module, name_w + "_raw")
109 |             w = F.dropout(raw_w, p=self.dropout, training=self.training)
110 | 
111 |             # This is because we may call this function in non-training mode first and so, as self.training=False, w is
112 |             # a nn.Parameter and thus self.module.weight remains a Parameter of self.module when we don't want it to.
113 |             if name_w in self.module._parameters:
114 |                 del self.module._parameters[name_w]
115 |             # Set dropped out weights
116 |             setattr(self.module, name_w, w)
117 | 
118 |     def forward(self, *args):
119 |         """Forward patch"""
120 |         self._setweights()
121 |         self.module.flatten_parameters()
122 |         return self.module.forward(*args)
123 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/linear_models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class LogisticRegression(torch.nn.Module):
 5 |     """
 6 |     Logistic regression implementation
 7 | 
 8 |     Args:
 9 |         n_features (int): Number of features
10 |     """
11 | 
12 |     def __init__(self, n_features):
13 |         super(LogisticRegression, self).__init__()
14 | 
15 |         self.linear = torch.nn.Linear(n_features, 1, bias=False)
16 | 
17 |     def forward(self, x):
18 |         y_pred = torch.sigmoid(self.linear(x))
19 |         return y_pred
20 | 
21 | 
22 | class LinearRegression(torch.nn.Module):
23 |     """
24 |     Ridge regression implementation
25 | 
26 |     Args:
27 |         n_features (int): Number of features
28 |     """
29 | 
30 |     def __init__(self, n_features):
31 |         super(LinearRegression, self).__init__()
32 |         self.linear = torch.nn.Linear(n_features, 1, bias=False)
33 | 
34 |     def forward(self, x):
35 |         return self.linear(x)
36 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sequence_generator import SequenceGenerator
2 | from .transformer import TransformerModel
3 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/transformer/decoder.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import nn
  6 | 
  7 | from mlbench_core.models.pytorch.transformer.modules import (
  8 |     PositionalEmbedding,
  9 |     TransformerDecoderLayer,
 10 | )
 11 | 
 12 | 
 13 | class TransformerDecoder(nn.Module):
 14 |     """
 15 |     Transformer decoder consisting of *args.decoder_layers* layers. Each layer
 16 |     is a :class:`TransformerDecoderLayer`.
 17 | 
 18 |     Args:
 19 |         args: Arguments of model. All arguments should be accessible via `__getattribute__` method
 20 |         dictionary (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): decoding dictionary
 21 |         embed_tokens (torch.nn.Embedding): output embedding
 22 |         no_encoder_attn (bool, optional): whether to attend to encoder outputs
 23 |             (default: False).
 24 |         left_pad (bool): Pad targets to the left (`True`) or right (`False`). Default: `False`
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False
 29 |     ):
 30 |         super().__init__()
 31 |         self.dictionary = dictionary
 32 |         self.dropout = args.dropout
 33 |         self.share_input_output_embed = args.share_decoder_input_output_embed
 34 | 
 35 |         embed_dim = embed_tokens.embedding_dim
 36 |         padding_idx = embed_tokens.padding_idx
 37 |         self.max_target_positions = args.max_target_positions
 38 | 
 39 |         self.embed_tokens = embed_tokens
 40 |         self.embed_scale = math.sqrt(embed_dim)
 41 |         self.embed_positions = (
 42 |             PositionalEmbedding(
 43 |                 args.max_target_positions,
 44 |                 embed_dim,
 45 |                 padding_idx,
 46 |                 left_pad=left_pad,
 47 |                 learned=args.decoder_learned_pos,
 48 |             )
 49 |             if not args.no_token_positional_embeddings
 50 |             else None
 51 |         )
 52 | 
 53 |         self.layers = nn.ModuleList(
 54 |             [
 55 |                 TransformerDecoderLayer(args, no_encoder_attn)
 56 |                 for _ in range(args.decoder_layers)
 57 |             ]
 58 |         )
 59 | 
 60 |         if not self.share_input_output_embed:
 61 |             self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim))
 62 |             nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
 63 |         self.normalize = args.decoder_normalize_before
 64 | 
 65 |         if self.normalize:
 66 |             self.layer_norm = nn.LayerNorm(embed_dim)
 67 | 
 68 |     def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
 69 |         # embed positions
 70 |         positions = (
 71 |             self.embed_positions(
 72 |                 prev_output_tokens,
 73 |                 incremental_state=incremental_state,
 74 |             )
 75 |             if self.embed_positions is not None
 76 |             else None
 77 |         )
 78 | 
 79 |         if incremental_state is not None:
 80 |             prev_output_tokens = prev_output_tokens[:, -1:]
 81 |             if positions is not None:
 82 |                 positions = positions[:, -1:]
 83 | 
 84 |         # embed tokens and positions
 85 |         x = self.embed_scale * self.embed_tokens(prev_output_tokens)
 86 |         if positions is not None:
 87 |             x += positions
 88 |         x = F.dropout(x, p=self.dropout, training=self.training)
 89 | 
 90 |         # B x T x C -> T x B x C
 91 |         x = x.transpose(0, 1)
 92 | 
 93 |         if x.size(1) == 1:
 94 |             if x.is_contiguous():
 95 |                 x = x.view(x.size(0), x.size(1), x.size(2))
 96 |             else:
 97 |                 x = x.contiguous()
 98 |         else:
 99 |             x = x.contiguous()
100 | 
101 |         attn = None
102 | 
103 |         # decoder layers
104 |         for layer in self.layers:
105 |             x, attn = layer(
106 |                 x,
107 |                 encoder_out["encoder_out"] if encoder_out is not None else None,
108 |                 encoder_out["encoder_padding_mask"]
109 |                 if encoder_out is not None
110 |                 else None,
111 |                 incremental_state,
112 |             )
113 | 
114 |         if self.normalize:
115 |             x = self.layer_norm(x)
116 | 
117 |         # T x B x C -> B x T x C
118 |         x = x.transpose(0, 1)
119 |         # project back to size of vocabulary
120 |         if self.share_input_output_embed:
121 |             x = F.linear(x, self.embed_tokens.weight)
122 |         else:
123 |             x = F.linear(x, self.embed_out)
124 | 
125 |         return x, attn
126 | 
127 |     def max_positions(self):
128 |         """Maximum output length supported by the decoder."""
129 |         if self.embed_positions is None:
130 |             return self.max_target_positions
131 |         return min(self.max_target_positions, self.embed_positions.max_positions())
132 | 
133 |     def reorder_incremental_state(self, incremental_state, new_order):
134 |         """Reorder incremental state.
135 | 
136 |         This should be called when the order of the input has changed from the
137 |         previous time step. A typical use case is beam search, where the input
138 |         order changes between time steps based on the selection of beams.
139 |         """
140 | 
141 |         def apply_reorder_incremental_state(module):
142 |             if module != self and hasattr(module, "reorder_incremental_state"):
143 |                 module.reorder_incremental_state(
144 |                     incremental_state,
145 |                     new_order,
146 |                 )
147 | 
148 |         self.apply(apply_reorder_incremental_state)
149 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/transformer/encoder.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import nn
  6 | 
  7 | from mlbench_core.models.pytorch.transformer.modules import (
  8 |     PositionalEmbedding,
  9 |     TransformerEncoderLayer,
 10 | )
 11 | 
 12 | 
 13 | class TransformerEncoder(nn.Module):
 14 |     """
 15 |     Transformer encoder consisting of *args.encoder_layers* layers. Each layer
 16 |     is a :class:`TransformerEncoderLayer`.
 17 | 
 18 |     Args:
 19 |         args: Arguments of model. All arguments should be accessible via `__getattribute__` method
 20 |         dictionary (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): encoding dictionary
 21 |         embed_tokens (torch.nn.Embedding): input embedding
 22 |         left_pad (bool): Pad sources to the left (`True`) or right (`False`). Default: `True`
 23 |     """
 24 | 
 25 |     def __init__(self, args, dictionary, embed_tokens, left_pad=True):
 26 |         super().__init__()
 27 |         self.dictionary = dictionary
 28 |         self.dropout = args.dropout
 29 | 
 30 |         embed_dim = embed_tokens.embedding_dim
 31 |         self.padding_idx = embed_tokens.padding_idx
 32 |         self.max_source_positions = args.max_source_positions
 33 | 
 34 |         self.softmax_type = args.softmax_type
 35 | 
 36 |         self.embed_tokens = embed_tokens
 37 |         self.embed_scale = math.sqrt(embed_dim)
 38 |         self.embed_positions = (
 39 |             PositionalEmbedding(
 40 |                 args.max_source_positions,
 41 |                 embed_dim,
 42 |                 self.padding_idx,
 43 |                 left_pad=left_pad,
 44 |                 learned=args.encoder_learned_pos,
 45 |             )
 46 |             if not args.no_token_positional_embeddings
 47 |             else None
 48 |         )
 49 | 
 50 |         self.layers = nn.ModuleList(
 51 |             [TransformerEncoderLayer(args) for i in range(args.encoder_layers)]
 52 |         )
 53 | 
 54 |         self.normalize = args.encoder_normalize_before
 55 |         if self.normalize:
 56 |             self.layer_norm = nn.LayerNorm(embed_dim)
 57 | 
 58 |     def forward(self, src_tokens):
 59 |         """Forward function of encoder
 60 | 
 61 |         Args:
 62 |             src_tokens (:obj:`torch.Tensor`): Source tokens
 63 | 
 64 |         Returns:
 65 |             (dict): {`encoder:out` (:obj:`torch.Tensor`), `encoder_padding_mask` (:obj:`torch.Tensor`)}
 66 |         """
 67 |         # embed tokens and positions
 68 |         x = self.embed_scale * self.embed_tokens(src_tokens)
 69 | 
 70 |         if self.embed_positions is not None:
 71 |             x += self.embed_positions(src_tokens)
 72 |         x = F.dropout(x, p=self.dropout, training=self.training)
 73 | 
 74 |         # B x T x C -> T x B x C
 75 |         x = x.transpose(0, 1)
 76 | 
 77 |         if x.size(1) == 1:
 78 |             if x.is_contiguous():
 79 |                 x = x.view(x.size(0), x.size(1), x.size(2))
 80 |             else:
 81 |                 x = x.contiguous()
 82 |         else:
 83 |             x = x.contiguous()
 84 | 
 85 |         # compute padding mask
 86 |         encoder_padding_mask = src_tokens.eq(self.padding_idx)
 87 |         if not encoder_padding_mask.any():
 88 |             encoder_padding_mask = None
 89 |         if (self.softmax_type == "fast_fill") and (encoder_padding_mask is not None):
 90 |             encoder_padding_mask = torch.zeros_like(
 91 |                 encoder_padding_mask, dtype=x.dtype
 92 |             ).masked_fill_(encoder_padding_mask, float("-inf"))
 93 | 
 94 |         # encoder layers
 95 |         for layer in self.layers:
 96 |             x = layer(x, encoder_padding_mask)
 97 | 
 98 |         if self.normalize:
 99 |             x = self.layer_norm(x)
100 | 
101 |         return {
102 |             "encoder_out": x,  # T x B x C
103 |             "encoder_padding_mask": encoder_padding_mask,  # B x T
104 |         }
105 | 
106 |     def reorder_encoder_out(self, encoder_out, new_order):
107 |         if encoder_out["encoder_out"] is not None:
108 |             encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
109 |                 1, new_order
110 |             )
111 |         if encoder_out["encoder_padding_mask"] is not None:
112 |             encoder_out["encoder_padding_mask"] = encoder_out[
113 |                 "encoder_padding_mask"
114 |             ].index_select(0, new_order)
115 |         return encoder_out
116 | 
117 |     def max_positions(self):
118 |         """Maximum input length supported by the encoder."""
119 |         if self.embed_positions is None:
120 |             return self.max_source_positions
121 |         return min(self.max_source_positions, self.embed_positions.max_positions())
122 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/transformer/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from .embeddings import PositionalEmbedding, build_embedding
 2 | from .layers import TransformerDecoderLayer, TransformerEncoderLayer
 3 | 
 4 | __ALL__ = [
 5 |     PositionalEmbedding,
 6 |     build_embedding,
 7 |     TransformerDecoderLayer,
 8 |     TransformerEncoderLayer,
 9 | ]
10 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/transformer/modules/strided_batched_gemm/strided_batched_gemm.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 | // Licensed under the Apache License, Version 2.0 (the "License");
 3 | // you may not use this file except in compliance with the License.
 4 | // You may obtain a copy of the License at
 5 | //
 6 | //     http://www.apache.org/licenses/LICENSE-2.0
 7 | //
 8 | // Unless required by applicable law or agreed to in writing, software
 9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | 
14 | #include <torch/torch.h>
15 | #include <vector>
16 | 
17 | at::Tensor strided_batched_gemm_cuda(
18 |     float beta,
19 |     at::Tensor in_result,
20 |     float alpha,
21 |     at::Tensor batch1,
22 |     at::Tensor batch2);
23 | 
24 | // C++ interface
25 | 
26 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
27 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
28 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
29 | 
30 | at::Tensor strided_batched_gemm(
31 |     float beta,
32 |     at::Tensor in_result,
33 |     float alpha,
34 |     at::Tensor batch1,
35 |     at::Tensor batch2) {
36 |   //CHECK_INPUT(in_result);
37 |   //CHECK_INPUT(batch1);
38 |   //CHECK_INPUT(batch2);
39 | 
40 |   AT_ASSERTM(in_result.dim()   == 3, "expected 3D tensor");
41 |   AT_ASSERTM(batch1.dim()      == 3, "expected 3D tensor");
42 |   AT_ASSERTM(batch2.dim()      == 3, "expected 3D tensor");
43 | 
44 |   AT_ASSERTM(in_result.size(0) == batch1.size(0), "equal number of batches expected");
45 |   AT_ASSERTM(in_result.size(0) == batch2.size(0), "equal number of batches expected");
46 | 
47 |   AT_ASSERTM(in_result.size(1) == batch1.size(1), "wrong matrix size");
48 |   AT_ASSERTM(in_result.size(2) == batch2.size(2), "wrong matrix size");
49 |   AT_ASSERTM(batch1.size(2)    == batch2.size(1), "wrong matrix size");
50 | 
51 |   AT_ASSERTM(batch1.type().scalarType()    == at::ScalarType::Half, "Only HALF is supported");
52 |   AT_ASSERTM(batch2.type().scalarType()    == at::ScalarType::Half, "Only HALF is supported");
53 |   AT_ASSERTM(in_result.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
54 | 
55 |   return strided_batched_gemm_cuda(beta, in_result, alpha, batch1, batch2);
56 | }
57 | 
58 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
59 |         m.def("strided_batched_gemm", &strided_batched_gemm, "Special strided batched gemm.");
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/transformer/transformer.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from mlbench_core.models.pytorch.transformer.decoder import TransformerDecoder
 4 | from mlbench_core.models.pytorch.transformer.encoder import TransformerEncoder
 5 | from mlbench_core.models.pytorch.transformer.modules import build_embedding
 6 | 
 7 | DEFAULT_MAX_SOURCE_POSITIONS = 256
 8 | DEFAULT_MAX_TARGET_POSITIONS = 256
 9 | 
10 | 
11 | class TransformerModel(nn.Module):
12 |     """Transformer model
13 | 
14 |     This model uses MultiHeadAttention as described in
15 |     :cite:`NIPS2017_7181`
16 | 
17 |     Args:
18 |         args: Arguments of model. All arguments should be accessible via `__getattribute__` method
19 |         src_dict (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): Source dictionary
20 |         trg_dict (:obj:`mlbench_core.dataset.nlp.pytorch.wmt17.Dictionary`): Target dictionary
21 |     """
22 | 
23 |     def __init__(self, args, src_dict, trg_dict):
24 |         super().__init__()
25 |         self._is_generation_fast = False
26 |         if not hasattr(args, "max_source_positions"):
27 |             args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
28 |         if not hasattr(args, "max_target_positions"):
29 |             args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
30 | 
31 |         # Define embedding layer
32 |         if args.share_all_embeddings:
33 |             if src_dict != trg_dict:
34 |                 raise ValueError("share_all_embeddings requires a joined dictionary")
35 |             if args.encoder_embed_dim != args.decoder_embed_dim:
36 |                 raise ValueError(
37 |                     "share_all_embeddings requires encoder_embed_dim to match decoder_embed_dim"
38 |                 )
39 |             if args.decoder_embed_path and (
40 |                 args.decoder_embed_path != args.encoder_embed_path
41 |             ):
42 |                 raise ValueError(
43 |                     "share_all_embeddings not compatible with decoder_embed_path"
44 |                 )
45 |             encoder_embed_tokens = build_embedding(
46 |                 src_dict, args.encoder_embed_dim, args.encoder_embed_path
47 |             )
48 |             decoder_embed_tokens = encoder_embed_tokens
49 |             args.share_decoder_input_output_embed = True
50 |         else:
51 |             encoder_embed_tokens = build_embedding(
52 |                 src_dict, args.encoder_embed_dim, args.encoder_embed_path
53 |             )
54 |             decoder_embed_tokens = build_embedding(
55 |                 trg_dict, args.decoder_embed_dim, args.decoder_embed_path
56 |             )
57 |         self.encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)
58 |         self.decoder = TransformerDecoder(args, trg_dict, decoder_embed_tokens)
59 | 
60 |     def forward(
61 |         self,
62 |         src_tokens,
63 |         src_lengths,
64 |         prev_output_tokens,
65 |     ):
66 |         """
67 |         Run the forward pass of the transformer model.
68 | 
69 |         Args:
70 |             src_tokens (:obj:`torch.Tensor`): Source tokens
71 |             src_lengths (:obj:`torch.Tensor`): Source sentence lengths
72 |             prev_output_tokens (:obj:`torch.Tensor`): Previous output tokens
73 | 
74 |         Returns:
75 |             (:obj:`torch.Tensor`, Optional[:obj:`torch.Tensor`]):
76 |                 The model output, and attention weights if needed
77 |         """
78 |         encoder_out = self.encoder(src_tokens)
79 |         decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out)
80 |         return decoder_out
81 | 
82 |     def max_positions(self):
83 |         """Maximum length supported by the model."""
84 |         return self.encoder.max_positions(), self.decoder.max_positions()
85 | 
86 |     def max_decoder_positions(self):
87 |         """Maximum length supported by the decoder.
88 | 
89 |         Returns:
90 |             (int)
91 |         """
92 |         return self.decoder.max_positions()
93 | 


--------------------------------------------------------------------------------
/mlbench_core/models/pytorch/vgg.py:
--------------------------------------------------------------------------------
 1 | """VGG11/13/16/19 in Pytorch.
 2 | 
 3 | From https://github.com/kuangliu/pytorch-cifar."""
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | cfg = {
 8 |     "VGG11": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
 9 |     "VGG13": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
10 |     "VGG16": [
11 |         64,
12 |         64,
13 |         "M",
14 |         128,
15 |         128,
16 |         "M",
17 |         256,
18 |         256,
19 |         256,
20 |         "M",
21 |         512,
22 |         512,
23 |         512,
24 |         "M",
25 |         512,
26 |         512,
27 |         512,
28 |         "M",
29 |     ],
30 |     "VGG19": [
31 |         64,
32 |         64,
33 |         "M",
34 |         128,
35 |         128,
36 |         "M",
37 |         256,
38 |         256,
39 |         256,
40 |         256,
41 |         "M",
42 |         512,
43 |         512,
44 |         512,
45 |         512,
46 |         "M",
47 |         512,
48 |         512,
49 |         512,
50 |         512,
51 |         "M",
52 |     ],
53 | }
54 | 
55 | 
56 | class VGG(nn.Module):
57 |     def __init__(self, vgg_name):
58 |         super(VGG, self).__init__()
59 |         self.features = self._make_layers(cfg[vgg_name])
60 |         self.classifier = nn.Linear(512, 10)
61 | 
62 |     def forward(self, x):
63 |         out = self.features(x)
64 |         out = out.view(out.size(0), -1)
65 |         out = self.classifier(out)
66 |         return out
67 | 
68 |     def _make_layers(self, cfg):
69 |         layers = []
70 |         in_channels = 3
71 |         for x in cfg:
72 |             if x == "M":
73 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
74 |             else:
75 |                 layers += [
76 |                     nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
77 |                     nn.BatchNorm2d(x),
78 |                     nn.ReLU(inplace=True),
79 |                 ]
80 |                 in_channels = x
81 |         layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
82 |         return nn.Sequential(*layers)
83 | 
84 | 
85 | def test():
86 |     net = VGG("VGG11")
87 |     x = torch.randn(2, 3, 32, 32)
88 |     y = net(x)
89 |     print(y.size())
90 | 


--------------------------------------------------------------------------------
/mlbench_core/models/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_model import Cifar10Model
2 | 
3 | __all__ = ["Cifar10Model"]
4 | 


--------------------------------------------------------------------------------
/mlbench_core/optim/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     import torch
3 | 
4 |     from . import pytorch
5 | except ImportError:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/mlbench_core/optim/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | from .centralized import *
 2 | from .decentralized import *
 3 | from .optim import *
 4 | 
 5 | optimizers = {
 6 |     "sign_sgd": SignSGD,
 7 |     "sparsified_sgd": SparsifiedSGD,
 8 |     "centralized_sparsified_sgd": CentralizedSparsifiedSGD,
 9 |     "centralized_sgd": CentralizedSGD,
10 |     "centralized_adam": CentralizedAdam,
11 |     "power_sgd": PowerSGD,
12 |     "decentralized_sgd": DecentralizedSGD,
13 | }
14 | 
15 | 
16 | def get_optimizer(optimizer, **kwargs):
17 |     """Returns an object of the class specified with the argument `optimizer`.
18 | 
19 |     Args:
20 |         optimizer (str): name of the optimizer
21 |         **kwargs (dict, optional): additional optimizer-specific parameters. For the list of supported parameters
22 |             for each optimizer, please look at its documentation.
23 |     """
24 |     return optimizers[optimizer](**kwargs)
25 | 


--------------------------------------------------------------------------------
/mlbench_core/optim/pytorch/decentralized.py:
--------------------------------------------------------------------------------
 1 | from torch.optim import SGD
 2 | from torch.optim.optimizer import required
 3 | 
 4 | from mlbench_core.aggregation.pytorch.decentralized import DecentralizedAggregation
 5 | 
 6 | 
 7 | class DecentralizedSGD(SGD):
 8 |     r"""Implements decentralized stochastic gradient descent (optionally with momentum).
 9 | 
10 |     Args:
11 |         rank (int): rank of current process in the network
12 |         neighbors (list): list of ranks of the neighbors of current process
13 |         model (:obj:`nn.Module`): model which contains parameters for SGD
14 |         lr (float): learning rate
15 |         momentum (float, optional): momentum factor (default: 0)
16 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
17 |         dampening (float, optional): dampening for momentum (default: 0)
18 |         nesterov (bool, optional): enables Nesterov momentum (default: False)
19 |         average_world (bool): Whether to average models on the world_size (default: `True`)
20 |         use_cuda (bool): Whether to use cuda tensors for aggregation
21 |         by_layer (bool): Aggregate by layer instead of all layers at once
22 |     """
23 | 
24 |     def __init__(
25 |         self,
26 |         rank=None,
27 |         neighbors=None,
28 |         model=None,
29 |         lr=required,
30 |         momentum=0,
31 |         dampening=0,
32 |         weight_decay=0,
33 |         nesterov=False,
34 |         average_world=True,
35 |         use_cuda=False,
36 |         by_layer=False,
37 |     ):
38 |         if not rank:
39 |             raise ValueError('"rank" not set for optimizer')
40 |         if not neighbors:
41 |             raise ValueError('"neighbors" not set for optimizer')
42 |         if not model:
43 |             raise ValueError('"model" not set for optimizer')
44 |         super(DecentralizedSGD, self).__init__(
45 |             model.parameters(), lr, momentum, dampening, weight_decay, nesterov
46 |         )
47 | 
48 |         if average_world:
49 |             self.agg_mode = "avg_world"
50 |         else:
51 |             raise NotImplementedError("Only average model is supported right now.")
52 | 
53 |         self.model = model
54 |         self.agg = DecentralizedAggregation(
55 |             rank, neighbors, use_cuda=use_cuda
56 |         ).agg_model(by_layer=by_layer)
57 | 
58 |     def step(self, closure=None, tracker=None):
59 |         """Aggregates the gradients and performs a single optimization step.
60 | 
61 |         Arguments:
62 |             closure (callable, optional): A closure that reevaluates the model
63 |                 and returns the loss.
64 |             tracker (:obj:`mlbench_core.utils.Tracker`, optional) The current tracker
65 |         """
66 |         loss = super(DecentralizedSGD, self).step(closure=closure)
67 |         if tracker:
68 |             tracker.record_batch_opt_step()
69 |         # Averaging the model after updating the gradient separately.
70 |         self.agg(self.model, self.agg_mode)
71 |         if tracker:
72 |             tracker.record_batch_agg()
73 |         return loss
74 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .log_metrics import LogMetrics
 2 | from .tracker import AverageMeter, Tracker
 3 | 
 4 | try:
 5 |     import torch
 6 | 
 7 |     from . import pytorch
 8 | except ImportError:
 9 |     pass
10 | 
11 | 
12 | try:
13 |     import tensorflow
14 | 
15 |     from . import tensorflow
16 | except ImportError:
17 |     pass
18 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/log_metrics.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | 
 4 | from mlbench_core.api import ApiClient
 5 | 
 6 | 
 7 | class LogMetrics(object):
 8 |     """Use to write metric values to the Dashboard API and to Trackers
 9 | 
10 |     Caches API client for performance reasons
11 |     """
12 | 
13 |     in_cluster = os.getenv("KUBERNETES_SERVICE_HOST") is not None
14 | 
15 |     if in_cluster:
16 |         api = ApiClient()
17 | 
18 |     @staticmethod
19 |     def log(run_id, rank, epoch, metric_name, value):
20 |         """Logs metrics to the Metrics API
21 | 
22 |         Currently only logs inside of a cluster
23 | 
24 |         Args:
25 |             run_id (str): The id of the run in the dashboard
26 |             rank (int): Rank of the current worker node
27 |             epoch (float): The current epoch (fractional)
28 |             metric_name (str): The name of the metric
29 |             value (float / int / str): The metric value to write
30 |             tracker(:obj:`mlbench_core.utils.Tracker`): The value Tracker
31 |             time (float): The current time (used for Tracker)
32 | 
33 |         """
34 | 
35 |         if not LogMetrics.in_cluster:
36 |             return
37 | 
38 |         metric_name = "{} @ {}".format(metric_name, rank)
39 | 
40 |         LogMetrics.api.post_metric(
41 |             run_id,
42 |             metric_name,
43 |             value,
44 |             metadata="{{rank: {}, epoch:{}}}".format(rank, epoch),
45 |         )
46 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from contextlib import contextmanager
 3 | 
 4 | import torch
 5 | import torch.distributed as dist
 6 | 
 7 | from .helpers import config_logging, config_path, config_pytorch
 8 | from .topology import FCGraph
 9 | 
10 | __all__ = ["initialize_backends", "FCGraph"]
11 | 
12 | 
13 | @contextmanager
14 | def initialize_backends(
15 |     comm_backend="mpi",
16 |     hosts=None,
17 |     rank=-1,
18 |     logging_level="INFO",
19 |     logging_file="/mlbench.log",
20 |     use_cuda=False,
21 |     seed=None,
22 |     cudnn_deterministic=False,
23 |     ckpt_run_dir="/checkpoints",
24 |     delete_existing_ckpts=False,
25 | ):
26 |     """Initializes the backends.
27 | 
28 |     Sets up logging, sets up pytorch and configures paths
29 |     correctly.
30 | 
31 |     Args:
32 |         config (:obj:`types.SimpleNamespace`): a global object containing all of the config.
33 | 
34 |     Returns:
35 |         (:obj:`types.SimpleNamespace`): a global object containing all of the config.
36 |     """
37 | 
38 |     if not (hasattr(dist, "_initialized") and dist._initialized):
39 | 
40 |         if comm_backend in [dist.Backend.GLOO, dist.Backend.NCCL]:
41 | 
42 |             if comm_backend == dist.Backend.NCCL:
43 |                 assert (
44 |                     torch.cuda.is_available()
45 |                 ), "Invalid use of NCCL backend without CUDA support available"
46 | 
47 |             hosts = hosts.split(",")
48 |             os.environ["MASTER_ADDR"] = hosts[0]
49 |             os.environ["MASTER_PORT"] = "29500"
50 |             os.environ["RANK"] = str(rank)
51 |             os.environ["WORLD_SIZE"] = str(len(hosts))
52 | 
53 |         dist.init_process_group(comm_backend)
54 | 
55 |     config_logging(logging_level, logging_file)
56 | 
57 |     rank, world_size, graph = config_pytorch(use_cuda, seed, cudnn_deterministic)
58 | 
59 |     config_path(ckpt_run_dir, delete_existing_ckpts)
60 | 
61 |     yield rank, world_size, graph
62 | 
63 |     dist.destroy_process_group()
64 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/pytorch/distributed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | 
 5 | def global_average(sum, count):
 6 |     def helper(array):
 7 |         array = get_backend_tensor(torch.Tensor(array))
 8 | 
 9 |         dist.all_reduce(array, op=dist.ReduceOp.SUM)
10 |         return array[0] / array[1]
11 | 
12 |     avg = helper([sum, count])
13 |     return avg
14 | 
15 | 
16 | def get_backend_tensor(tensor):
17 |     if dist.is_initialized() and dist.get_backend() == dist.Backend.NCCL:
18 |         return tensor.cuda()
19 |     return tensor
20 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/pytorch/helpers.py:
--------------------------------------------------------------------------------
  1 | r"""Helper functions."""
  2 | 
  3 | import logging
  4 | import os
  5 | import random
  6 | import shutil
  7 | import socket
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from torch import distributed as dist
 12 | 
 13 | from mlbench_core.utils.pytorch.topology import FCGraph
 14 | 
 15 | 
 16 | def config_logging(logging_level="INFO", logging_file="/mlbench.log"):
 17 |     """Setup logging modules.
 18 |     A stream handler and file handler are added to default logger `mlbench`.
 19 | 
 20 |     Args:
 21 |         logging_level (str): Log level
 22 |         logging_file (str): Log file
 23 | 
 24 |     """
 25 | 
 26 |     class RankFilter(logging.Filter):
 27 |         def filter(self, record):
 28 |             record.rank = dist.get_rank()
 29 |             return True
 30 | 
 31 |     logger = logging.getLogger("mlbench")
 32 |     if len(logger.handlers) >= 2:
 33 |         return
 34 | 
 35 |     logger.setLevel(logging_level)
 36 |     logger.addFilter(RankFilter())
 37 | 
 38 |     formatter = logging.Formatter(
 39 |         "%(asctime)s %(name)s %(rank)2s %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S"
 40 |     )
 41 | 
 42 |     ch = logging.StreamHandler()
 43 |     ch.setLevel(logging_level)
 44 |     ch.setFormatter(formatter)
 45 |     logger.addHandler(ch)
 46 | 
 47 |     fh = logging.FileHandler(logging_file)
 48 |     fh.setLevel(logging_level)
 49 |     fh.setFormatter(formatter)
 50 |     logger.addHandler(fh)
 51 | 
 52 | 
 53 | def config_pytorch(use_cuda=False, seed=None, cudnn_deterministic=False):
 54 |     """Config pytorch packages.
 55 | 
 56 |     Fix random number for packages and initialize distributed environment for pytorch.
 57 |     Setup cuda environment for pytorch.
 58 | 
 59 |     Args:
 60 |         use_cuda (bool): Use CUDA acceleration
 61 |         seed (int | None): Random seed to use
 62 |         cudnn_deterministic (bool): Set `cudnn.determenistic=True`
 63 | 
 64 |     Returns:
 65 |         (int, int, `obj`:FCGraph): The rank, world size, and network graph
 66 |     """
 67 |     # Setting `cudnn.deterministic = True` will turn on
 68 |     # CUDNN deterministic setting which can slow down training considerably.
 69 |     # Unexpected behavior may also be observed from checkpoint.
 70 |     # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py
 71 |     if cudnn_deterministic:
 72 |         # cudnn.deterministic = True
 73 |         print(
 74 |             "You have chosen to seed training. "
 75 |             "This will turn on the CUDNN deterministic setting, "
 76 |             "which can slow down your training considerably! "
 77 |             "You may see unexpected behavior when restarting "
 78 |             "from checkpoints."
 79 |         )
 80 | 
 81 |     if seed:
 82 |         torch.manual_seed(seed)
 83 |         torch.cuda.manual_seed_all(seed)
 84 |         np.random.seed(seed)
 85 |         random.seed(seed)
 86 |         os.environ["PYTHONHASHSEED"] = str(seed)
 87 | 
 88 |     # define the graph for the computation.
 89 |     if use_cuda:
 90 |         assert torch.cuda.is_available()
 91 | 
 92 |     rank = dist.get_rank()
 93 |     world_size = dist.get_world_size()
 94 |     backend = dist.get_backend() if dist.is_initialized() else None
 95 |     graph = FCGraph(rank, world_size, use_cuda)
 96 | 
 97 |     # enable cudnn accelerator if we are using cuda.
 98 |     if use_cuda:
 99 |         graph.assigned_gpu_id()
100 |         torch.backends.cudnn.enabled = True
101 |         torch.backends.cudnn.benchmark = False
102 | 
103 |         if cudnn_deterministic:
104 |             torch.backends.cudnn.deterministic = True
105 | 
106 |         if torch.backends.cudnn.version() is None:
107 |             print("CUDNN not found on device.")
108 | 
109 |         print(
110 |             "World size={}, Rank={}, hostname={}, backend={}, cuda_available={}, cuda_device={}".format(
111 |                 world_size,
112 |                 rank,
113 |                 socket.gethostname(),
114 |                 backend,
115 |                 torch.cuda.is_available(),
116 |                 torch.cuda.current_device(),
117 |             )
118 |         )
119 | 
120 |     return rank, world_size, graph
121 | 
122 | 
123 | def config_path(ckpt_run_dir, delete_existing_ckpts=False):
124 |     """Config the path used during the experiments."""
125 |     if delete_existing_ckpts:
126 |         print("Remove previous checkpoint directory : {}".format(ckpt_run_dir))
127 |         shutil.rmtree(ckpt_run_dir, ignore_errors=True)
128 |     os.makedirs(ckpt_run_dir, exist_ok=True)
129 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/pytorch/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/mlbench_core/utils/pytorch/inference/__init__.py


--------------------------------------------------------------------------------
/mlbench_core/utils/pytorch/topology.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | import torch
 4 | import torch.distributed as dist
 5 | 
 6 | from mlbench_core.utils.pytorch.distributed import get_backend_tensor
 7 | 
 8 | 
 9 | def _ranks_on_same_node(rank, world_size):
10 |     hostname = socket.gethostname()
11 |     hostname_length = get_backend_tensor(torch.IntTensor([len(hostname)]))
12 | 
13 |     dist.all_reduce(hostname_length, op=dist.ReduceOp.MAX)
14 |     max_hostname_length = hostname_length.item()
15 | 
16 |     encoding = [ord(c) for c in hostname]
17 |     encoding += [-1 for c in range(max_hostname_length - len(hostname))]
18 |     encoding = get_backend_tensor(torch.IntTensor(encoding))
19 | 
20 |     all_encodings = [
21 |         get_backend_tensor(torch.IntTensor([0] * max_hostname_length))
22 |         for _ in range(world_size)
23 |     ]
24 |     dist.all_gather(all_encodings, encoding)
25 | 
26 |     if dist.get_backend() == dist.Backend.NCCL:
27 |         all_encodings = [ec.cpu() for ec in all_encodings]
28 | 
29 |     all_encodings = [ec.numpy().tolist() for ec in all_encodings]
30 | 
31 |     ranks = []
32 |     for i in range(world_size):
33 |         if all_encodings[rank] == all_encodings[i]:
34 |             ranks.append(i)
35 |     return ranks
36 | 
37 | 
38 | class FCGraph(object):
39 |     """Fully-Connected Network Graph
40 | 
41 |     Args:
42 |         config (dict): a global object containing all of the config.
43 |     """
44 | 
45 |     def __init__(self, rank, world_size, use_cuda=False):
46 |         self.rank = rank
47 |         self.world_size = world_size
48 |         self.use_cuda = use_cuda
49 | 
50 |     @property
51 |     def current_device_name(self):
52 |         return "cuda:{}".format(torch.cuda.current_device()) if self.use_cuda else "cpu"
53 | 
54 |     @property
55 |     def current_device(self):
56 |         return torch.device(self.current_device_name())
57 | 
58 |     def assigned_gpu_id(self):
59 |         num_gpus_on_device = torch.cuda.device_count()
60 |         ranks = _ranks_on_same_node(self.rank, self.world_size)
61 |         # raise NotImplementedError(self.rank, ranks)
62 |         assigned_id = ranks.index(self.rank) % num_gpus_on_device
63 |         torch.cuda.set_device(assigned_id)
64 | 
65 |     def __str__(self):
66 |         return "{}".format(self.current_device_name)
67 | 
68 |     def __repr__(self):
69 |         return self.__str__()
70 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/pytorch/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | @torch.jit.script
 5 | def orthogonalize(matrix, eps=torch.FloatTensor([1e-16])):
 6 |     """Function used to orthogonalize a matrix.
 7 | 
 8 |     Args:
 9 |         matrix (torch.Tensor): Matrix to orthogonalize
10 |         eps (torch.FloatTensor): Used to avoid division by zero (default: 1e-16)
11 |     """
12 |     n, m = matrix.shape
13 |     for i in range(m):
14 |         # Normalize the i'th column
15 |         col = matrix[:, i : i + 1]
16 |         col /= torch.sqrt(torch.sum(col ** 2)) + eps
17 |         # Project it on the rest and remove it
18 |         if i + 1 < m:
19 |             rest = matrix[:, i + 1 :]
20 |             # rest -= torch.matmul(col.t(), rest) * col
21 |             rest -= torch.sum(col * rest, dim=0) * col
22 | 
23 | 
24 | def pack_tensors(tensors, use_cuda=False):
25 |     """
26 |     Packs a list of tensors into one 1-dimensional tensor.
27 | 
28 |     Args:
29 |         tensors (list[torch.Tensor]): The tensors to pack
30 |         use_cuda (bool): Whether the resulting tensor should be on cuda
31 | 
32 |     Returns:
33 |         (torch.Tensor, list[int], list[(int, int)]):
34 |             The flattened tensors, the list start indices of each packed tensor,
35 |             and the original shape of each tensor.
36 | 
37 |             Those values are used to then unpack the tensor
38 |     """
39 |     indices = [0]
40 |     for tensor in tensors:
41 |         new_end = indices[-1] + tensor.nelement()
42 |         indices.append(new_end)
43 | 
44 |     tensor_sizes = [t.size() for t in tensors]
45 | 
46 |     vec = torch.empty(
47 |         indices[-1],
48 |         device=tensors[0].device if tensors[0].is_cuda and use_cuda else "cpu",
49 |         dtype=tensors[0].dtype,
50 |     )
51 | 
52 |     for tensor, start_idx, end_idx in zip(tensors, indices[:-1], indices[1:]):
53 |         vec[start_idx:end_idx] = tensor.data.view(-1)
54 | 
55 |     return vec, indices, tensor_sizes
56 | 
57 | 
58 | def unpack_tensors(aggregated, indices, sizes):
59 |     """
60 |     Unpacks a 1-dimensional tensor into a list of tensors
61 | 
62 |     Args:
63 |         aggregated (torch.Tensor): The 1-dimensional tensor
64 |         indices (List[Int]): The start index of each tensor
65 |         sizes (List[(Int, Int)]): The size of each resulting tensor
66 | 
67 |     Returns:
68 |         List[torch.Tensor]: The unpacked tensors
69 |     """
70 |     start_index = indices[:-1]
71 |     end_index = indices[1:]
72 | 
73 |     tensors = []
74 |     for i, (start, end) in enumerate(zip(start_index, end_index)):
75 |         tensors.append(aggregated[start:end].view(sizes[i]))
76 | 
77 |     return tensors
78 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/task_args.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | 
 5 | def task_main(main_func, uid="allreduce"):
 6 |     """Parses the task arguments and launches the main
 7 | 
 8 |     Args:
 9 |         main_func: Main function. Must have arguments `run_id`, `dataset_dir`, `ckpt_run_dir`, `output_dir`,
10 |                     `rank`, `backend`, `hosts`, `validation_only`, `gpu`, `light_target`,
11 |         uid: Task unique ID
12 | 
13 |     """
14 |     dataset_dir, ckpt_run_dir, output_dir, args = _task_args(uid=uid)
15 | 
16 |     main_func(
17 |         run_id=args.run_id,
18 |         dataset_dir=dataset_dir,
19 |         ckpt_run_dir=ckpt_run_dir,
20 |         output_dir=output_dir,
21 |         rank=args.rank,
22 |         backend=args.backend,
23 |         hosts=args.hosts,
24 |         validation_only=args.validation_only,
25 |         gpu=args.gpu,
26 |         light_target=args.light,
27 |     )
28 | 
29 | 
30 | def _task_args(uid):
31 |     """
32 |     Parses the task arguments
33 | 
34 |     Args:
35 |         uid (str): Task Unique ID
36 | 
37 |     Returns:
38 |         str, str, str, dict: Dataset directory, checkpoint directory, output directory and arguments
39 |     """
40 |     parser = argparse.ArgumentParser(description="Process run parameters")
41 |     parser.add_argument("--run_id", type=str, default="1", help="The id of the run")
42 |     parser.add_argument(
43 |         "--root-dataset",
44 |         type=str,
45 |         default="/datasets",
46 |         help="Default root directory to dataset.",
47 |     )
48 |     parser.add_argument(
49 |         "--root-checkpoint",
50 |         type=str,
51 |         default="/checkpoint",
52 |         help="Default root directory to checkpoint.",
53 |     )
54 |     parser.add_argument(
55 |         "--root-output",
56 |         type=str,
57 |         default="/output",
58 |         help="Default root directory to output.",
59 |     )
60 |     parser.add_argument(
61 |         "--validation_only",
62 |         action="store_true",
63 |         default=False,
64 |         help="Only validate from checkpoints.",
65 |     )
66 |     parser.add_argument(
67 |         "--gpu", action="store_true", default=False, help="Train with GPU"
68 |     )
69 |     parser.add_argument(
70 |         "--light",
71 |         action="store_true",
72 |         default=False,
73 |         help="Train to light target metric goal",
74 |     )
75 |     parser.add_argument("--rank", type=int, default=1, help="The rank of the process")
76 |     parser.add_argument(
77 |         "--backend", type=str, default="mpi", help="PyTorch distributed backend"
78 |     )
79 |     parser.add_argument("--hosts", type=str, help="The list of hosts")
80 | 
81 |     args = parser.parse_args()
82 | 
83 |     dataset_dir = os.path.join(args.root_dataset, "torch", "wmt17")
84 |     ckpt_run_dir = os.path.join(args.root_checkpoint, uid)
85 |     output_dir = os.path.join(args.root_output, uid)
86 |     os.makedirs(dataset_dir, exist_ok=True)
87 |     os.makedirs(ckpt_run_dir, exist_ok=True)
88 |     os.makedirs(output_dir, exist_ok=True)
89 | 
90 |     return dataset_dir, ckpt_run_dir, output_dir, args
91 | 


--------------------------------------------------------------------------------
/mlbench_core/utils/tensorflow/__init__.py:
--------------------------------------------------------------------------------
 1 | """Initialize environment for pytorch."""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def _init_cleanup(config):
 7 |     r"""Cleanup legacy files like logs, output."""
 8 |     print("=> Initial cleanup")
 9 | 
10 | 
11 | def _init_log(config):
12 |     print("=> Initialize log")
13 | 
14 | 
15 | def _init_tensorflow(config):
16 |     print("=> Initialize TensorFlow")
17 | 
18 | 
19 | def initialize_backends(config):
20 |     """Initializes the backends.
21 | 
22 |     Sets up logging, sets up tensorflow and configures paths
23 |     correctly.
24 | 
25 |     Args:
26 |         config (:obj:`types.SimpleNamespace`): a global object containing all of the config.
27 | 
28 |     Returns:
29 |         (:obj:`types.SimpleNamespace`): a global object containing all of the config.
30 |     """
31 |     _init_cleanup(config)
32 | 
33 |     _init_log(config)
34 | 
35 |     _init_tensorflow(config)
36 |     return config
37 | 
38 | 
39 | def default_session_config(
40 |     tf_allow_soft_placement, tf_log_device_placement, tf_gpu_mem
41 | ):
42 |     """Initialize session configuration."""
43 |     session_conf = tf.ConfigProto(
44 |         allow_soft_placement=tf_allow_soft_placement,
45 |         log_device_placement=tf_log_device_placement,
46 |     )
47 | 
48 |     session_conf.gpu_options.allow_growth = False  # True
49 |     session_conf.gpu_options.per_process_gpu_memory_fraction = tf_gpu_mem
50 |     return session_conf
51 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 3.0.0-dev23
 3 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-dev(?P<dev>[0-9]+))?
 4 | serialize = 
 5 | 	{major}.{minor}.{patch}-dev{dev}
 6 | 	{major}.{minor}.{patch}
 7 | commit = False
 8 | tag = False
 9 | 
10 | [bumpversion:file:mlbench_core/__init__.py]
11 | search = __version__ = "{current_version}"
12 | replace = __version__ = "{new_version}"
13 | 
14 | [bumpversion:file:setup.py]
15 | search = version="{current_version}"
16 | replace = version="{new_version}"
17 | 
18 | [flake8]
19 | exclude = docs
20 | 
21 | [aliases]
22 | test = pytest
23 | 
24 | [tool:pytest]
25 | collect_ignore = ['setup.py']
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """The setup script."""
  5 | 
  6 | from setuptools import find_packages, setup
  7 | 
  8 | with open("README.md") as readme_file:
  9 |     readme = readme_file.read()
 10 | 
 11 | with open("CHANGELOG.md") as history_file:
 12 |     history = history_file.read()
 13 | 
 14 | # Common libraries
 15 | requirements = [
 16 |     "appdirs==1.4.4",
 17 |     "boto3==1.17.74",
 18 |     "Click>=6.0",
 19 |     "deprecation>=2.0.6",
 20 |     "dill==0.3.4",
 21 |     "docker==5.0.0",
 22 |     "GitPython==3.1.17",
 23 |     "google-api-python-client==1.12.8",
 24 |     "google-auth==1.32.1",
 25 |     "google-cloud==0.34.0",
 26 |     "google-cloud-container==2.5.0",
 27 |     "grpcio==1.34.0",
 28 |     "kubernetes==12.0.1",
 29 |     "lmdb==1.2.1",
 30 |     "matplotlib==3.4.2",
 31 |     "numpy==1.20.3",
 32 |     "oauth2client==4.1.3",
 33 |     "sklearn==0.0",
 34 |     "supermutes==0.2.5",
 35 |     "tabulate>=0.8.5",
 36 |     "tensorpack==0.11",
 37 | ]
 38 | 
 39 | # Libraries used by torch
 40 | torch_reqs = [
 41 |     "sacrebleu==1.5.1",
 42 |     "torch==1.9.0",
 43 |     "torchvision==0.10.0",
 44 | ]
 45 | 
 46 | tensorflow_reqs = [
 47 |     "tensorflow==1.13.2",
 48 | ]
 49 | 
 50 | setup_requirements = [
 51 |     "pytest-runner",
 52 | ]
 53 | 
 54 | lint_requirements = [
 55 |     "black==21.5b2",
 56 |     "isort==5.6.4",
 57 | ]
 58 | 
 59 | test_requirements = (
 60 |     [
 61 |         "codecov==2.1.9",
 62 |         "coverage==5.5",
 63 |         "freezegun==1.0.0",
 64 |         "pre-commit",
 65 |         "pytest>=3",
 66 |         "pytest-cov==2.10.1",
 67 |         "pytest-mock==3.3.1",
 68 |         "wcwidth==0.2.5",
 69 |     ]
 70 |     + lint_requirements
 71 |     + torch_reqs
 72 |     + tensorflow_reqs
 73 | )
 74 | 
 75 | dev_requirements = torch_reqs + tensorflow_reqs + lint_requirements + test_requirements
 76 | extras = {
 77 |     "test": test_requirements,
 78 |     "lint": lint_requirements,
 79 |     "torch": torch_reqs,
 80 |     "tensorflow": tensorflow_reqs,
 81 |     "dev": dev_requirements,
 82 | }
 83 | 
 84 | setup(
 85 |     author="Ralf Grubenmann",
 86 |     author_email="ralf.grubenmann@epfl.ch",
 87 |     classifiers=[
 88 |         "Development Status :: 2 - Pre-Alpha",
 89 |         "Intended Audience :: Developers",
 90 |         "License :: OSI Approved :: Apache Software License",
 91 |         "Natural Language :: English",
 92 |         "Programming Language :: Python :: 3.4",
 93 |         "Programming Language :: Python :: 3.5",
 94 |         "Programming Language :: Python :: 3.6",
 95 |         "Programming Language :: Python :: 3.7",
 96 |     ],
 97 |     description="A public and reproducible collection of reference implementations and benchmark suite for distributed machine learning systems.",
 98 |     entry_points={
 99 |         "console_scripts": [
100 |             "mlbench=mlbench_core.cli:cli_group",
101 |         ],
102 |     },
103 |     install_requires=requirements,
104 |     license="Apache Software License 2.0",
105 |     long_description=readme + "\n\n" + history,
106 |     include_package_data=True,
107 |     keywords="mlbench",
108 |     name="mlbench_core",
109 |     packages=find_packages(),
110 |     setup_requires=setup_requirements,
111 |     test_suite="tests",
112 |     tests_require=test_requirements,
113 |     extras_require=extras,
114 |     url="https://github.com/mlbench/mlbench_core",
115 |     version="3.0.0-dev23",
116 |     zip_safe=False,
117 | )
118 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlbench/mlbench-core/4fd3c7e6f1a5be69e52383ab2eb64cad257218c2/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_python_optim.py:
--------------------------------------------------------------------------------
  1 | """Tests for `mlbench_core.optim.pytorch` package."""
  2 | import pytest
  3 | import torch
  4 | import torch.distributed as dist
  5 | from torch.nn.modules import Linear, MSELoss
  6 | from torch.optim import SGD
  7 | 
  8 | from mlbench_core.optim.pytorch.centralized import (
  9 |     CentralizedAdam,
 10 |     CentralizedSGD,
 11 |     CentralizedSparsifiedSGD,
 12 |     CustomCentralizedOptimizer,
 13 |     GenericCentralizedOptimizer,
 14 |     PowerSGD,
 15 | )
 16 | from mlbench_core.optim.pytorch.optim import SignSGD, SparsifiedSGD
 17 | 
 18 | 
 19 | def test_SparsifiedSGD():
 20 |     model = Linear(2, 1)
 21 |     opt = SparsifiedSGD(model.parameters(), lr=1)
 22 | 
 23 |     input_data = torch.Tensor([[1, 2], [3, 4]])
 24 |     target = torch.Tensor([[1], [2]])
 25 | 
 26 |     opt.zero_grad()
 27 |     output = model(input_data)
 28 |     loss = MSELoss()(output, target)
 29 |     loss.backward()
 30 |     opt.step()
 31 | 
 32 | 
 33 | def test_SignSGD():
 34 |     model = Linear(2, 1)
 35 |     opt = SignSGD(model.parameters(), lr=1)
 36 | 
 37 |     input_data = torch.Tensor([[1, 2], [3, 4]])
 38 |     target = torch.Tensor([[1], [2]])
 39 | 
 40 |     opt.zero_grad()
 41 |     output = model(input_data)
 42 |     loss = MSELoss()(output, target)
 43 |     loss.backward()
 44 |     opt.step()
 45 | 
 46 | 
 47 | def test_GenericCentralizedOptimizer():
 48 |     model = Linear(2, 1)
 49 |     opt = SGD(model.parameters(), lr=1)
 50 |     c_opt = GenericCentralizedOptimizer(world_size=1, model=model)
 51 |     c_opt.optimizer = opt
 52 | 
 53 |     input_data = torch.Tensor([[1, 2], [3, 4]])
 54 |     target = torch.Tensor([[1], [2]])
 55 | 
 56 |     c_opt.zero_grad()
 57 |     output = model(input_data)
 58 |     loss = MSELoss()(output, target)
 59 |     loss.backward()
 60 |     opt.step()
 61 | 
 62 | 
 63 | def test_CentralizedSparsifiedSGD(mocker):
 64 |     dist.init_process_group(
 65 |         "gloo", world_size=1, init_method="file:///tmp/somefile", rank=0
 66 |     )
 67 |     model = Linear(2, 1, bias=False)
 68 |     opt = CentralizedSparsifiedSGD(model.parameters(), lr=10, sparse_grad_size=1)
 69 | 
 70 |     input_data = torch.Tensor([[1, 2], [3, 4]])
 71 |     target = torch.Tensor([[1, 2], [2, 3]])
 72 | 
 73 |     opt.zero_grad()
 74 |     output = model(input_data)
 75 |     loss = MSELoss()(output, target)
 76 |     loss.backward()
 77 |     opt.step()
 78 |     dist.destroy_process_group()
 79 | 
 80 | 
 81 | def test_CentralizedSGD():
 82 |     model = Linear(2, 1)
 83 |     opt = CentralizedSGD(world_size=1, model=model, lr=1)
 84 | 
 85 |     input_data = torch.Tensor([[1, 2], [3, 4]])
 86 |     target = torch.Tensor([[1], [2]])
 87 | 
 88 |     opt.zero_grad()
 89 |     output = model(input_data)
 90 |     loss = MSELoss()(output, target)
 91 |     loss.backward()
 92 |     opt.step()
 93 | 
 94 | 
 95 | def test_CentralizedAdam():
 96 |     model = Linear(2, 1)
 97 |     opt = CentralizedAdam(world_size=1, model=model, lr=1)
 98 | 
 99 |     input_data = torch.Tensor([[1, 2], [3, 4]])
100 |     target = torch.Tensor([[1], [2]])
101 | 
102 |     opt.zero_grad()
103 |     output = model(input_data)
104 |     loss = MSELoss()(output, target)
105 |     loss.backward()
106 |     opt.step()
107 | 
108 | 
109 | def test_PowerSGD():
110 |     dist.init_process_group(
111 |         "gloo", world_size=1, init_method="file:///tmp/somefile", rank=0
112 |     )
113 |     model = Linear(2, 1)
114 |     opt = PowerSGD(world_size=1, model=model, lr=1)
115 | 
116 |     input_data = torch.Tensor([[1, 2], [3, 4]])
117 |     target = torch.Tensor([[1], [2]])
118 | 
119 |     opt.zero_grad()
120 |     output = model(input_data)
121 |     loss = MSELoss()(output, target)
122 |     loss.backward()
123 |     opt.step()
124 |     dist.destroy_process_group()
125 | 
126 | 
127 | def test_CustomCentralizedOptimizer():
128 | 
129 |     model = Linear(2, 1)
130 |     opt = SGD(params=model.parameters(), lr=1)
131 |     c_opt = CustomCentralizedOptimizer(
132 |         world_size=1, model=model, optimizer=opt, average_world=True
133 |     )
134 | 
135 |     input_data = torch.Tensor([[1, 2], [3, 4]])
136 |     target = torch.Tensor([[1], [2]])
137 | 
138 |     c_opt.zero_grad()
139 |     output = model(input_data)
140 |     loss = MSELoss()(output, target)
141 |     loss.backward()
142 |     c_opt.step()
143 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_controlflow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Tests for `mlbench_core.controlflow.pytorch` package."""
  5 | import itertools
  6 | import random
  7 | 
  8 | import pytest
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.optim as optim
 12 | from torch.utils.data import DataLoader
 13 | 
 14 | from mlbench_core.controlflow.pytorch.controlflow import (
 15 |     compute_train_batch_metrics,
 16 |     record_train_batch_stats,
 17 |     validation_round,
 18 | )
 19 | from mlbench_core.controlflow.pytorch.helpers import (
 20 |     convert_dtype,
 21 |     iterate_dataloader,
 22 |     maybe_range,
 23 | )
 24 | from mlbench_core.evaluation.pytorch.metrics import TopKAccuracy
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def model():
 29 |     return nn.Linear(1, 2)
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def optimizer(model):
 34 |     return optim.SGD(model.parameters(), lr=0.1)
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def loss_function():
 39 |     return nn.CrossEntropyLoss()
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def metrics():
 44 |     return [TopKAccuracy(topk=1)]
 45 | 
 46 | 
 47 | def _create_random_sets():
 48 |     train_set = [random.random() for _ in range(100)]
 49 |     train_set = [
 50 |         (
 51 |             torch.FloatTensor([n * 50 - 25]),
 52 |             1 if (n > 0.5) != (random.random() < 0.1) else 0,
 53 |         )
 54 |         for n in train_set
 55 |     ]
 56 | 
 57 |     test_set = [random.random() for _ in range(10)]
 58 |     test_set = [
 59 |         (
 60 |             torch.FloatTensor([n * 50 - 25]),
 61 |             1 if (n > 0.5) != (random.random() < 0.1) else 0,
 62 |         )
 63 |         for n in test_set
 64 |     ]
 65 | 
 66 |     return train_set, test_set
 67 | 
 68 | 
 69 | def test_compute_train_metrics(mocker, model, optimizer, loss_function, metrics):
 70 |     mocker.patch("mlbench_core.utils.pytorch.distributed.dist")
 71 |     mocker.patch("mlbench_core.utils.tracker.LogMetrics")
 72 | 
 73 |     batch_size = 2
 74 | 
 75 |     train_set, test_set = _create_random_sets()
 76 |     train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
 77 | 
 78 |     for i, (data, target) in enumerate(train_loader):
 79 |         optimizer.zero_grad()
 80 |         output = model(data)
 81 |         loss = loss_function(output, target)
 82 | 
 83 |         metric_values = compute_train_batch_metrics(output, target, metrics)
 84 |         metric_values = [(k, v) for k, v in metric_values.items() if k.name == "Prec@1"]
 85 |         assert len(metric_values) == 1
 86 | 
 87 |         metric, value = metric_values[0]
 88 | 
 89 |         assert value == metrics[0](output, target)
 90 | 
 91 | 
 92 | def test_validation_round(mocker, model, optimizer, loss_function, metrics):
 93 |     mocker.patch("mlbench_core.utils.pytorch.distributed.dist")
 94 |     mocker.patch("mlbench_core.utils.tracker.LogMetrics")
 95 | 
 96 |     batch_size = 2
 97 | 
 98 |     train_set, test_set = _create_random_sets()
 99 |     train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
100 |     test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)
101 | 
102 |     for data, target in train_loader:
103 |         optimizer.zero_grad()
104 |         output = model(data)
105 |         loss = loss_function(output, target)
106 | 
107 |         loss.backward()
108 |         optimizer.step()
109 | 
110 |     metric_values, loss_values = validation_round(
111 |         test_loader,
112 |         model=model,
113 |         loss_function=loss_function,
114 |         metrics=metrics,
115 |         dtype="fp32",
116 |     )
117 | 
118 |     assert "Prec@1" in [m.name for m in metric_values]
119 | 
120 | 
121 | def test_maybe_range():
122 |     r = maybe_range(10)
123 | 
124 |     assert len(r) == 10
125 |     assert r == range(10)
126 | 
127 |     r = maybe_range(None)
128 | 
129 |     assert isinstance(r, itertools.count)
130 |     assert next(r) == 0
131 |     assert next(r) == 1
132 | 
133 | 
134 | def test_convert_dtype():
135 |     t = torch.IntTensor([0])
136 | 
137 |     tt = convert_dtype("fp32", t)
138 | 
139 |     assert tt.dtype == torch.float32
140 | 
141 |     tt2 = convert_dtype("fp64", t)
142 | 
143 |     assert tt2.dtype == torch.float64
144 | 
145 |     with pytest.raises(NotImplementedError):
146 |         tt3 = convert_dtype("int", t)
147 | 
148 | 
149 | def test_iterate_dataloader(mocker):
150 |     dataloader = [
151 |         (torch.IntTensor([0]), torch.IntTensor([1])),
152 |         (torch.IntTensor([2]), torch.IntTensor([3])),
153 |     ]
154 | 
155 |     it = iterate_dataloader(
156 |         dataloader, "fp32", max_batch_per_epoch=2, transform_target_type=True
157 |     )
158 | 
159 |     first = next(it)
160 | 
161 |     assert first[0].dtype == torch.float32
162 |     assert first[1].dtype == torch.float32
163 |     assert first[0].data.item() == 0.0
164 |     assert first[1].item() == 1.0
165 | 
166 |     second = next(it)
167 | 
168 |     assert second[0].dtype == torch.float32
169 |     assert second[1].dtype == torch.float32
170 |     assert second[0].data.item() == 2.0
171 |     assert second[1].item() == 3.0
172 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_helpers.py:
--------------------------------------------------------------------------------
 1 | """Tests for `mlbench_core.utils.pytorch.helpers` package."""
 2 | 
 3 | from mlbench_core.utils.pytorch.helpers import config_path, config_pytorch
 4 | 
 5 | 
 6 | def test_config_pytorch(mocker):
 7 |     mocker.patch("torch.distributed.get_rank", return_value=1)
 8 |     mocker.patch("torch.distributed.get_world_size", return_value=1)
 9 |     mocker.patch("mlbench_core.utils.pytorch.helpers.FCGraph")
10 | 
11 |     rank, world_size, graph = config_pytorch(
12 |         use_cuda=False, seed=42, cudnn_deterministic=True
13 |     )
14 | 
15 |     assert rank == 1
16 |     assert world_size == 1
17 |     assert graph is not None
18 | 
19 | 
20 | def test_config_path(mocker):
21 |     sh = mocker.patch("shutil.rmtree")
22 |     osmk = mocker.patch("os.makedirs")
23 | 
24 |     config_path("/tmp/checkpoints", delete_existing_ckpts=False)
25 | 
26 |     osmk.assert_called_once_with("/tmp/checkpoints", exist_ok=True)
27 |     assert sh.call_count == 0
28 | 
29 |     config_path("/tmp/checkpoints", delete_existing_ckpts=True)
30 | 
31 |     assert sh.call_count == 1
32 |     assert osmk.call_count == 2
33 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from mlbench_core.evaluation.pytorch.metrics import (
 5 |     BLEUScore,
 6 |     DiceCoefficient,
 7 |     F1Score,
 8 |     Perplexity,
 9 |     TopKAccuracy,
10 | )
11 | 
12 | 
13 | def test_f1_score():
14 |     output = torch.tensor([1, 1, 1, 1, 1]).reshape(5, 1)
15 |     target = torch.tensor([0, 0, 0, 0, 0]).reshape(5, 1)
16 | 
17 |     f1 = F1Score()
18 |     score = f1(output, target)
19 | 
20 |     assert score.item() == 0
21 | 
22 |     output = torch.tensor([1, 1, 1, 0, 1]).reshape(5, 1)
23 |     target = torch.tensor([1, 0, 1, 1, 0]).reshape(5, 1)
24 | 
25 |     precision = 2 / (2 + 2)
26 |     recall = 2 / (2 + 1)
27 | 
28 |     score = f1(output, target)
29 |     expected_score = 2 * (precision * recall) / (precision + recall)
30 |     np.testing.assert_almost_equal(score.item(), expected_score)
31 | 
32 | 
33 | def test_top1_accuracy():
34 |     output_1 = torch.tensor([[0, 1], [0, 1], [1, 0], [0, 1], [1, 0]]).reshape(5, 2)
35 |     output_2 = torch.tensor([1, 1, 0, 1, 0]).reshape(5, 1)
36 |     target = torch.tensor([0, 1, 0, 0, 1]).reshape(5, 1)
37 | 
38 |     acc = TopKAccuracy(topk=1)
39 |     expected_score = (2 / 5) * 100
40 | 
41 |     actual_score_1 = acc(output_1, target)
42 |     actual_score_2 = acc(output_2, target)
43 | 
44 |     assert actual_score_1 == expected_score
45 |     assert actual_score_2 == expected_score
46 | 
47 | 
48 | def test_top3_accuracy():
49 |     output_1 = torch.tensor(
50 |         [
51 |             [0.2, 0.2, 0.3, 0.1],
52 |             [0.15, 0.2, 0.05, 0.6],
53 |             [0.25, 0.3, 0.15, 0.3],
54 |             [0.3, 0.1, 0.2, 0.2],
55 |             [0.15, 0.15, 0.2, 0.5],
56 |         ]
57 |     ).reshape(5, 4)
58 |     target = torch.tensor([3, 1, 0, 2, 1]).reshape(5, 1)
59 | 
60 |     acc = TopKAccuracy(topk=3)
61 |     expected_score = (3 / 5) * 100
62 | 
63 |     actual_score_1 = acc(output_1, target)
64 | 
65 |     assert actual_score_1 == expected_score
66 | 
67 | 
68 | def test_perplexity():
69 |     target = torch.randint(high=1000, size=(100, 1))
70 |     outputs = torch.randn((100, 1000, 1))
71 | 
72 |     true_ppl = torch.exp(torch.nn.functional.cross_entropy(outputs, target))
73 |     ppl = Perplexity()
74 |     ppl_score = ppl(outputs, target)
75 | 
76 |     assert ppl_score == true_ppl
77 | 
78 | 
79 | def test_dice_coefficient():
80 |     target = torch.Tensor([1, 1, 1, 0, 0, 1]).view(-1, 1)
81 |     output = torch.Tensor([0.2, 0.6, 0.1, 0.15, 0.1, 0.8]).view(-1, 1)
82 | 
83 |     dice = DiceCoefficient()
84 | 
85 |     loss = dice(output, target).item()
86 | 
87 |     assert round(loss, 1) == 0.6
88 | 
89 | 
90 | def test_raw_bleu_score():
91 |     outputs = ["the quick yellow fox jumps over the active dog"]
92 |     target = ["the quick brown fox jumps over the lazy dog"]
93 | 
94 |     bl = BLEUScore(use_raw=True)
95 |     score = bl(outputs, target)
96 | 
97 |     assert round(score.item(), 1) == 36.9
98 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_models.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from mlbench_core.models.pytorch.linear_models import *
 5 | from mlbench_core.models.pytorch.resnet import *
 6 | 
 7 | 
 8 | def test_resnet18():
 9 |     resnet = resnet18_bkj(1000)
10 | 
11 |     inp = torch.rand(2, 3, 32, 32)
12 | 
13 |     outp = resnet(inp)
14 | 
15 |     assert outp is not None
16 |     assert outp.shape[0] == 2
17 |     assert outp.shape[1] == 1000
18 | 
19 |     resnet = resnet18_bkj(500)
20 | 
21 |     inp = torch.rand(3, 3, 32, 32)
22 | 
23 |     outp = resnet(inp)
24 | 
25 |     assert outp is not None
26 |     assert outp.shape[0] == 3
27 |     assert outp.shape[1] == 500
28 | 
29 | 
30 | def test_resnet20():
31 |     resnet = get_resnet_model("resnet20", 1, "fp32")
32 | 
33 |     inp = torch.rand(2, 3, 32, 32)
34 | 
35 |     outp = resnet(inp)
36 | 
37 |     assert outp is not None
38 |     assert outp.shape[0] == 2
39 |     assert outp.shape[1] == 10
40 | 
41 |     resnet = get_resnet_model("resnet20", 1, "fp32")
42 | 
43 |     inp = torch.rand(3, 3, 32, 32)
44 | 
45 |     outp = resnet(inp)
46 | 
47 |     assert outp is not None
48 |     assert outp.shape[0] == 3
49 |     assert outp.shape[1] == 10
50 | 
51 | 
52 | def test_resnet20v2():
53 |     resnet = get_resnet_model("resnet20", 2, "fp32")
54 | 
55 |     inp = torch.rand(2, 3, 32, 32)
56 | 
57 |     outp = resnet(inp)
58 | 
59 |     assert outp is not None
60 |     assert outp.shape[0] == 2
61 |     assert outp.shape[1] == 10
62 | 
63 |     resnet = get_resnet_model("resnet20", 2, "fp32")
64 | 
65 |     inp = torch.rand(3, 3, 32, 32)
66 | 
67 |     outp = resnet(inp)
68 | 
69 |     assert outp is not None
70 |     assert outp.shape[0] == 3
71 |     assert outp.shape[1] == 10
72 | 
73 | 
74 | def test_linear_regression():
75 |     lr = LinearRegression(10)  # Linear regression with 10 features
76 |     inp = torch.rand(100, 10)
77 | 
78 |     output = lr(inp)
79 |     assert output is not None
80 |     assert output.shape[0] == 100
81 |     assert output.shape[1] == 1
82 | 
83 | 
84 | def test_logistic_regression():
85 |     log = LogisticRegression(10)
86 | 
87 |     inp = torch.rand(100, 10)
88 | 
89 |     output = log(inp)
90 |     assert output is not None
91 |     assert output.shape[0] == 100
92 |     assert output.shape[1] == 1
93 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_schedulers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import pytest
  4 | import torch
  5 | 
  6 | from mlbench_core.lr_scheduler.pytorch.lr import (
  7 |     LRLinearWarmUp,
  8 |     MultiStepLRLinearWarmUp,
  9 |     SQRTTimeDecayLR,
 10 |     TimeDecayLR,
 11 | )
 12 | 
 13 | 
 14 | def test_linear_warmup_1():
 15 |     """Tests Linear Warmup LR"""
 16 |     init_lr = 0
 17 |     scaled_lr = 10
 18 |     warmup_duration = 5
 19 |     params = torch.nn.Parameter(torch.Tensor([1, 2, 3]))
 20 |     opt = torch.optim.SGD([params], lr=scaled_lr)
 21 | 
 22 |     scheduler = LRLinearWarmUp(
 23 |         optimizer=opt,
 24 |         init_lr=init_lr,
 25 |         scaled_lr=scaled_lr,
 26 |         warmup_duration=warmup_duration,
 27 |     )
 28 | 
 29 |     lrs = [0, 2, 4, 6, 8, 10, 10]
 30 |     for i in range(7):
 31 |         last_lr = scheduler.get_last_lr()[0]
 32 |         assert last_lr == lrs[i]
 33 |         scheduler.step()
 34 | 
 35 | 
 36 | def test_linear_warmup_2():
 37 |     """Tests Linear Warmup LR"""
 38 |     init_lr = 10
 39 |     scaled_lr = 10
 40 |     warmup_duration = 5
 41 |     params = torch.nn.Parameter(torch.Tensor([1, 2, 3]))
 42 |     opt = torch.optim.SGD([params], lr=scaled_lr)
 43 | 
 44 |     scheduler = LRLinearWarmUp(
 45 |         optimizer=opt,
 46 |         init_lr=init_lr,
 47 |         scaled_lr=scaled_lr,
 48 |         warmup_duration=warmup_duration,
 49 |     )
 50 | 
 51 |     for i in range(7):
 52 |         last_lr = scheduler.get_last_lr()[0]
 53 |         assert last_lr == scaled_lr
 54 |         scheduler.step()
 55 | 
 56 | 
 57 | def test_multi_step_lr():
 58 |     """Tests Multi step LR without warmup"""
 59 |     scaled_lr = 10
 60 |     params = torch.nn.Parameter(torch.Tensor([1, 2, 3]))
 61 |     opt = torch.optim.SGD([params], lr=scaled_lr)
 62 | 
 63 |     scheduler = MultiStepLRLinearWarmUp(
 64 |         optimizer=opt, scaled_lr=scaled_lr, gamma=0.5, milestones=[2, 3]
 65 |     )
 66 | 
 67 |     lrs = [10, 10, 5, 2.5]
 68 |     for i in range(4):
 69 |         last_lr = scheduler.get_last_lr()[0]
 70 |         assert last_lr == lrs[i]
 71 |         scheduler.step()
 72 | 
 73 | 
 74 | def test_multi_step_lin_warmup():
 75 |     """Tests Multistep LR with linear warmup"""
 76 |     init_lr = 0
 77 |     scaled_lr = 10
 78 |     warmup_duration = 5
 79 |     params = torch.nn.Parameter(torch.Tensor([1, 2, 3]))
 80 |     opt = torch.optim.SGD([params], lr=scaled_lr)
 81 | 
 82 |     scheduler = MultiStepLRLinearWarmUp(
 83 |         optimizer=opt,
 84 |         warmup_init_lr=init_lr,
 85 |         scaled_lr=scaled_lr,
 86 |         warmup_duration=warmup_duration,
 87 |         gamma=0.5,
 88 |         milestones=[7, 8],
 89 |     )
 90 | 
 91 |     lrs = [0, 2, 4, 6, 8, 10, 10, 5, 2.5]
 92 |     for i in range(9):
 93 |         last_lr = scheduler.get_last_lr()[0]
 94 |         assert last_lr == lrs[i]
 95 |         scheduler.step()
 96 | 
 97 | 
 98 | def test_time_decay_lr():
 99 |     """Tests Time Decay LR"""
100 |     lr = 10
101 |     beta = 1
102 |     params = torch.nn.Parameter(torch.Tensor([1, 2, 3]))
103 |     opt = torch.optim.SGD([params], lr=lr)
104 | 
105 |     scheduler = TimeDecayLR(optimizer=opt, beta=beta)
106 | 
107 |     for i in range(10):
108 |         true_lr = lr / (i + beta)
109 |         last_lr = scheduler.get_last_lr()[0]
110 |         assert last_lr == pytest.approx(true_lr)
111 |         scheduler.step()
112 | 
113 | 
114 | def test_sqrt_time_decay_lr():
115 |     """Tests SQRT Time Decay LR"""
116 |     lr = 10
117 |     params = torch.nn.Parameter(torch.Tensor([1, 2, 3]))
118 |     opt = torch.optim.SGD([params], lr=lr)
119 | 
120 |     scheduler = SQRTTimeDecayLR(optimizer=opt)
121 | 
122 |     for i in range(10):
123 |         true_lr = lr / math.sqrt(max(1, i))
124 |         last_lr = scheduler.get_last_lr()[0]
125 |         assert last_lr == pytest.approx(true_lr)
126 |         scheduler.step()
127 | 


--------------------------------------------------------------------------------
/tests/test_pytorch_utils.py:
--------------------------------------------------------------------------------
 1 | """Tests for `mlbench_core.utils.pytorch.utils` package."""
 2 | import torch
 3 | 
 4 | from mlbench_core.utils.pytorch.utils import orthogonalize, pack_tensors, unpack_tensors
 5 | 
 6 | 
 7 | def test_orthogonalize():
 8 |     m = torch.rand(2, 2)
 9 |     identity = torch.eye(2)
10 | 
11 |     orthogonalize(m)
12 | 
13 |     # check if m'*m = I
14 |     assert torch.allclose(torch.matmul(m.t(), m), identity, atol=1e-04)
15 | 
16 | 
17 | def test_pack_tensors():
18 |     tensors = [torch.rand(2, 2), torch.rand(2, 2)]
19 | 
20 |     flattened = [y for x in tensors for y in x.view(-1)]
21 | 
22 |     vec, indices, sizes = pack_tensors(tensors)
23 | 
24 |     assert vec.tolist() == flattened
25 |     assert indices == [0, 4, 8]
26 |     assert sizes == [(2, 2), (2, 2)]
27 | 
28 | 
29 | def test_unpack_tensors():
30 |     tensors = [torch.rand(2, 2), torch.rand(2, 2)]
31 |     vec, indices, sizes = pack_tensors(tensors)
32 | 
33 |     unpacked = unpack_tensors(vec, indices, sizes)
34 | 
35 |     assert all((x == y).all() for x, y in zip(tensors, unpacked))
36 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | """Tests for `mlbench_core.utils` package."""
  2 | 
  3 | import datetime
  4 | 
  5 | from freezegun import freeze_time
  6 | 
  7 | from mlbench_core.evaluation.goals import task1_time_to_accuracy_light_goal
  8 | from mlbench_core.evaluation.pytorch.metrics import TopKAccuracy
  9 | from mlbench_core.utils import LogMetrics, Tracker
 10 | 
 11 | 
 12 | def test_tracker():
 13 |     tracker = Tracker([TopKAccuracy(5)], 1, 0)
 14 | 
 15 |     assert tracker is not None
 16 | 
 17 | 
 18 | def test_tracker_goal(mocker):
 19 |     patched = mocker.patch("mlbench_core.utils.tracker.LogMetrics")
 20 | 
 21 |     metric = TopKAccuracy(1)
 22 |     tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal())
 23 | 
 24 |     tracker.start()
 25 | 
 26 |     assert tracker.start_time is not None
 27 | 
 28 |     tracker.train()
 29 | 
 30 |     tracker.record_stat("global_Prec@1", 69, log_to_api=True)
 31 |     tracker.batch_end()
 32 | 
 33 |     assert not tracker.goal_reached
 34 | 
 35 |     tracker.record_stat("global_Prec@1", 70, log_to_api=True)
 36 |     tracker.batch_end()
 37 | 
 38 |     assert not tracker.goal_reached
 39 | 
 40 |     tracker.validation()
 41 | 
 42 |     tracker.record_stat("global_Prec@1", 69, log_to_api=True)
 43 |     tracker.batch_end()
 44 | 
 45 |     assert not tracker.goal_reached
 46 | 
 47 |     tracker.record_stat("global_Prec@1", 70, log_to_api=True)
 48 | 
 49 |     assert tracker.goal_reached
 50 | 
 51 | 
 52 | def _do_batch(tracker, frozen):
 53 |     tracker.batch_start()
 54 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 55 |     tracker.record_batch_load()
 56 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 57 |     tracker.record_batch_init()
 58 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 59 |     tracker.record_batch_fwd_pass()
 60 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 61 |     tracker.record_batch_comp_loss()
 62 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 63 |     tracker.record_batch_backprop()
 64 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 65 |     tracker.record_batch_agg()
 66 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 67 |     tracker.record_batch_opt_step()
 68 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 69 |     tracker.record_batch_comp_metrics()
 70 |     frozen.tick(delta=datetime.timedelta(seconds=0.5))
 71 |     tracker.batch_end()
 72 | 
 73 | 
 74 | def test_tracker_goal_times(mocker):
 75 |     patched = mocker.patch("mlbench_core.utils.tracker.LogMetrics")
 76 | 
 77 |     metric = TopKAccuracy(1)
 78 |     tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal())
 79 | 
 80 |     tracker.start()
 81 | 
 82 |     assert tracker.start_time is not None
 83 | 
 84 |     tracker.train()
 85 | 
 86 |     with freeze_time(datetime.datetime.now()) as frozen:
 87 |         _do_batch(tracker, frozen)
 88 | 
 89 |         assert abs(tracker.get_total_preprocess_time() - 0.5) < 0.01
 90 |         assert abs(tracker.get_total_communication_time() - 0.5) < 0.01
 91 |         assert abs(tracker.get_total_compute_time() - 2.0) < 0.01
 92 |         assert abs(tracker.get_total_metrics_time() - 0.5) < 0.01
 93 | 
 94 |         _do_batch(tracker, frozen)
 95 | 
 96 |         assert abs(tracker.get_total_preprocess_time() - 1.0) < 0.01
 97 |         assert abs(tracker.get_total_communication_time() - 1.0) < 0.01
 98 |         assert abs(tracker.get_total_compute_time() - 4.0) < 0.01
 99 |         assert abs(tracker.get_total_metrics_time() - 1.0) < 0.01
100 | 
101 |         tracker.validation()
102 |         tracker.record_stat("global_Prec@1", 70, log_to_api=True)
103 | 
104 |         assert tracker.goal_reached
105 |         assert any(filter(lambda c: c[1][3] == "TaskResult", patched.method_calls))
106 | 
107 | 
108 | def test_LogMetrics(mocker):
109 |     mocker.patch("mlbench_core.api.ApiClient")
110 | 
111 |     LogMetrics.log("1", 1, 1, "loss", 123)
112 | 
113 |     mocker.patch.dict("os.environ", {"MLBENCH_IN_DOCKER": "True"})
114 | 
115 |     LogMetrics.log("1", 1, 1, "loss", 123)
116 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =  py37, lint, docs
 3 | 
 4 | [default]
 5 | basepython = python3.7
 6 | 
 7 | deps =
 8 |     .[test]
 9 | 
10 | setenv =
11 |     PYTHONPATH = {toxinidir}
12 | 
13 | [testenv]
14 | description = run tests
15 | 
16 | basepython =
17 |     py37: python3.7
18 | 
19 |     pypy3: pypy3
20 | 
21 | deps =
22 |     {[default]deps}
23 | 
24 | setenv =
25 |     {[default]setenv}
26 | 
27 | passenv = CI TRAVIS TRAVIS_*
28 | 
29 | commands =
30 |     pytest --cov=./mlbench_core/
31 |     codecov
32 | 
33 | 
34 | [testenv:docs]
35 | basepython=python
36 | changedir={toxinidir}/docs
37 | deps=
38 |     -rdocs/requirements.txt
39 | commands=
40 |     sphinx-build -W -b html -d _build/doctrees . _build/html
41 | 
42 | 
43 | [testenv:lint]
44 | 
45 | description = run Black (linter) and isort (import sorter)
46 | 
47 | basepython = {[default]basepython}
48 | 
49 | skip_install = True
50 | 
51 | deps =
52 |     .[lint]
53 | 
54 | setenv =
55 |     BLACK_LINT_ARGS=--check
56 | 
57 | commands =
58 |     black {env:BLACK_LINT_ARGS:} .
59 |     isort --check-only .
60 | 
61 | [tool:isort]
62 | ; black's default line length
63 | line_length = 88
64 | multi_line_output = 3
65 | include_trailing_comma = True
66 | known_first_party = mlbench_core
67 | known_third_party =PIL,appdirs,boto3,botocore,click,cv2,deprecation,dill,docker,docutils,freezegun,gensidebar,google,kubernetes,lmdb,matplotlib,mosestokenizer,numpy,pyhelm,pytest,requests,setuptools,six,sklearn,sphinx,tabulate,tensorflow,tensorpack,torch,torchtext,torchvision,tqdm,urllib3,yaml
68 | 


--------------------------------------------------------------------------------