├── .coveragerc ├── .github └── workflows │ ├── canary.yml │ ├── deploy-website.yml │ ├── python-publish.yml │ └── test.yml ├── .gitignore ├── .pylintrc ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── VERSION_NOTES.md ├── examples ├── cifar_image_classification │ ├── .images │ │ └── tensorboard_screenshot.jpg │ ├── CIFAR.ipynb │ ├── README.md │ ├── cifar.py │ ├── config.yaml │ └── requirements.txt ├── cnndailymail_text_summarization │ ├── ORT_README.md │ ├── azureml │ │ ├── Dockerfile │ │ └── submit_ortds.py │ ├── config-ortds.yaml │ ├── config-prod.yaml │ ├── config.yaml │ ├── data.py │ ├── deepspeed_methods │ │ ├── __init__.py │ │ ├── deepspeedConfig.json │ │ ├── deepspeed_trainer.py │ │ ├── deepspeed_trainer_backend.py │ │ └── deepspeed_utils.py │ ├── images │ │ └── tensorboard_screenshot_bart.jpg │ ├── infer.py │ ├── model_ortds.py │ ├── readme.md │ ├── requirements.txt │ ├── train.py │ └── train_ortds.py ├── covid19_text_classification │ ├── azureml │ │ ├── dockerfile │ │ └── submit.py │ ├── config.yaml │ ├── data.py │ ├── readme.md │ ├── requirements.txt │ └── train.py ├── germ_text_ner │ ├── GermEvalAML.ipynb │ ├── config_germ.yaml │ ├── readme.md │ ├── requirements.txt │ ├── test.py │ ├── train_germ │ │ └── train.tsv │ └── val_germ │ │ └── dev.tsv ├── glue_text_benchmark │ ├── Dockerfile │ ├── README.md │ ├── configs-roberta-base │ │ ├── cola.yaml │ │ ├── mnli.yaml │ │ ├── mrpc.yaml │ │ ├── qnli.yaml │ │ ├── qqp.yaml │ │ ├── rte.yaml │ │ ├── sst2.yaml │ │ └── stsb.yaml │ ├── images │ │ └── tensorboard_screenshot.jpg │ ├── logs_roberta_base │ │ └── rte │ │ │ └── from_pretrained │ │ │ └── events.out.tfevents.1623336412.krishan-vm.20548.0 │ ├── requirements.txt │ └── src │ │ ├── data.py │ │ ├── infer.py │ │ └── train.py ├── readme.md └── snli_benchmark │ ├── SNLI.ipynb │ ├── configs-bert-base │ └── snli.yaml │ └── src │ ├── data.py │ └── train.py ├── pymarlin ├── __init__.py ├── core │ ├── __init__.py │ ├── data_interface.py │ ├── module_interface.py │ ├── trainer.py │ └── trainer_backend.py ├── plugins │ ├── __init__.py │ ├── base.py │ ├── hf_ner │ │ ├── __init__.py │ │ ├── config_germ.yaml │ │ ├── data_classes.py │ │ ├── implementation.py │ │ ├── module_classes.py │ │ └── sequence_labelling_metrics.py │ ├── hf_seq2seq │ │ ├── __init__.py │ │ ├── data_classes.py │ │ ├── implementation.py │ │ ├── metric_utils.py │ │ └── module_classes.py │ ├── hf_seq_classification │ │ ├── __init__.py │ │ ├── config.yaml │ │ ├── data_classes.py │ │ ├── implementation.py │ │ ├── metric_utils.py │ │ └── module_classes.py │ ├── hfdistill_utils.py │ └── plugin_module_interface.py └── utils │ ├── __init__.py │ ├── checkpointer │ ├── __init__.py │ └── checkpoint_utils.py │ ├── config_parser │ ├── __init__.py │ └── custom_arg_parser.py │ ├── differential_privacy.py │ ├── distributed.py │ ├── fabrics.py │ ├── logger │ ├── __init__.py │ └── logging_utils.py │ ├── misc │ ├── __init__.py │ └── misc_utils.py │ ├── stats │ ├── __init__.py │ └── basic_stats.py │ └── writer │ ├── __init__.py │ ├── aml.py │ ├── base.py │ ├── stdout.py │ └── tensorboard.py ├── pyproject.toml ├── pytest.ini ├── setup.py ├── tests ├── core │ ├── test_data_interface.py │ ├── test_trainer.py │ └── test_trainer_backend.py ├── test_sanity.py └── utils │ ├── config.yaml │ ├── corrupt_files │ └── config.yaml │ ├── test_checkpointer.py │ ├── test_config_parser.py │ └── test_stats.py └── website ├── .gitignore ├── README.md ├── UML ├── diagrams │ ├── out │ │ ├── classes.png │ │ ├── classes.svg │ │ ├── classification_data_processing.png │ │ ├── classification_data_processing.svg │ │ ├── classification_train.png │ │ ├── classification_train.svg │ │ ├── classifier.png │ │ ├── classifier.svg │ │ ├── training_lifecycle.png │ │ └── training_lifecycle.svg │ └── src │ │ ├── classes.pu │ │ ├── classifier.pu │ │ └── train_manager_sequence.pu ├── make.bat ├── plantuml.jar └── readme.md ├── babel.config.js ├── docs ├── contributing.md ├── examples │ ├── checkpointing.md │ ├── cifar.md │ ├── classification.md │ ├── datamodule-example.md │ ├── distillation.md │ ├── glue-tasks.md │ ├── images │ │ ├── cifar.png │ │ ├── tb.jpg │ │ └── tensorboard_screenshot_bart.jpg │ └── summarization.md ├── getting-started.md ├── installation.md ├── marlin-in-pictures.md ├── plugins │ ├── hf_ner.md │ ├── hf_seq_classification.md │ └── images │ │ ├── hfner │ │ └── ner_dataset_mod.png │ │ └── hfseqclass │ │ ├── loss.jpg │ │ ├── loss.png │ │ ├── lr.jpg │ │ ├── lr.png │ │ ├── train_metrics.jpg │ │ └── train_metrics.png └── utils │ ├── images │ └── tb_example.jpg │ └── stats.md ├── docusaurus.config.js ├── package.json ├── pydoc-markdown.yml ├── sidebars.js ├── src ├── components │ ├── HomepageFeatures.js │ └── HomepageFeatures.module.css ├── css │ └── custom.css └── pages │ ├── index.js │ ├── index.module.css │ └── markdown-page.md ├── static ├── .nojekyll └── img │ ├── docusaurus.png │ ├── favicon.ico │ ├── logo.svg │ ├── tutorial │ ├── docsVersionDropdown.png │ └── localeDropdown.png │ ├── undraw_docusaurus_mountain.svg │ ├── undraw_docusaurus_react.svg │ └── undraw_docusaurus_tree.svg └── yarn.lock /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | # omit everything under plugins for now 4 | pymarlin/plugins/* 5 | 6 | [report] 7 | # Regexes for lines to exclude from consideration 8 | exclude_lines = 9 | # exclude abstract functions that will most likely never get run anyways 10 | pass -------------------------------------------------------------------------------- /.github/workflows/canary.yml: -------------------------------------------------------------------------------- 1 | name: azureml canary 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '5 8 * * 0' # runs once a week at 8.05 on day 0 (Monday) 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: check out repo 13 | uses: actions/checkout@v2 14 | - name: setup python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: "3.8" 18 | - name: requirements 19 | run: pip install azureml-sdk>=1.20.0 20 | - name: azure login 21 | uses: azure/login@v1 22 | with: 23 | creds: ${{secrets.AZURE_CREDENTIALS}} 24 | - name: release canary 25 | run: | 26 | cd examples/covid19_text_classification/azureml/ 27 | python submit.py --backend ddp-amp --process_count 2 --wait \ 28 | --subscription_id ${{secrets.SUBSCRIPTION_ID}} --resource_group ${{secrets.RESOURCE_GROUP}} \ 29 | --workspace_name ${{secrets.WORKSPACE_NAME}} -------------------------------------------------------------------------------- /.github/workflows/deploy-website.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: [main] 8 | workflow_dispatch: 9 | 10 | jobs: 11 | checks: 12 | if: github.event_name != 'push' 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v1 16 | - uses: actions/setup-node@v1 17 | with: 18 | node-version: '12.x' 19 | - name: setup python 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: "3.8" 23 | - name: pydoc-markdown install 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install docspec-python==0.1.0 27 | pip install pydoc-markdown 28 | - name: pydoc-markdown run 29 | run: | 30 | cd website 31 | pydoc-markdown 32 | - name: Test Build 33 | run: | 34 | cd website 35 | if [ -e yarn.lock ]; then 36 | yarn install --frozen-lockfile 37 | elif [ -e package-lock.json ]; then 38 | npm ci 39 | else 40 | npm i 41 | fi 42 | npm run build 43 | gh-release: 44 | if: github.event_name != 'pull_request' 45 | runs-on: ubuntu-latest 46 | steps: 47 | - uses: actions/checkout@v1 48 | - uses: actions/setup-node@v1 49 | with: 50 | node-version: '12.x' 51 | - name: setup python 52 | uses: actions/setup-python@v2 53 | with: 54 | python-version: "3.8" 55 | - name: pydoc-markdown install 56 | run: | 57 | python -m pip install --upgrade pip 58 | pip install docspec-python==0.1.0 59 | pip install pydoc-markdown 60 | - name: pydoc-markdown run 61 | run: | 62 | cd website 63 | pydoc-markdown 64 | - name: Add key to allow access to repository 65 | env: 66 | SSH_AUTH_SOCK: /tmp/ssh_agent.sock 67 | run: | 68 | mkdir -p ~/.ssh 69 | ssh-keyscan github.com >> ~/.ssh/known_hosts 70 | echo "${{ secrets.GH_PAGES_DEPLOY }}" > ~/.ssh/id_rsa 71 | chmod 600 ~/.ssh/id_rsa 72 | cat <> ~/.ssh/config 73 | Host github.com 74 | HostName github.com 75 | IdentityFile ~/.ssh/id_rsa 76 | EOT 77 | - name: Release to GitHub Pages 78 | env: 79 | USE_SSH: true 80 | GIT_USER: git 81 | run: | 82 | git config --global user.email "actions@gihub.com" 83 | git config --global user.name "gh-actions" 84 | cd website 85 | if [ -e yarn.lock ]; then 86 | yarn install --frozen-lockfile 87 | elif [ -e package-lock.json ]; then 88 | npm ci 89 | else 90 | npm i 91 | fi 92 | yarn deploy -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | 5 | name: pypi 6 | 7 | on: 8 | push: 9 | branches: [main] 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.8' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install --upgrade build 25 | pip install setuptools wheel twine 26 | - name: Build 27 | run: | 28 | python -m build 29 | 30 | - name: Publish to TestPyPi 31 | env: 32 | TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }} 33 | TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} 34 | run: | 35 | python -m twine upload --repository testpypi dist/* --skip-existing 36 | 37 | - name: Publish to PyPi 38 | env: 39 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 40 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 41 | run: | 42 | python -m twine upload dist/* --skip-existing 43 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: lint & test 5 | 6 | on: 7 | workflow_dispatch: 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | jobs: 14 | build: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python 3.8 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.8 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html 28 | pip install -e .[dev] 29 | - name: lint 30 | run: | 31 | pylint pymarlin --rcfile=.pylintrc 32 | - name: test with coverage 33 | run: | 34 | pytest --cov=pymarlin --cov-report=xml --cov-config=.coveragerc 35 | - name: Upload coverage to Codecov 36 | uses: codecov/codecov-action@v1 37 | with: 38 | files: ./coverage.xml 39 | directory: ./coverage/reports/ 40 | flags: unittests 41 | env_vars: OS,PYTHON 42 | name: codecov-umbrella 43 | fail_ci_if_error: true 44 | path_to_write_report: ./coverage/codecov_report.txt 45 | verbose: true 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # logs 132 | logs/ 133 | 134 | # vscode 135 | .vscode/ 136 | 137 | # pydoc-markdown auto-generated reference docs 138 | /website/docs/reference/** 139 | 140 | # AzureML Workspace Config JSON files 141 | config.json 142 | 143 | # local snapshots of pymarlin for submitted to azureml 144 | /examples/*/pymarlin 145 | 146 | # don't check in data as it normally comes with its own restrictive license 147 | /examples/*/data 148 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04 2 | RUN apt-get update 3 | 4 | 5 | # create conda environment 6 | RUN conda update -n base -c defaults conda -y 7 | RUN conda create -n marlin python=3.8 -y 8 | RUN echo ". /opt/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc 9 | 10 | #install torch latest 11 | # Cuda toolkit other than 1.2 makes GPUs invisible. Base image issue 12 | RUN conda install pytorch cudatoolkit=10.2 -c pytorch -y -n marlin 13 | 14 | ADD . /workdir 15 | WORKDIR /workdir 16 | 17 | RUN /opt/miniconda/envs/marlin/bin/pip install -U -e . -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyMarlin, a lightweight PyTorch library for agile deep learning! 2 | [![Unit Tests](https://github.com/microsoft/PyMarlin/actions/workflows/test.yml/badge.svg)](https://github.com/microsoft/PyMarlin/actions/workflows/test.yml) 3 | [![codecov](https://codecov.io/gh/microsoft/PyMarlin/branch/main/graph/badge.svg?token=wUF3ZODLpN)](https://codecov.io/gh/microsoft/PyMarlin) 4 | [![Docs](https://github.com/microsoft/PyMarlin/actions/workflows/deploy-website.yml/badge.svg)](https://microsoft.github.io/PyMarlin/) 5 | [![AzureML Canary](https://github.com/microsoft/PyMarlin/actions/workflows/canary.yml/badge.svg)](https://github.com/microsoft/PyMarlin/actions/workflows/canary.yml) 6 | [![pypi](https://img.shields.io/pypi/v/pymarlin)](https://pypi.org/project/pymarlin/) 7 | 8 | PyMarlin was developed with the goal of simplifying the E2E Deep Learning experimentation lifecycle for data scientists using PyTorch. The library enables an agile way to quickly prototype a new AI scenario on dev box and seamlessly scale it training multi-node DDP GPU training with AzureML or other cloud services. 9 | 10 | ## Key features 11 | - Provides public and enterprise **data pre-processing** recipes, which provides out of the box vanilla and parallel processing. It requires no additional code to run for AzureML or other environments easily. 12 | - Provides **scalable model training** with support for Single Process, VM, multi-GPU, multi-node, distributed Data Parallel, mixed-precision (AMP, Apex) training. ORT and DeepSpeed based training are going to be available soon! 13 | - Provides out of the box **Plugins** that can be used for all typical NLP tasks like Sequence Classification, Named Entity Recognition and Seq2Seq text generation. 14 | - Provides **reusable modules** for model checkpointing, stats collection, Tensorboard and compliant AML logging which can be customized based on your scenario. 15 | - Provides **custom arguments parser** that allows for saving all the default values for arguments related to a scenario in an YAML config file, merging user provided arguments at runtime. 16 | - All core modules are thoroughly **linted**,**unit tested** and even ran E2E (multi-node, GPU) in AzureML. 17 | - PyMarlin is minimal and has a easy to understand codebase. PyMarlin was designed to make it easy for others to understand the entire codebase and customize according to their needs. 18 | 19 | ## Installation 20 | 21 | pip install pymarlin 22 | 23 | Read the [installation doc](https://microsoft.github.io/PyMarlin/docs/installation) for more information. 24 | 25 | ## Start exploring! 26 | 27 | ### Full documentation website 28 | Full website with [guides and SDK reference](https://microsoft.github.io/PyMarlin/). 29 | 30 | ### Train your first model with pymarlin 31 | Check out the [CIFAR image classification example](hhttps://microsoft.github.io/PyMarlin/docs/examples/cifar). 32 | 33 | ### GLUE task benchmarking 34 | Explore how to use pymarlin to [benchmark your language models on GLUE tasks](https://microsoft.github.io/PyMarlin/docs/examples/glue-tasks). 35 | 36 | ## We want your feedback! 37 | Reach out to us with your [feedback and suggestions](https://github.com/microsoft/PyMarlin/issues). 38 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /VERSION_NOTES.md: -------------------------------------------------------------------------------- 1 | # Version Notes: 2 | ## 0.3.2 3 | * PyMarlin supports DP training via Opacus v1.0 4 | 5 | ## 0.3.1 6 | * Version parity for Pypi. 7 | 8 | ## 0.2.8 9 | * Incremented the dependency to torch<=1.9.1 10 | 11 | ## 0.2.7 12 | * Adding torch<=1.9 as required dependency 13 | 14 | ## 0.2.6 15 | * Adding support for parsing multi-level args from commandline and params 16 | 17 | ## 0.2.5 18 | * Adding support for directories with config path (only one file in directory) 19 | 20 | ## 0.2.4 21 | * Fixed bug where DDP all-reduce was not working 22 | 23 | ## 0.2.3 24 | * Unbound azureml-core version 25 | 26 | ## 0.2.2 27 | * Plugins bug fix 28 | 29 | ## 0.2.0 30 | * Adding plugins: SeqClassification, NER, Seq2Seq 31 | * --params json input 32 | * DDP allreduce optimization 33 | 34 | ## 0.1.1 35 | * Tests & Lint Pipeline 36 | * Documentation Pipeline 37 | * PyPi Pipeline 38 | 39 | ## 0.1.0 40 | * Initial release 41 | * Trainer, TrainerBackend, ModuleInterface, DataProcessor/Interface, ConfigParser and more (see docs) 42 | -------------------------------------------------------------------------------- /examples/cifar_image_classification/.images/tensorboard_screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/cifar_image_classification/.images/tensorboard_screenshot.jpg -------------------------------------------------------------------------------- /examples/cifar_image_classification/README.md: -------------------------------------------------------------------------------- 1 | The jupyter notebook is good for just trying out CIFAR using pymarlin. 2 | ## Run in [Colab](https://colab.research.google.com/github/microsoft/PyMarlin/blob/main/examples/cifar_image_classification/CIFAR.ipynb) 3 | 4 | To use other advanced features like distributed training, yaml parser, tensorboard etc. use the python file and follow the instructions below. 5 | 6 | Note: Recommended to use a machine with more than one GPU to try out all features 7 | 8 | # 1. Install Pymarlin ,pytorch, requirements 9 | 10 | Follow steps here https://microsoft.github.io/PyMarlin/docs/installation 11 | 12 | pip install -r requirements.txt 13 | 14 | # 2. Run CIFAR 15 | 16 | ## Single process 17 | 18 | python cifar.py --config_path config.yaml 19 | 20 | ## Mixed Precission (Needs GPU) 21 | 22 | python cifar.py --config_path config.yaml --tr.backend sp-amp 23 | 24 | ## Multi process (Needs at least 2 GPUs) 25 | 26 | python -m torch.distributed.launch --nproc_per_node 2 cifar.py --config_path config.yaml --tr.backend ddp 27 | 28 | # Results 29 | 30 | Val accuracy at step 50000 = 61.14 31 | 32 | # Tensorboard 33 | 34 | tensorboard --logdir logs 35 | 36 | ![tensorboard](.images/tensorboard_screenshot.jpg) -------------------------------------------------------------------------------- /examples/cifar_image_classification/config.yaml: -------------------------------------------------------------------------------- 1 | tr: 2 | epochs: 2 3 | train_batch_size: 4 4 | val_batch_size: 16 5 | writers: ['tensorboard'] 6 | clip_grads: False 7 | log_level : 'INFO' 8 | backend : sp 9 | max_train_steps_per_epoch : null 10 | max_val_steps_per_epoch : null 11 | -------------------------------------------------------------------------------- /examples/cifar_image_classification/requirements.txt: -------------------------------------------------------------------------------- 1 | torchvision 2 | matplotlib -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/ORT_README.md: -------------------------------------------------------------------------------- 1 | # Optimizing with ORT + DeepSpeed 2 | We have extended this example to be optimized with ORT+DeepSpeed! Starting from this scenario we will try to build common backend for both ORT and DeepSpeed. 3 | 4 | ## Speed improvments (batches/second) 5 | setup: bart-base, 4xV100 (16GB), batchsize 32 6 | 7 | ### configs and speed 8 | * base pytorch , OOM 9 | * ort , 1.46-1.48 batch/s 10 | * deepspeed , 1.69-1.70 batch/s 11 | * ort+deepspeed , 1.71-1.72 batch/s 12 | * deepspeed fp16 zero stage 1 , 3.36-3.41 batch/s 13 | * ort+deepspeed fp16 zero stage 1 , 3.47-3.55 batch/s 14 | 15 | ## Noteworthy files 16 | * [deepspeed_methods](deepspeed_methods): deepspeed utility methods and trainer / trainer backends. 17 | * [model_ortds.py](model_ortds.py): module interface with config checks to enable ort+deepspeed 18 | * [train_ortds.py](train_ortds.py): main train script that imports the above 19 | * [azureml/submit_ortds.py](azureml/submit_ortds.py): azureml submit script 20 | 21 | ## Submitting 22 | 1. Install azureml-sdk and create an AzureML workspace, great [instructions on both here](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/installation). 23 | 2. Write out the config.json for the workspace with [write_config()](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/workspace#helpful-methods) 24 | 3. Create a gpu cluster in the workspace, for more info go [here](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/compute-targets#creating-compute-targets) 25 | 4. Adjust the values in submit_ortds.py to point to your new gpu cluster. 26 | 5. Upload preprocessed CNN/DailyMail from original README by uncommenting line 48 and point to local path. 27 | 6. From examples/summarization/aml, Submit job with `python submit_ortds.py` -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/azureml/submit_ortds.py: -------------------------------------------------------------------------------- 1 | from azureml.core import Experiment, Workspace, ScriptRunConfig 2 | from azureml.core.compute import AmlCompute 3 | from azureml.core.runconfig import MpiConfiguration 4 | 5 | # put your AML workspace config.json in this directory! 6 | ws = Workspace.from_config() 7 | ws_details = ws.get_details() 8 | ds = ws.get_default_datastore() 9 | 10 | gpu_compute_target = AmlCompute(workspace=ws, name='sriovdedicated1') 11 | print(gpu_compute_target.status.serialize()) 12 | 13 | from azureml.core import Dataset 14 | from azureml.data import OutputFileDatasetConfig 15 | 16 | 17 | # create input/output datasets 18 | def get_input_dataset(datastore, path_on_datastore, dataset_name): 19 | dataset = Dataset.File.from_files(path=[(datastore, path_on_datastore)]) 20 | return dataset.as_named_input(dataset_name).as_download() 21 | 22 | def get_output_dataset(datastore, path_on_datastore, dataset_name): 23 | return OutputFileDatasetConfig(destination=(datastore, path_on_datastore), name=dataset_name).as_mount() 24 | 25 | def get_args(outputSuffix="deepspeed_ort_amp_nopadding_v100_8"): 26 | all_params_default = [ 27 | '--data_path', get_input_dataset(ds, f'datasets/cnn_dm/preprocessed/bart/', "data_path"), 28 | '--config_path', 'config-ortds.yaml', 29 | ] 30 | 31 | return all_params_default 32 | 33 | from azureml.core import Environment 34 | 35 | # Creates the environment inside a Docker container. 36 | pytorch_env = Environment(name='myEnv') 37 | pytorch_env.docker.enabled = True 38 | # docker file in this directory built for your convenience 39 | pytorch_env.docker.base_image = "pymarlin/base-gpu:cuda11.1.cudnn8.ds.ort" 40 | pytorch_env.python.user_managed_dependencies = True 41 | pytorch_env.python.interpreter_path = '/opt/miniconda/bin/python' 42 | 43 | mpi = MpiConfiguration() 44 | #NCv3_24rs - 4 16GB V100 GPU's per node 45 | mpi.process_count_per_node = 4 46 | mpi.node_count = 2 47 | 48 | # ds.upload_files(['local path to preprocessed data'], 'datasets/cnn_dm/preprocessed/bart') 49 | 50 | script = "train_ortds.py" 51 | codepath = '..' 52 | 53 | config = ScriptRunConfig(source_directory=codepath, 54 | script=script, 55 | arguments=get_args(), 56 | compute_target=gpu_compute_target, 57 | environment=pytorch_env, 58 | distributed_job_config=mpi) 59 | 60 | experiment_name = 'pymarlin_summarization_bart_ortds' 61 | experiment = Experiment(ws, name=experiment_name) 62 | 63 | run = experiment.submit(config) 64 | 65 | run.tag('nodes', f'{mpi.node_count}') 66 | run.tag('process_count_per_node', f'{mpi.process_count_per_node}') 67 | run.tag('notes', '2 node with ort+ds') 68 | 69 | print("Submitted run") 70 | print(f"\n{run.get_portal_url()}") 71 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/config-ortds.yaml: -------------------------------------------------------------------------------- 1 | data_path: 'D:/data/cnn_cln' 2 | dist: true 3 | ort: true 4 | ortds: true 5 | 6 | trainer: 7 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 8 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 9 | train_batch_size: 32 # Training global batch size. 10 | val_batch_size: 32 # Validation batch size per GPU. 11 | epochs: 3 # Total epochs to run. 12 | gpu_batch_size_limit : 4 # Max limit for GPU batch size during training. 13 | disable_tqdm : True 14 | writers: ["stdout", "aml", "tensorboard"] 15 | 16 | module: 17 | max_length_encoder : 1024 18 | max_length_decoder : 128 19 | deepspeed_config: 'deepspeed_methods/deepspeedConfig.json' 20 | deepspeed_transformer_kernel: true 21 | deepspeed_ckpt_tag: "deepspeed_ckpt" # optional, let deepspeed load specific checkpoint, unnecessary if save_latest is true (default) when checkpointing with deepspeed 22 | 23 | wrt: 24 | tb_log_dir : 'outputs/tb_logs' 25 | 26 | 27 | stat: 28 | log_steps : 20 29 | 30 | chkp: 31 | checkpoint : True 32 | delete_existing_checkpoints: False 33 | save_dir: 'outputs/chkpt' # aml output path. does not require mounting 34 | model_state_save_dir: 'outputs/model' 35 | load_dir: null 36 | load_filename: null 37 | 38 | # add more from BartForConditionalGeneration.generate? 39 | generate: 40 | max_length: 128 41 | do_sample : False 42 | num_beams : 5 43 | # support everything in a yaml. ignore (print warning) everything that's not present. 44 | # Do not add the requirement to define anything in the parser other than yamls 45 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/config-prod.yaml: -------------------------------------------------------------------------------- 1 | data_path: 'D:/data/cnn_cln' 2 | 3 | trainer: 4 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 5 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 6 | train_batch_size: 32 # Training global batch size. 7 | val_batch_size: 32 # Validation batch size per GPU. 8 | epochs: 3 # Total epochs to run. 9 | gpu_batch_size_limit : 4 # Max limit for GPU batch size during training. 10 | disable_tqdm : False 11 | writers: ["stdout", "aml", "tensorboard"] 12 | backend: 'ddp-amp' 13 | module: 14 | max_length_encoder : 1024 15 | max_length_decoder : 128 16 | wrt: 17 | tb_log_dir : 'logs' 18 | stat: 19 | log_steps : 50 20 | chkp: 21 | checkpoint : True 22 | delete_existing_checkpoints: False 23 | save_dir: 'outputs' #aml output path. does not require mounting 24 | load_dir: null 25 | load_filename: null 26 | 27 | # add more from BartForConditionalGeneration.generate? 28 | generate: 29 | max_length: 128 30 | do_sample : False 31 | num_beams : 5 32 | # support everything in a yaml. ignore (print warning) everything that's not present. 33 | # Do not add the requirement to define anything in the parser other than yamls 34 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/config.yaml: -------------------------------------------------------------------------------- 1 | data_path: 'D:/data/cnn_cln' 2 | 3 | trainer: 4 | max_train_steps_per_epoch : 2 # Maximum train steps per epoch. 5 | max_val_steps_per_epoch : 2 # Maximum validation steps per epoch. 6 | train_batch_size: 1 # Training global batch size. 7 | val_batch_size: 1 # Validation global batch size. 8 | epochs: 3 # Total epochs to run. 9 | gpu_batch_size_limit : 16 # Max limit for GPU batch size during training. 10 | disable_tqdm : False 11 | writers: ["aml", "tensorboard"] 12 | backend: "sp" 13 | 14 | module: 15 | max_length_encoder : 128 16 | max_length_decoder : 128 17 | 18 | wrt: 19 | tb_log_dir : 'logs' 20 | 21 | stat: 22 | log_steps : 1 23 | chkp: 24 | checkpoint : False 25 | delete_existing_checkpoints: True 26 | save_dir: 'checkpoints' 27 | load_dir: null 28 | 29 | generate: 30 | max_length: 128 31 | do_sample : False 32 | num_beams : 1 -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import torch 3 | import pymarlin 4 | from pymarlin.core import data_interface 5 | import matplotlib 6 | matplotlib.use('Agg') # disable this in local machine to see plots 7 | import matplotlib.pyplot as plt 8 | import sys 9 | 10 | def get_source_target(root = 'D:/data/cnn_cln', stage = 'val'): 11 | source = f'{root}/{stage}.source' 12 | target = f'{root}/{stage}.target' 13 | return source, target 14 | 15 | class AnalyzeProcessor(data_interface.DataProcessor): 16 | def __init__(self, source, target): 17 | with open(source, 'r', encoding = 'UTF-8') as f: 18 | self.source = f.readlines() 19 | with open(target, 'r', encoding = 'UTF-8') as f: 20 | self.target = f.readlines() 21 | def process(self): 22 | pass 23 | def analyze(self): 24 | self.df = pd.DataFrame({'source':self.source, 'target': self.target}) 25 | print(self.df.head()) 26 | print('\nWord length analysis:') 27 | wordlengths = self.df.applymap(lambda x : len(x.split())) 28 | print(wordlengths.describe()) 29 | plt.plot(wordlengths) 30 | plt.legend(['source','target']) 31 | 32 | class SummarizationDataset(torch.utils.data.Dataset): 33 | def __init__(self, source, target): 34 | with open(source, 'r', encoding = 'UTF-8') as f: 35 | self.source = f.readlines() 36 | with open(target, 'r', encoding = 'UTF-8') as f: 37 | self.target = f.readlines() 38 | print('len(self.source), len(self.target) = ',len(self.source), len(self.target)) 39 | def __getitem__(self, i): 40 | # print('len(self.source), len(self.target) = ',len(self.source), len(self.target)) 41 | return self.source[i].strip(), self.target[i].strip() 42 | def __len__(self): 43 | return len(self.target) 44 | 45 | class SummarizationData(pymarlin.core.data_interface.DataInterface): 46 | ''' 47 | Class which expects input data to have different files for source and target. 48 | Returns dataset which returns non tokenized source and target text. 49 | ''' 50 | def __init__(self, root='D:/data/cnn_cln'): 51 | self.root = root 52 | self.train_ds = SummarizationDataset(*get_source_target(root, 'train')) 53 | self.val_ds = SummarizationDataset(*get_source_target(root, 'val')) 54 | print('self.train_ds length = ', len(self.train_ds)) 55 | 56 | def get_train_dataset(self, *args, **kwargs): 57 | return self.train_ds 58 | def get_val_dataset(self, *args, **kwargs): 59 | return self.val_ds 60 | def get_test_dataset(self, *args, **kwargs): 61 | pass 62 | 63 | if __name__ == '__main__': 64 | root = sys.argv[1] #'D:/data/cnn_cln' 65 | print(root) 66 | print('\n**** Analyzing Train ***') 67 | dp = AnalyzeProcessor(*get_source_target(root = root, stage='train')) 68 | dp.process_data() 69 | print('\n**** Analyzing Val ***') 70 | dp = AnalyzeProcessor(*get_source_target(root = root, stage='val')) 71 | dp.process_data() 72 | plt.show() -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/deepspeed_methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/cnndailymail_text_summarization/deepspeed_methods/__init__.py -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/deepspeed_methods/deepspeedConfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 32, 3 | "gradient_accumulation_steps": 1, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 1.875e-4, 8 | "betas": [ 9 | 0.9, 10 | 0.98 11 | ], 12 | "eps": 1e-8, 13 | "weight_decay": 3e-7 14 | } 15 | }, 16 | "zero_allow_untested_optimizer": true, 17 | "scheduler": { 18 | "type": "OneCycle", 19 | "params": { 20 | "cycle_first_step_size": 256115, 21 | "cycle_first_stair_count": 10000, 22 | "cycle_second_step_size": 256115, 23 | "cycle_second_stair_count": 10000, 24 | "decay_step_size": 1000, 25 | "cycle_min_lr": 1.875e-5, 26 | "cycle_max_lr": 1.875e-4, 27 | "decay_lr_rate": 0.001, 28 | "cycle_min_mom": 0.85, 29 | "cycle_max_mom": 0.99, 30 | "decay_mom_rate": 0.0 31 | } 32 | }, 33 | "fp16": { 34 | "enabled": true 35 | }, 36 | "zero_optimization": { 37 | "stage": 1, 38 | "allgather_partitions": true, 39 | "allgather_bucket_size": 5e8, 40 | "overlap_comm": false, 41 | "reduce_scatter": true, 42 | "reduce_bucket_size": 5e8, 43 | "contiguous_gradients": false, 44 | "cpu_offload": false 45 | } 46 | } -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/deepspeed_methods/deepspeed_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pymarlin import Trainer 4 | from pymarlin.utils.checkpointer.checkpoint_utils import Checkpoint 5 | 6 | 7 | class DeepSpeedTrainer(Trainer): 8 | 9 | def save_checkpoint(self, force=False) -> None: 10 | # deepspeed will require all processes to call save_checkpoint method 11 | ckpt_id = str(self.trainer_backend.get_state()["global_step_completed"]) 12 | self.module.model.save_checkpoint(os.path.join(self.args.checkpointer_args.save_dir, self.module.DEEPSPEED_CKPT_PREFIX), ckpt_id) 13 | 14 | if self.is_main_process: # only main process should checkpoint 15 | checkpoint_state = Checkpoint( 16 | module_interface_state=self.module.get_state(), 17 | trainer_state=self.get_state(), 18 | trainer_backend_state=self.trainer_backend.get_state() 19 | ) 20 | self.checkpointer.save(checkpoint_state, self.last_epoch, force) 21 | 22 | def save_model_checkpoint(self) -> None: 23 | if self.args.checkpointer_args.checkpoint and (self.args.checkpointer_args.model_state_save_dir is not None): 24 | ckpt_id = str(self.trainer_backend.get_state()["global_step_completed"]) 25 | self.module.model.save_checkpoint(os.path.join(self.args.checkpointer_args.model_state_save_dir, self.module.DEEPSPEED_CKPT_PREFIX), ckpt_id) 26 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/deepspeed_methods/deepspeed_trainer_backend.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import torch 3 | from typing import Iterable, List, Optional, Union 4 | 5 | from pymarlin import SingleProcess 6 | from pymarlin.core import module_interface 7 | from pymarlin.core.trainer_backend import TrainerBackendArguments, OutputCollector, DDPTrainerBackend 8 | 9 | 10 | class DeepSpeedTrainerBackend(SingleProcess): 11 | 12 | def init(self, args: TrainerBackendArguments): 13 | self.args = args 14 | self.model = self.args.model 15 | if not self.distributed: 16 | assert self.args.distributed_training_args.world_size == 1 \ 17 | , 'World size > 1 . Decorate with DDPTrainerBackend' 18 | 19 | # ensure gradient_accumulation will be equal to the one set in deepspeed config json 20 | if self.args.gradient_accumulation != self.model.model.gradient_accumulation_steps(): 21 | print(f"Warning, self.args.gradient_accumulation ({self.args.gradient_accumulation}) is not equal to gradient_accumulation_steps inside deepspeedConfig.json, adjusting") 22 | print(f"Warning, setting self.args.gradient_accumulation to {self.model.model.gradient_accumulation_steps()}") 23 | self.args.gradient_accumulation = self.model.model.gradient_accumulation_steps() 24 | 25 | def train_dl(self, dataloader, callback: module_interface.CallbackInterface): 26 | 27 | epoch_collector = OutputCollector() 28 | global_step_collector = OutputCollector() 29 | self.global_step_this_epoch = 0 30 | # can pass certain stuff as argument instead of passing the entire train module. 31 | # But will this hinder inheritence as different trainer_backends will need different stuff from train module 32 | with tqdm(dataloader, unit="batch", disable=self.args.disable_tqdm) as tbatch: 33 | for i, batch in enumerate(tbatch): 34 | if ( 35 | self.args.max_train_steps_per_epoch 36 | and self.global_step_this_epoch 37 | >= self.args.max_train_steps_per_epoch 38 | ): 39 | break 40 | 41 | tbatch.set_description(f"Global Batch: {self.global_step_completed + 1} ") 42 | # forward 43 | outputs = self.model.forward( 44 | stage=module_interface.Stage.TRAIN, 45 | batch=batch, 46 | device=self.args.device, 47 | global_step=self.global_step_completed + 1, 48 | ) 49 | # assume iterable if first return type is not a list 50 | outputs = [outputs] if type(outputs) == torch.Tensor else outputs 51 | 52 | loss = outputs[0] 53 | 54 | # backward. This will keep on accumulating gradients 55 | self.model.model.backward(loss) 56 | # deepspeed model engine must be called each micro step 57 | self.model.model.step() 58 | callback.on_end_backward(self.global_step_completed, loss) 59 | 60 | # collect 61 | epoch_collector.collect(outputs) 62 | global_step_collector.collect(outputs) 63 | 64 | unscaled_loss = outputs[0].item() 65 | tbatch.set_postfix( 66 | loss=unscaled_loss 67 | ) # move progress bar to logger later 68 | 69 | self.batches_completed += 1 70 | 71 | if self.batches_completed % self.args.gradient_accumulation == 0: 72 | # write global step mean loss to stats 73 | self.process_global_step(global_step_collector, callback) 74 | 75 | return epoch_collector.all_outputs 76 | 77 | def process_global_step(self, global_step_collector, callback): 78 | """Clip gradients and call optimizer + scheduler 79 | """ 80 | global_step_outputs = global_step_collector.all_outputs 81 | global_step_mean_loss = ( 82 | global_step_outputs[0].mean().item() 83 | ) 84 | global_step_collector.reset() 85 | self.stats.update("loss", global_step_mean_loss, frequent=True) 86 | 87 | self.global_step_completed += 1 88 | self.global_step_this_epoch += 1 89 | 90 | callback.on_end_train_step(self.global_step_completed, *global_step_outputs) 91 | self.stats.log_stats(self.global_step_completed) 92 | 93 | 94 | class DeepSpeedDistributedTrainerBackend(DDPTrainerBackend): 95 | 96 | def init(self, args: TrainerBackendArguments): 97 | # unpack trainer_backend arguments 98 | self.args = args 99 | self.distributed_training_args = args.distributed_training_args 100 | 101 | self.trainer_backend.init(args) 102 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/deepspeed_methods/deepspeed_utils.py: -------------------------------------------------------------------------------- 1 | import deepspeed as dp 2 | from typing import Optional, Any, Dict 3 | 4 | 5 | def prepare_optimizer_parameters(deepspeed_transformer_kernel, model): 6 | param_optimizer = list(model.named_parameters()) 7 | param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] 8 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 9 | if deepspeed_transformer_kernel: 10 | no_decay = no_decay + ['attn_nw', 'attn_nb', 'norm_w', 'norm_b', 11 | 'attn_qkvb', 'attn_ob', 'inter_b', 'output_b'] 12 | weight_decay = 0.01 13 | 14 | optimizer_grouped_parameters = [{ 15 | 'params': 16 | [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 17 | 'weight_decay': 18 | weight_decay 19 | }, { 20 | 'params': 21 | [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 22 | 'weight_decay': 23 | 0.0 24 | }] 25 | 26 | return optimizer_grouped_parameters 27 | 28 | 29 | def initialize_deepspeed(model, config, deepspeed_transformer_kernel): 30 | print("SystemLog: Initializing DeepSpeed") 31 | print("SystemLog: DeepSpeed parameters: deepspeed_config=%s" % (config)) 32 | 33 | optimizer_grouped_parameters = prepare_optimizer_parameters(deepspeed_transformer_kernel, model) 34 | 35 | # DeepSpeed initializer handles FP16, distributed, optimizer automatically. 36 | model_deepspeed, optimizer_deepspeed, _, _ = dp.initialize( 37 | config=config, 38 | model=model, 39 | model_parameters=optimizer_grouped_parameters) 40 | 41 | return model_deepspeed, optimizer_deepspeed 42 | 43 | 44 | def get_core_model(model, deepspeed_flag=False, ort_flag=False): 45 | module = model 46 | if deepspeed_flag: 47 | module = module.module 48 | if ort_flag: 49 | module = module._original_module 50 | 51 | return module 52 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/images/tensorboard_screenshot_bart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/cnndailymail_text_summarization/images/tensorboard_screenshot_bart.jpg -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/infer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Inferencing after training completion 3 | ''' 4 | import torch 5 | import os 6 | import argparse 7 | 8 | from transformers import BartForConditionalGeneration, BartTokenizerFast 9 | class Summarizer(torch.nn.Module): 10 | def __init__(self, model_path = 'outputs', model_file='', isCheckpoint = True, load_weights = True): 11 | super().__init__() 12 | self.fullpath = os.path.join(model_path, model_file) 13 | self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") 14 | self.tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") 15 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 16 | #load model weights 17 | if load_weights: 18 | self._load_weights(isCheckpoint) 19 | 20 | 21 | def _load_weights(self, isCheckpoint = True): 22 | state_dict = torch.load(self.fullpath) 23 | if isCheckpoint: 24 | state_dict = state_dict['module_interface'] 25 | self.load_state_dict(state_dict) 26 | else: 27 | self.model.load_state_dict(state_dict) 28 | self.model.to(self.device) 29 | 30 | def summarize(self, text): 31 | batch = self.tokenizer(text, return_tensors='pt').to(self.device) 32 | generated_ids = self.model.generate(batch['input_ids']) 33 | return self.tokenizer.batch_decode(generated_ids)[0] 34 | 35 | if __name__ == '__main__': 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument( 38 | "--model_path", type=str, default=r"checkpoints", help="Path to model") 39 | parser.add_argument("--model_file", type=str, default = "model_0.tar") 40 | 41 | args = parser.parse_args() 42 | summ = Summarizer(model_path = args.model_path, model_file = args.model_file ) 43 | text = "Home Secretary Priti Patel warns people trying to leave UK will be turned back at airports and lashes influencers 'working' in the sun as she unveils quarantine rules for Brits returning from 30 high-risk countries" 44 | summary = summ.summarize(text) 45 | print(text) 46 | print('Summary:', summary) -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/model_ortds.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import os 3 | 4 | # too long import 5 | from pymarlin.utils.stats import global_stats 6 | from pymarlin.utils.logger import getlogger 7 | 8 | from onnxruntime.training.ortmodule import ORTModule 9 | 10 | from filelock import FileLock 11 | 12 | from deepspeed_methods import deepspeed_utils 13 | from train import SummarizationBartModule 14 | 15 | logger = getlogger(__file__) 16 | 17 | class SummarizationBartModuleORT(SummarizationBartModule): 18 | def __init__( 19 | self, 20 | *args, 21 | **kwargs 22 | ): 23 | super().__init__(*args, **kwargs) 24 | 25 | #setting this here to avoid issues after wrapping 26 | self._pad_token_id = self.model.config.pad_token_id 27 | 28 | logger.info("Employing ORT, wrapping model with ORTModule") 29 | self.model = ORTModule(self.model) 30 | 31 | def get_core_model(self): 32 | return deepspeed_utils.get_core_model(self.model, ort_flag=True) 33 | 34 | @property 35 | def pad_token_id(self): 36 | return self._pad_token_id 37 | 38 | class SummarizationBartModuleORTDeepSpeed(SummarizationBartModuleORT): 39 | def __init__( 40 | self, 41 | *args, 42 | deepspeed_config='', 43 | deepspeed_transformer_kernel=False, 44 | deepspeed_ckpt_tag=None, 45 | deepspeed_resume_from_checkpoint=None, 46 | **kwargs 47 | ): 48 | super().__init__(*args, **kwargs) 49 | logger.info(f"Employing Deepspeed, wrapping model with Deepspeed") 50 | self.model, _ = deepspeed_utils.initialize_deepspeed(self.model, deepspeed_config, deepspeed_transformer_kernel) 51 | self.deepspeed_resume_from_checkpoint = deepspeed_resume_from_checkpoint 52 | self.deepspeed_ckpt_tag = deepspeed_ckpt_tag 53 | self.DEEPSPEED_CKPT_PREFIX = "deepspeed_ckpt" 54 | 55 | def get_optimizers_schedulers( 56 | self, estimated_global_steps_per_epoch: int, epochs: int 57 | ): 58 | print(f"Deepspeed is employed, optimizer and scheduler are defined in deepspeedConfig.json file") 59 | return [], [] 60 | 61 | def get_core_model(self): 62 | return deepspeed_utils.get_core_model(self.model, ort_flag=True, deepspeed_flag=True) 63 | 64 | def train_step(self, global_step: int, batch, device): 65 | batch = batch.to(device) 66 | result = self.model(**batch) 67 | global_stats.update("lr", self.model.get_lr()[0], frequent=True) 68 | loss = result["loss"] 69 | 70 | return loss 71 | 72 | def get_state(self) -> Dict: 73 | return None 74 | 75 | def update_state(self, state: Dict): 76 | if self.deepspeed_resume_from_checkpoint is not None: 77 | 78 | import glob 79 | loading_path = os.path.join(self.deepspeed_resume_from_checkpoint, self.DEEPSPEED_CKPT_PREFIX) 80 | deepspeed_checkpoint_dirs = sorted(glob.glob(f"{loading_path}/*")) 81 | 82 | if len(deepspeed_checkpoint_dirs) > 0: 83 | logger.info(f"Attempting to resume from {loading_path}") 84 | # this magically updates self.optimizer and self.lr_scheduler 85 | load_path, _ = self.model.load_checkpoint( 86 | loading_path, 87 | load_optimizer_states=True, 88 | load_lr_scheduler_states=True, 89 | tag=self.deepspeed_ckpt_tag, 90 | ) 91 | if load_path is None: 92 | raise ValueError(f"[deepspeed] failed to resume from checkpoint {self.deepspeed_resume_from_checkpoint}") 93 | else: 94 | logger.error(f"{loading_path} doesn't have deepspeed checkpoints, doing nothing") 95 | -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | matplotlib 3 | rouge-score 4 | nltk -------------------------------------------------------------------------------- /examples/cnndailymail_text_summarization/train_ortds.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pymarlin.core import trainer 3 | 4 | # too long import 5 | from pymarlin.core.trainer_backend import build_trainer_backend 6 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser 7 | 8 | from filelock import FileLock 9 | 10 | # DeepSpeed + ORT 11 | from deepspeed_methods.deepspeed_trainer import DeepSpeedTrainer 12 | from deepspeed_methods.deepspeed_trainer_backend import DeepSpeedTrainerBackend, DeepSpeedDistributedTrainerBackend 13 | from onnxruntime.training.ortmodule import ORTModule 14 | 15 | from data import SummarizationData 16 | from model_ortds import SummarizationBartModuleORT,SummarizationBartModuleORTDeepSpeed, SummarizationBartModule 17 | 18 | if __name__ == '__main__': 19 | config = CustomArgParser(yaml_file_arg_key="config_path", default_yamlfile="config-ortds.yaml").parse() 20 | 21 | print(f"config: {config}") 22 | 23 | data = SummarizationData(root=config["data_path"]) 24 | 25 | if config['ortds']: 26 | module_class = SummarizationBartModuleORTDeepSpeed 27 | elif config['ort']: 28 | module_class = SummarizationBartModuleORT 29 | else: 30 | module_class = SummarizationBartModule 31 | 32 | module = module_class(data, **config["module"], generate_kwargs=config["generate"]) 33 | 34 | trainer_args = trainer.TrainerArguments( 35 | **config["trainer"], 36 | stats_args=trainer.stats.StatInitArguments(**config["stat"]), 37 | writer_args=trainer.WriterInitArguments(**config["wrt"]), 38 | checkpointer_args=trainer.DefaultCheckpointerArguments(**config["chkp"]) 39 | ) 40 | 41 | if config['ortds']: 42 | module.deepspeed_resume_from_checkpoint = config["chkp"]["load_dir"] 43 | tr = DeepSpeedDistributedTrainerBackend(DeepSpeedTrainerBackend()) if config["dist"] else DeepSpeedTrainerBackend() 44 | trainer = DeepSpeedTrainer(trainer_backend=tr, module=module, args=trainer_args) 45 | else: 46 | trainer = trainer.Trainer(module=module, args=trainer_args) 47 | 48 | trainer.train() 49 | trainer.validate() 50 | -------------------------------------------------------------------------------- /examples/covid19_text_classification/azureml/dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04 2 | 3 | ############################################################################## 4 | # Custom Conda environment 5 | ############################################################################## 6 | 7 | ENV CONDAPATH /opt/miniconda/envs/pymarlin 8 | RUN conda create -p $CONDAPATH python=3.8 pip=20.2.4 9 | ENV PATH $CONDAPATH/bin:$PATH 10 | 11 | ############################################################################## 12 | # PyTorch 13 | ############################################################################## 14 | 15 | RUN pip install --no-cache-dir torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html 16 | 17 | ################################################################################################ 18 | # pymarlin[plugins] should have everything needed to run classification 19 | ################################################################################################ 20 | 21 | RUN pip install --no-cache-dir --use-feature=2020-resolver pymarlin[plugins] 22 | -------------------------------------------------------------------------------- /examples/covid19_text_classification/azureml/submit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from azureml.core import Workspace, Dataset, Experiment, ScriptRunConfig, Environment 4 | from azureml.core.runconfig import PyTorchConfiguration, MpiConfiguration 5 | 6 | def prepare_env_cmd(): 7 | """Prepare the environment and submission command for the classification example.""" 8 | env = Environment("pymarlin_requirements") 9 | env.docker.enabled = True 10 | env.docker.base_image = None 11 | env.docker.base_dockerfile = 'dockerfile' 12 | env.python.user_managed_dependencies = True 13 | env.python.interpreter_path = "/opt/miniconda/bin/python" 14 | env.register(ws) 15 | 16 | ds = ws.get_default_datastore() 17 | # preprocessed data needs to be placed into datastore 18 | # ds.upload_files([r'data\covid-19-nlp-text-classification\preprocessed\bert'], 'datasets/covid19_classification/preprocessed/bert/') 19 | dataset = Dataset.File.from_files((ds, 'datasets/covid19_classification/preprocessed/bert/')).as_download() 20 | 21 | cmd = f'''python train.py --trainer.backend {args.backend} '''.split() 22 | cmd.extend(['--data.preprocessed_dir', dataset]) 23 | 24 | return env, cmd 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--target_name", '-t', default="sriovdedicated1") 29 | parser.add_argument("--node_count", "-n", type=int, default=1) 30 | parser.add_argument("--process_count", "-p", type=int, default=1) 31 | parser.add_argument("--experiment_name", '-e', type=str, default="marlin-tests") 32 | parser.add_argument("--distributed_config", "-d", type=str, choices=["mpi", "pytorch"], default="pytorch") 33 | parser.add_argument("--backend", "-b", choices=["sp", "ddp-amp"], default="sp") 34 | parser.add_argument("--subscription_id", '-s', help='azure subscription id', required=True) 35 | parser.add_argument("--resource_group", '-rg', help='azure resource group', required=True) 36 | parser.add_argument("--workspace_name",'-ws', help='azure machine learning workspace', required=True) 37 | parser.add_argument("--wait", "-w", action="store_true", help="Throw error is Azure ML job fails.") 38 | args = parser.parse_args() 39 | 40 | ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) 41 | 42 | target = ws.compute_targets[args.target_name] 43 | 44 | if args.distributed_config == "pytorch": 45 | distributed_job_config = PyTorchConfiguration( 46 | process_count=args.process_count, node_count=args.node_count 47 | ) 48 | elif args.distributed_config == "mpi": 49 | distributed_job_config = MpiConfiguration( 50 | process_count_per_node=args.process_count, node_count=args.node_count 51 | ) 52 | else: 53 | raise ValueError(f"Didn't recognize the distributed config {args.distributed_config}. Select on of 'mpi' or 'pytorch'.") 54 | 55 | env, cmd = prepare_env_cmd() 56 | 57 | src = ScriptRunConfig( 58 | source_directory='..', 59 | command=cmd, 60 | compute_target=target, 61 | distributed_job_config=distributed_job_config, 62 | environment=env, 63 | ) 64 | 65 | print("Submitting experiment...") 66 | run = Experiment(ws, args.experiment_name).submit(src) 67 | 68 | print(f"{run.get_portal_url()}") 69 | 70 | if args.wait: 71 | print("Waiting for run completion...") 72 | run.wait_for_completion(show_output=True, raise_on_error=True) 73 | -------------------------------------------------------------------------------- /examples/covid19_text_classification/config.yaml: -------------------------------------------------------------------------------- 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line. 2 | # Example usage in command-line: --tmgr.epochs 10 3 | 4 | # trainer arguments 5 | trainer: 6 | max_train_steps_per_epoch : 20 # Maximum train steps per epoch. 7 | max_val_steps_per_epoch : 5 # Maximum validation steps per epoch. 8 | train_batch_size: 8 # Training global batch size. 9 | val_batch_size: 4 # Validation global batch size. 10 | epochs: 1 # Total epochs to run. 11 | gpu_batch_size_limit : 8 # Max limit for GPU batch size during training. 12 | clip_grads : False # Enable or disable clipping of gradients. 13 | use_gpu: True # Enable or disable use of GPU. 14 | max_grad_norm: 1.0 # Maximum value for gradient norm. 15 | writers: ['stdout', 'aml', 'tensorboard'] # List of all the writers to use. 16 | reset_optimizers_schedulers: True 17 | backend: sp 18 | 19 | # Checkpointer arguments 20 | chkp: 21 | checkpoint: True # Flag indicating whether to checkpoint model. 22 | delete_existing_checkpoints: False # Flag indicating whether to delete checkpoints under save_dir before training. 23 | period: 1 # Period of epochs at which to checkpoint model. 24 | save_dir: 'checkpoints' # Path to directory where checkpoints are to be stored. 25 | #load_dir: 'checkpoints' # Path to directory where checkpoints are to be loaded from. 26 | #load_filename: 'tweetClassification_0.pt' # Filename of checkpoint under load_dir (overrides automatic loading of max epoch). 27 | file_prefix: 'tweetClassification' # Prefix of the checkpoint filename. 28 | file_ext: 'pt' # File extension for the checkpoint. 29 | log_level: 'DEBUG' # Log level for checkpointer module. 30 | 31 | # Basic-Statistics arguments 32 | stat: 33 | log_steps: 50 # Interval between steps for logging stats. 34 | update_system_stats: False # Enable or disable updating system stats 35 | log_model_steps: 1000 # Interval between steps for logging model. 36 | exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)' # Exclude list for logging. 37 | 38 | # Writers arguments 39 | wrts: 40 | model_log_level : 'INFO' # Log level for model. Set to DEBUG to enable. 41 | tb_log_dir : 'logs' # Folder name for storing Tensorboard logs. 42 | tb_logpath_parent_env : null # Log path parent Environment. 43 | tb_log_multi : False # Enable or disable logging multi. 44 | tb_log_hist_steps : 20000 # Interval between steps to log histogram. 45 | 46 | # Scenario-specific arguments 47 | module: 48 | max_lr : 0.00004 # Maximum learning rate. 49 | log_level: 'INFO' 50 | 51 | data: 52 | filepath_train: 'data/covid-19-nlp-text-classification/Corona_NLP_train.csv' 53 | filepath_test: 'data/covid-19-nlp-text-classification/Corona_NLP_test.csv' 54 | preprocessed_dir: 'data/covid-19-nlp-text-classification/preprocessed/bert' 55 | encoding: 'ISO-8859-1' 56 | text_field: 'OriginalTweet' 57 | label_field: 'Sentiment' 58 | splitpct: 10 59 | log_level: 'INFO' 60 | -------------------------------------------------------------------------------- /examples/covid19_text_classification/readme.md: -------------------------------------------------------------------------------- 1 | # Covid-19 Text Sentiment Classification 2 | 3 | ## Instructions 4 | 1. Install requirements (start in this directory) 5 | pip install -r requirements.txt 6 | 2. Download data from kaggle 7 | Ref: https://github.com/Kaggle/kaggle-api 8 | 9 | Get your credentials file from kaggle here: C:\Users\\.kaggle\kaggle.json 10 | 11 | mkdir data 12 | 13 | cd data 14 | 15 | kaggle datasets download -d datatattle/covid-19-nlp-text-classification 16 | 17 | (if windows): Expand-Archive .\covid-19-nlp-text-classification.zip 18 | 19 | (else): unzip ./covid-19-nlp-text-classification.zip 20 | 21 | mkdir covid-19-nlp-text-classification 22 | 23 | move the two csv files to the new folder 24 | 25 | 3. Install pymarlin library 26 | 27 | pip install pymarlin 28 | 29 | or 30 | 31 | $env:PYTHONPATH= 32 | 33 | 4. Set working directory 34 | 35 | cd .. 36 | 37 | 5. Prepare data 38 | 39 | python data.py 40 | 41 | 6. Train 42 | 43 | python train.py [--trainer.max_train_steps_per_epoch 2] 44 | 45 | ## Running AzureML 46 | You can use [`examples/classification/azureml/azureml_submit.py`](https://github.com/microsoft/PyMarlin/blob/main/examples/classification/azureml/azureml_submit.py) 47 | to submit examples to run on Azure ML. 48 | 49 | For example: 50 | 51 | ```bash 52 | cd examples/classification/azureml/ 53 | python azureml_submit.py --backend ddp-amp --process_count 2 54 | --subcription_id --resource_group --workspace_name 55 | ``` 56 | 57 | See `examples/classification/azureml/azureml_submit.py -h` for more options. 58 | 59 | **Note.** Submitting to AzureML requires setting up an AzureML workspace. See [Azure ML CheatSheet](https://aka.ms/aml/cheatsheet) for more details. 60 | -------------------------------------------------------------------------------- /examples/covid19_text_classification/requirements.txt: -------------------------------------------------------------------------------- 1 | kaggle 2 | transformers -------------------------------------------------------------------------------- /examples/germ_text_ner/config_germ.yaml: -------------------------------------------------------------------------------- 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line. 2 | # Example usage in command-line: --tmod.max_lr 4E-5 3 | 4 | # data_processor args 5 | data: 6 | train_filepath : null 7 | val_filepath : null 8 | labels_list: [B-LOC, B-LOCderiv, B-LOCpart, B-ORG, B-ORGderiv, B-ORGpart, B-OTH, B-OTHderiv, 9 | B-OTHpart, B-PER, B-PERderiv, B-PERpart, I-LOC, I-LOCderiv, I-LOCpart, I-ORG, I-ORGderiv, 10 | I-ORGpart, I-OTH, I-OTHderiv, I-OTHpart, I-PER, I-PERderiv, I-PERpart, O] 11 | has_labels: True 12 | file_format: "tsv" 13 | 14 | # model arguments 15 | model: 16 | model_name: "bert" 17 | encoder_key: "bert" 18 | hf_model: "bert-base-multilingual-cased" 19 | model_file: "pytorch_model.bin" 20 | model_config_file: "config.json" 21 | model_path: null 22 | model_config_path: null 23 | tokenizer_path: null 24 | 25 | # module_interface arguments 26 | module: 27 | output_dir: null 28 | max_lr : 0.00002 # Maximum learning rate. 29 | warmup_prop: 0.1 30 | has_labels: True 31 | max_seq_len: 128 32 | pad_label_id: -100 33 | label_all_tokens: False 34 | 35 | # distill module arguments 36 | distill: 37 | enable: False 38 | student_model_config_path: null 39 | student_model_config_file: null 40 | student_model_path: null 41 | student_model_file: null 42 | student_layers: [0,6,11] 43 | loss_types: ["logits"] 44 | loss_weights: [1] 45 | temperature: 1 46 | 47 | # trainer arguments 48 | trainer: 49 | backend: "sp" 50 | train_batch_size: 32 # Training global batch size. 51 | val_batch_size: 16 # Validation global batch size. 52 | epochs: 5 # Total epochs to run. 53 | gpu_batch_size_limit : 8 # Max limit for GPU batch size during training. 54 | clip_grads : True # Enable or disable clipping of gradients. 55 | use_gpu: True # Enable or disable use of GPU. 56 | max_grad_norm: 1.0 # Maximum value for gradient norm. 57 | writers: ['stdout', 'aml', 'tensorboard'] # List of all the writers to use. 58 | disable_tqdm: True 59 | 60 | # Checkpointer arguments 61 | ckpt: 62 | checkpoint: True # Flag indicating whether to checkpoint model. 63 | delete_existing_checkpoints: True 64 | period: 1 # Period of epochs at which to checkpoint model. 65 | save_dir: 'ckpts' # Path to directory where checkpoints are to be stored. 66 | model_state_save_dir: 'model_ckpts' 67 | file_prefix: 'marlin' # Prefix of the checkpoint filename. 68 | file_ext: 'bin' # File extension for the checkpoint. 69 | 70 | # Basic-Statistics arguments 71 | stats: 72 | log_steps: 50 73 | update_system_stats: False 74 | log_model_steps: 1000 75 | exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)' 76 | 77 | # Writers arguments 78 | wrts: 79 | tb_log_dir : 'logs' 80 | tb_logpath_parent_env : null 81 | tb_log_multi : False 82 | tb_log_hist_steps : 20000 -------------------------------------------------------------------------------- /examples/germ_text_ner/readme.md: -------------------------------------------------------------------------------- 1 | # Germ Eval NER task 2 | 3 | Ashwin Srinivasan 4 | 5 | This example will walk you through executing the plugin for Germ eval NER task. 6 | 7 | For more detailed understanding of how you can use NER plugin please refer to [this](https://microsoft.github.io/PyMarlin/docs/plugins/hf_ner) 8 | 9 | ## Dataset format 10 | 11 | NER plugin expects the input to be a TSV or CSV with 2 columns. A column with the text sentences followed by a column with the labels for the tokens in the sentence.'Sentence':'who is harry', 'Slot': 'O O B-contact_name' 12 | 13 | For GermEval dataset we have already modified and provided the dataset along with this example. You will find the train file under train_germ and dev file under val_germ. 14 | 15 | ## Running on VM 16 | ``` 17 | conda create -n pymarlin 18 | conda activate pymarlin 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | Moving data : 23 | 24 | scp -r -P $port .\examples\NER_Plugin\ $user@${machine}:/home/$user 25 | ssh $user@$machine -p $port 26 | 27 | Running on CLI would be as simple as: 28 | 29 | ``` 30 | python test.py --data.train_filepath ./train_germ/train.tsv --data.val_filepath ./val_germ/dev.tsv --config_path config_germ.yaml 31 | ``` 32 | 33 | Running with multi-GPU: 34 | 35 | ``` 36 | python -m torch.distributed.launch --nproc_per_node 4 test.py --data.train_filepath ./train_germ/train.tsv --data.val_filepath ./val_germ/dev.tsv --config_path config_germ.yaml --trainer.backend ddp-amp 37 | ``` 38 | Results: 39 | 40 | tensorboard --logdir logs 41 | 42 | Tunnel to view tensorboard UI: (if using VM): 43 | 44 | ssh -N -f -L 127.0.0.1:6006:127.0.0.1:6006 $user@${machine} -p $port 45 | 46 | View Tensorboard UI: 47 | 48 | http://localhost:6006/ 49 | 50 | ## Running on Azure ML 51 | 52 | A notebook has been provided along with this titled 'GermEvalAML.ipynb' , once you have a valid azure workspace , resource group and compute replace the placeholders in the notebook and you should be able to submit a script to AML. 53 | 54 | ## Mode checkpoint extraction + Inference 55 | 56 | You may want to further use this model checkpoint for inference or use it in your project. The instructions are [here](https://microsoft.github.io/PyMarlin/docs/plugins/hf_ner) under evaluation section. Further the notebook includes a inference section with the relevant code. 57 | -------------------------------------------------------------------------------- /examples/germ_text_ner/requirements.txt: -------------------------------------------------------------------------------- 1 | pymarlin[plugins] 2 | torch==1.8.1+cu111 3 | -f https://download.pytorch.org/whl/torch_stable.html 4 | -------------------------------------------------------------------------------- /examples/germ_text_ner/test.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | 4 | import itertools 5 | import pandas as pd 6 | from pymarlin.core.trainer_backend import build_trainer_backend 7 | 8 | from pymarlin.utils.logger.logging_utils import getlogger 9 | logger = getlogger(__name__, 'DEBUG') 10 | from pymarlin.core import data_interface 11 | from transformers import InputExample, AutoTokenizer, InputFeatures 12 | from pymarlin.plugins import HfNERPlugin 13 | 14 | if __name__ == '__main__': 15 | ########### Usage ############# 16 | plugin = HfNERPlugin() 17 | 18 | ################ Run plugin.setup() to bootstrap entire pipeline ################# 19 | 20 | #### Cmdline: python test.py --data.train_filepath ./train_germ/train.tsv --data.val_filepath ./val_germ/dev.tsv --config_path config_germ.yaml 21 | 22 | plugin.setup_trainer() 23 | trainer = plugin.trainer 24 | trainer.train() 25 | -------------------------------------------------------------------------------- /examples/glue_text_benchmark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04 2 | RUN apt-get update 3 | 4 | 5 | # create conda environment 6 | RUN conda update -n base -c defaults conda -y 7 | RUN conda create -n marlin python=3.8 -y 8 | RUN echo ". /opt/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc 9 | 10 | 11 | 12 | ADD requirements.txt /workdir/ 13 | WORKDIR /workdir 14 | 15 | RUN /opt/miniconda/envs/marlin/bin/pip install -U -r requirements.txt 16 | 17 | # Instructions to update docker image. (replace krishansubudhi with your dockerhub account name) 18 | # https://krishansubudhi.github.io/development/2019/09/23/CreatingDockerImage.html 19 | 20 | # In a VM, Build 21 | # docker build --rm -t krishansubudhi/marlin:latest . 22 | 23 | # Test 24 | # docker run --gpus all -it -d -p 5000:5000 krishansubudhi/marlin:latest 25 | # docker ps 26 | # CONTAINER_ID=4dd751e87293 # replace 4dd751e87293 wit your container id 27 | # docker cp ./src $CONTAINER_ID:/workdir 28 | # docker attach $CONTAINER_ID 29 | 30 | # Push new image to dockerhub 31 | # docker login 32 | # docker push krishansubudhi/marlin:latest -------------------------------------------------------------------------------- /examples/glue_text_benchmark/README.md: -------------------------------------------------------------------------------- 1 | # GLUE Finetuning 2 | This code works well in a virtual machine with GPU (preferably with nvidia V100 or A100 GPU for AMP support). More information can be found in this [blog](https://krishansubudhi.github.io/deeplearning/2020/12/09/run-ml-on-vm.html). 3 | 4 | We can train any GLUE task using this code. This blog only shows instruction for RTE. Running other tasks are relatively straight forward. 5 | 6 | This code can be used for any other single sentence or sentence pair classifier too. A new DataInterface needs to be created for non-GLUE dataset. 7 | 8 | ## Move the code to GPU VM (Optional) 9 | 10 | scp -r -P $port .\examples\GLUE\ $user@${machine}:/home/$user 11 | ssh $user@$machine -p $port 12 | 13 | ## Setup environment and dependencies 14 | 15 | conda create -n pymarlin 16 | conda activate pymarlin 17 | pip install -r requirements.txt 18 | ## Analyze data 19 | Script: 20 | 21 | python src/data.py rte 22 | 23 | Result: 24 | 25 | DatasetDict({ 26 | train: Dataset({ 27 | features: ['sentence1', 'sentence2', 'label', 'idx'], 28 | num_rows: 2490 29 | }) 30 | validation: Dataset({ 31 | features: ['sentence1', 'sentence2', 'label', 'idx'], 32 | num_rows: 277 33 | }) 34 | test: Dataset({ 35 | features: ['sentence1', 'sentence2', 'label', 'idx'], 36 | num_rows: 3000 37 | }) 38 | }) 39 | 40 | train data label distribution 41 | idx label sentence1 sentence2 42 | 0 0 1 No Weapons of Mass Destruction Found in Iraq Yet. Weapons of Mass Destruction Found in Iraq. 43 | 1 1 0 A place of sorrow, after Pope John Paul II die... Pope Benedict XVI is the new leader of the Rom... 44 | count ratio 45 | label 46 | 0 1249 0.501606 47 | 1 1241 0.498394 48 | count 2490.000000 49 | mean 43.565462 50 | std 32.389776 51 | min 4.000000 52 | 50% 31.000000 53 | 95% 112.000000 54 | 99% 143.000000 55 | 99.9% 170.066000 56 | max 239.000000 57 | Name: sentence1, dtype: float64 58 | count 2490.000000 59 | mean 8.790361 60 | std 4.396781 61 | min 3.000000 62 | 50% 8.000000 63 | 95% 18.000000 64 | 99% 26.000000 65 | 99.9% 31.000000 66 | max 41.000000 67 | Name: sentence2, dtype: float64 68 | 69 | validation data label distribution 70 | idx label sentence1 sentence2 71 | 0 0 1 Dana Reeve, the widow of the actor Christopher... Christopher Reeve had an accident. 72 | 1 1 0 Yet, we now are discovering that antibiotics a... Bacteria is winning the war against antibiotics. 73 | count ratio 74 | label 75 | 0 146 0.527076 76 | 1 131 0.472924 77 | 78 | 79 | ## Train and validate 80 | Script Single GPU: 81 | 82 | python src/train.py --config_path configs-roberta-base/rte.yaml 83 | 84 | Script multiple GPUs: 85 | 86 | python -m torch.distributed.launch --nproc_per_node 4 src/train.py --config_path configs-roberta-base/rte.yaml --tr.backend ddp-amp 87 | 88 | Results: 89 | 90 | tensorboard --logdir logs_roberta_base 91 | 92 | Tunnel to view tensorboard UI: (if using VM): 93 | 94 | ssh -N -f -L 127.0.0.1:6006:127.0.0.1:6006 $user@${machine} -p $port 95 | 96 | View Tensorboard UI: 97 | 98 | http://localhost:6006/ 99 | 100 | ![results](images/tensorboard_screenshot.jpg) 101 | ## infer 102 | 0 is entailment, 1 is contradiction. 103 | 104 | Script 105 | 106 | python src/infer.py --config_path configs-roberta-base/rte.yaml 107 | 108 | Input 109 | 110 | sentence1 = ['No Weapons of Mass Destruction Found in Iraq Yet.', 111 | 'India is a hot country', 112 | 'Krishan has written this inference example'] 113 | sentence2 = ['Weapons of Mass Destruction Found in Iraq.', 114 | 'It\'s warm in india', 115 | 'Krishan is the author of this example'] 116 | 117 | Result: 118 | 119 | tensor([1, 0, 0]) -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/cola.yaml: -------------------------------------------------------------------------------- 1 | #https://arxiv.org/pdf/1907.11692v1.pdf 2 | #We consider a limited hyperparameter 3 | # sweep for each task, with batch sizes ∈ {16, 32} 4 | # and learning rates ∈ {1e−5, 2e−5, 3e−5}, with a 5 | # linear warmup for the first 6% of steps followed by 6 | # a linear decay to 0. We finetune for 10 epochs and 7 | # perform early stopping based on each tasks evaluation metric on the dev set. 8 | # The rest of the hyperparameters remain the same as during pretraining. 9 | 10 | glue_task: 'cola' 11 | 12 | mi: 13 | encoder : "roberta-base" 14 | tokenizer: "roberta-base" 15 | num_labels : 2 16 | max_lr : 0.00002 17 | warmup : 0.06 18 | tr: 19 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 20 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 21 | train_batch_size: 32 # Training global batch size. 22 | val_batch_size: 64 # Validation global batch size. 23 | epochs: 10 # Total epochs to run. 24 | gpu_batch_size_limit : 32 # Max limit for GPU batch size during training. 25 | disable_tqdm : False 26 | writers: ["tensorboard"] 27 | backend: 'sp' 28 | 29 | wrt: 30 | tb_log_dir : 'logs_roberta_base/cola/from_pretrained' 31 | 32 | stat: 33 | log_steps : 20 34 | 35 | dist: 36 | local_rank : 3 37 | period: 5 38 | 39 | ckp: 40 | checkpoint : False -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/mnli.yaml: -------------------------------------------------------------------------------- 1 | glue_task: 'mnli' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 3 7 | max_lr : 0.00002 8 | s1_key : 'premise' 9 | s2_key : 'hypothesis' 10 | tr: 11 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 12 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 13 | train_batch_size: 32 # Training global batch size. 14 | val_batch_size: 64 # Validation global batch size. 15 | epochs: 3 # Total epochs to run. 16 | gpu_batch_size_limit : 16 # Max limit for GPU batch size during training. 17 | disable_tqdm : False 18 | writers: ["tensorboard"] 19 | backend: 'sp' 20 | 21 | wrt: 22 | tb_log_dir : 'logs_roberta_base/mnli/from_pretrained' 23 | 24 | stat: 25 | log_steps : 20 26 | 27 | dist: 28 | local_rank : 3 29 | 30 | ckp: 31 | checkpoint : False 32 | period: 2 33 | 34 | # python -m torch.distributed.launch --nproc_per_node 2 src/mtl/finetune_glue.py --config_path configs/roberta-base/mnli.yaml --distributed -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/mrpc.yaml: -------------------------------------------------------------------------------- 1 | glue_task: 'mrpc' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 2 7 | max_lr : 0.00002 8 | s1_key : 'sentence1' 9 | s2_key : 'sentence2' 10 | max_length : 128 11 | warmup : 0.06 12 | tr: 13 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 14 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 15 | train_batch_size: 32 # Training global batch size. 16 | val_batch_size: 64 # Validation global batch size. 17 | epochs: 10 # Total epochs to run. 18 | gpu_batch_size_limit : 32 # Max limit for GPU batch size during training. 19 | disable_tqdm : False 20 | writers: ["tensorboard"] 21 | backend: 'sp' 22 | 23 | wrt: 24 | tb_log_dir : 'logs_roberta_base/mrpc/from_pretrained' 25 | 26 | stat: 27 | log_steps : 20 28 | dist: 29 | local_rank : 3 30 | ckp: 31 | checkpoint : False 32 | period: 5 -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/qnli.yaml: -------------------------------------------------------------------------------- 1 | glue_task: 'qnli' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 2 7 | max_lr : 0.00002 8 | s1_key : 'question' 9 | s2_key : 'sentence' 10 | max_length : 512 11 | tr: 12 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 13 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 14 | train_batch_size: 32 # Training global batch size. 15 | val_batch_size: 64 # Validation global batch size. 16 | epochs: 4 # Total epochs to run. 17 | gpu_batch_size_limit : 16 # Max limit for GPU batch size during training. 18 | disable_tqdm : False 19 | writers: ["tensorboard"] 20 | backend: 'sp' 21 | wrt: 22 | tb_log_dir : 'logs_roberta_base/qnli/from_pretrained' 23 | 24 | stat: 25 | log_steps : 20 26 | dist: 27 | local_rank : 0 28 | ckp: 29 | checkpoint : False 30 | period: 2 -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/qqp.yaml: -------------------------------------------------------------------------------- 1 | glue_task: 'qqp' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 2 7 | max_lr : 0.00002 8 | max_length : 128 9 | warmup : 0.06 10 | tr: 11 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 12 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 13 | train_batch_size: 32 # Training global batch size. 14 | val_batch_size: 64 # Validation global batch size. 15 | epochs: 10 # Total epochs to run. 16 | gpu_batch_size_limit : 32 # Max limit for GPU batch size during training. 17 | disable_tqdm : False 18 | writers: ["tensorboard"] 19 | backend: 'sp' 20 | 21 | wrt: 22 | tb_log_dir : 'logs_roberta_base/qqp/from_pretrained' 23 | 24 | stat: 25 | log_steps : 20 26 | 27 | dist: 28 | local_rank : 3 29 | period: 5 30 | 31 | ckp: 32 | checkpoint : False -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/rte.yaml: -------------------------------------------------------------------------------- 1 | glue_task : 'rte' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 2 7 | max_lr : 0.00002 8 | s1_key : 'sentence1' 9 | s2_key : 'sentence2' 10 | max_length : 128 11 | warmup : 0.06 12 | tr: 13 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 14 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 15 | train_batch_size: 32 # Training global batch size. 16 | val_batch_size: 64 # Validation global batch size. 17 | epochs: 10 # Total epochs to run. 18 | gpu_batch_size_limit : 32 # Max limit for GPU batch size during training. 19 | disable_tqdm : False 20 | writers: ["tensorboard"] 21 | backend: 'sp' 22 | 23 | wrt: 24 | tb_log_dir : 'logs_roberta_base/rte/from_pretrained' 25 | 26 | stat: 27 | log_steps : 20 28 | dist: 29 | local_rank : 1 30 | ckp: 31 | checkpoint : True 32 | period: 5 -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/sst2.yaml: -------------------------------------------------------------------------------- 1 | glue_task: 'sst2' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 2 7 | max_lr : 0.00002 8 | max_length : 128 9 | tr: 10 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 11 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 12 | train_batch_size: 32 # Training global batch size. 13 | val_batch_size: 64 # Validation global batch size. 14 | epochs: 3 # Total epochs to run. 15 | gpu_batch_size_limit : 32 # Max limit for GPU batch size during training. 16 | disable_tqdm : False 17 | writers: ["tensorboard"] 18 | backend: 'sp' 19 | 20 | wrt: 21 | tb_log_dir : 'logs_roberta_base/sst2/from_pretrained' 22 | 23 | stat: 24 | log_steps : 20 25 | dist: 26 | local_rank : 1 27 | ckp: 28 | checkpoint : False 29 | period: 2 -------------------------------------------------------------------------------- /examples/glue_text_benchmark/configs-roberta-base/stsb.yaml: -------------------------------------------------------------------------------- 1 | glue_task: 'stsb' 2 | 3 | mi: 4 | encoder : "roberta-base" 5 | tokenizer: "roberta-base" 6 | num_labels : 1 7 | max_lr : 0.00002 8 | s1_key : 'sentence1' 9 | s2_key : 'sentence2' 10 | max_length : 128 11 | tr: 12 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 13 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 14 | train_batch_size: 32 # Training global batch size. 15 | val_batch_size: 64 # Validation global batch size. 16 | epochs: 10 # Total epochs to run. 17 | gpu_batch_size_limit : 32 # Max limit for GPU batch size during training. 18 | disable_tqdm : False 19 | writers: ["tensorboard"] 20 | backend: 'sp' 21 | 22 | wrt: 23 | tb_log_dir : 'logs_roberta_base/stsb/from_pretrained' 24 | 25 | stat: 26 | log_steps : 20 27 | dist: 28 | local_rank : 2 29 | ckp: 30 | checkpoint : False 31 | period: 5 -------------------------------------------------------------------------------- /examples/glue_text_benchmark/images/tensorboard_screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/glue_text_benchmark/images/tensorboard_screenshot.jpg -------------------------------------------------------------------------------- /examples/glue_text_benchmark/logs_roberta_base/rte/from_pretrained/events.out.tfevents.1623336412.krishan-vm.20548.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/glue_text_benchmark/logs_roberta_base/rte/from_pretrained/events.out.tfevents.1623336412.krishan-vm.20548.0 -------------------------------------------------------------------------------- /examples/glue_text_benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | pymarlin 3 | torch==1.8.1+cu111 4 | -f https://download.pytorch.org/whl/torch_stable.html 5 | transformers 6 | sklearn 7 | matplotlib 8 | mock -------------------------------------------------------------------------------- /examples/glue_text_benchmark/src/data.py: -------------------------------------------------------------------------------- 1 | from pymarlin.core.data_interface import DataInterface, DataProcessor 2 | from datasets import load_dataset 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | 6 | cache_dir =r"/tmp/hf_data" 7 | class GlueDataAnalyzer(DataProcessor): 8 | def __init__(self, glue_task): 9 | self.datasets = load_dataset("glue",glue_task, cache_dir = cache_dir) 10 | 11 | def process(self): 12 | pass 13 | 14 | def analyze(self): 15 | print(self.datasets) 16 | for split in self.datasets.keys(): 17 | self.analyze_split(split) 18 | 19 | def analyze_split(self,split = 'train'): 20 | print(f'\n{split} data label distribution') 21 | df = pd.DataFrame(self.datasets[split]) 22 | print(df.head(2)) 23 | count = df.groupby('label')['label'].count() 24 | summary = pd.DataFrame({'count':count, 'ratio': count/len(df)}) 25 | print(summary) 26 | return df 27 | 28 | def analyze_texts(self, texts): 29 | s = pd.Series( texts) 30 | s = s.apply(lambda cell : cell.split()) 31 | print(s.apply(len).describe(percentiles = [0.5,0.95,0.99,0.999])) 32 | 33 | class SentenceDataAnalyzer(GlueDataAnalyzer): 34 | def __init__(self, glue_task, sentence_key = 'sentence'): 35 | super().__init__(glue_task) 36 | self.sentence_key = sentence_key 37 | def analyze_split(self, split = 'train'): 38 | df = super().analyze_split(split) 39 | self.analyze_texts(df[self.sentence_key]) 40 | 41 | class SPDataAnalyzer(GlueDataAnalyzer): 42 | def __init__(self, glue_task, s1_key = 'question1', s2_key = 'question2'): 43 | super().__init__(glue_task) 44 | self.s1_key = s1_key 45 | self.s2_key = s2_key 46 | def analyze_split(self, split = 'train'): 47 | df = super().analyze_split(split) 48 | self.analyze_texts(df[self.s1_key]) 49 | self.analyze_texts(df[self.s2_key]) 50 | 51 | 52 | class SPRegressionDataAnalyzer(SPDataAnalyzer): 53 | def __init__(self, glue_task, s1_key = 'question1', s2_key = 'question2'): 54 | super().__init__(glue_task) 55 | self.s1_key = s1_key 56 | self.s2_key = s2_key 57 | def analyze_split(self, split = 'train'): 58 | print(f'\n{split} data label distribution') 59 | df = pd.DataFrame(self.datasets[split]) 60 | print(df.head(2)) 61 | # print(df.label.describe()) 62 | self.analyze_texts(df[self.s1_key]) 63 | self.analyze_texts(df[self.s2_key]) 64 | 65 | def analyzer_factory(glue_task): 66 | factory = { 67 | 'default':GlueDataAnalyzer(glue_task), 68 | 'qqp':SPDataAnalyzer('qqp'), 69 | 'rte':SPDataAnalyzer('rte', 'sentence1', 'sentence2'), 70 | 'mnli':SPDataAnalyzer('mnli', 'premise', 'hypothesis'), 71 | 'qnli':SPDataAnalyzer('qnli', 'question', 'sentence'), 72 | 'sst2':SentenceDataAnalyzer('sst2'), 73 | 'stsb':SPRegressionDataAnalyzer('stsb', 'sentence1','sentence2'), 74 | 'wnli':SPDataAnalyzer('wnli', 'sentence1', 'sentence2'), 75 | 'mrpc':SPDataAnalyzer('mrpc', 'sentence1', 'sentence2'), 76 | } 77 | glue_task = glue_task if glue_task in factory else 'default' 78 | return factory[glue_task] 79 | 80 | class GlueData(DataInterface): 81 | def setup_datasets(self, glue_task = 'cola'): 82 | self.glue_task = glue_task 83 | datasets = load_dataset("glue",glue_task, cache_dir = cache_dir) 84 | self.train_ds = datasets['train'] 85 | if glue_task == 'mnli': 86 | self.val_ds = {'mnli_matched':datasets['validation_matched'],'mnli_mismatched':datasets['validation_mismatched']} 87 | self.test_ds = [datasets['test_matched'],datasets['test_mismatched']] 88 | else: 89 | self.val_ds = datasets['validation'] 90 | self.test_ds = datasets['test'] 91 | 92 | def get_train_dataset(self): 93 | return self.train_ds 94 | 95 | def get_val_dataset(self): 96 | return self.val_ds 97 | 98 | def get_test_dataset(self): 99 | return self.test_ds 100 | 101 | if __name__ == "__main__": 102 | import sys 103 | 104 | glue_task = sys.argv[1] if len(sys.argv) >1 else 'cola' 105 | print(glue_task) 106 | di = GlueData() 107 | di.setup_datasets(glue_task=glue_task) 108 | dp = analyzer_factory(glue_task) 109 | dp.process_data() 110 | 111 | #python src/data_hf_glue.py rte -------------------------------------------------------------------------------- /examples/glue_text_benchmark/src/infer.py: -------------------------------------------------------------------------------- 1 | from train import * 2 | from mock import MagicMock 3 | 4 | def load_classifier(): 5 | config = CustomArgParser().parse() 6 | checkpoint_path = 'checkpoints/model_9.pt' 7 | glue_task = config['glue_task'] 8 | data = MagicMock() 9 | classifier = recipe_factory(glue_task, data_interface = data, **config['mi']) 10 | sd = torch.load(checkpoint_path,map_location = 'cpu')['module_interface_state'] 11 | classifier.load_state_dict(sd) 12 | return classifier 13 | 14 | if __name__ == "__main__": 15 | classifier = load_classifier() 16 | 17 | 18 | #RTE 19 | sentence1 = ['No Weapons of Mass Destruction Found in Iraq Yet.', 20 | 'India is a hot country', 21 | 'Krishan has written this inference example'] 22 | sentence2 = ['Weapons of Mass Destruction Found in Iraq.', 23 | 'It\'s warm in india', 24 | 'Krishan is the author of this example'] 25 | input = classifier.tokenizer( 26 | text = sentence1, 27 | text_pair = sentence2, 28 | max_length=classifier.max_length, 29 | return_tensors="pt", 30 | padding=True, 31 | truncation=True, 32 | ) 33 | output = classifier.net(classifier.encoder(**input)) 34 | result = torch.argmax(output.logits, dim = -1) 35 | print(result) 36 | 37 | 38 | -------------------------------------------------------------------------------- /examples/readme.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | A collection of examples of PyMarlin in action! 4 | 5 | 1. Classification on Kaggle `covid19-nlp-text-classification` with BERT 6 | 2. Summarization on `CNN/DailyMail` with BART (and optionally with ORT+DeepSpeed extensions) 7 | 3. `GLUE Benchmark` with RoBERTa 8 | 4. Named Entity Recognition (Plugin) on `GERM` with bert-base-multilingual-cased 9 | 5. CIFAR Image Classification notebook -------------------------------------------------------------------------------- /examples/snli_benchmark/configs-bert-base/snli.yaml: -------------------------------------------------------------------------------- 1 | glue_task : 'snli' 2 | 3 | mi: 4 | encoder : "bert-base-cased" 5 | tokenizer: "bert-base-cased" 6 | num_labels : 3 7 | lr : 0.0005 8 | s1_key : 'premise' 9 | s2_key : 'hypothesis' 10 | max_length : 128 11 | warmup : 0.06 12 | tr: 13 | clip_grads: False 14 | max_train_steps_per_epoch : null # Maximum train steps per epoch. 15 | max_val_steps_per_epoch : null # Maximum validation steps per epoch. 16 | train_batch_size: 32 # Training global batch size. 17 | val_batch_size: 64 # Validation global batch size. 18 | epochs: 3 # Total epochs to run. 19 | gpu_batch_size_limit : 4 # Max limit for GPU batch size during training. 20 | disable_tqdm : True 21 | writers: ['stdout', 'aml', 'tensorboard'] 22 | backend: 'ddp-dp' 23 | 24 | dp: 25 | per_sample_max_grad_norm: 1.0 26 | noise_multiplier: 0.4 27 | sample_rate: 0.00005818 #snli: 32/550000 28 | target_delta: 0.000001818 #snli: 1/550000 29 | 30 | wrt: 31 | tb_log_dir : 'logs_bert_base/snli/from_pretrained' 32 | 33 | stat: 34 | log_steps : 20 35 | 36 | dist: 37 | local_rank : 1 38 | 39 | ckp: 40 | checkpoint : False 41 | period: 5 42 | -------------------------------------------------------------------------------- /examples/snli_benchmark/src/data.py: -------------------------------------------------------------------------------- 1 | from pymarlin.core.data_interface import DataInterface 2 | from datasets import load_dataset 3 | 4 | class SnliData(DataInterface): 5 | def setup_datasets(self, task): 6 | self.task = task 7 | datasets = load_dataset(self.task) 8 | self.train_ds = datasets['train'] 9 | self.train_ds = self.train_ds.filter(lambda x: x["label"] != -1) 10 | 11 | self.val_ds = datasets['validation'] 12 | self.val_ds = self.val_ds.filter(lambda x: x["label"] != -1) 13 | 14 | self.test_ds = datasets['test'] 15 | self.test_ds = self.test_ds.filter(lambda x: x["label"] != -1) 16 | 17 | def get_train_dataset(self): 18 | return self.train_ds 19 | 20 | def get_val_dataset(self): 21 | return self.val_ds 22 | 23 | def get_test_dataset(self): 24 | return self.test_ds 25 | -------------------------------------------------------------------------------- /pymarlin/__init__.py: -------------------------------------------------------------------------------- 1 | """A lightweight library for Deep Learning model training""" 2 | 3 | __version__ = '0.3.5' 4 | from pymarlin.core.trainer import ( 5 | TrainerArguments, 6 | Trainer, 7 | ) 8 | from pymarlin.core.data_interface import ( 9 | DataProcessor, 10 | DataInterface, 11 | ) 12 | from pymarlin.core.module_interface import ( 13 | CallbackInterface, 14 | ModuleInterface, 15 | ) 16 | from pymarlin.core.trainer_backend import ( 17 | SingleProcess, 18 | SingleProcessAmp, 19 | SingleProcessApexAmp, 20 | DDPTrainerBackend, 21 | ) 22 | 23 | from pymarlin.utils.checkpointer.checkpoint_utils import ( 24 | DefaultCheckpointerArguments, 25 | DefaultCheckpointer, 26 | ) 27 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser 28 | from pymarlin.utils.stats.basic_stats import BasicStats 29 | -------------------------------------------------------------------------------- /pymarlin/core/__init__.py: -------------------------------------------------------------------------------- 1 | '''Empty init file''' 2 | -------------------------------------------------------------------------------- /pymarlin/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | '''pymarlin.plugins''' 2 | from .plugin_module_interface import PluginModuleInterface 3 | from .hf_seq_classification import HfSeqClassificationPlugin 4 | from .hf_ner import HfNERPlugin 5 | from .hf_seq2seq import HfSeq2SeqPlugin 6 | -------------------------------------------------------------------------------- /pymarlin/plugins/base.py: -------------------------------------------------------------------------------- 1 | """ Base class for all plugins. """ 2 | from abc import abstractmethod 3 | from typing import Optional, Dict 4 | from pymarlin.core import module_interface, data_interface 5 | from pymarlin.core import trainer as trn 6 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser 7 | 8 | 9 | class Plugin: 10 | """Base class for all plugins. 11 | 12 | It is structured around three core components 13 | [trn.Trainer, module_interface.ModuleInterface, data_interface.DataInterface]. 14 | Derived classes should implement the methods `setup_data`, 15 | `setup_module`, and `setup`. These methods will execute the data processing 16 | pipeline and initialize the required components for training such as 17 | `trainer` and `module_interface`. `setup_trainer` initializes the PyMarlin 18 | trainer and backend. 19 | 20 | `plugin.setup` is provided to bootstrap the entire pipeline for a specific 21 | downstream task. 22 | Example:: 23 | 24 | trainer = plugin.setup() 25 | trainer.train() 26 | trainer.validate() 27 | """ 28 | 29 | def __init__(self, config: Optional[Dict] = None): 30 | """CustomArgParser parses YAML config located at cmdline --config_path. If --config_path 31 | is not provided, assumes YAML file is named config.yaml and present in working directory. 32 | self.trainer_args (trn.TrainerArguments): Instantiated dataclass containing 33 | args required to initialize trn.Trainer class. 34 | """ 35 | if config is None: 36 | config = CustomArgParser().parse() 37 | self.trainer_args = trn.TrainerArguments( 38 | **config["trainer"], 39 | stats_args=trn.stats.StatInitArguments(**config["stats"]), 40 | writer_args=trn.WriterInitArguments(**config["wrts"]), 41 | checkpointer_args=trn.DefaultCheckpointerArguments(**config["ckpt"]) 42 | ) 43 | 44 | @property 45 | def datainterface(self): 46 | """DataInterface object used for data processing. 47 | The property can be set in `setup_datainterface`. 48 | 49 | Returns: 50 | An object of type data_interface.DataInterface. 51 | """ 52 | return self._datainterface 53 | 54 | @datainterface.setter 55 | def datainterface(self, data_interface_obj: data_interface.DataInterface): 56 | assert isinstance(data_interface_obj, data_interface.DataInterface) 57 | self._datainterface = data_interface_obj 58 | 59 | @property 60 | def dataprocessor(self): 61 | """DataProcessor object(s) used for data processing. 62 | The property may be used in conjuction with `datainterface` in the 63 | `setup_datainterface` method. 64 | 65 | Returns: 66 | An object of type data_interface.DataProcessor. 67 | """ 68 | return self._dataprocessor 69 | 70 | @dataprocessor.setter 71 | def dataprocessor(self, data_processor_obj: data_interface.DataProcessor): 72 | assert isinstance(data_processor_obj, data_interface.DataProcessor) 73 | self._dataprocessor = data_processor_obj 74 | 75 | @property 76 | def moduleinterface(self): 77 | """ModuleInterface object. 78 | The property can be set in `setup_module`. 79 | 80 | Returns: 81 | An object of type module_interface.ModuleInterface. 82 | """ 83 | return self._moduleinterface 84 | 85 | @moduleinterface.setter 86 | def moduleinterface(self, module_interface_obj: module_interface.ModuleInterface): 87 | assert isinstance(module_interface_obj, module_interface.ModuleInterface) 88 | self._moduleinterface = module_interface_obj 89 | 90 | @property 91 | def trainer(self): 92 | """Trainer object. 93 | The property can be set in `setup_trainer`. 94 | 95 | Returns: 96 | An object of type trn.Trainer. 97 | """ 98 | return self._trainer 99 | 100 | @trainer.setter 101 | def trainer(self, trainer_obj: trn.Trainer): 102 | assert isinstance(trainer_obj, trn.Trainer) 103 | self._trainer = trainer_obj 104 | 105 | @abstractmethod 106 | def setup_datainterface(self, *args: Optional): 107 | """Derived plugins must implement this method. The method should 108 | execute a generic data processing pipeline for the task and update the 109 | TaskDataInterface object to contain the processed train and val datasets. 110 | 111 | NOTE to TaskPlugin designers: Typically, the plugin shouldn't need 112 | any input arguments from user except from the YAML config. DataInterface and 113 | DataProcessor related arguments should be processed in the __init__ method of 114 | the TaskPlugin. 115 | 116 | Returns: 117 | datainterface_obj (data_interface.DataInterface): TaskDataInterface object 118 | """ 119 | 120 | @abstractmethod 121 | def setup_module(self, *args: Optional): 122 | """Derived plugins must implement this method. The method should 123 | create a TaskModuleInterface object (module_interface.ModuleInterface) 124 | and set `moduleinterface` property. 125 | 126 | NOTE to TaskPlugin designers: Typically, the plugin shouldn't need 127 | any input arguments from user. ModuleInterface related arguments should be 128 | processed in the __init__ method of the TaskPlugin. 129 | """ 130 | 131 | def setup_trainer(self): 132 | """Creates a trn.Trainer object and sets the `trainer` property. 133 | Used by all plugins unless overriden (not recommended). 134 | """ 135 | self.trainer = trn.Trainer(args=self.trainer_args, module=self.moduleinterface) 136 | 137 | @abstractmethod 138 | def setup(self, **kwargs): 139 | """Executes all steps from data processing to trainer initialization. 140 | 141 | This should be equivalent to:: 142 | 143 | plugin.setup_datainterface() 144 | plugin.setup_module() 145 | plugin.setup_trainer() 146 | """ 147 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_ner/__init__.py: -------------------------------------------------------------------------------- 1 | from .implementation import HfNERPlugin 2 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_ner/config_germ.yaml: -------------------------------------------------------------------------------- 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line. 2 | # Example usage in command-line: --tmod.max_lr 4E-5 3 | 4 | # data_processor args 5 | data: 6 | train_dir : null 7 | val_dir : null 8 | labels_list: [B-LOC, B-LOCderiv, B-LOCpart, B-ORG, B-ORGderiv, B-ORGpart, B-OTH, B-OTHderiv, 9 | B-OTHpart, B-PER, B-PERderiv, B-PERpart, I-LOC, I-LOCderiv, I-LOCpart, I-ORG, I-ORGderiv, 10 | I-ORGpart, I-OTH, I-OTHderiv, I-OTHpart, I-PER, I-PERderiv, I-PERpart, O] 11 | max_seq_len: 128 12 | pad_label_id: -100 13 | has_labels: True 14 | tokenizer: "bert-base-multilingual-cased" 15 | file_format: "tsv" 16 | label_all_tokens: False 17 | 18 | # model arguments 19 | model: 20 | model_name: "bert" 21 | encoder_key: "bert" 22 | hf_model: "bert-base-multilingual-cased" 23 | model_file: "pytorch_model.bin" 24 | model_config_file: "config.json" 25 | model_path: null 26 | model_config_path: null 27 | 28 | # module_interface arguments 29 | module: 30 | output_dir: null 31 | max_lr : 0.00003 # Maximum learning rate. 32 | warmup_prop: 0.1 33 | has_labels: True 34 | 35 | # distill module arguments 36 | distill: 37 | enable: False 38 | student_model_config_path: null 39 | student_model_config_file: null 40 | student_model_path: null 41 | student_model_file: null 42 | student_layers: [0,6,11] 43 | loss_types: ["logits"] 44 | loss_weights: [1] 45 | temperature: 1 46 | 47 | # trainer arguments 48 | trainer: 49 | backend: "sp" 50 | train_batch_size: 32 # Training global batch size. 51 | val_batch_size: 16 # Validation global batch size. 52 | epochs: 1 # Total epochs to run. 53 | gpu_batch_size_limit : 8 # Max limit for GPU batch size during training. 54 | clip_grads : True # Enable or disable clipping of gradients. 55 | use_gpu: True # Enable or disable use of GPU. 56 | max_grad_norm: 1.0 # Maximum value for gradient norm. 57 | writers: ['stdout', 'aml', 'tensorboard'] # List of all the writers to use. 58 | disable_tqdm: True 59 | log_level: "DEBUG" 60 | max_train_steps_per_epoch: 1 61 | max_val_steps_per_epoch: 1 62 | 63 | # Checkpointer arguments 64 | ckpt: 65 | checkpoint: False # Flag indicating whether to checkpoint model. 66 | delete_existing_checkpoints: False 67 | period: 1 # Period of epochs at which to checkpoint model. 68 | save_dir: 'ckpts' # Path to directory where checkpoints are to be stored. 69 | file_prefix: 'bert' # Prefix of the checkpoint filename. 70 | file_ext: 'tar' # File extension for the checkpoint. 71 | 72 | # Basic-Statistics arguments 73 | stats: 74 | log_steps: 50 75 | update_system_stats: False 76 | log_model_steps: 1000 77 | exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)' 78 | 79 | # Writers arguments 80 | wrts: 81 | model_log_level : 'INFO' 82 | tb_log_dir : 'logs' 83 | tb_logpath_parent_env : null 84 | tb_log_multi : False 85 | tb_log_hist_steps : 20000 -------------------------------------------------------------------------------- /pymarlin/plugins/hf_ner/data_classes.py: -------------------------------------------------------------------------------- 1 | '''pymarlin.pluguins.hf_ner.data_classes''' 2 | import dataclasses 3 | import pandas as pd 4 | import multiprocessing 5 | import torch 6 | 7 | from torch.utils.data import Dataset 8 | from transformers import InputExample, InputFeatures 9 | from pymarlin.utils.logger.logging_utils import getlogger 10 | from pymarlin.core import data_interface 11 | 12 | logger = getlogger(__name__, "DEBUG") 13 | 14 | @dataclasses.dataclass 15 | class DataArguments: 16 | train_filepath: None 17 | val_filepath: None 18 | labels_list: None 19 | has_labels: True 20 | file_format: str = "tsv" 21 | 22 | 23 | class NERBaseDataset(Dataset): 24 | def __init__(self, args, input_filepath): 25 | self.input_filepath = input_filepath 26 | self.args = args 27 | 28 | if self.args.file_format == "tsv": 29 | sep = "\t" 30 | else: 31 | sep = "," 32 | self.df = pd.read_csv(self.input_filepath, sep=sep).dropna() 33 | 34 | def __len__(self): 35 | return len(self.df) 36 | 37 | def __getitem__(self, idx): 38 | record = self.df.iloc[idx] 39 | sent = record["Sentence"].split(" ") 40 | label = record["Slot"].split(" ") 41 | assert len(sent) == len(label) 42 | return sent,label 43 | 44 | class NERDataInterface(data_interface.DataInterface): 45 | '''NER Data Interface''' 46 | def __init__(self, args): 47 | super().__init__() 48 | self.args = args 49 | self.train_dataset = [] 50 | self.val_dataset = [] 51 | self._set_args() 52 | 53 | def setup_datasets(self): 54 | self.train_dataset = NERBaseDataset(self.args, self.args.train_filepath) 55 | self.val_dataset = NERBaseDataset(self.args, self.args.val_filepath) 56 | 57 | def get_train_dataset(self): 58 | return self.train_dataset 59 | 60 | def get_val_dataset(self): 61 | return self.val_dataset 62 | 63 | def get_labels(self): 64 | return self.args.labels_list 65 | 66 | def _set_args(self): 67 | self.label_map = ( 68 | {label: i for i, label in enumerate(self.args.labels_list)} 69 | if self.args.labels_list is not None 70 | else None 71 | ) 72 | logger.info(f"Labels map = {self.label_map}") 73 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_ner/implementation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | from torch.utils.data import DataLoader, Dataset 7 | from torch.optim import Adam 8 | from torch.optim.lr_scheduler import OneCycleLR 9 | 10 | from transformers import InputExample, AutoTokenizer 11 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser 12 | 13 | from .data_classes import DataArguments, NERBaseDataset, NERDataInterface 14 | from .module_classes import NERModule, ModuleInterfaceArguments, ModelArguments 15 | 16 | from transformers import AutoModelForTokenClassification 17 | from pymarlin.core.data_interface import DataProcessor, DataInterface 18 | from pymarlin.core.module_interface import ModuleInterface 19 | from pymarlin.plugins.base import Plugin 20 | from pymarlin.plugins.hfdistill_utils import build_distill_module, DistillationArguments 21 | from pymarlin.utils.stats import global_stats 22 | from pymarlin.utils.logger.logging_utils import getlogger 23 | 24 | logger = getlogger(__name__, "DEBUG") 25 | 26 | 27 | class HfNERPlugin(Plugin): 28 | """Named Entity Recognition or Token Classification plugin for HuggingFace models 29 | 30 | plugin.setup() bootstraps the entire pipeline and returns a fully setup trainer. 31 | Example: 32 | 33 | trainer = plugin.setup_trainer() 34 | trainer.train() 35 | trainer.validate() 36 | """ 37 | 38 | def __init__(self, config: Optional[Dict] = None): 39 | """CustomArgParser parses YAML config located at cmdline --config_path. If --config_path 40 | is not provided, assumes YAML file is named config.yaml and present in working directory. 41 | Instantiates dataclasses: 42 | self.data_args (arguments.DataInterfaceArguments): Instantiated dataclass containing 43 | args required to initialize NERDataInterface and NERProcessor classes 44 | self.module_args (arguments.ModuleInterfaceArguments): Instantiated dataclass containing 45 | args required to initialize NERModule class 46 | 47 | Sets properties: 48 | self.datainterface: data_interface.DataInterface [NERDataInterface] object 49 | self.dataprocessor: data_interface.DataProcessor [NERProcessor] object. 50 | These two together are used to read raw data and create sequences of tokens in `setup_datainterface`. 51 | The processed data is fed to HuggingFace AutoModelForTokenClassification models. 52 | self.module: module_interface.ModuleInterface [NERModule] object 53 | This is used to initialize a Marlin trainer. 54 | """ 55 | super().__init__(config=None) 56 | if config is None: 57 | config = CustomArgParser(log_level="DEBUG").parse() 58 | self.data_args = DataArguments(**config["data"]) 59 | self.module_args = ModuleInterfaceArguments( 60 | **config["module"], model_args=ModelArguments(**config["model"]) 61 | ) 62 | self.distill_args = DistillationArguments(**config["distill"]) 63 | 64 | self.datainterface = NERDataInterface(self.data_args) 65 | self.datainterface.setup_datasets() 66 | module_class = NERModule 67 | 68 | module_params = [self.module_args, self.datainterface] 69 | 70 | if self.distill_args.enable: 71 | module_params = [self.distill_args] + module_params 72 | module_class = build_distill_module(module_class) 73 | 74 | self.moduleinterface = module_class(*module_params) 75 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | from .implementation import HfSeq2SeqPlugin 2 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq2seq/data_classes.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import dataclasses 3 | import os 4 | import pandas as pd 5 | import torch 6 | from pymarlin.core import data_interface 7 | import matplotlib 8 | 9 | matplotlib.use("Agg") # disable this in local machine to see plots 10 | import matplotlib.pyplot as plt 11 | 12 | def get_source_target(path="D:/data/cnn_cln", stage="val"): 13 | source = os.path.join(path, f"{stage}.source") 14 | target = os.path.join(path, f"{stage}.target") 15 | return source, target 16 | 17 | 18 | class AnalyzeProcessor(data_interface.DataProcessor): 19 | def __init__(self, source, target): 20 | with open(source, "r", encoding="UTF-8") as f: 21 | self.source = f.readlines() 22 | with open(target, "r", encoding="UTF-8") as f: 23 | self.target = f.readlines() 24 | 25 | def process(self): 26 | pass 27 | 28 | def analyze(self): 29 | self.df = pd.DataFrame({"source": self.source, "target": self.target}) 30 | print(self.df.head()) 31 | print("Word length analysis:") 32 | wordlengths = self.df.applymap(lambda x: len(x.split())) 33 | print(wordlengths.describe()) 34 | plt.plot(wordlengths) 35 | plt.legend(["source", "target"]) 36 | 37 | 38 | class HfSeq2SeqDataset(torch.utils.data.Dataset): 39 | def __init__(self, source, target): 40 | with open(source, "r", encoding="UTF-8") as f: 41 | self.source = f.readlines() 42 | with open(target, "r", encoding="UTF-8") as f: 43 | self.target = f.readlines() 44 | print( 45 | "len(self.source), len(self.target) = ", len(self.source), len(self.target) 46 | ) 47 | 48 | def __getitem__(self, i): 49 | # print('len(self.source), len(self.target) = ',len(self.source), len(self.target)) 50 | return self.source[i].strip(), self.target[i].strip() 51 | 52 | def __len__(self): 53 | return len(self.target) 54 | 55 | 56 | @dataclasses.dataclass 57 | class DataInterfaceArguments: 58 | data_dir: str = None 59 | 60 | 61 | class HfSeq2SeqData(data_interface.DataInterface): 62 | """ 63 | Class which expects input data to have different files for source and target. 64 | Returns dataset which returns non tokenized source and target text. 65 | """ 66 | 67 | def __init__(self, args: DataInterfaceArguments): 68 | self.args = args 69 | 70 | def setup_datasets(self): 71 | self.train_ds = HfSeq2SeqDataset( 72 | *get_source_target(self.args.data_dir, "train") 73 | ) 74 | self.val_ds = HfSeq2SeqDataset(*get_source_target(self.args.data_dir, "val")) 75 | print("self.train_ds length = ", len(self.train_ds)) 76 | 77 | def get_train_dataset(self, *args, **kwargs): 78 | return self.train_ds 79 | 80 | def get_val_dataset(self, *args, **kwargs): 81 | return self.val_ds 82 | 83 | 84 | if __name__ == "__main__": 85 | dm = HfSeq2SeqData() 86 | root = sys.argv[1] #'D:/data/cnn_cln' 87 | print("Train") 88 | dm.process_data(AnalyzeProcessor(*get_source_target(root=root, stage="train"))) 89 | print("Val") 90 | dm.process_data(AnalyzeProcessor(*get_source_target(root=root, stage="val"))) 91 | plt.show() 92 | 93 | # dm.setup_datasets() 94 | # ds = dm.get_train_dataset() 95 | # len(ds),ds[0] 96 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq2seq/implementation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import multiprocessing 3 | 4 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser 5 | from pymarlin.utils.logger.logging_utils import getlogger 6 | 7 | logger = getlogger(__name__, "DEBUG") 8 | 9 | from pymarlin.core import data_interface, module_interface 10 | from pymarlin.core import trainer as trn 11 | 12 | from pymarlin.plugins.base import Plugin 13 | from .data_classes import HfSeq2SeqData, DataInterfaceArguments 14 | from .module_classes import ( 15 | HfSeq2SeqModule, 16 | ModuleInterfaceArguments, 17 | ModelArguments, 18 | GenerateArguments, 19 | ) 20 | 21 | 22 | class HfSeq2SeqPlugin(Plugin): 23 | """Plugin for Text Sequence to Sequence Generation using Huggingface models. 24 | 25 | plugin.setup() bootstraps the entire pipeline and returns a fully setup trainer. 26 | Example: 27 | 28 | trainer = plugin.setup() 29 | trainer.train() 30 | trainer.validate() 31 | 32 | Alternatively, you can run `setup_datainterface` `setup_module` `setup_trainer` individually. 33 | Example: 34 | 35 | plugin.setup_datainterface() 36 | plugin.setup_module() 37 | trainer = plugin.setup_trainer() 38 | """ 39 | 40 | def __init__(self, config=None): 41 | """Accepts optional config dictionary. 42 | CustomArgParser parses YAML config located at cmdline --config_path. If --config_path 43 | is not provided, assumes YAML file is named config.yaml and present in working directory. 44 | Instantiates dataclasses: 45 | self.data_args (arguments.DataInterfaceArguments): Data Inference arguments 46 | self.module_args (arguments.ModuleInterfaceArguments): Module Interface Arguments 47 | Sets properties: 48 | self.datainterface: data_interface.DataInterface [HfSeq2SeqData] object 49 | self.moduleinterface: module_interface.ModuleInterface [HfSeq2SeqModule] object 50 | """ 51 | super().__init__() 52 | if config is None: 53 | config = CustomArgParser(log_level="DEBUG").parse() 54 | self.data_args = DataInterfaceArguments(**config["data"]) 55 | self.module_args = ModuleInterfaceArguments( 56 | **config["module"], 57 | model_args=ModelArguments(**config["model"]), 58 | generate_args=GenerateArguments(**config["generate"]) 59 | ) 60 | # self.distill_args = DistillationArguments(**config['distill']) 61 | 62 | def setup_datainterface(self): 63 | """Calls `datainterface.setup_datasets(train_data, val_data)`. 64 | 65 | Assumptions: 66 | Training and validation files are placed in separate directories. 67 | Accepted file formats: source/target text lines in data_args.data_dir/{train,val}.{source,targets} 68 | """ 69 | self.datainterface = HfSeq2SeqData(self.data_args) 70 | self.datainterface.setup_datasets() 71 | 72 | def setup_module(self): 73 | """Sets `HfSeq2SeqModule.data` property to `datainterface` which contains 74 | the processed datasets. Assertion error is thrown if `datainterface` retrieves no train 75 | or val data, indicating that `datainterface` hasn't been setup with processed data. 76 | Sets the `HfSeq2SeqModule.model` property after initializing weights: 77 | Option 1: Load weights from specified files mentioned in YAML config 78 | model: 79 | model_config_path 80 | model_config_file 81 | model_path 82 | model_file 83 | Option 2: Load from Huggingface model hub, specify string in YAML config as: 84 | model: 85 | hf_model 86 | """ 87 | # datainterface should contain the processed datasets 88 | assert ( 89 | len(self.datainterface.get_train_dataset()) != 0 90 | or len(self.datainterface.get_val_dataset()) != 0 91 | ) 92 | self.moduleinterface = HfSeq2SeqModule(self.datainterface, self.module_args) 93 | 94 | def setup(self): 95 | """Executes all the setup methods required to create a trn.Trainer object. 96 | Trainer needs `moduleinterface` and backend is specified by self.trainer_args.backend. 97 | """ 98 | self.setup_datainterface() 99 | self.setup_module() 100 | self.setup_trainer() 101 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq2seq/metric_utils.py: -------------------------------------------------------------------------------- 1 | from rouge_score import rouge_scorer 2 | 3 | """ Metric Functions """ 4 | 5 | 6 | def get_metric_func(metric_name): 7 | METRIC_MAP = {"rouge": rouge} 8 | return METRIC_MAP[metric_name] 9 | 10 | 11 | def rouge(preds, labels): 12 | # All Rouge scores for CNN/DailyMail 13 | scorer = rouge_scorer.RougeScorer( 14 | ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True 15 | ) 16 | agg_scores = {} 17 | 18 | # sum up fmeasures 19 | for pred, ref in zip(preds, labels): 20 | scores = scorer.score(pred, ref) 21 | for key in scores: 22 | if key not in agg_scores: 23 | agg_scores[key] = 0 24 | agg_scores[key] += scores[key].fmeasure 25 | 26 | # and divide to average 27 | for key in agg_scores: 28 | agg_scores[key] /= len(preds) 29 | 30 | return agg_scores 31 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq_classification/__init__.py: -------------------------------------------------------------------------------- 1 | from .implementation import HfSeqClassificationPlugin 2 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq_classification/config.yaml: -------------------------------------------------------------------------------- 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line. 2 | # Example usage in command-line: --tmod.max_lr 4E-5 3 | 4 | # data arguments 5 | data: 6 | train_filepath: null 7 | val_filepath: null 8 | file_format: "csv" 9 | header: 0 # file has a header at row 0 10 | text_a_col: "OriginalTweet" 11 | text_b_col: null # null in config file is equivalent to None 12 | label_col: "Sentiment" 13 | labels_list: ["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"] # list of labels which will be mapped in order from 0 to 4 for the model 14 | 15 | # model arguments 16 | model: 17 | tokenizer_path: null 18 | hf_model: "roberta-base" 19 | encoder_key: "roberta" 20 | model_config_path: null # provide path to model config dir 21 | model_config_file: "config.json" 22 | model_path: null # provide path to model weights dir 23 | model_file: "pytorch_model.bin" 24 | 25 | # module_interface arguments 26 | module: 27 | metric: "acc_and_f1" 28 | max_lr : 0.00002 29 | warmup_prop: 0.1 30 | has_labels: True 31 | max_seq_len: 128 32 | 33 | # distill module arguments 34 | distill: 35 | enable: False 36 | # config_output_dir: null 37 | student_model_config_path: null 38 | student_model_config_file: null 39 | student_model_path: null 40 | student_model_file: null 41 | student_layers: [0,6,11] 42 | loss_types: ["logits"] 43 | loss_weights: [1] 44 | temperature: 1 45 | 46 | # trainer arguments 47 | trainer: 48 | backend: "sp" # sp, sp-amp, ddp, ddp-amp 49 | train_batch_size: 32 # Training global batch size. 50 | val_batch_size: 16 # Validation global batch size. 51 | epochs: 3 # Total epochs to run. 52 | gpu_batch_size_limit : 8 # Max limit for GPU batch size during training. 53 | clip_grads : True # Enable or disable clipping of gradients. 54 | use_gpu: True # Enable or disable use of GPU. 55 | max_grad_norm: 1.0 # Maximum value for gradient norm. 56 | disable_tqdm: True 57 | log_level: "INFO" 58 | 59 | # Checkpointer arguments 60 | ckpt: 61 | checkpoint: True # Flag indicating whether to checkpoint model. 62 | delete_existing_checkpoints: False 63 | period: 1 # Period of epochs at which to checkpoint model. 64 | save_dir: 'marlin_states' # Path to directory where checkpoints are to be stored. 65 | model_state_save_dir: 'model_ckpts' 66 | file_prefix: 'marlin' # Prefix of the checkpoint filename. 67 | file_ext: 'bin' # File extension for the checkpoint. 68 | 69 | # Basic-Statistics arguments 70 | stats: 71 | log_steps: 5 72 | update_system_stats: False 73 | log_model_steps: 1000 74 | exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)' 75 | 76 | # Writers arguments 77 | wrts: 78 | model_log_level : 'INFO' 79 | tb_log_dir : 'logs' 80 | tb_logpath_parent_env : null 81 | tb_log_multi : False 82 | tb_log_hist_steps : 20000 -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq_classification/data_classes.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import pandas as pd 3 | import torch 4 | from torch.utils.data import Dataset 5 | 6 | from transformers import InputFeatures 7 | 8 | from pymarlin.core import data_interface 9 | from pymarlin.utils.logger.logging_utils import getlogger 10 | logger = getlogger(__name__, "DEBUG") 11 | 12 | 13 | @dataclasses.dataclass 14 | class DataArguments: 15 | train_filepath: str = None 16 | val_filepath: str = None 17 | labels_list: list = None 18 | file_format: str = "tsv" 19 | header: int = None 20 | text_a_col: int or str = None 21 | text_b_col: int or str = None 22 | label_col: int or str = None 23 | 24 | class HfSeqClassificationDataset(Dataset): 25 | """PyTorch Dataset.""" 26 | 27 | def __init__(self, args, input_filepath, label_map): 28 | """ 29 | Args: 30 | args: DataInterface arguments 31 | input_filepath (str): Path to dataset 32 | label_map (dict): Map categorical values to numerical 33 | """ 34 | self.args = args 35 | self.label_map = label_map 36 | if self.args.file_format == "json": 37 | self.df = pd.read_json(input_filepath, lines=True) 38 | elif self.args.file_format in ["tsv", "csv"]: 39 | if self.args.file_format == "tsv": 40 | sep = "\t" 41 | else: 42 | sep = "," 43 | self.df = pd.read_csv(input_filepath, sep=sep, header=self.args.header) 44 | 45 | def __len__(self): 46 | return len(self.df) 47 | 48 | def __getitem__(self, idx): 49 | record = self.df.iloc[idx] 50 | if self.label_map is not None: 51 | label = self.label_map[record[self.args.label_col]] 52 | else: 53 | label = float(record[self.args.label_col]) 54 | 55 | if self.args.text_b_col is not None: 56 | return record[self.args.text_a_col], record[self.args.text_b_col], label 57 | else: 58 | return record[self.args.text_a_col], label 59 | 60 | class HfSeqClassificationDataInterface(data_interface.DataInterface): 61 | """Retrieves train and val PyTorch Datasets.""" 62 | 63 | def __init__(self, args): 64 | """ 65 | Args: 66 | args (arguments.DataArguments): Dataclass 67 | """ 68 | super().__init__() 69 | self.args = args 70 | self.train_dataset = [] 71 | self.val_dataset = [] 72 | self._set_args() 73 | 74 | def _set_args(self): 75 | if self.args.file_format in ["tsv", "csv"]: 76 | if self.args.file_format == "tsv": 77 | sep = "\t" 78 | else: 79 | sep = "," 80 | if self.args.header is None: # Refer by column numbers 81 | self.args.text_a_col = int(self.args.text_a_col) 82 | if self.args.text_b_col: 83 | self.args.text_b_col = int(self.args.text_b_col) 84 | self.args.label_col = int(self.args.label_col) 85 | self.label_map = ( 86 | {label: i for i, label in enumerate(self.args.labels_list)} 87 | if len(self.args.labels_list) > 1 88 | else None 89 | ) 90 | 91 | def setup_datasets(self): 92 | if self.args.train_filepath is not None: 93 | self.train_dataset = HfSeqClassificationDataset(self.args, self.args.train_filepath, self.label_map) 94 | if self.args.val_filepath is not None: 95 | self.val_dataset = HfSeqClassificationDataset(self.args, self.args.val_filepath, self.label_map) 96 | 97 | def get_train_dataset(self): 98 | return self.train_dataset 99 | 100 | def get_val_dataset(self): 101 | return self.val_dataset 102 | 103 | def get_labels(self): 104 | return self.args.labels_list 105 | -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq_classification/implementation.py: -------------------------------------------------------------------------------- 1 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser 2 | from pymarlin.core import data_interface, module_interface 3 | from pymarlin.plugins.base import Plugin 4 | from pymarlin.plugins.hfdistill_utils import build_distill_module, DistillationArguments 5 | 6 | from .data_classes import ( 7 | HfSeqClassificationDataInterface, 8 | DataArguments, 9 | ) 10 | from .module_classes import ( 11 | HfSeqClassificationModule, 12 | ModuleInterfaceArguments, 13 | ModelArguments, 14 | ) 15 | from typing import Optional, Dict 16 | 17 | 18 | class HfSeqClassificationPlugin(Plugin): 19 | """Plugin for Text Sequence Classification using Huggingface models. 20 | 21 | 22 | plugin.setup() bootstraps the entire pipeline and returns a fully setup trainer. 23 | Example:: 24 | 25 | trainer = plugin.setup() 26 | trainer.train() 27 | trainer.validate() 28 | 29 | Alternatively, you can run `setup_datainterface` `setup_module` `setup_trainer` individually. 30 | Example:: 31 | 32 | plugin.setup_datainterface() 33 | plugin.setup_module() 34 | trainer = plugin.setup_trainer() 35 | """ 36 | 37 | def __init__(self, config: Optional[Dict] = None): 38 | """CustomArgParser parses YAML config located at cmdline --config_path. If --config_path 39 | is not provided, assumes YAML file is named config.yaml and present in working directory. 40 | Instantiates dataclasses: 41 | self.data_args (arguments.DataInterfaceArguments): Instantiated dataclass containing 42 | args. 43 | self.module_args (arguments.ModuleInterfaceArguments): Instantiated dataclass containing 44 | args required to initialize HfSeqClassificationModule class. 45 | self.distill_args (arguments.DistillationArguments): Instantiated dataclass 46 | required to initialize DistillHfModule. 47 | Set self.distill_args.enable = True in config file to do knowledge distillation 48 | instead of regular training. 49 | Sets properties: 50 | self.datainterface: data_interface.DataInterface [HfSeqClassificationDataInterface] object 51 | self.module: module_interface.ModuleInterface [HfSeqClassificationModule] object 52 | This is used to initialize a Marlin trainer. 53 | """ 54 | super().__init__(config=None) 55 | if config is None: 56 | config = CustomArgParser(log_level="DEBUG").parse() 57 | self.data_args = DataArguments(**config["data"]) 58 | self.module_args = ModuleInterfaceArguments( 59 | **config["module"], model_args=ModelArguments(**config["model"]) 60 | ) 61 | self.distill_args = DistillationArguments(**config["distill"]) 62 | 63 | def setup_datainterface(self): 64 | """Calls `datainterface.setup_datasets(train_data, val_data)`. 65 | 66 | Assumptions: 67 | Training and validation files are placed in separate directories. 68 | Accepted file formats: source/target text lines in data_args.data_dir/{train,val}.{source,targets} 69 | """ 70 | self.datainterface = HfSeqClassificationDataInterface(self.data_args) 71 | self.datainterface.setup_datasets() 72 | 73 | def setup_module(self): 74 | """Sets `HfSeqClassificationModule.data` property to `datainterface` which contains 75 | the processed datasets. Assertion error is thrown if `datainterface` retrieves no train 76 | or val data, indicating that `datainterface` hasn't been setup with processed data. 77 | Sets the `HfSeqClassificationModule.model` property after initializing weights: 78 | Option 1: Load weights from specified files mentioned in YAML config 79 | model: 80 | model_config_path 81 | model_config_file 82 | model_path 83 | model_file 84 | Option 2: Load from Huggingface model hub, specify string in YAML config as: 85 | model: 86 | hf_model 87 | """ 88 | # datainterface should contain the processed datasets 89 | assert ( 90 | len(self.datainterface.get_train_dataset()) != 0 91 | or len(self.datainterface.get_val_dataset()) != 0 92 | ) 93 | module_class = HfSeqClassificationModule 94 | module_params = [self.module_args, self.datainterface] 95 | if self.distill_args.enable: 96 | module_params = [self.distill_args] + module_params 97 | module_class = build_distill_module(module_class) 98 | self.moduleinterface = module_class(*module_params) 99 | 100 | def setup(self): 101 | """Executes all the setup methods required to create a trn.Trainer object. 102 | Trainer needs `moduleinterface` and backend is specified by self.trainer_args.backend. 103 | """ 104 | self.setup_datainterface() 105 | self.setup_module() 106 | self.setup_trainer() -------------------------------------------------------------------------------- /pymarlin/plugins/hf_seq_classification/metric_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | from scipy.stats import pearsonr, spearmanr 5 | from sklearn.metrics import ( 6 | matthews_corrcoef, 7 | f1_score, 8 | precision_score, 9 | recall_score, 10 | classification_report, 11 | accuracy_score, 12 | ) 13 | 14 | """ Metric Functions """ 15 | 16 | 17 | def get_metric_func(metric_name): 18 | METRIC_MAP = { 19 | "acc": simple_accuracy, 20 | "acc_and_f1": acc_and_f1, 21 | "pcc_and_scc": pearson_and_spearman, 22 | "mcc": mcc, 23 | } 24 | return METRIC_MAP[metric_name] 25 | 26 | 27 | def mcc(labels, preds): 28 | return {"mcc": matthews_corrcoef(labels, preds)} 29 | 30 | 31 | def simple_accuracy(labels, preds): 32 | return {"acc": accuracy_score(preds, labels)} 33 | 34 | 35 | def acc_and_f1(labels, preds, average="weighted", target_labels=None): 36 | f1 = f1_score(y_true=labels, y_pred=preds, average=average, labels=target_labels) 37 | precision = precision_score( 38 | y_true=labels, y_pred=preds, average=average, labels=target_labels 39 | ) 40 | recall = recall_score( 41 | y_true=labels, y_pred=preds, average=average, labels=target_labels 42 | ) 43 | metrics_dict = { 44 | "f1": f1, 45 | "precision": precision, 46 | "recall": recall, 47 | } 48 | metrics_dict.update(simple_accuracy(labels, preds)) 49 | return metrics_dict 50 | 51 | 52 | def pearson_and_spearman(labels, preds): 53 | pearson_corr = pearsonr(preds, labels)[0] 54 | spearman_corr = spearmanr(preds, labels)[0] 55 | return { 56 | "pearson": pearson_corr, 57 | "spearmanr": spearman_corr, 58 | "corr": (pearson_corr + spearman_corr) / 2, 59 | } 60 | -------------------------------------------------------------------------------- /pymarlin/plugins/plugin_module_interface.py: -------------------------------------------------------------------------------- 1 | '''plugin module interface''' 2 | import os 3 | from transformers import AutoTokenizer, AutoConfig 4 | from pymarlin.core import module_interface, data_interface 5 | 6 | class PluginModuleInterface(module_interface.ModuleInterface): 7 | '''Common plugin module interface to easily load Huggingface tokenizers and Configs''' 8 | def auto_setup(self, automodel_class): 9 | """Run all (tokenizer,config,model) setups""" 10 | self.setup_tokenizer() 11 | self.setup_model_config() 12 | self.setup_model(automodel_class) 13 | 14 | @property 15 | def data(self): 16 | """DataInterface object that is used to retrieve corresponding train or val dataset. 17 | 18 | Returns: 19 | data: DataInterface object with at least one of train or val data. 20 | """ 21 | return self._data 22 | 23 | @data.setter 24 | def data(self, datainterface): 25 | assert isinstance(datainterface, data_interface.DataInterface) 26 | assert ( 27 | len(datainterface.get_train_dataset()) != 0 28 | or len(datainterface.get_val_dataset()) != 0 29 | ) 30 | self._data = datainterface 31 | 32 | @property 33 | def model(self): 34 | """Pytorch model.""" 35 | return self._model 36 | 37 | @model.setter 38 | def model(self, newmodel): 39 | self._model = newmodel 40 | 41 | def setup_tokenizer(self): 42 | """Initializes AutoTokenizer from 43 | model_args.tokenizer_path or model_args.hf_model string 44 | """ 45 | if self.args.model_args.tokenizer_path is not None: 46 | tokenizer = AutoTokenizer.from_pretrained( 47 | self.args.model_args.tokenizer_path 48 | ) 49 | else: 50 | tokenizer = AutoTokenizer.from_pretrained(self.args.model_args.hf_model) 51 | 52 | self.tokenizer = tokenizer 53 | 54 | def setup_model_config(self): 55 | """Initializes AutoConfig from 56 | model_args.model_config + model_args.model_config_file path or model_args.hf_model string 57 | """ 58 | if self.args.model_args.model_config_path is not None: 59 | model_config = AutoConfig.from_pretrained( 60 | os.path.join( 61 | self.args.model_args.model_config_path, 62 | self.args.model_args.model_config_file, 63 | ) 64 | ) 65 | else: 66 | model_config = AutoConfig.from_pretrained( 67 | self.args.model_args.hf_model 68 | ) 69 | 70 | model_config.num_labels = ( 71 | len(self.data.get_labels()) if hasattr(self.data, "get_labels") else None 72 | ) 73 | self.model_config = model_config 74 | 75 | def setup_model(self, automodel_class): 76 | """Initializes automodel_class arg by either: 77 | Option 1: Load weights from specified files mentioned in YAML config 78 | model: 79 | model_config_path 80 | model_config_file 81 | model_path 82 | model_file 83 | Option 2: Load from Huggingface model hub, specify string in YAML config as: 84 | model: 85 | hf_model 86 | 87 | Args: 88 | automodel_class: Huggingface AutoModelFor* class 89 | """ 90 | if ( 91 | self.args.model_args.model_path is not None 92 | and self.args.model_args.model_file is not None 93 | ): 94 | self.model = automodel_class.from_pretrained( 95 | os.path.join( 96 | self.args.model_args.model_path, self.args.model_args.model_file 97 | ), 98 | config=self.model_config, 99 | ) 100 | else: 101 | self.model = automodel_class.from_pretrained( 102 | self.args.model_args.hf_model, config=self.model_config 103 | ) 104 | -------------------------------------------------------------------------------- /pymarlin/utils/__init__.py: -------------------------------------------------------------------------------- 1 | '''Empty init file''' 2 | -------------------------------------------------------------------------------- /pymarlin/utils/checkpointer/__init__.py: -------------------------------------------------------------------------------- 1 | '''checkpointer utils''' 2 | from .checkpoint_utils import AbstractCheckpointer, DefaultCheckpointer, DefaultCheckpointerArguments 3 | -------------------------------------------------------------------------------- /pymarlin/utils/config_parser/__init__.py: -------------------------------------------------------------------------------- 1 | '''config parser''' 2 | -------------------------------------------------------------------------------- /pymarlin/utils/differential_privacy.py: -------------------------------------------------------------------------------- 1 | """Differential Privacy utils""" 2 | from typing import Optional 3 | from dataclasses import dataclass 4 | 5 | @dataclass 6 | class DifferentialPrivacyArguments: 7 | noise_multiplier: float = 1.0 # Scaling for the noise variance 8 | per_sample_max_grad_norm: float = 1.0 # Clips the per sample gradients 9 | sample_rate: float = 0.0 # Should be set as batch_size/number_of_samples (see doc for special cases) 10 | delta: Optional[float] = None # Typically set as o(1/number_of_samples), only required to calculate privacy budget (epsilon) 11 | 12 | # Wrap any No-DP optimizer to distinguish from the DP optimizer 13 | # This is expected to be a very rare situation 14 | class NoDPWrap: 15 | def __init__(self, optimizer): 16 | self.optimizer = optimizer 17 | -------------------------------------------------------------------------------- /pymarlin/utils/distributed.py: -------------------------------------------------------------------------------- 1 | """distributed utils""" 2 | import os 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | from functools import wraps 6 | from azureml.core.run import Run 7 | import torch 8 | 9 | @dataclass 10 | class DistributedTrainingArguments: 11 | local_rank: int = 0 12 | global_rank: int = 0 13 | world_size: int = 1 14 | backend: str = "nccl" 15 | init_method: str = "env://" 16 | gather_frequency: Optional[int] = None 17 | 18 | @dataclass 19 | class DistributedPreprocessArguments: 20 | local_rank: int = 0 21 | global_rank: int = 0 22 | world_size: int = 1 23 | node_count: int = 1 24 | local_size: int = 1 25 | node_rank: Optional[int] = None 26 | 27 | class SequentialDistributedSampler(torch.utils.data.distributed.DistributedSampler): 28 | def __init__(self, dataset, num_replicas=None, rank=None, seed=0, drop_last=False, **kwargs): 29 | super().__init__(dataset, shuffle=False, num_replicas=num_replicas, rank=rank, seed=seed, drop_last=drop_last, **kwargs) 30 | 31 | def ranks_already_set(args) -> bool: 32 | """Return True is both local and global ranks have been set.""" 33 | is_local_rank_set = args.local_rank > -1 34 | is_global_rank_set = args.global_rank > -1 35 | return is_local_rank_set and is_global_rank_set 36 | 37 | def fetch_ranks_from_azureml_preprocess(): 38 | """Look up distributed arguments from Azure ML environment variables. 39 | 40 | Assumes OpenMPI image. 41 | 42 | Note: 43 | Sets up NCCL environment variables used by Azure ML: 44 | 45 | - NCCL_SOCKET_IFNAME 46 | - NCCL_IB_DISABLE 47 | """ 48 | ranks = DistributedPreprocessArguments() 49 | 50 | run = Run.get_context() 51 | run.get_status() 52 | ranks.node_count = run.get_details()['runDefinition']['nodeCount'] 53 | ranks.local_size = run.get_details()['runDefinition']['mpi']['processCountPerNode'] 54 | 55 | ranks.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")) 56 | ranks.global_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK")) 57 | ranks.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE")) 58 | 59 | return ranks 60 | 61 | def fetch_ranks_from_azureml(): 62 | """Look up distributed arguments from Azure ML environment variables. 63 | 64 | Assumes OpenMPI image. 65 | 66 | Note: 67 | Sets up NCCL environment variables used by Azure ML: 68 | 69 | - NCCL_SOCKET_IFNAME 70 | - NCCL_IB_DISABLE 71 | """ 72 | ranks = DistributedTrainingArguments() 73 | ranks.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")) 74 | ranks.global_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK")) 75 | ranks.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE")) 76 | return ranks 77 | 78 | 79 | def fetch_ranks_from_torch_distributed_launch(): 80 | """Read distributed arguments set by torch.distributed.launch via environment variables.""" 81 | ranks = DistributedTrainingArguments() 82 | ranks.local_rank = int(os.environ["LOCAL_RANK"]) 83 | ranks.global_rank = int(os.environ["RANK"]) 84 | ranks.world_size = int(os.environ["WORLD_SIZE"]) 85 | return ranks 86 | 87 | 88 | def set_environment_variables_for_nccl_backend(): 89 | """Sets distributed training environments for azureml openmpi runs with NCCL backend.""" 90 | 91 | # NCCL environment. Still works without it. 92 | os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo" 93 | os.environ["NCCL_IB_DISABLE"] = "0" # for IB 94 | 95 | master_node = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"] 96 | master_port = "54965" 97 | 98 | # set env variables 99 | os.environ["MASTER_ADDR"] = master_node 100 | os.environ["MASTER_PORT"] = master_port 101 | 102 | 103 | def rank_zero_only(fn): 104 | """Decorates functions to only execute on global rank 0, else wait via torch.distributed""" 105 | 106 | @wraps(fn) 107 | def wrapped_fn(*args, **kwargs): 108 | if rank_zero_only.rank == 0: 109 | res = fn(*args, **kwargs) 110 | if torch.distributed.is_initialized(): 111 | torch.distributed.barrier() 112 | return res 113 | else: 114 | if torch.distributed.is_initialized(): 115 | torch.distributed.barrier() 116 | 117 | return wrapped_fn 118 | rank_zero_only.rank = 0 # by default 119 | -------------------------------------------------------------------------------- /pymarlin/utils/fabrics.py: -------------------------------------------------------------------------------- 1 | """Compute fabric specific utility methods.""" 2 | import os 3 | import importlib.util 4 | 5 | 6 | def is_azureml_mpirun() -> bool: 7 | """Check if run set up by azureml using OpenMPI image. 8 | 9 | When running MPIRUN with OpenMPI images, AzureML sets a specific combination 10 | of environment variables which we check for here, specifically:: 11 | 12 | OMPI_COMM_WORLD_RANK # the rank of the process 13 | OMPI_COMM_WORLD_SIZE # the world size 14 | OMPI_COMM_WORLD_LOCAL_RANK # the local rank of the process on the node 15 | OMPI_COMM_WORLD_LOCAL_SIZE # number of processes on the node 16 | 17 | and one of the following:: 18 | 19 | AZ_BATCH_MASTER_NODE # multiple nodes 20 | AZ_BATCHAI_MPI_MASTER_NODE # single node 21 | """ 22 | is_openmpi_image: bool = ( 23 | "OMPI_COMM_WORLD_RANK" in os.environ 24 | and "OMPI_COMM_WORLD_SIZE" in os.environ 25 | and "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ 26 | and "OMPI_COMM_WORLD_LOCAL_SIZE" in os.environ 27 | ) 28 | 29 | is_azureml_mpirun_env: bool = ( 30 | "AZ_BATCH_MASTER_NODE" in os.environ 31 | or "AZ_BATCHAI_MPI_MASTER_NODE" in os.environ 32 | ) 33 | 34 | return bool(is_openmpi_image and is_azureml_mpirun_env) 35 | 36 | 37 | def is_torch_distributed_launch_via_environment_variables() -> bool: 38 | """Check if torch.distributed.launch used to submit the job using environment variables.""" 39 | 40 | env_vars = os.environ 41 | is_using_environment_vars: bool = ( 42 | "RANK" in env_vars 43 | and "MASTER_ADDR" in env_vars 44 | and "MASTER_PORT" in env_vars 45 | and "WORLD_SIZE" in env_vars 46 | ) 47 | 48 | return is_using_environment_vars 49 | 50 | 51 | def is_azureml_run_with_sdk() -> bool: 52 | """Check if we are running on Azure ML with azureml-sdk.""" 53 | if not _is_azureml_available(): 54 | print("Unable to import azureml sdk.") 55 | return False 56 | 57 | import azureml.core.run 58 | 59 | run = azureml.core.run.Run.get_context() 60 | is_azureml_run = False 61 | 62 | try: 63 | run.get_status() 64 | is_azureml_run = True 65 | except AttributeError: 66 | print("This is not an Azure ML run") 67 | 68 | return is_azureml_run 69 | 70 | 71 | def _is_azureml_available() -> bool: 72 | """Check sys.modules to see if azureml.core.run is available. 73 | See https://github.com/huggingface/transformers/blob/02e05fb0a532e572b56ba75dad6ba3db625bbdeb/src/transformers/integrations.py#L81 74 | """ 75 | if importlib.util.find_spec("azureml") is None: 76 | return False 77 | if importlib.util.find_spec("azureml.core") is None: 78 | return False 79 | return importlib.util.find_spec("azureml.core.run") is not None 80 | -------------------------------------------------------------------------------- /pymarlin/utils/logger/__init__.py: -------------------------------------------------------------------------------- 1 | ''' logging utils ''' 2 | from .logging_utils import getlogger 3 | -------------------------------------------------------------------------------- /pymarlin/utils/logger/logging_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging util module 3 | """ 4 | import logging 5 | 6 | # create console handler for pymarlin format 7 | console_handler = logging.StreamHandler() 8 | formatter = logging.Formatter('%(asctime)s:%(levelname)s : %(name)s : %(lineno)d : %(message)s') 9 | console_handler.setFormatter(formatter) 10 | 11 | def getlogger(name, log_level='INFO'): 12 | """ 13 | This method returns a logger object to be used by the calling class. 14 | The logger object returned has the following format for all the logs: 15 | '%(asctime)s:%(levelname)s : %(name)s : %(lineno)d : %(message)s' 16 | 17 | Args: 18 | name (str): Directory under which to search for checkpointed files. 19 | file_prefix (str): Prefix to match for when searching for candidate files. 20 | file_ext (str, optional): File extension to consider when searching. 21 | 22 | Returns: 23 | logger (object): logger object to use for logging. 24 | """ 25 | logger = logging.getLogger(name) 26 | logger.handlers = [console_handler] 27 | logger.setLevel(log_level) 28 | return logger 29 | 30 | if __name__ == '__main__': 31 | # pylint: disable=pointless-string-statement 32 | """ 33 | Running this command: "python logging_utils.py" will print following to console: 34 | logging level for logger1 is INFO 35 | logging level for logger2 is DEBUG 36 | :ERROR: logger1 : 34: hello printing error message here for l1 37 | :ERROR: logger2 : 35: hello printing error message here for l2 38 | :DEBUG: logger2 : 36: hello printing debug message here for l2 39 | :INFO: logger2 : 37: hello printing info message here for l2 40 | """ 41 | l1 = getlogger('logger1') 42 | l2 = getlogger('logger2', log_level='DEBUG') 43 | l1.error('hello printing error message here for l1') 44 | l2.error('hello printing error message here for l2') 45 | l2.debug('hello printing debug message here for l2') 46 | l2.info('hello printing info message here for l2') 47 | -------------------------------------------------------------------------------- /pymarlin/utils/misc/__init__.py: -------------------------------------------------------------------------------- 1 | '''Empty init file''' 2 | -------------------------------------------------------------------------------- /pymarlin/utils/misc/misc_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous utility functions 3 | """ 4 | 5 | from functools import wraps 6 | import os 7 | import re 8 | import shutil 9 | from pymarlin.utils.logger.logging_utils import getlogger 10 | 11 | #https://docs.python.org/2/library/functools.html#functools.wraps 12 | 13 | def snake2camel(name): 14 | """ 15 | This method changes input name from snake format to camel format. 16 | 17 | Args: 18 | name (str): snake format input name. 19 | 20 | Returns: 21 | name (str): camel format input name. 22 | 23 | """ 24 | return re.sub(r'(?:^|_)([a-z])', lambda x: x.group(1).upper(), name) 25 | 26 | def clear_dir(path, skips=None): 27 | """ 28 | This method deletes the contents of the directory for which path 29 | has been provided and not included in the skips list. 30 | 31 | Args: 32 | path (str): Path for directory to be deleted. 33 | skips (List[str]): List of paths for sub directories to be skipped from deleting. 34 | 35 | """ 36 | if os.path.isdir(path): 37 | with os.scandir(path) as path_iter: 38 | for entry in path_iter: 39 | if entry.path in skips: 40 | continue 41 | try: 42 | if entry.is_file() or entry.is_symlink(): 43 | os.remove(entry.path) 44 | else: 45 | shutil.rmtree(entry.path) 46 | except PermissionError: 47 | getlogger(__name__).warning(f"could not delete path: {entry.path}") 48 | 49 | def debug(method): 50 | """ 51 | This method wraps input method with debug calls to measure time taken for 52 | the given input method to finish. 53 | 54 | Args: 55 | method (function): Method which needs to be timed. 56 | 57 | Returns: 58 | debugged (method): debugged function. 59 | 60 | """ 61 | @wraps(method) 62 | def debugged(*args, **kw): 63 | logger = getlogger(__name__) 64 | logger.debug('Inside method: %s', method.__name__) 65 | result = method(*args, **kw) 66 | logger.debug('Finished method: %s', method.__name__) 67 | return result 68 | return debugged 69 | -------------------------------------------------------------------------------- /pymarlin/utils/stats/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Stats package. 3 | ''' 4 | from .basic_stats import BasicStats, StatInitArguments 5 | #singleton object 6 | global_stats = BasicStats(StatInitArguments(), writers=[]) 7 | -------------------------------------------------------------------------------- /pymarlin/utils/writer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Writers package. 3 | """ 4 | from pymarlin.utils.logger.logging_utils import getlogger 5 | from .base import WriterInitArguments 6 | from .aml import Aml 7 | from .stdout import Stdout 8 | from .tensorboard import Tensorboard 9 | logger = getlogger(__name__) 10 | 11 | def build_writer(writer, args: WriterInitArguments): 12 | """ 13 | Initializes and returns writer object based on writer type. 14 | """ 15 | logger.debug(f'Building Writer {writer}') 16 | if writer == 'stdout': 17 | return Stdout(args) 18 | if writer == 'aml': 19 | return Aml() 20 | if writer == 'tensorboard': 21 | return Tensorboard(args) 22 | logger.error(f'Error initializing writer {writer}') 23 | raise Exception(f"Invalid writer type:{writer} requested.") 24 | -------------------------------------------------------------------------------- /pymarlin/utils/writer/aml.py: -------------------------------------------------------------------------------- 1 | """ 2 | AML writer module. 3 | """ 4 | from pymarlin.utils.logger.logging_utils import getlogger 5 | from .base import Writer 6 | 7 | class Aml(Writer): 8 | """ 9 | This class implements the Azure ML writer for stats. 10 | """ 11 | def __init__(self): 12 | super().__init__(getlogger(__name__)) 13 | self.run = None 14 | try: 15 | from azureml.core.run import Run 16 | self.run = Run.get_context() 17 | self.logger.info(self.run.get_status()) 18 | except Exception: # pylint: disable=broad-except 19 | self.run = None 20 | self.logger.warning('AML writer failed to initialize.') 21 | self.logger.info(f'run = {self.run}') 22 | 23 | def log_scalar(self, k, v, step): 24 | """ 25 | Log metric to AML. 26 | """ 27 | kwargs = { 28 | 'global_step': step, 29 | k: v 30 | } 31 | if self.run is not None: 32 | self.run.log_row(k, **kwargs) 33 | 34 | def log_multi(self, k, v, step): 35 | """ 36 | Log metrics to stdout. 37 | """ 38 | for key, val in v.items(): 39 | key = k+'/'+key 40 | self.log_scalar(key, val, step) 41 | -------------------------------------------------------------------------------- /pymarlin/utils/writer/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for Writers 3 | """ 4 | from abc import ABC 5 | import dataclasses 6 | 7 | @dataclasses.dataclass 8 | class WriterInitArguments: 9 | """ 10 | Writer Arguments. 11 | """ 12 | tb_log_dir: str = 'logs' 13 | tb_logpath_parent_env: str = None 14 | tb_log_multi: bool = False 15 | tb_log_hist_steps: int = 20000 16 | model_log_level: str = 'INFO' 17 | 18 | class Writer(ABC): 19 | """ 20 | Abstract Base class for Writers. 21 | """ 22 | def __init__(self, logger): 23 | self.logger = logger 24 | 25 | def log_scalar(self, k, v, step): 26 | pass 27 | 28 | def log_multi(self, k, v, step): 29 | pass 30 | 31 | def log_model(self, flat_weights, flat_grads, step): 32 | pass 33 | 34 | def log_args(self, args): 35 | pass 36 | 37 | def log_graph(self, model, device=None): 38 | pass 39 | 40 | def log_image(self, k, v, step, dataformats='HW'): 41 | pass 42 | 43 | def log_pr_curve(self, k, preds, labels, step): 44 | pass 45 | 46 | def log_histogram(self, param_name, vals, step): 47 | pass 48 | 49 | def log_embedding(self, tag, mat, labels, step): 50 | pass 51 | 52 | def _log_norms(self, step, param_name, weight_norm, grad_norm): 53 | pass 54 | 55 | def log_multi_line(self, string): 56 | lines = string.split('\n') 57 | for line in lines: 58 | self.logger.info(line) 59 | 60 | def flush(self): 61 | pass 62 | 63 | def finish(self): 64 | pass 65 | -------------------------------------------------------------------------------- /pymarlin/utils/writer/stdout.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stdout writer module. 3 | """ 4 | from pymarlin.utils.logger.logging_utils import getlogger 5 | from .base import Writer, WriterInitArguments 6 | 7 | class Stdout(Writer): 8 | """ 9 | This class implements the stdout writer for stats. 10 | """ 11 | def __init__(self, args: WriterInitArguments): 12 | super().__init__(getlogger(__name__)) 13 | self.args = args 14 | 15 | def log_scalar(self, k, v, step): 16 | """ 17 | Log metric to stdout. 18 | """ 19 | self.logger.info(f'step = {step}, {k} : {v}') 20 | 21 | def log_multi(self, k, v, step): 22 | """ 23 | Log metric to stdout. 24 | """ 25 | self.logger.info(f'step = {step}, {k} : {v}') 26 | 27 | def log_model(self, flat_weights, flat_grads, step): 28 | """ 29 | Log model to stdout. 30 | Can slow down training. Only use for debugging. 31 | It's logged in Tensorboard by default. 32 | """ 33 | if self.args.model_log_level == 'DEBUG': 34 | for name in flat_weights: 35 | weight_norm = flat_weights[name].norm().item() 36 | grad_norm = None 37 | if name in flat_grads: 38 | grad_norm = flat_grads[name].norm().item() 39 | self._log_norms(step, name, weight_norm, grad_norm) 40 | 41 | def log_graph(self, model, device=None): 42 | """ 43 | Log model graph to stdout. 44 | """ 45 | self.logger.debug('Logging model graph') 46 | self.log_multi_line(str(model)) 47 | 48 | def _log_norms(self, step, param_name, weight_norm, grad_norm): 49 | self.logger.debug(f'step = {step} , {param_name} : weight_norm = {weight_norm}, grad_norm = {grad_norm}') 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning:tensorboard.* 4 | ignore::DeprecationWarning:pywintypes.* -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | required = ['torch<=1.9.1','tqdm','tensorboard', 'Pillow','azureml-core','pyyaml','pandas'] 7 | extras = { 8 | 'dev': ['pylint', 'pytest', 'pytest-cov'], 9 | 'plugins': ['transformers','pandas','matplotlib','sklearn','scipy','rouge-score'] 10 | } 11 | 12 | setup( 13 | name="pymarlin", 14 | version="0.3.6", 15 | author="ELR Team", 16 | author_email="elrcore@microsoft.com", 17 | description="Lightweight Deeplearning Library", 18 | long_description=long_description, 19 | long_description_content_type = "text/markdown", 20 | url="https://microsoft.github.io/PyMarlin/", 21 | packages=find_packages(), 22 | classifiers=[ 23 | "Programming Language :: Python :: 3", 24 | "License :: OSI Approved :: MIT License", 25 | "Operating System :: OS Independent", 26 | ], 27 | install_requires=required, 28 | extras_require=extras, 29 | python_requires=">=3.6", 30 | ) 31 | # https://packaging.python.org/discussions/install-requires-vs-requirements/ -------------------------------------------------------------------------------- /tests/core/test_data_interface.py: -------------------------------------------------------------------------------- 1 | """Test module for data_interface""" 2 | 3 | import os 4 | from dataclasses import dataclass 5 | import unittest 6 | import pandas as pd 7 | from torch.utils.data import Dataset 8 | from pymarlin.core.data_interface import DataInterface, DataProcessor 9 | 10 | @dataclass 11 | class MyArgs: 12 | filepath_train: str = os.path.join("outputs", "file1.csv") 13 | filepath_test: str = os.path.join("outputs", "file2.csv") 14 | text_field: str = "text" 15 | label_field: str = "label" 16 | 17 | 18 | class MyDataset(Dataset): 19 | def __init__(self, df, text_field, label_field): 20 | self.df = df 21 | self.text_field = text_field 22 | self.label_field = label_field 23 | 24 | def __len__(self): 25 | return len(self.df) 26 | 27 | def __getitem__(self, idx): 28 | row = self.df.iloc[idx] 29 | return (row[self.text_field], row[self.label_field]) 30 | 31 | 32 | class MyData(DataInterface): 33 | 34 | def __init__(self): 35 | self._train_ds = None 36 | self._val_ds = None 37 | 38 | def setup_datasets(self, train_ds, val_ds): 39 | self._train_ds = train_ds 40 | self._val_ds = val_ds 41 | 42 | def get_train_dataset(self): 43 | return self._train_ds 44 | 45 | def get_val_dataset(self): 46 | return self._val_ds 47 | 48 | 49 | class MyDataProcessor(DataProcessor): 50 | 51 | def __init__(self, args): 52 | super().__init__() 53 | self.args = args 54 | 55 | def process(self): 56 | df = pd.read_csv(self.args.filepath_train) 57 | return MyDataset(df, self.args.text_field, 58 | self.args.label_field) 59 | 60 | 61 | class MyDataMultiProcessor(DataProcessor): 62 | 63 | def __init__(self, args): 64 | super().__init__() 65 | self.args = args 66 | 67 | def process(self, filename): 68 | df = pd.read_csv(filename) 69 | return MyDataset(df, self.args.text_field, 70 | self.args.label_field) 71 | 72 | 73 | class TestDataInterface(unittest.TestCase): 74 | 75 | def setUp(self): 76 | 77 | self.args = MyArgs() 78 | self.data_interface = MyData() 79 | self.data_processor = MyDataProcessor(self.args) 80 | self.data_multiprocessor = MyDataMultiProcessor(self.args) 81 | 82 | if not os.path.exists("outputs"): 83 | os.makedirs("outputs") 84 | 85 | def tearDown(self): 86 | for f in os.listdir("outputs"): 87 | if f.endswith(".csv"): 88 | os.remove(os.path.join("outputs", f)) 89 | 90 | def test_process_data(self): 91 | df = pd.DataFrame({self.args.text_field: ['one', 'two'], 92 | self.args.label_field: [1, 2]}) 93 | df.to_csv(self.args.filepath_train) 94 | 95 | train_ds = self.data_processor.process_data() 96 | assert train_ds[0] == ('one', 1) 97 | assert len(train_ds) == 2 98 | 99 | def test_multi_process_data(self): 100 | df1 = pd.DataFrame({self.args.text_field: ['one', 'two'], 101 | self.args.label_field: [1, 2]}) 102 | df2 = pd.DataFrame({self.args.text_field: ['three', 'four', 'five'], 103 | self.args.label_field: [3, 4, 5]}) 104 | df1.to_csv(self.args.filepath_train) 105 | df2.to_csv(self.args.filepath_test) 106 | 107 | train_ds_list = self.data_multiprocessor.multi_process_data( 108 | [self.args.filepath_train, self.args.filepath_test], 109 | process_count=2) 110 | assert train_ds_list[0] == ('one', 1) 111 | assert train_ds_list[2] == ('three', 3) 112 | assert len(train_ds_list) == 5 113 | 114 | def test_collect_params(self): 115 | a_number = 1 116 | single_argument = 'single_argument' 117 | list_to_split = ['first', 'second', 'third'] 118 | a_dict = {'test': 'dict'} 119 | list_not_to_split = [['this', 'lists', 'elements', 'dont', 'split']] 120 | self.data_processor._set_ranks() 121 | list_params = self.data_processor._collect_params( 122 | a_number, 123 | single_argument, 124 | list_to_split, 125 | a_dict, 126 | list_not_to_split 127 | ) 128 | assert len(list_params) == 5 129 | assert list_params[2][0] == list_to_split[0] 130 | assert list_params[2][1] == list_to_split[1] 131 | assert list_params[2][2] == list_to_split[2] 132 | 133 | for param in list_params[1]: 134 | assert param == single_argument 135 | 136 | for param in list_params: 137 | assert len(param) == 3 138 | -------------------------------------------------------------------------------- /tests/core/test_trainer_backend.py: -------------------------------------------------------------------------------- 1 | """Test module for trainer_backend""" 2 | 3 | import unittest 4 | from unittest import mock 5 | import torch 6 | import pytest 7 | from pymarlin.core import module_interface, trainer_backend 8 | from pymarlin.utils.distributed import DistributedTrainingArguments 9 | #https://docs.python.org/3/library/unittest.mock.hmock_tml 10 | 11 | class TestSingleProcess(unittest.TestCase): 12 | def setUp(self): 13 | self.trainer_backend = trainer_backend.SingleProcess() 14 | self.mock_module = mock.MagicMock(spec = module_interface.ModuleInterface) 15 | # make x^2 as loss 16 | # self.x = torch.Tensor([1]) 17 | # self.x.requires_grad = True 18 | # self.loss = self.x*self.x 19 | self.loss = torch.randn(1, requires_grad=True) 20 | self.mock_module.forward = mock.MagicMock(return_value = [self.loss]) 21 | 22 | self.mock_scheduler = mock.MagicMock() 23 | 24 | self.mock_optimizer = mock.MagicMock(spec = torch.optim.Optimizer) 25 | self.trainer_backendArgs = trainer_backend.TrainerBackendArguments( 26 | model=self.mock_module, 27 | device='cpu', 28 | train_batch_size=1, 29 | max_train_steps_per_epoch=1, 30 | max_val_steps_per_epoch=1, 31 | distributed_training_args=DistributedTrainingArguments(), 32 | optimizers=[self.mock_optimizer], 33 | schedulers=[self.mock_scheduler], 34 | gradient_accumulation=1, 35 | clip_grads=False, 36 | ) 37 | 38 | self.trainer_backend.init(self.trainer_backendArgs) 39 | 40 | self.mock_callback = mock.MagicMock() 41 | self.mock_dataloader = [mock.MagicMock()]*10 42 | 43 | def test_train_dl(self): 44 | 45 | # make x^2 as loss 46 | x = torch.Tensor([1]) 47 | x.requires_grad = True 48 | loss = x*x 49 | self.mock_module.forward = mock.MagicMock(return_value = [loss]) 50 | 51 | 52 | self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback) 53 | 54 | 55 | # test forward 56 | self.mock_module.forward.assert_called_once_with( 57 | stage = module_interface.Stage.TRAIN, 58 | batch = self.mock_dataloader[0], 59 | device = 'cpu', 60 | global_step =1 ) 61 | print(self.mock_module.forward.return_value) 62 | # test backward 63 | assert x.grad == 2 *x 64 | # test optimization 65 | self.mock_optimizer.step.assert_called_once() 66 | self.mock_optimizer.zero_grad.assert_called_once() 67 | #test callback 68 | self.mock_callback.on_end_train_step.assert_called_once() 69 | self.mock_callback.on_end_train_step.assert_called_with( 1, loss.detach()) 70 | 71 | def test_eval_dl(self): 72 | 73 | self.trainer_backend.validate_dl(self.mock_dataloader) 74 | 75 | # test forward 76 | self.mock_module.forward.assert_called_once_with( 77 | stage = module_interface.Stage.VAL, 78 | batch = self.mock_dataloader[0], 79 | device = 'cpu', 80 | global_step = 0 ) 81 | 82 | def test_gradiend_accumulation(self): 83 | self.trainer_backend.args.gradient_accumulation = 2 84 | self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback) 85 | assert self.mock_module.forward.call_count == 2 86 | assert self.mock_optimizer.step.call_count == 1 87 | assert self.mock_optimizer.step.call_count == 1 88 | 89 | def test_gradiend_clipping(self): 90 | # make x^2 as loss 91 | 92 | 93 | self.trainer_backend.args.clip_grads = True 94 | self.trainer_backend.args.max_grad_norm = 1 95 | 96 | for val in range(-10, 10): 97 | x = torch.Tensor([val]) 98 | x.requires_grad = True 99 | loss = x*x 100 | self.mock_module.parameters = mock.MagicMock(return_value = [x]) 101 | self.mock_module.forward = mock.MagicMock(return_value = [loss]) 102 | self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback) 103 | 104 | assert min(0, 2*val -1) < x.grad.item() <= 1 105 | 106 | def test_output_collection(self): 107 | 108 | self.trainer_backendArgs.max_train_steps_per_epoch = 2 109 | self.trainer_backend.args.gradient_accumulation = 2 110 | 111 | losses = [torch.randn(1, requires_grad=True).squeeze(), torch.randn(1, requires_grad=True).squeeze()] * 2 112 | labels = [torch.randint(0,10, size = (4,3)), torch.randint(0,10, size = (3,3))] * 2 113 | # guids = range(4) 114 | self.mock_module.forward = mock.MagicMock() 115 | self.mock_module.forward.side_effect = zip(losses, labels)#, guids) 116 | 117 | outputs = self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback) 118 | assert self.mock_module.forward.call_count == 4 119 | 120 | assert outputs[0].shape == torch.Size([4]) 121 | assert outputs[1].shape == torch.Size([4+3+4+3, 3]) # concatinated across axis 0 122 | # assert outputs[2] == [0,1,2,3] 123 | #print(outputs, outputs[0].shape) 124 | 125 | 126 | def test_get_state(self): 127 | state = self.trainer_backend.get_state() 128 | assert state['global_step_completed'] == 0 129 | assert state['batches_completed'] == 0 130 | 131 | def test_update_state(self): 132 | state_dict = { 133 | 'global_step_completed': 1, 134 | 'batches_completed': 2 135 | } 136 | self.trainer_backend.update_state(state_dict) 137 | assert self.trainer_backend.get_global_steps_completed() == 1 138 | assert self.trainer_backend.get_batches_completed() == 2 139 | 140 | 141 | @pytest.mark.filterwarnings("ignore::UserWarning: torch.cuda.amp.") 142 | class TestSingleProcessAmp(TestSingleProcess): 143 | def setUp(self): 144 | super().setUp() 145 | self.trainer_backend = trainer_backend.SingleProcessAmp() 146 | self.trainer_backend.init(self.trainer_backendArgs) -------------------------------------------------------------------------------- /tests/test_sanity.py: -------------------------------------------------------------------------------- 1 | '''Class to test import of marlin library''' 2 | def test_import(): 3 | import pymarlin 4 | assert True -------------------------------------------------------------------------------- /tests/utils/config.yaml: -------------------------------------------------------------------------------- 1 | # Test YAML file for unit testing. 2 | 3 | test_data_path: 'c:\tmp' 4 | test_empty_data_path: null 5 | 6 | # test arguments 7 | test: 8 | test_float: -1.0 9 | test_int: -1 10 | test_list_float: [-1.0, -1.0, -1.0] 11 | test_list_int: [-1, -1, -1] 12 | test_list_str: [ 'this', 'is', 'a', 'test', 'list'] 13 | test_str: null 14 | test_true: False 15 | test_false: True 16 | test_two_levels: 17 | test_int: -1 18 | test_three_levels: 19 | test_list_str: [ 'this', 'is', 'a', 'test', 'list'] 20 | -------------------------------------------------------------------------------- /tests/utils/corrupt_files/config.yaml: -------------------------------------------------------------------------------- 1 | # Test for corrupt YAML file for unit testing. 2 | date: 2021-02-03 3 | dummy -------------------------------------------------------------------------------- /tests/utils/test_stats.py: -------------------------------------------------------------------------------- 1 | """Module to test stats module class""" 2 | import os 3 | import torch 4 | import numpy as np 5 | import pytest 6 | import shutil 7 | import unittest 8 | from unittest import mock 9 | from pymarlin.utils import stats 10 | from pymarlin.utils.writer import build_writer, WriterInitArguments 11 | import collections 12 | import functools 13 | 14 | class TestStats(unittest.TestCase): 15 | def setUp(self): 16 | self.stats = stats.global_stats 17 | self.stat_args = stats.StatInitArguments( 18 | log_steps = 50, 19 | update_system_stats = False, 20 | log_model_steps = 1000, 21 | exclude_list = None 22 | ) 23 | self.writer_args = WriterInitArguments( 24 | tb_log_dir='logs' 25 | ) 26 | self.writers = [ 27 | build_writer(writer, self.writer_args) 28 | if isinstance(writer, str) 29 | else writer 30 | for writer in ['stdout','tensorboard'] 31 | ] 32 | self.stats.rebuild(args=self.stat_args, writers=self.writers) 33 | 34 | def tearDown(self): 35 | self.stats.rebuild(args=None, writers=[]) 36 | 37 | @pytest.fixture(scope='module') 38 | def project_file(self, tmpdir_factory): 39 | print('deleting temp folder') 40 | my_tmpdir = tmpdir_factory.mktemp(self.writer_args.tb_log_dir) 41 | yield my_tmpdir 42 | shutil.rmtree(str(my_tmpdir)) 43 | 44 | def test_short(self): 45 | scalars = {'F1': 0.5, 'acc':0.8} 46 | for k, v in scalars.items(): 47 | self.stats.update(k,v, frequent = True) 48 | assert self.stats.scalars_short[k] == v 49 | self.stats.update(k,v+0.1, frequent = True) 50 | assert self.stats.scalars_short[k] == v+0.1 51 | self.stat_args.log_steps = 2 52 | self.stats.rebuild(args=self.stat_args, writers=self.writers) 53 | print('log stats for step 1. Nothing should be logged here.') 54 | self.stats.log_stats(step = 1) 55 | assert len(self.stats.scalars_short) > 0 56 | print('log stats for step 2. should be logged now.') 57 | self.stats.log_stats(step = 2) 58 | assert len(self.stats.scalars_short) == 0 59 | 60 | def test_long(self): 61 | scalars = {'epochs': 1} 62 | for k,v in scalars.items(): 63 | self.stats.update(k,v, frequent = False) 64 | assert self.stats.scalars_long[k] == v 65 | multi = {'losses': {'train':0.5, 'val_email':0.8, 'val_wiki':0.3}} 66 | for k,v in multi.items(): 67 | self.stats.update_multi(k,v, frequent = False) 68 | assert self.stats.multi_long[k] == v 69 | print('log long stats . should be logged') 70 | self.stats.log_long_stats(step = 1000) 71 | 72 | def test_log_model(self): 73 | # 2 layer NN with layer norm and sigmoid 74 | model = MyModel() 75 | self.stats.log_graph(model, device='cpu') 76 | optim = torch.optim.SGD(params = model.parameters(), lr = 1) 77 | 78 | self.stat_args.log_steps = 1 79 | self.writer_args.tb_hist_interval = 2 80 | self.writers = [ 81 | build_writer(writer, self.writer_args) 82 | if isinstance(writer, str) 83 | else writer 84 | for writer in ['stdout','tensorboard'] 85 | ] 86 | self.stats.rebuild(args=self.stat_args, writers=self.writers) 87 | 88 | for step in range(1, 5): 89 | op = model.forward(torch.rand(2,3)) 90 | loss = torch.nn.MSELoss()(op, torch.rand(2,1)) 91 | loss.backward() 92 | 93 | self.stats.log_model(step, model) 94 | optim.step() 95 | optim.zero_grad() 96 | #expectation. norms should be logged 4 times 97 | #histogram should be logged only twice in tensorboard 98 | 99 | def test_log_image(self): 100 | random_image = np.random.randint(100,size = (1,100)).reshape(10,10) 101 | random_image = random_image/ 100 102 | self.stats.update_image('random_image', 103 | random_image, 104 | dataformats = 'HW') 105 | self.stats.log_long_stats(step = 1000) 106 | 107 | def test_log_pr(self): 108 | preds = np.random.rand(100) 109 | labels = np.random.randint(2, size=100) 110 | self.stats.update_pr('binary_pr', 111 | preds, labels) 112 | self.stats.log_long_stats(step = 1000) 113 | 114 | class MyModel(torch.nn.Module): 115 | def __init__(self): 116 | super().__init__() 117 | self.hidden = torch.nn.Linear(3,5) 118 | self.hidden_activation = torch.nn.Tanh() 119 | self.hidden_layernorm = torch.nn.LayerNorm(5) 120 | self.output = torch.nn.Linear(5,1) 121 | 122 | def forward(self, input): 123 | hidden_op = self.hidden_activation( 124 | self.hidden_layernorm( 125 | self.hidden(input))) 126 | op = self.output(hidden_op) 127 | return op 128 | 129 | def get_sample_input(self): 130 | return torch.ones(1,3, dtype = torch.float32) 131 | -------------------------------------------------------------------------------- /website/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /node_modules 3 | 4 | # Production 5 | /build 6 | 7 | # Generated files 8 | .docusaurus 9 | .cache-loader 10 | 11 | # Misc 12 | .DS_Store 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | -------------------------------------------------------------------------------- /website/README.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. 4 | 5 | ## Prerequisites 6 | 7 | To build and test documentation locally, begin by downloading and installing [Node.js](https://nodejs.org/en/download/), and then installing [Yarn](https://classic.yarnpkg.com/en/). 8 | On Windows, you can install via the npm package manager (npm) which comes bundled with Node.js: 9 | 10 | ```console 11 | npm install --global yarn 12 | ``` 13 | 14 | ## Installation 15 | 16 | ```console 17 | yarn install 18 | pip install pydoc-markdown 19 | ``` 20 | 21 | ## Local Development 22 | 23 | Navigate to the website folder and run: 24 | 25 | ```console 26 | pydoc-markdown 27 | yarn start 28 | ``` 29 | 30 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. 31 | 32 | ## Build 33 | 34 | ```console 35 | yarn build 36 | ``` 37 | 38 | This command generates static content into the `build` directory and can be served using any static contents hosting service. 39 | 40 | ## Deployment 41 | 42 | ```console 43 | GIT_USER= USE_SSH=true yarn deploy 44 | ``` 45 | 46 | If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch. 47 | -------------------------------------------------------------------------------- /website/UML/diagrams/out/classes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classes.png -------------------------------------------------------------------------------- /website/UML/diagrams/out/classification_data_processing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classification_data_processing.png -------------------------------------------------------------------------------- /website/UML/diagrams/out/classification_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classification_train.png -------------------------------------------------------------------------------- /website/UML/diagrams/out/classifier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classifier.png -------------------------------------------------------------------------------- /website/UML/diagrams/out/training_lifecycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/training_lifecycle.png -------------------------------------------------------------------------------- /website/UML/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for UML diagram generation 6 | 7 | if "%1" == "svg" goto svg 8 | if "%1" == "png" goto png 9 | 10 | java -jar plantuml.jar -h 11 | goto end 12 | 13 | :svg 14 | java -jar plantuml.jar -tsvg -o "../out" "diagrams/src" 15 | 16 | :png 17 | java -jar plantuml.jar -tpng -o "../out" "diagrams/src" 18 | 19 | :end 20 | popd -------------------------------------------------------------------------------- /website/UML/plantuml.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/plantuml.jar -------------------------------------------------------------------------------- /website/UML/readme.md: -------------------------------------------------------------------------------- 1 | ## UML Diagrams 2 | 3 | We created all diagrams using PlantUML. If you are using Visual Studio Code, 4 | you can leverage the PlantUML extension for modifying and previewing diagrams. 5 | 6 | https://plantuml.com/ 7 | 8 | ### Prerequisites 9 | 10 | - PlantUML.jar: https://sourceforge.net/projects/plantuml/files/plantuml.jar/download 11 | - Java: https://www.java.com/en/download/ 12 | - Add to java executable to PATH or if using PlantUML extension for VS Code, 13 | add full path to `java.exe` under `plantuml.java` in PlantUML's `settings.json` file. 14 | - Graphviz (not needed for sequence diagrams): https://graphviz.org/download/ 15 | 16 | Note: the latest versions of PlantUML include a minimalistic graphviz dot.exe. 17 | 18 | ### Building the diagrams 19 | 20 | All diagram source codes are stored under `docs/UML/diagrams/src`, and outputs 21 | under `docs/UML/diagrams/out`. 22 | 23 | To automatically generate all diagrams run `make svg` for .svg outputs, or `make png` for .png outputs. 24 | 25 | ```bash 26 | cd docs/UML 27 | make svg 28 | make png 29 | ``` 30 | 31 | To manually build the diagrams, you can use the following command, which will 32 | search the directory for files with .pu extension with `@startuml` and `@enduml`, and 33 | create all diagrams found under `docs/UML/diagrams/src`. 34 | 35 | ```bash 36 | cd docs/UML 37 | java -jar plantuml.jar -tsvg -o "../out" "diagrams/src" 38 | ``` 39 | 40 | Note: all diagram names are specified via `@startuml diagram_name` in each file. 41 | 42 | 43 | ### Contribution 44 | 45 | To modify an existing diagram simply modify between `@startuml diagram_name` and `@enduml`, 46 | and regenerate diagrams. To create new diagrams, please create a new .pu file and it will 47 | automatically be detected when calling `make svg`. -------------------------------------------------------------------------------- /website/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /website/docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contribution and Feedback 2 | ## Contributing 3 | PyMarlin welcomes your contributions! 4 | 5 | ## Contributor License Agreement 6 | This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 7 | 8 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. 9 | 10 | ## Code of Conduct 11 | This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments. 12 | 13 | ## Feedback 14 | PyMarlin library is developed by Microsoft engineers and we cannot wait to see how our library will be adopted by the wider community and help bring more innovation into the world of Artificial Intelligence ! 15 | 16 | Please reach out to us for any feedback and suggestions about the library in the github issues here: https://github.com/microsoft/PyMarlin/issues -------------------------------------------------------------------------------- /website/docs/examples/classification.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Covid-19 Classification 3 | --- 4 | 5 | In this example, we finetune BERT for for Covid-19 Tweet Sentiment detection. We also provide instructions for running in AzureML seamlessly with DDP Distributed Multi-GPU training. Check out the latest instructions in the GitHub repo [here](https://github.com/microsoft/PyMarlin/tree/main/examples/covid19_text_classification) 6 | -------------------------------------------------------------------------------- /website/docs/examples/datamodule-example.md: -------------------------------------------------------------------------------- 1 | # Data interface single and multi process 2 | 3 | This is an example explaining how to leverage the in-built multiprocessing capability of DataInterface for large amounts of data. 4 | For example purpose we are using 27 files from wikipedia raw text. 5 | 1) Azure virtual machine , single node multi-process , single selective machine 6 | 2) AML, single node vs multi-node, single selective machine 7 | ## Configs - YAML and Parsing 8 | 9 | For ease of use we have configs passed in as YAML files. 10 | In this case we use the config file : config_prod.yaml included with example code. 11 | 12 | Snippet of config: ( modify file paths according to your folder structure) 13 | 14 | ```python 15 | input_dir: 'C:/Users/ashwinsr/wikipedia.part1' 16 | out_dir: 'C:/Users/ashwinsr/out_fold' 17 | process_count: 10 18 | run_type: '' 19 | ``` 20 | 21 | This config can be read in like below : 22 | 23 | ```python 24 | #Create arg parser and read config 25 | parser = CustomArgParser(log_level='DEBUG', default_yamlfile="config_prod.yaml") 26 | config = parser.parse() 27 | ``` 28 | 29 | Our data processor is a simple token splitter which given raw text will split it into token store the results back in a file. The processor runs 1 file at a time. 30 | 31 | ## Virtual machine 32 | ### Single virtual machine with multi process 33 | 34 | ```python 35 | dataInterface = Ex_dataInterface() 36 | file_list = dataInterface.get_file_names(config["input_dir"]) 37 | #create and run processor1 38 | example_processor = Processor1(config["input_dir"], config["out_dir"]) 39 | out = example_processor.multi_process_data(file_list, process_count=config["process_count"]) 40 | ``` 41 | Here we create a list of files in the directory and initialize the processor with the input and output directory. We call the the multi_process_data function in the processor, passing the list of files , with the process count. The processor then spins up those many number of processes to create coressponding output. 42 | 43 | ### Selective node preprocessing 44 | For a case where we have a single node but want to process the data in batches. We want the processor to run on different subset of files depending upon the rank we assign. This is to emulate multi-node behaviour with a single node by controlling the node rank parameter. 45 | 46 | For instance if we have 30 files to process over 5 separate runs , then we need to add the following to config and initialize dataProcessor accordingly 47 | 48 | ```python 49 | distribArgs: 50 | local_rank: 0 51 | global_rank: 0 52 | world_size: 1 53 | node_count: 5 54 | local_size: 1 55 | node_rank: 3 56 | ``` 57 | 58 | ```python 59 | distrib = DistributedPreprocessArguments(**config["distribArgs"]) 60 | example_processorer = Processor1(config["input_dir"], config["out_dir"], distrib) 61 | ``` 62 | Remember to initialize the base dataProcessor class with the distributed arguemnts as shown below, the default None would treat it like a regular multi-node processing job 63 | 64 | ``` 65 | class Processor1(data_interface.DataProcessor): 66 | def __init__(self, input_dir, out_dir, distrib_args = None): 67 | super(Processor1, self).__init__(distrib_args) 68 | self.input_dir = input_dir 69 | self.out_dir = out_dir 70 | ``` 71 | 72 | With the above setting we would process files 18-24 out of 30. Since the node_rank is 3 (0 indexed) and can be a maximum of 4. node_count gives us a count of total nodes available 73 | This gives a flexibility with large data processing with limited compute. 74 | 75 | To run in virtual machine copy over the files to virtual machine using SCP 76 | Install pymarlin and requirements and run example 77 | ```python 78 | > ssh $user@$machine -p $port 79 | $ pip install ./pymarlin --force-reinstall 80 | $ pip install -r pymarlin/requirements.txt 81 | $ cd data_ex 82 | $ python data.py 83 | ``` 84 | 85 | ### AML 86 | We can do single and multi-node processing both with AML. The datamodule handles AML ranking internally for both single and multinodes to appropriately divide the files across nodes. 87 | You will find a notebook along with the example to submit a AML a job, with placeholders for storage and compute accounts. 88 | -------------------------------------------------------------------------------- /website/docs/examples/distillation.md: -------------------------------------------------------------------------------- 1 | # Distillation 2 | 3 | With `pymarlin` library, distillation can be done in a standalone manner or as an extension to your original training Scenario. In this example, we will go through how the [GLUE Task](glue-tasks.md) setup was extended to also perform distillation. 4 | 5 | Data Preprocessing is the same as [here](glue-tasks.md). The main implementation is in the `ModuleInterface` which we chose to call `DistillRecipe` (inheriting from the GLUE `Recipe`). 6 | 7 | The key methods of `DistillRecipe` that we want to override: 8 | 1. Setting up teacher and student model and related items such as config as needed. Here, we have the option to modify the student config depending on the desired changes to the depth or width of the model. 9 | ```python 10 | def setup_models(self): 11 | self._setup_configs() 12 | # teacher setup 13 | self.teacher = AutoModelForSequenceClassification.from_pretrained( 14 | os.path.join(self.args.model_args.model_wts_path, self.args.model_args.model_file), 15 | config=self.model_config 16 | ) 17 | # student setup 18 | self.model = copy.deepcopy(self.teacher) 19 | if len(self.student_layers) > 0: 20 | layer_modules = getattr(self.model, self.args.model_args.encoder_key).encoder.layer 21 | new_layer_modules = distill_utils.extract_layers(layer_modules, self.student_layers) 22 | getattr(self.model, self.args.model_args.encoder_key).encoder.layer = new_layer_modules 23 | 24 | self.teacher.eval() 25 | self.output_hidden = True if 'hidden_states' in self.loss_types else False 26 | self.output_attentions = True if 'attentions' in self.loss_types else False 27 | return (self.model, self.teacher) 28 | ``` 29 | 30 | 2. Modify `train_step` to set teacher in eval mode, get teacher outputs, get student outputs, and compute a custom loss. The loss can be a combination of `logits`, `labels` or various intermediate representations such as `hidden_states` and `attentions`. You have the flexibility to determine your distillation logic. 31 | ```python 32 | def train_step(self, global_step, batch, device): 33 | self.teacher.eval() 34 | inputs = self._inputs_to_device(batch, device) 35 | teacher_outputs = self.teacher.forward(**inputs, 36 | output_hidden_states=self.output_hidden, 37 | output_attentions=self.output_attentions, 38 | ) # label_loss, logits, hidden, attns 39 | student_outputs = self.model.forward(**inputs, 40 | output_hidden_states=self.output_hidden, 41 | output_attentions=self.output_attentions, 42 | ) 43 | total_loss = torch.zeros([1], dtype=student_outputs[0].dtype, device=device) 44 | for i, k in enumerate(self.loss_types): 45 | if k == 'labels': 46 | student_scores = student_outputs.loss 47 | teacher_scores = teacher_outputs.loss 48 | else: 49 | student_scores = getattr(student_outputs, k) 50 | teacher_scores = getattr(teacher_outputs, k) 51 | 52 | if student_scores is not None and teacher_scores is not None: 53 | if k == 'logits': 54 | total_loss += self.loss_weights[i] * distill_utils.logits_loss( 55 | student_scores, teacher_scores, 56 | temperature=self.distill_args.temperature, 57 | ) 58 | elif k != 'logits' and self.distill_args.width_shrinkage == 0: 59 | total_loss += self.loss_weights[i] * distill_utils.representations_loss( 60 | student_scores, 61 | teacher_scores, 62 | [*range(len(self.student_layers))], 63 | self.student_layers 64 | ) 65 | return total_loss 66 | ``` 67 | 68 | 3. As an example, `on_end_train` can be used to cleanup any changes made to the final student model config and save it to the output directory along with the student model. 69 | 70 | That's it! If you have a scenario setup it's as easy as overriding just 2 methods. -------------------------------------------------------------------------------- /website/docs/examples/glue-tasks.md: -------------------------------------------------------------------------------- 1 | # GLUE Tasks 2 | 3 | You can use the `pymarlin` library to easily benchmark your models for the GLUE tasks. 4 | Checkout the README in the github repo [here](https://github.com/microsoft/PyMarlin/tree/main/examples/glue_text_benchmark). -------------------------------------------------------------------------------- /website/docs/examples/images/cifar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/examples/images/cifar.png -------------------------------------------------------------------------------- /website/docs/examples/images/tb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/examples/images/tb.jpg -------------------------------------------------------------------------------- /website/docs/examples/images/tensorboard_screenshot_bart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/examples/images/tensorboard_screenshot_bart.jpg -------------------------------------------------------------------------------- /website/docs/examples/summarization.md: -------------------------------------------------------------------------------- 1 | # CNN/DailyMail Summarization 2 | 3 | In this example, we finetune a BART model for summarizing CNN/Daily Mail news articles. Checkout the README in the GitHub repo [here](https://github.com/microsoft/PyMarlin/tree/main/examples/cnndailymail_text_summarization) for the latest instructions on how to run. -------------------------------------------------------------------------------- /website/docs/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | ### Welcome to PyMarlin, a lightweight PyTorch library for agile deep learning! 4 | PyMarlin is a lightweight PyTorch extension library for agile deep learning experimentation. PyMarlin was developed with the goal of simplifying the E2E Deep Learning experimentation lifecycle for data scientists. The library enables an agile way to quickly prototype a new AI scenario on your dev box and seamlessly scale to multi-node GPU training in AzureML or any other cloud services. 5 | 6 | ## Key features 7 | - **Data pre-processing** module which enables data preprocessing recipes to scale from single CPU to multi-CPU and multi node. 8 | - **Infra-agnostic design**: native Azure ML integration implies the same code running on local dev-box can also run directly on any VM or Azure ML cluster. 9 | - **Trainer backend abstraction** with support for Single Process (CPU/GPU), distributed Data Parallel, mixed-precision (AMP, Apex) training. ORT and Deepspeed libraries are also integrated to get the best distributed training throughputs. 10 | - Out-of-the-box **Plugins** that can be used for typical NLP tasks like Sequence Classification, Named Entity Recognition and Seq2Seq text generation. 11 | - **Utility modules** for model checkpointing, stats collection and Tensorboard events logging which can be customized based on your scenario. 12 | - **Custom arguments parser** that allows for saving all the default values for arguments related to a scenario in a YAML config file, merging user supplied arguments at runtime. 13 | 14 | 15 | ## Start exploring! 16 | 17 | ### Train your first model with pymarlin 18 | 19 | Check out [CIFAR image classification](examples/cifar.md) from the EXAMPLES section. 20 | 21 | ### GLUE task benchmarking 22 | 23 | Explore how to use pymarlin to [benchmark your models on GLUE tasks](examples/glue-tasks.md). 24 | 25 | ## We want your feedback! 26 | 27 | Reach out to us with your [feedback and suggestions](https://github.com/microsoft/PyMarlin/issues). -------------------------------------------------------------------------------- /website/docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | In this guide, we will share instructions on how to set up pymarlin in the following environments: 3 | * Local/Dev Machine 4 | * AzureML 5 | 6 | ## Local/Dev Machine 7 | ### Environment setup 8 | conda create -n pymarlin python=3.8 9 | conda activate pymarlin 10 | 11 | ### Install pytorch 12 | [Latest documentation](https://pytorch.org/get-started/locally/) 13 | 14 | conda install pytorch cpuonly -c pytorch 15 | 16 | ### Install PyMarlin 17 | You can install from our internal pip or alternatively install from source. 18 | 19 | #### Install from pip (once available in PyPi) 20 | 21 | pip install pymarlin 22 | 23 | #### Install from source 24 | 25 | git clone https://github.com/microsoft/PyMarlin.git 26 | cd PyMarlin 27 | pip install -e . 28 | 29 | ## AzureML 30 | Specify the pip package in a supplied conda_env.yml file. -------------------------------------------------------------------------------- /website/docs/marlin-in-pictures.md: -------------------------------------------------------------------------------- 1 | # PyMarlin in Pictures 2 | 3 | pymarlin is designed to improve agility, scalability, code manageability, all while 4 | providing flexibility and control across DL environments. Due to this, pymarlin is divided 5 | into various components correspoinding to extensible classes, only a few of which need to 6 | be implemented by users. As shown in the following class diagram, the key clases that need 7 | to be implemented are **DataInterface** and **ModuleInterface**. These interact with **Trainer** which 8 | acts as an orchestrator, and **TrainerBackend**, which themselves are extensible and configurable via 9 | arguments. 10 | 11 | ![](../UML/diagrams/out/classes.svg) 12 | 13 | ## Classification task example 14 | 15 | Below you can find a collection of class and sequence diagrams 16 | for a classification task that exemplifies the use of pymarlin. The example 17 | shown implements the DataInterface and ModuleInterface for classifying tweet 18 | sentiment, as TweetSentData and TweetSentModule. The class diagram 19 | illustrates which classes correspond to built-in pymarlin classes, user-extended 20 | classes specific to the scenario, and some important external depenencies used. 21 | 22 | ### Class Diagram 23 | 24 | This diagram shows the classes implemented for the classification task, as well as 25 | relationships between the important pymarlin modules. Here, ModuleInterface has been 26 | extended as *TweetSentModule*, DataInterface as *TweetSentDataModule*, and 27 | data processing is managed by classes *Stage1* and *Stage2*, extended from DataProcessor. As a TrainerBackend, SingleProcess class is used for this example. 28 | Most customization and settings come from modifying default TrainerArguments 29 | via the config.yaml file. 30 | 31 | ![](../UML/diagrams/out/classifier.svg) 32 | 33 | ### Training 34 | 35 | The following sequence diagram illustrates how to train the above classification 36 | model, with emphasis on the module instanciation users need to perform as part 37 | of their main script. Details on Data Processing and the Training Lifecycle referenced 38 | in this diagram can be found below. 39 | 40 | ![](../UML/diagrams/out/classification_train.svg) 41 | 42 | ### Data Processing 43 | 44 | Sequence diagram for the data processing prior to training. 45 | 46 | ![](../UML/diagrams/out/classification_data_processing.svg) 47 | 48 | ### Training Lifecycle 49 | 50 | Sequence diagram for the training loop. 51 | 52 | ![](../UML/diagrams/out/training_lifecycle.svg) 53 | -------------------------------------------------------------------------------- /website/docs/plugins/images/hfner/ner_dataset_mod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfner/ner_dataset_mod.png -------------------------------------------------------------------------------- /website/docs/plugins/images/hfseqclass/loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/loss.jpg -------------------------------------------------------------------------------- /website/docs/plugins/images/hfseqclass/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/loss.png -------------------------------------------------------------------------------- /website/docs/plugins/images/hfseqclass/lr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/lr.jpg -------------------------------------------------------------------------------- /website/docs/plugins/images/hfseqclass/lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/lr.png -------------------------------------------------------------------------------- /website/docs/plugins/images/hfseqclass/train_metrics.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/train_metrics.jpg -------------------------------------------------------------------------------- /website/docs/plugins/images/hfseqclass/train_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/train_metrics.png -------------------------------------------------------------------------------- /website/docs/utils/images/tb_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/utils/images/tb_example.jpg -------------------------------------------------------------------------------- /website/docs/utils/stats.md: -------------------------------------------------------------------------------- 1 | # Stats and Tensorboard logging 2 | We have implemented a wrapper on Tensorboard's SummaryWriter for logging stats to Tensorboard (TB) which makes it easy to use the utility to save TB events and visualize on TB later for tracking the progress of your training experiment. We also have the Azure ML and stdout writers to be able to write out your stats to the logs. 3 | 4 | Usage is demonstrated here: 5 | ```python 6 | import os 7 | os.listdir() 8 | ``` 9 | 10 | 11 | 12 | 13 | ['.ipynb_checkpoints', 'Untitled.ipynb'] 14 | 15 | 16 | 17 | 18 | ```python 19 | import pymarlin 20 | from pymarlin.utils.stats import global_stats, StatInitArgs 21 | from pymarlin.utils.writer import build_writer, WriterInitArgs 22 | ``` 23 | `global_stats` is a singleton variable that can be used across entire application to log stats. 24 | 25 | ```python 26 | writers = ['tensorboard','stdout','aml'] 27 | writerargs = WriterInitArgs( 28 | tb_log_dir = './logs' 29 | ) 30 | writer_objects = [build_writer(w, writerargs) for w in writers] 31 | global_stats.rebuild(StatInitArgs(), writer_objects) 32 | ``` 33 | 34 | SystemLog: 2021-01-29 16:02:21,033:INFO : pymarlin.utils.writer.tensorboard : 37 : Cleared directory ./logs (skipping azureml dirs) 35 | SystemLog: 2021-01-29 16:02:21,040:INFO : pymarlin.utils.writer.tensorboard : 40 : Created tensorboard folder ./logs : [] 36 | 37 | ## Write out stats you care about 38 | ```python 39 | for i in range(10): 40 | global_stats.update('loss',random.uniform(0,2), frequent = True ) # adds a new in memory stat 41 | global_stats.log_stats(step = i) #actually logs stats to stdout, tensorboard and aml simultaneously 42 | ``` 43 | 44 | SystemLog: 2021-01-29 16:06:40,276:INFO : pymarlin.utils.writer.stdout : 10 : step = 0, iteration : 0 45 | SystemLog: 2021-01-29 16:06:40,279:INFO : pymarlin.utils.writer.stdout : 10 : step = 0, loss : 0.44372909088471446 46 | SystemLog: 2021-01-29 16:06:40,284:INFO : pymarlin.utils.writer.stdout : 10 : step = 1, loss : 0.5985009500820384 47 | SystemLog: 2021-01-29 16:06:40,285:INFO : pymarlin.utils.writer.stdout : 10 : step = 2, loss : 1.5669796666205043 48 | SystemLog: 2021-01-29 16:06:40,286:INFO : pymarlin.utils.writer.stdout : 10 : step = 3, loss : 0.8748342474891679 49 | SystemLog: 2021-01-29 16:06:40,288:INFO : pymarlin.utils.writer.stdout : 10 : step = 4, loss : 1.8371541447672195 50 | SystemLog: 2021-01-29 16:06:40,290:INFO : pymarlin.utils.writer.stdout : 10 : step = 5, loss : 0.18000397399047174 51 | SystemLog: 2021-01-29 16:06:40,292:INFO : pymarlin.utils.writer.stdout : 10 : step = 6, loss : 0.1455008149921977 52 | SystemLog: 2021-01-29 16:06:40,293:INFO : pymarlin.utils.writer.stdout : 10 : step = 7, loss : 1.4704800219353158 53 | SystemLog: 2021-01-29 16:06:40,297:INFO : pymarlin.utils.writer.stdout : 10 : step = 8, loss : 0.8764679987392285 54 | SystemLog: 2021-01-29 16:06:40,298:INFO : pymarlin.utils.writer.stdout : 10 : step = 9, loss : 0.6293567937040325 55 | 56 | ## Check Tensorboard logs 57 | ```python 58 | os.listdir('logs') 59 | ``` 60 | 61 | ['events.out.tfevents.1611964941.krishan-surface.16776.1'] 62 | 63 | ```python 64 | !tensorboard --logdir logs 65 | ``` 66 | 67 | ![img](images/tb_example.jpg) 68 | 69 | For more info on stats, check the stat module docstring in **pymarlin API** section 70 | -------------------------------------------------------------------------------- /website/docusaurus.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('@docusaurus/types').DocusaurusConfig} */ 2 | module.exports = { 3 | title: 'PyMarlin', 4 | tagline: 'Lightweight PyTorch Training Framework', 5 | url: 'https://github.com/microsoft/PyMarlin', 6 | baseUrl: '/PyMarlin/', 7 | onBrokenLinks: 'throw', 8 | onBrokenMarkdownLinks: 'warn', 9 | favicon: 'img/favicon.ico', 10 | organizationName: 'microsoft', // Usually your GitHub org/user name. 11 | projectName: 'PyMarlin', // Usually your repo name. 12 | themeConfig: { 13 | navbar: { 14 | title: 'PyMarlin', 15 | logo: { 16 | alt: 'My Site Logo', 17 | src: 'img/logo.svg', 18 | }, 19 | items: [ 20 | { 21 | type: 'doc', 22 | docId: 'getting-started', 23 | position: 'left', 24 | label: 'Docs', 25 | }, 26 | { 27 | type: 'doc', 28 | docId: 'reference/core/module_interface', 29 | position: 'left', 30 | label: 'SDK', 31 | }, 32 | { 33 | href: 'https://github.com/microsoft/PyMarlin', 34 | label: 'GitHub', 35 | position: 'right', 36 | }, 37 | ], 38 | }, 39 | footer: { 40 | style: 'dark', 41 | links: [ 42 | { 43 | title: 'Docs', 44 | items: [ 45 | { 46 | label: 'Getting Started', 47 | to: 'docs/getting-started', 48 | }, 49 | ], 50 | }, 51 | { 52 | title: 'Community', 53 | items: [ 54 | { 55 | label: 'Stack Overflow', 56 | href: 'https://stackoverflow.com/questions/tagged/pymarlin', 57 | }, 58 | // { 59 | // label: 'Discord', 60 | // href: 'https://discordapp.com/invite/docusaurus', 61 | // }, 62 | // { 63 | // label: 'Twitter', 64 | // href: 'https://twitter.com/docusaurus', 65 | // }, 66 | ], 67 | }, 68 | { 69 | title: 'More', 70 | items: [ 71 | { 72 | label: 'GitHub', 73 | href: 'https://github.com/microsoft/PyMarlin', 74 | }, 75 | ], 76 | }, 77 | ], 78 | copyright: `Copyright © ${new Date().getFullYear()} Microsoft Inc. Built with Docusaurus.`, 79 | }, 80 | }, 81 | presets: [ 82 | [ 83 | '@docusaurus/preset-classic', 84 | { 85 | docs: { 86 | sidebarPath: require.resolve('./sidebars.js'), 87 | // Please change this to your repo. 88 | editUrl: 89 | 'https://github.com/microsoft/PyMarlin/edit/master/website/', 90 | }, 91 | // blog: { 92 | // showReadingTime: true, 93 | // // Please change this to your repo. 94 | // editUrl: 95 | // 'https://github.com/facebook/docusaurus/edit/master/website/blog/', 96 | // }, 97 | theme: { 98 | customCss: require.resolve('./src/css/custom.css'), 99 | }, 100 | }, 101 | ], 102 | ], 103 | }; 104 | -------------------------------------------------------------------------------- /website/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "website", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "docusaurus": "docusaurus", 7 | "start": "docusaurus start", 8 | "build": "docusaurus build", 9 | "swizzle": "docusaurus swizzle", 10 | "deploy": "docusaurus deploy", 11 | "clear": "docusaurus clear", 12 | "serve": "docusaurus serve", 13 | "write-translations": "docusaurus write-translations", 14 | "write-heading-ids": "docusaurus write-heading-ids" 15 | }, 16 | "dependencies": { 17 | "@docusaurus/core": "2.0.0-alpha.75", 18 | "@docusaurus/preset-classic": "2.0.0-alpha.75", 19 | "@mdx-js/react": "^1.6.21", 20 | "@svgr/webpack": "^5.5.0", 21 | "clsx": "^1.1.1", 22 | "file-loader": "^6.2.0", 23 | "react": "^17.0.1", 24 | "react-dom": "^17.0.1", 25 | "url-loader": "^4.1.1", 26 | "trim": "^0.0.3" 27 | }, 28 | "browserslist": { 29 | "production": [ 30 | ">0.5%", 31 | "not dead", 32 | "not op_mini all" 33 | ], 34 | "development": [ 35 | "last 1 chrome version", 36 | "last 1 firefox version", 37 | "last 1 safari version" 38 | ] 39 | } 40 | } -------------------------------------------------------------------------------- /website/pydoc-markdown.yml: -------------------------------------------------------------------------------- 1 | loaders: 2 | - type: python 3 | search_path: [../pymarlin] 4 | processors: 5 | - type: filter 6 | skip_empty_modules: true 7 | - type: smart 8 | - type: crossref 9 | renderer: 10 | type: docusaurus 11 | docs_base_path: docs 12 | relative_output_path: reference 13 | relative_sidebar_path: sidebar.json 14 | sidebar_top_level_label: Reference 15 | -------------------------------------------------------------------------------- /website/sidebars.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Creating a sidebar enables you to: 3 | - create an ordered group of docs 4 | - render a sidebar for each doc of that group 5 | - provide next/previous navigation 6 | 7 | The sidebars can be generated from the filesystem, or explicitly defined here. 8 | 9 | Create as many sidebars as you want. 10 | */ 11 | 12 | module.exports = { 13 | docsSidebar: [ 14 | 'getting-started', 15 | 'installation', 16 | 'marlin-in-pictures', 17 | {'Examples': [{type: 'autogenerated', dirName: 'examples'}]}, 18 | {'Plugins': [{type: 'autogenerated', dirName: 'plugins'}]}, 19 | 'utils/stats', 20 | 'contributing' 21 | ], 22 | // pydoc-markdown auto-generated markdowns from docstrings 23 | referenceSideBar: [require("./docs/reference/sidebar.json")] 24 | }; 25 | -------------------------------------------------------------------------------- /website/src/components/HomepageFeatures.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import clsx from 'clsx'; 3 | import styles from './HomepageFeatures.module.css'; 4 | 5 | const FeatureList = [ 6 | { 7 | title: 'Easy to Use', 8 | Svg: require('../../static/img/undraw_docusaurus_mountain.svg').default, 9 | description: ( 10 | <> 11 | PyMarlin was designed to make PyTorch training as easy as possible while still getting the benefits of GPU and distributed node acceleration. 12 | 13 | ), 14 | }, 15 | { 16 | title: 'Focus on What Matters', 17 | Svg: require('../../static/img/undraw_docusaurus_tree.svg').default, 18 | description: ( 19 | <> 20 | Focus on your scenario code and data preprocessing pipeline, we'll take care of the training loop and optimizations for you. 21 | 22 | ), 23 | }, 24 | { 25 | title: 'Scale out to hundreds of GPUs with AzureML', 26 | Svg: require('../../static/img/undraw_docusaurus_react.svg').default, 27 | description: ( 28 | <> 29 | Run the same code in all environments and just use a simple configuration change to scale from a single CPU on your dev machines 30 | to hundreds of GPU's in AzureML or other cloud services. 31 | 32 | ), 33 | }, 34 | ]; 35 | 36 | function Feature({Svg, title, description}) { 37 | return ( 38 |
39 |
40 | 41 |
42 |
43 |

{title}

44 |

{description}

45 |
46 |
47 | ); 48 | } 49 | 50 | export default function HomepageFeatures() { 51 | return ( 52 |
53 |
54 |
55 | {FeatureList.map((props, idx) => ( 56 | 57 | ))} 58 |
59 |
60 |
61 | ); 62 | } 63 | -------------------------------------------------------------------------------- /website/src/components/HomepageFeatures.module.css: -------------------------------------------------------------------------------- 1 | /* stylelint-disable docusaurus/copyright-header */ 2 | 3 | .features { 4 | display: flex; 5 | align-items: center; 6 | padding: 2rem 0; 7 | width: 100%; 8 | } 9 | 10 | .featureSvg { 11 | height: 200px; 12 | width: 200px; 13 | } 14 | -------------------------------------------------------------------------------- /website/src/css/custom.css: -------------------------------------------------------------------------------- 1 | /* stylelint-disable docusaurus/copyright-header */ 2 | /** 3 | * Any CSS included here will be global. The classic template 4 | * bundles Infima by default. Infima is a CSS framework designed to 5 | * work well for content-centric websites. 6 | */ 7 | 8 | /* You can override the default Infima variables here. */ 9 | :root { 10 | --ifm-color-primary: #25c2a0; 11 | --ifm-color-primary-dark: rgb(33, 175, 144); 12 | --ifm-color-primary-darker: rgb(31, 165, 136); 13 | --ifm-color-primary-darkest: rgb(26, 136, 112); 14 | --ifm-color-primary-light: rgb(70, 203, 174); 15 | --ifm-color-primary-lighter: rgb(102, 212, 189); 16 | --ifm-color-primary-lightest: rgb(146, 224, 208); 17 | --ifm-code-font-size: 95%; 18 | } 19 | 20 | .docusaurus-highlight-code-line { 21 | background-color: rgb(72, 77, 91); 22 | display: block; 23 | margin: 0 calc(-1 * var(--ifm-pre-padding)); 24 | padding: 0 var(--ifm-pre-padding); 25 | } -------------------------------------------------------------------------------- /website/src/pages/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import clsx from 'clsx'; 3 | import Layout from '@theme/Layout'; 4 | import Link from '@docusaurus/Link'; 5 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 6 | import styles from './index.module.css'; 7 | import HomepageFeatures from '../components/HomepageFeatures'; 8 | 9 | function HomepageHeader() { 10 | const {siteConfig} = useDocusaurusContext(); 11 | return ( 12 |
13 |
14 |

{siteConfig.title}

15 |

{siteConfig.tagline}

16 |
17 | 20 | PyMarlin Getting Started - 5min ⏱️ 21 | 22 |
23 |
24 |
25 | ); 26 | } 27 | 28 | export default function Home() { 29 | const {siteConfig} = useDocusaurusContext(); 30 | return ( 31 | 34 | 35 |
36 | 37 |
38 |
39 | ); 40 | } 41 | -------------------------------------------------------------------------------- /website/src/pages/index.module.css: -------------------------------------------------------------------------------- 1 | /* stylelint-disable docusaurus/copyright-header */ 2 | 3 | /** 4 | * CSS files with the .module.css suffix will be treated as CSS modules 5 | * and scoped locally. 6 | */ 7 | 8 | .heroBanner { 9 | padding: 4rem 0; 10 | text-align: center; 11 | position: relative; 12 | overflow: hidden; 13 | } 14 | 15 | @media screen and (max-width: 966px) { 16 | .heroBanner { 17 | padding: 2rem; 18 | } 19 | } 20 | 21 | .buttons { 22 | display: flex; 23 | align-items: center; 24 | justify-content: center; 25 | } 26 | -------------------------------------------------------------------------------- /website/src/pages/markdown-page.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Markdown page example 3 | --- 4 | 5 | # Markdown page example 6 | 7 | You don't need React to write simple standalone pages. 8 | -------------------------------------------------------------------------------- /website/static/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/.nojekyll -------------------------------------------------------------------------------- /website/static/img/docusaurus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/docusaurus.png -------------------------------------------------------------------------------- /website/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/favicon.ico -------------------------------------------------------------------------------- /website/static/img/tutorial/docsVersionDropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/tutorial/docsVersionDropdown.png -------------------------------------------------------------------------------- /website/static/img/tutorial/localeDropdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/tutorial/localeDropdown.png --------------------------------------------------------------------------------