├── .coveragerc
├── .github
    └── workflows
    │   ├── canary.yml
    │   ├── deploy-website.yml
    │   ├── python-publish.yml
    │   └── test.yml
├── .gitignore
├── .pylintrc
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── VERSION_NOTES.md
├── examples
    ├── cifar_image_classification
    │   ├── .images
    │   │   └── tensorboard_screenshot.jpg
    │   ├── CIFAR.ipynb
    │   ├── README.md
    │   ├── cifar.py
    │   ├── config.yaml
    │   └── requirements.txt
    ├── cnndailymail_text_summarization
    │   ├── ORT_README.md
    │   ├── azureml
    │   │   ├── Dockerfile
    │   │   └── submit_ortds.py
    │   ├── config-ortds.yaml
    │   ├── config-prod.yaml
    │   ├── config.yaml
    │   ├── data.py
    │   ├── deepspeed_methods
    │   │   ├── __init__.py
    │   │   ├── deepspeedConfig.json
    │   │   ├── deepspeed_trainer.py
    │   │   ├── deepspeed_trainer_backend.py
    │   │   └── deepspeed_utils.py
    │   ├── images
    │   │   └── tensorboard_screenshot_bart.jpg
    │   ├── infer.py
    │   ├── model_ortds.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   ├── train.py
    │   └── train_ortds.py
    ├── covid19_text_classification
    │   ├── azureml
    │   │   ├── dockerfile
    │   │   └── submit.py
    │   ├── config.yaml
    │   ├── data.py
    │   ├── readme.md
    │   ├── requirements.txt
    │   └── train.py
    ├── germ_text_ner
    │   ├── GermEvalAML.ipynb
    │   ├── config_germ.yaml
    │   ├── readme.md
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── train_germ
    │   │   └── train.tsv
    │   └── val_germ
    │   │   └── dev.tsv
    ├── glue_text_benchmark
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── configs-roberta-base
    │   │   ├── cola.yaml
    │   │   ├── mnli.yaml
    │   │   ├── mrpc.yaml
    │   │   ├── qnli.yaml
    │   │   ├── qqp.yaml
    │   │   ├── rte.yaml
    │   │   ├── sst2.yaml
    │   │   └── stsb.yaml
    │   ├── images
    │   │   └── tensorboard_screenshot.jpg
    │   ├── logs_roberta_base
    │   │   └── rte
    │   │   │   └── from_pretrained
    │   │   │       └── events.out.tfevents.1623336412.krishan-vm.20548.0
    │   ├── requirements.txt
    │   └── src
    │   │   ├── data.py
    │   │   ├── infer.py
    │   │   └── train.py
    ├── readme.md
    └── snli_benchmark
    │   ├── SNLI.ipynb
    │   ├── configs-bert-base
    │       └── snli.yaml
    │   └── src
    │       ├── data.py
    │       └── train.py
├── pymarlin
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── data_interface.py
    │   ├── module_interface.py
    │   ├── trainer.py
    │   └── trainer_backend.py
    ├── plugins
    │   ├── __init__.py
    │   ├── base.py
    │   ├── hf_ner
    │   │   ├── __init__.py
    │   │   ├── config_germ.yaml
    │   │   ├── data_classes.py
    │   │   ├── implementation.py
    │   │   ├── module_classes.py
    │   │   └── sequence_labelling_metrics.py
    │   ├── hf_seq2seq
    │   │   ├── __init__.py
    │   │   ├── data_classes.py
    │   │   ├── implementation.py
    │   │   ├── metric_utils.py
    │   │   └── module_classes.py
    │   ├── hf_seq_classification
    │   │   ├── __init__.py
    │   │   ├── config.yaml
    │   │   ├── data_classes.py
    │   │   ├── implementation.py
    │   │   ├── metric_utils.py
    │   │   └── module_classes.py
    │   ├── hfdistill_utils.py
    │   └── plugin_module_interface.py
    └── utils
    │   ├── __init__.py
    │   ├── checkpointer
    │       ├── __init__.py
    │       └── checkpoint_utils.py
    │   ├── config_parser
    │       ├── __init__.py
    │       └── custom_arg_parser.py
    │   ├── differential_privacy.py
    │   ├── distributed.py
    │   ├── fabrics.py
    │   ├── logger
    │       ├── __init__.py
    │       └── logging_utils.py
    │   ├── misc
    │       ├── __init__.py
    │       └── misc_utils.py
    │   ├── stats
    │       ├── __init__.py
    │       └── basic_stats.py
    │   └── writer
    │       ├── __init__.py
    │       ├── aml.py
    │       ├── base.py
    │       ├── stdout.py
    │       └── tensorboard.py
├── pyproject.toml
├── pytest.ini
├── setup.py
├── tests
    ├── core
    │   ├── test_data_interface.py
    │   ├── test_trainer.py
    │   └── test_trainer_backend.py
    ├── test_sanity.py
    └── utils
    │   ├── config.yaml
    │   ├── corrupt_files
    │       └── config.yaml
    │   ├── test_checkpointer.py
    │   ├── test_config_parser.py
    │   └── test_stats.py
└── website
    ├── .gitignore
    ├── README.md
    ├── UML
        ├── diagrams
        │   ├── out
        │   │   ├── classes.png
        │   │   ├── classes.svg
        │   │   ├── classification_data_processing.png
        │   │   ├── classification_data_processing.svg
        │   │   ├── classification_train.png
        │   │   ├── classification_train.svg
        │   │   ├── classifier.png
        │   │   ├── classifier.svg
        │   │   ├── training_lifecycle.png
        │   │   └── training_lifecycle.svg
        │   └── src
        │   │   ├── classes.pu
        │   │   ├── classifier.pu
        │   │   └── train_manager_sequence.pu
        ├── make.bat
        ├── plantuml.jar
        └── readme.md
    ├── babel.config.js
    ├── docs
        ├── contributing.md
        ├── examples
        │   ├── checkpointing.md
        │   ├── cifar.md
        │   ├── classification.md
        │   ├── datamodule-example.md
        │   ├── distillation.md
        │   ├── glue-tasks.md
        │   ├── images
        │   │   ├── cifar.png
        │   │   ├── tb.jpg
        │   │   └── tensorboard_screenshot_bart.jpg
        │   └── summarization.md
        ├── getting-started.md
        ├── installation.md
        ├── marlin-in-pictures.md
        ├── plugins
        │   ├── hf_ner.md
        │   ├── hf_seq_classification.md
        │   └── images
        │   │   ├── hfner
        │   │       └── ner_dataset_mod.png
        │   │   └── hfseqclass
        │   │       ├── loss.jpg
        │   │       ├── loss.png
        │   │       ├── lr.jpg
        │   │       ├── lr.png
        │   │       ├── train_metrics.jpg
        │   │       └── train_metrics.png
        └── utils
        │   ├── images
        │       └── tb_example.jpg
        │   └── stats.md
    ├── docusaurus.config.js
    ├── package.json
    ├── pydoc-markdown.yml
    ├── sidebars.js
    ├── src
        ├── components
        │   ├── HomepageFeatures.js
        │   └── HomepageFeatures.module.css
        ├── css
        │   └── custom.css
        └── pages
        │   ├── index.js
        │   ├── index.module.css
        │   └── markdown-page.md
    ├── static
        ├── .nojekyll
        └── img
        │   ├── docusaurus.png
        │   ├── favicon.ico
        │   ├── logo.svg
        │   ├── tutorial
        │       ├── docsVersionDropdown.png
        │       └── localeDropdown.png
        │   ├── undraw_docusaurus_mountain.svg
        │   ├── undraw_docusaurus_react.svg
        │   └── undraw_docusaurus_tree.svg
    └── yarn.lock


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     # omit everything under plugins for now
 4 |     pymarlin/plugins/*
 5 | 
 6 | [report]
 7 | # Regexes for lines to exclude from consideration
 8 | exclude_lines =
 9 |     # exclude abstract functions that will most likely never get run anyways
10 |     pass


--------------------------------------------------------------------------------
/.github/workflows/canary.yml:
--------------------------------------------------------------------------------
 1 | name: azureml canary
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |   - cron: '5 8 * * 0'  # runs once a week at 8.05 on day 0 (Monday)
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: check out repo
13 |       uses: actions/checkout@v2
14 |     - name: setup python
15 |       uses: actions/setup-python@v2
16 |       with: 
17 |         python-version: "3.8"
18 |     - name: requirements
19 |       run: pip install azureml-sdk>=1.20.0
20 |     - name: azure login
21 |       uses: azure/login@v1
22 |       with:
23 |         creds: ${{secrets.AZURE_CREDENTIALS}}
24 |     - name: release canary
25 |       run: |
26 |         cd examples/covid19_text_classification/azureml/
27 |         python submit.py --backend ddp-amp --process_count 2 --wait \
28 |          --subscription_id ${{secrets.SUBSCRIPTION_ID}} --resource_group ${{secrets.RESOURCE_GROUP}} \
29 |          --workspace_name ${{secrets.WORKSPACE_NAME}}


--------------------------------------------------------------------------------
/.github/workflows/deploy-website.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   push:
 7 |     branches: [main]
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   checks:
12 |     if: github.event_name != 'push'
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v1
16 |       - uses: actions/setup-node@v1
17 |         with:
18 |           node-version: '12.x'
19 |       - name: setup python
20 |         uses: actions/setup-python@v2
21 |         with: 
22 |           python-version: "3.8"
23 |       - name: pydoc-markdown install
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           pip install docspec-python==0.1.0 
27 |           pip install pydoc-markdown
28 |       - name: pydoc-markdown run
29 |         run: |
30 |           cd website
31 |           pydoc-markdown
32 |       - name: Test Build
33 |         run: |
34 |           cd website
35 |           if [ -e yarn.lock ]; then
36 |           yarn install --frozen-lockfile
37 |           elif [ -e package-lock.json ]; then
38 |           npm ci
39 |           else
40 |           npm i
41 |           fi
42 |           npm run build
43 |   gh-release:
44 |     if: github.event_name != 'pull_request'
45 |     runs-on: ubuntu-latest
46 |     steps:
47 |       - uses: actions/checkout@v1
48 |       - uses: actions/setup-node@v1
49 |         with:
50 |           node-version: '12.x'
51 |       - name: setup python
52 |         uses: actions/setup-python@v2
53 |         with: 
54 |           python-version: "3.8"
55 |       - name: pydoc-markdown install
56 |         run: |
57 |           python -m pip install --upgrade pip
58 |           pip install docspec-python==0.1.0 
59 |           pip install pydoc-markdown
60 |       - name: pydoc-markdown run
61 |         run: |
62 |           cd website
63 |           pydoc-markdown
64 |       - name: Add key to allow access to repository
65 |         env:
66 |           SSH_AUTH_SOCK: /tmp/ssh_agent.sock
67 |         run: |
68 |           mkdir -p ~/.ssh
69 |           ssh-keyscan github.com >> ~/.ssh/known_hosts
70 |           echo "${{ secrets.GH_PAGES_DEPLOY }}" > ~/.ssh/id_rsa
71 |           chmod 600 ~/.ssh/id_rsa
72 |           cat <<EOT >> ~/.ssh/config
73 |           Host github.com
74 |           HostName github.com
75 |           IdentityFile ~/.ssh/id_rsa
76 |           EOT
77 |       - name: Release to GitHub Pages
78 |         env:
79 |           USE_SSH: true
80 |           GIT_USER: git
81 |         run: |
82 |           git config --global user.email "actions@gihub.com"
83 |           git config --global user.name "gh-actions"
84 |           cd website
85 |           if [ -e yarn.lock ]; then
86 |           yarn install --frozen-lockfile
87 |           elif [ -e package-lock.json ]; then
88 |           npm ci
89 |           else
90 |           npm i
91 |           fi
92 |           yarn deploy


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | 
 5 | name: pypi
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: [main]
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.8'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         python -m pip install --upgrade build
25 |         pip install setuptools wheel twine
26 |     - name: Build
27 |       run: |
28 |         python -m build
29 |       
30 |     - name: Publish to TestPyPi
31 |       env:
32 |         TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
33 |         TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
34 |       run: |
35 |         python -m twine upload --repository testpypi dist/* --skip-existing
36 |       
37 |     - name: Publish to PyPi
38 |       env:
39 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
40 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
41 |       run: |
42 |         python -m twine upload dist/* --skip-existing
43 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: lint & test
 5 | 
 6 | on:
 7 |   workflow_dispatch:
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 | jobs:
14 |   build:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python 3.8
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: 3.8
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install torch==1.7.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
28 |         pip install -e .[dev]
29 |     - name: lint
30 |       run: |
31 |         pylint pymarlin --rcfile=.pylintrc 
32 |     - name: test with coverage
33 |       run: |
34 |         pytest --cov=pymarlin --cov-report=xml --cov-config=.coveragerc
35 |     - name: Upload coverage to Codecov
36 |       uses: codecov/codecov-action@v1
37 |       with:
38 |         files: ./coverage.xml
39 |         directory: ./coverage/reports/
40 |         flags: unittests
41 |         env_vars: OS,PYTHON
42 |         name: codecov-umbrella
43 |         fail_ci_if_error: true
44 |         path_to_write_report: ./coverage/codecov_report.txt
45 |         verbose: true
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # logs
132 | logs/
133 | 
134 | # vscode
135 | .vscode/
136 | 
137 | # pydoc-markdown auto-generated reference docs
138 | /website/docs/reference/**
139 | 
140 | # AzureML Workspace Config JSON files
141 | config.json
142 | 
143 | # local snapshots of pymarlin for submitted to azureml
144 | /examples/*/pymarlin
145 | 
146 | # don't check in data as it normally comes with its own restrictive license
147 | /examples/*/data
148 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/azureml/base:openmpi3.1.2-ubuntu18.04
 2 | RUN apt-get update
 3 | 
 4 | 
 5 | # create conda environment
 6 | RUN conda update -n base -c defaults conda -y
 7 | RUN conda create -n marlin python=3.8 -y
 8 | RUN echo ". /opt/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc
 9 | 
10 | #install torch latest
11 | # Cuda toolkit other than 1.2 makes GPUs invisible. Base image issue
12 | RUN conda install pytorch cudatoolkit=10.2 -c pytorch -y -n marlin
13 | 
14 | ADD . /workdir
15 | WORKDIR /workdir
16 | 
17 | RUN /opt/miniconda/envs/marlin/bin/pip install -U -e .


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyMarlin, a lightweight PyTorch library for agile deep learning!
 2 | [![Unit Tests](https://github.com/microsoft/PyMarlin/actions/workflows/test.yml/badge.svg)](https://github.com/microsoft/PyMarlin/actions/workflows/test.yml)
 3 | [![codecov](https://codecov.io/gh/microsoft/PyMarlin/branch/main/graph/badge.svg?token=wUF3ZODLpN)](https://codecov.io/gh/microsoft/PyMarlin)
 4 | [![Docs](https://github.com/microsoft/PyMarlin/actions/workflows/deploy-website.yml/badge.svg)](https://microsoft.github.io/PyMarlin/)
 5 | [![AzureML Canary](https://github.com/microsoft/PyMarlin/actions/workflows/canary.yml/badge.svg)](https://github.com/microsoft/PyMarlin/actions/workflows/canary.yml)
 6 | [![pypi](https://img.shields.io/pypi/v/pymarlin)](https://pypi.org/project/pymarlin/)
 7 | 
 8 | PyMarlin was developed with the goal of simplifying the E2E Deep Learning experimentation lifecycle for data scientists using PyTorch. The library enables an agile way to quickly prototype a new AI scenario on dev box and seamlessly scale it training multi-node DDP GPU training with AzureML or other cloud services.
 9 | 
10 | ## Key features
11 | - Provides public and enterprise **data pre-processing** recipes, which provides out of the box vanilla and parallel processing. It requires no additional code to run for AzureML or other environments easily.
12 | - Provides **scalable model training** with support for Single Process, VM, multi-GPU, multi-node, distributed Data Parallel, mixed-precision (AMP, Apex) training. ORT and DeepSpeed based training are going to be available soon!
13 | - Provides out of the box **Plugins** that can be used for all typical NLP tasks like Sequence Classification, Named Entity Recognition and Seq2Seq text generation.
14 | - Provides **reusable modules** for model checkpointing, stats collection, Tensorboard and compliant AML logging which can be customized based on your scenario.
15 | - Provides **custom arguments parser** that allows for saving all the default values for arguments related to a scenario in an YAML config file, merging user provided arguments at runtime.
16 | - All core modules are thoroughly **linted**,**unit tested** and even ran E2E (multi-node, GPU) in AzureML.
17 | - PyMarlin is minimal and has a easy to understand codebase. PyMarlin was designed to make it easy for others to understand the entire codebase and customize according to their needs.
18 | 
19 | ## Installation
20 | 
21 |     pip install pymarlin
22 | 
23 | Read the [installation doc](https://microsoft.github.io/PyMarlin/docs/installation) for more information.
24 | 
25 | ## Start exploring!
26 | 
27 | ### Full documentation website
28 | Full website with [guides and SDK reference](https://microsoft.github.io/PyMarlin/).
29 | 
30 | ### Train your first model with pymarlin
31 | Check out the [CIFAR image classification example](hhttps://microsoft.github.io/PyMarlin/docs/examples/cifar).
32 | 
33 | ### GLUE task benchmarking
34 | Explore how to use pymarlin to [benchmark your language models on GLUE tasks](https://microsoft.github.io/PyMarlin/docs/examples/glue-tasks).
35 | 
36 | ## We want your feedback!
37 | Reach out to us with your [feedback and suggestions](https://github.com/microsoft/PyMarlin/issues).
38 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/VERSION_NOTES.md:
--------------------------------------------------------------------------------
 1 | # Version Notes:
 2 | ## 0.3.2
 3 | * PyMarlin supports DP training via Opacus v1.0
 4 | 
 5 | ## 0.3.1
 6 | * Version parity for Pypi.
 7 | 
 8 | ## 0.2.8
 9 | * Incremented the dependency to torch<=1.9.1
10 | 
11 | ## 0.2.7
12 | * Adding torch<=1.9 as required dependency
13 | 
14 | ## 0.2.6
15 | * Adding support for parsing multi-level args from commandline and params
16 | 
17 | ## 0.2.5
18 | * Adding support for directories with config path (only one file in directory)
19 | 
20 | ## 0.2.4
21 | * Fixed bug where DDP all-reduce was not working
22 | 
23 | ## 0.2.3
24 | * Unbound azureml-core version
25 | 
26 | ## 0.2.2
27 | * Plugins bug fix
28 | 
29 | ## 0.2.0
30 | * Adding plugins: SeqClassification, NER, Seq2Seq
31 | * --params json input
32 | * DDP allreduce optimization
33 | 
34 | ## 0.1.1
35 | * Tests & Lint Pipeline
36 | * Documentation Pipeline
37 | * PyPi Pipeline
38 | 
39 | ## 0.1.0
40 | * Initial release
41 | * Trainer, TrainerBackend, ModuleInterface, DataProcessor/Interface, ConfigParser and more (see docs)
42 | 


--------------------------------------------------------------------------------
/examples/cifar_image_classification/.images/tensorboard_screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/cifar_image_classification/.images/tensorboard_screenshot.jpg


--------------------------------------------------------------------------------
/examples/cifar_image_classification/README.md:
--------------------------------------------------------------------------------
 1 | The jupyter notebook is good for just trying out CIFAR using pymarlin.
 2 | ## Run in [Colab](https://colab.research.google.com/github/microsoft/PyMarlin/blob/main/examples/cifar_image_classification/CIFAR.ipynb)
 3 | 
 4 | To use other advanced features like distributed training, yaml parser, tensorboard etc. use the python file and follow the instructions below.
 5 | 
 6 | Note: Recommended to use a machine with more than one GPU to try out all features
 7 | 
 8 | # 1. Install Pymarlin ,pytorch, requirements
 9 | 
10 |     Follow steps here https://microsoft.github.io/PyMarlin/docs/installation
11 |     
12 |     pip install -r requirements.txt
13 | 
14 | # 2. Run CIFAR
15 | 
16 | ## Single process
17 | 
18 |     python cifar.py --config_path config.yaml
19 | 
20 | ## Mixed Precission (Needs GPU)
21 | 
22 |     python cifar.py --config_path config.yaml --tr.backend sp-amp
23 | 
24 | ## Multi process (Needs at least 2 GPUs)
25 | 
26 |     python -m torch.distributed.launch --nproc_per_node 2 cifar.py --config_path config.yaml --tr.backend ddp
27 | 
28 | # Results
29 | 
30 |     Val accuracy at step 50000 = 61.14
31 | 
32 | # Tensorboard
33 | 
34 |     tensorboard --logdir logs
35 | 
36 | ![tensorboard](.images/tensorboard_screenshot.jpg)


--------------------------------------------------------------------------------
/examples/cifar_image_classification/config.yaml:
--------------------------------------------------------------------------------
 1 | tr:
 2 |   epochs: 2
 3 |   train_batch_size: 4
 4 |   val_batch_size: 16
 5 |   writers: ['tensorboard']
 6 |   clip_grads: False
 7 |   log_level :  'INFO'
 8 |   backend : sp
 9 |   max_train_steps_per_epoch : null
10 |   max_val_steps_per_epoch : null
11 | 


--------------------------------------------------------------------------------
/examples/cifar_image_classification/requirements.txt:
--------------------------------------------------------------------------------
1 | torchvision
2 | matplotlib


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/ORT_README.md:
--------------------------------------------------------------------------------
 1 | # Optimizing with ORT + DeepSpeed
 2 | We have extended this example to be optimized with ORT+DeepSpeed! Starting from this scenario we will try to build common backend for both ORT and DeepSpeed.
 3 | 
 4 | ## Speed improvments (batches/second)
 5 | setup: bart-base, 4xV100 (16GB), batchsize 32
 6 | 
 7 | ### configs and speed
 8 | * base pytorch , OOM
 9 | * ort , 1.46-1.48 batch/s
10 | * deepspeed , 1.69-1.70 batch/s
11 | * ort+deepspeed , 1.71-1.72 batch/s
12 | * deepspeed fp16 zero stage 1 , 3.36-3.41 batch/s
13 | * ort+deepspeed fp16 zero stage 1 , 3.47-3.55 batch/s
14 | 
15 | ## Noteworthy files
16 | * [deepspeed_methods](deepspeed_methods): deepspeed utility methods and trainer / trainer backends.
17 | * [model_ortds.py](model_ortds.py): module interface with config checks to enable ort+deepspeed
18 | * [train_ortds.py](train_ortds.py): main train script that imports the above
19 | * [azureml/submit_ortds.py](azureml/submit_ortds.py): azureml submit script
20 | 
21 | ## Submitting
22 | 1. Install azureml-sdk and create an AzureML workspace, great [instructions on both here](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/installation).
23 | 2. Write out the config.json for the workspace with [write_config()](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/workspace#helpful-methods)
24 | 3. Create a gpu cluster in the workspace, for more info go [here](https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/compute-targets#creating-compute-targets)
25 | 4. Adjust the values in submit_ortds.py to point to your new gpu cluster.
26 | 5. Upload preprocessed CNN/DailyMail from original README by uncommenting line 48 and point to local path.
27 | 6. From examples/summarization/aml, Submit job with `python submit_ortds.py`


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/azureml/submit_ortds.py:
--------------------------------------------------------------------------------
 1 | from azureml.core import Experiment, Workspace, ScriptRunConfig
 2 | from azureml.core.compute import  AmlCompute
 3 | from azureml.core.runconfig import MpiConfiguration
 4 | 
 5 | # put your AML workspace config.json in this directory!
 6 | ws = Workspace.from_config()
 7 | ws_details = ws.get_details()
 8 | ds = ws.get_default_datastore()
 9 | 
10 | gpu_compute_target = AmlCompute(workspace=ws, name='sriovdedicated1')
11 | print(gpu_compute_target.status.serialize())
12 | 
13 | from azureml.core import Dataset
14 | from azureml.data import OutputFileDatasetConfig
15 | 
16 | 
17 | # create input/output datasets
18 | def get_input_dataset(datastore, path_on_datastore, dataset_name):
19 |     dataset = Dataset.File.from_files(path=[(datastore, path_on_datastore)])
20 |     return dataset.as_named_input(dataset_name).as_download()
21 | 
22 | def get_output_dataset(datastore, path_on_datastore, dataset_name):
23 |     return OutputFileDatasetConfig(destination=(datastore, path_on_datastore), name=dataset_name).as_mount()
24 | 
25 | def get_args(outputSuffix="deepspeed_ort_amp_nopadding_v100_8"):
26 |     all_params_default = [
27 |         '--data_path', get_input_dataset(ds, f'datasets/cnn_dm/preprocessed/bart/', "data_path"),
28 |         '--config_path', 'config-ortds.yaml',
29 |     ]
30 | 
31 |     return all_params_default
32 | 
33 | from azureml.core import Environment
34 | 
35 | # Creates the environment inside a Docker container.
36 | pytorch_env = Environment(name='myEnv')
37 | pytorch_env.docker.enabled = True
38 | # docker file in this directory built for your convenience
39 | pytorch_env.docker.base_image = "pymarlin/base-gpu:cuda11.1.cudnn8.ds.ort"
40 | pytorch_env.python.user_managed_dependencies = True
41 | pytorch_env.python.interpreter_path = '/opt/miniconda/bin/python'
42 | 
43 | mpi = MpiConfiguration()
44 | #NCv3_24rs - 4 16GB V100 GPU's per node
45 | mpi.process_count_per_node = 4
46 | mpi.node_count = 2
47 | 
48 | # ds.upload_files(['local path to preprocessed data'], 'datasets/cnn_dm/preprocessed/bart')
49 | 
50 | script = "train_ortds.py"
51 | codepath = '..'
52 | 
53 | config = ScriptRunConfig(source_directory=codepath,
54 |                          script=script,
55 |                          arguments=get_args(),
56 |                          compute_target=gpu_compute_target,
57 |                          environment=pytorch_env,
58 |                          distributed_job_config=mpi)
59 | 
60 | experiment_name = 'pymarlin_summarization_bart_ortds'
61 | experiment = Experiment(ws, name=experiment_name)
62 | 
63 | run = experiment.submit(config)
64 | 
65 | run.tag('nodes', f'{mpi.node_count}')
66 | run.tag('process_count_per_node', f'{mpi.process_count_per_node}')
67 | run.tag('notes', '2 node with ort+ds')
68 | 
69 | print("Submitted run")
70 | print(f"\n{run.get_portal_url()}")
71 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/config-ortds.yaml:
--------------------------------------------------------------------------------
 1 | data_path: 'D:/data/cnn_cln'
 2 | dist: true
 3 | ort: true
 4 | ortds: true
 5 | 
 6 | trainer:
 7 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
 8 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
 9 |     train_batch_size: 32 # Training global batch size.
10 |     val_batch_size: 32 # Validation batch size per GPU.
11 |     epochs: 3 # Total epochs to run.
12 |     gpu_batch_size_limit : 4 # Max limit for GPU batch size during training.
13 |     disable_tqdm : True
14 |     writers: ["stdout", "aml", "tensorboard"]
15 | 
16 | module:
17 |     max_length_encoder : 1024
18 |     max_length_decoder : 128
19 |     deepspeed_config: 'deepspeed_methods/deepspeedConfig.json'
20 |     deepspeed_transformer_kernel: true
21 |     deepspeed_ckpt_tag: "deepspeed_ckpt"    # optional, let deepspeed load specific checkpoint, unnecessary if save_latest is true (default) when checkpointing with deepspeed
22 | 
23 | wrt:
24 |     tb_log_dir : 'outputs/tb_logs'
25 | 
26 | 
27 | stat:
28 |     log_steps : 20
29 | 
30 | chkp:
31 |     checkpoint : True
32 |     delete_existing_checkpoints: False
33 |     save_dir: 'outputs/chkpt' # aml output path. does not require mounting
34 |     model_state_save_dir: 'outputs/model'
35 |     load_dir: null
36 |     load_filename: null
37 | 
38 | # add more from BartForConditionalGeneration.generate?
39 | generate:
40 |     max_length: 128
41 |     do_sample : False
42 |     num_beams : 5
43 | # support everything in a yaml. ignore (print warning) everything that's not present.
44 | # Do not add the requirement to define anything in the parser other than yamls
45 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/config-prod.yaml:
--------------------------------------------------------------------------------
 1 | data_path: 'D:/data/cnn_cln'
 2 | 
 3 | trainer:
 4 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
 5 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
 6 |     train_batch_size: 32 # Training global batch size.
 7 |     val_batch_size: 32 # Validation batch size per GPU. 
 8 |     epochs: 3 # Total epochs to run.
 9 |     gpu_batch_size_limit : 4 # Max limit for GPU batch size during training.
10 |     disable_tqdm : False
11 |     writers: ["stdout", "aml", "tensorboard"]
12 |     backend: 'ddp-amp'
13 | module:
14 |     max_length_encoder : 1024
15 |     max_length_decoder : 128
16 | wrt:
17 |     tb_log_dir : 'logs'
18 | stat:
19 |     log_steps : 50
20 | chkp:
21 |     checkpoint : True
22 |     delete_existing_checkpoints: False
23 |     save_dir: 'outputs' #aml output path. does not require mounting
24 |     load_dir: null
25 |     load_filename: null
26 | 
27 | # add more from BartForConditionalGeneration.generate?
28 | generate:
29 |     max_length: 128
30 |     do_sample : False
31 |     num_beams : 5
32 | # support everything in a yaml. ignore (print warning) everything that's not present.
33 | # Do not add the requirement to define anything in the parser other than yamls
34 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/config.yaml:
--------------------------------------------------------------------------------
 1 | data_path: 'D:/data/cnn_cln'
 2 | 
 3 | trainer:
 4 |     max_train_steps_per_epoch : 2 # Maximum train steps per epoch.
 5 |     max_val_steps_per_epoch : 2 # Maximum validation steps per epoch.
 6 |     train_batch_size: 1 # Training global batch size.
 7 |     val_batch_size: 1 # Validation global batch size.
 8 |     epochs: 3 # Total epochs to run.
 9 |     gpu_batch_size_limit : 16 # Max limit for GPU batch size during training.
10 |     disable_tqdm : False
11 |     writers: ["aml", "tensorboard"]
12 |     backend: "sp"
13 | 
14 | module:
15 |     max_length_encoder : 128
16 |     max_length_decoder : 128
17 | 
18 | wrt:
19 |     tb_log_dir : 'logs'
20 | 
21 | stat:
22 |     log_steps : 1
23 | chkp:
24 |     checkpoint : False
25 |     delete_existing_checkpoints: True
26 |     save_dir: 'checkpoints'
27 |     load_dir: null
28 | 
29 | generate:
30 |     max_length: 128
31 |     do_sample : False
32 |     num_beams : 1


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import torch
 3 | import pymarlin
 4 | from pymarlin.core import data_interface
 5 | import matplotlib
 6 | matplotlib.use('Agg') # disable this in local machine to see plots
 7 | import matplotlib.pyplot as plt
 8 | import sys
 9 | 
10 | def get_source_target(root = 'D:/data/cnn_cln', stage = 'val'):
11 |     source = f'{root}/{stage}.source'
12 |     target = f'{root}/{stage}.target'
13 |     return source, target
14 | 
15 | class AnalyzeProcessor(data_interface.DataProcessor):
16 |     def __init__(self, source, target):
17 |         with open(source, 'r', encoding = 'UTF-8') as f: 
18 |             self.source = f.readlines()
19 |         with open(target, 'r', encoding = 'UTF-8') as f: 
20 |             self.target = f.readlines()
21 |     def process(self):
22 |         pass
23 |     def analyze(self):
24 |         self.df = pd.DataFrame({'source':self.source, 'target': self.target})
25 |         print(self.df.head())
26 |         print('\nWord length analysis:')
27 |         wordlengths = self.df.applymap(lambda x :  len(x.split()))
28 |         print(wordlengths.describe())
29 |         plt.plot(wordlengths)
30 |         plt.legend(['source','target'])
31 | 
32 | class SummarizationDataset(torch.utils.data.Dataset):
33 |     def __init__(self, source, target):
34 |         with open(source, 'r', encoding = 'UTF-8') as f: 
35 |             self.source = f.readlines()
36 |         with open(target, 'r', encoding = 'UTF-8') as f: 
37 |             self.target = f.readlines()
38 |         print('len(self.source), len(self.target) = ',len(self.source), len(self.target))
39 |     def __getitem__(self, i):
40 |         # print('len(self.source), len(self.target) = ',len(self.source), len(self.target))
41 |         return self.source[i].strip(), self.target[i].strip()
42 |     def __len__(self):
43 |         return len(self.target)
44 | 
45 | class SummarizationData(pymarlin.core.data_interface.DataInterface):
46 |     '''
47 |     Class which expects input data to have different files for source and target. 
48 |     Returns dataset which returns non tokenized source and target text.
49 |     '''
50 |     def __init__(self, root='D:/data/cnn_cln'):
51 |         self.root = root
52 |         self.train_ds = SummarizationDataset(*get_source_target(root, 'train'))
53 |         self.val_ds = SummarizationDataset(*get_source_target(root, 'val'))
54 |         print('self.train_ds length = ', len(self.train_ds))
55 | 
56 |     def get_train_dataset(self, *args, **kwargs):
57 |         return self.train_ds
58 |     def get_val_dataset(self, *args, **kwargs):
59 |         return self.val_ds
60 |     def get_test_dataset(self, *args, **kwargs):
61 |         pass
62 | 
63 | if __name__ == '__main__':
64 |     root = sys.argv[1] #'D:/data/cnn_cln'
65 |     print(root)
66 |     print('\n**** Analyzing Train ***')
67 |     dp = AnalyzeProcessor(*get_source_target(root = root, stage='train'))
68 |     dp.process_data()
69 |     print('\n**** Analyzing Val ***')
70 |     dp = AnalyzeProcessor(*get_source_target(root = root, stage='val'))
71 |     dp.process_data()
72 |     plt.show()


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/deepspeed_methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/cnndailymail_text_summarization/deepspeed_methods/__init__.py


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/deepspeed_methods/deepspeedConfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 32,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 1.875e-4,
 8 |       "betas": [
 9 |         0.9,
10 |         0.98
11 |       ],
12 |       "eps": 1e-8,
13 |       "weight_decay": 3e-7
14 |     }
15 |   },
16 |   "zero_allow_untested_optimizer": true,
17 |   "scheduler": {
18 |     "type": "OneCycle",
19 |     "params": {
20 |       "cycle_first_step_size": 256115,
21 |       "cycle_first_stair_count": 10000,
22 |       "cycle_second_step_size": 256115,
23 |       "cycle_second_stair_count": 10000,
24 |       "decay_step_size": 1000,
25 |       "cycle_min_lr": 1.875e-5,
26 |       "cycle_max_lr": 1.875e-4,
27 |       "decay_lr_rate": 0.001,
28 |       "cycle_min_mom": 0.85,
29 |       "cycle_max_mom": 0.99,
30 |       "decay_mom_rate": 0.0
31 |     }
32 |   },
33 |   "fp16": {
34 |     "enabled": true
35 |   },
36 |   "zero_optimization": {
37 |     "stage": 1,
38 |     "allgather_partitions": true,
39 |     "allgather_bucket_size": 5e8,
40 |     "overlap_comm": false,
41 |     "reduce_scatter": true,
42 |     "reduce_bucket_size": 5e8,
43 |     "contiguous_gradients": false,
44 |     "cpu_offload": false
45 |   }
46 | }


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/deepspeed_methods/deepspeed_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pymarlin import Trainer
 4 | from pymarlin.utils.checkpointer.checkpoint_utils import Checkpoint
 5 | 
 6 | 
 7 | class DeepSpeedTrainer(Trainer):
 8 | 
 9 |     def save_checkpoint(self, force=False) -> None:
10 |         # deepspeed will require all processes to call save_checkpoint method
11 |         ckpt_id = str(self.trainer_backend.get_state()["global_step_completed"])
12 |         self.module.model.save_checkpoint(os.path.join(self.args.checkpointer_args.save_dir, self.module.DEEPSPEED_CKPT_PREFIX), ckpt_id)
13 | 
14 |         if self.is_main_process:  # only main process should checkpoint
15 |             checkpoint_state = Checkpoint(
16 |                 module_interface_state=self.module.get_state(),
17 |                 trainer_state=self.get_state(),
18 |                 trainer_backend_state=self.trainer_backend.get_state()
19 |             )
20 |             self.checkpointer.save(checkpoint_state, self.last_epoch, force)
21 | 
22 |     def save_model_checkpoint(self) -> None:
23 |         if self.args.checkpointer_args.checkpoint and (self.args.checkpointer_args.model_state_save_dir is not None):
24 |             ckpt_id = str(self.trainer_backend.get_state()["global_step_completed"])
25 |             self.module.model.save_checkpoint(os.path.join(self.args.checkpointer_args.model_state_save_dir, self.module.DEEPSPEED_CKPT_PREFIX), ckpt_id)
26 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/deepspeed_methods/deepspeed_trainer_backend.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import torch
  3 | from typing import Iterable, List, Optional, Union
  4 | 
  5 | from pymarlin import SingleProcess
  6 | from pymarlin.core import module_interface
  7 | from pymarlin.core.trainer_backend import TrainerBackendArguments, OutputCollector, DDPTrainerBackend
  8 | 
  9 | 
 10 | class DeepSpeedTrainerBackend(SingleProcess):
 11 | 
 12 |     def init(self, args: TrainerBackendArguments):
 13 |         self.args = args
 14 |         self.model = self.args.model
 15 |         if not self.distributed:
 16 |             assert self.args.distributed_training_args.world_size == 1 \
 17 |                 , 'World size > 1 . Decorate with DDPTrainerBackend'
 18 | 
 19 |         # ensure gradient_accumulation will be equal to the one set in deepspeed config json
 20 |         if self.args.gradient_accumulation != self.model.model.gradient_accumulation_steps():
 21 |             print(f"Warning, self.args.gradient_accumulation ({self.args.gradient_accumulation}) is not equal to gradient_accumulation_steps inside deepspeedConfig.json, adjusting")
 22 |             print(f"Warning, setting self.args.gradient_accumulation to {self.model.model.gradient_accumulation_steps()}")
 23 |             self.args.gradient_accumulation = self.model.model.gradient_accumulation_steps()
 24 | 
 25 |     def train_dl(self, dataloader, callback: module_interface.CallbackInterface):
 26 | 
 27 |         epoch_collector = OutputCollector()
 28 |         global_step_collector = OutputCollector()
 29 |         self.global_step_this_epoch = 0
 30 |         # can pass certain stuff as argument instead of passing the entire train module.
 31 |         # But will this hinder inheritence as different trainer_backends will need different stuff from train module
 32 |         with tqdm(dataloader, unit="batch", disable=self.args.disable_tqdm) as tbatch:
 33 |             for i, batch in enumerate(tbatch):
 34 |                 if (
 35 |                         self.args.max_train_steps_per_epoch
 36 |                         and self.global_step_this_epoch
 37 |                         >= self.args.max_train_steps_per_epoch
 38 |                 ):
 39 |                     break
 40 | 
 41 |                 tbatch.set_description(f"Global Batch: {self.global_step_completed + 1} ")
 42 |                 # forward
 43 |                 outputs = self.model.forward(
 44 |                     stage=module_interface.Stage.TRAIN,
 45 |                     batch=batch,
 46 |                     device=self.args.device,
 47 |                     global_step=self.global_step_completed + 1,
 48 |                 )
 49 |                 # assume iterable if first return type is not a list
 50 |                 outputs = [outputs] if type(outputs) == torch.Tensor else outputs
 51 | 
 52 |                 loss = outputs[0]
 53 | 
 54 |                 # backward. This will keep on accumulating gradients
 55 |                 self.model.model.backward(loss)
 56 |                 # deepspeed model engine must be called each micro step
 57 |                 self.model.model.step()
 58 |                 callback.on_end_backward(self.global_step_completed, loss)
 59 | 
 60 |                 # collect
 61 |                 epoch_collector.collect(outputs)
 62 |                 global_step_collector.collect(outputs)
 63 | 
 64 |                 unscaled_loss = outputs[0].item()
 65 |                 tbatch.set_postfix(
 66 |                     loss=unscaled_loss
 67 |                 )  # move progress bar to logger later
 68 | 
 69 |                 self.batches_completed += 1
 70 | 
 71 |                 if self.batches_completed % self.args.gradient_accumulation == 0:
 72 |                     # write global step mean loss to stats
 73 |                     self.process_global_step(global_step_collector, callback)
 74 | 
 75 |         return epoch_collector.all_outputs
 76 | 
 77 |     def process_global_step(self, global_step_collector, callback):
 78 |         """Clip gradients and call optimizer + scheduler
 79 |         """
 80 |         global_step_outputs = global_step_collector.all_outputs
 81 |         global_step_mean_loss = (
 82 |             global_step_outputs[0].mean().item()
 83 |         )
 84 |         global_step_collector.reset()
 85 |         self.stats.update("loss", global_step_mean_loss, frequent=True)
 86 | 
 87 |         self.global_step_completed += 1
 88 |         self.global_step_this_epoch += 1
 89 | 
 90 |         callback.on_end_train_step(self.global_step_completed, *global_step_outputs)
 91 |         self.stats.log_stats(self.global_step_completed)
 92 | 
 93 | 
 94 | class DeepSpeedDistributedTrainerBackend(DDPTrainerBackend):
 95 | 
 96 |     def init(self, args: TrainerBackendArguments):
 97 |         # unpack trainer_backend arguments
 98 |         self.args = args
 99 |         self.distributed_training_args = args.distributed_training_args
100 | 
101 |         self.trainer_backend.init(args)
102 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/deepspeed_methods/deepspeed_utils.py:
--------------------------------------------------------------------------------
 1 | import deepspeed as dp
 2 | from typing import Optional, Any, Dict
 3 | 
 4 | 
 5 | def prepare_optimizer_parameters(deepspeed_transformer_kernel, model):
 6 |     param_optimizer = list(model.named_parameters())
 7 |     param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
 8 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
 9 |     if deepspeed_transformer_kernel:
10 |         no_decay = no_decay + ['attn_nw', 'attn_nb', 'norm_w', 'norm_b',
11 |                                'attn_qkvb', 'attn_ob', 'inter_b', 'output_b']
12 |     weight_decay = 0.01
13 | 
14 |     optimizer_grouped_parameters = [{
15 |         'params':
16 |             [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
17 |         'weight_decay':
18 |             weight_decay
19 |     }, {
20 |         'params':
21 |             [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
22 |         'weight_decay':
23 |             0.0
24 |     }]
25 | 
26 |     return optimizer_grouped_parameters
27 | 
28 | 
29 | def initialize_deepspeed(model, config, deepspeed_transformer_kernel):
30 |     print("SystemLog: Initializing DeepSpeed")
31 |     print("SystemLog: DeepSpeed parameters: deepspeed_config=%s" % (config))
32 | 
33 |     optimizer_grouped_parameters = prepare_optimizer_parameters(deepspeed_transformer_kernel, model)
34 | 
35 |     # DeepSpeed initializer handles FP16, distributed, optimizer automatically.
36 |     model_deepspeed, optimizer_deepspeed, _, _ = dp.initialize(
37 |         config=config,
38 |         model=model,
39 |         model_parameters=optimizer_grouped_parameters)
40 | 
41 |     return model_deepspeed, optimizer_deepspeed
42 | 
43 | 
44 | def get_core_model(model, deepspeed_flag=False, ort_flag=False):
45 |     module = model
46 |     if deepspeed_flag:
47 |         module = module.module
48 |     if ort_flag:
49 |         module = module._original_module
50 | 
51 |     return module
52 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/images/tensorboard_screenshot_bart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/cnndailymail_text_summarization/images/tensorboard_screenshot_bart.jpg


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/infer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Inferencing after training completion
 3 | '''
 4 | import torch
 5 | import os
 6 | import argparse
 7 | 
 8 | from transformers import BartForConditionalGeneration, BartTokenizerFast
 9 | class Summarizer(torch.nn.Module):
10 |     def __init__(self, model_path = 'outputs', model_file='', isCheckpoint = True, load_weights = True):
11 |         super().__init__()
12 |         self.fullpath = os.path.join(model_path, model_file)
13 |         self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
14 |         self.tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
15 |         self.device = "cuda" if torch.cuda.is_available() else "cpu"
16 |         #load model weights
17 |         if load_weights:
18 |             self._load_weights(isCheckpoint)
19 |         
20 |     
21 |     def _load_weights(self, isCheckpoint = True):
22 |         state_dict = torch.load(self.fullpath)
23 |         if isCheckpoint:
24 |             state_dict = state_dict['module_interface']
25 |             self.load_state_dict(state_dict)
26 |         else:
27 |             self.model.load_state_dict(state_dict)
28 |         self.model.to(self.device)
29 | 
30 |     def summarize(self, text):
31 |         batch = self.tokenizer(text, return_tensors='pt').to(self.device)
32 |         generated_ids = self.model.generate(batch['input_ids'])
33 |         return self.tokenizer.batch_decode(generated_ids)[0]
34 | 
35 | if __name__ == '__main__':
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument(
38 |         "--model_path", type=str, default=r"checkpoints", help="Path to model")
39 |     parser.add_argument("--model_file", type=str, default = "model_0.tar")
40 | 
41 |     args = parser.parse_args()
42 |     summ = Summarizer(model_path = args.model_path, model_file = args.model_file )
43 |     text = "Home Secretary Priti Patel warns people trying to leave UK will be turned back at airports and lashes influencers 'working' in the sun as she unveils quarantine rules for Brits returning from 30 high-risk countries"
44 |     summary = summ.summarize(text)
45 |     print(text)
46 |     print('Summary:', summary)


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/model_ortds.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | import os
 3 | 
 4 | # too long import
 5 | from pymarlin.utils.stats import global_stats
 6 | from pymarlin.utils.logger import getlogger
 7 | 
 8 | from onnxruntime.training.ortmodule import ORTModule
 9 | 
10 | from filelock import FileLock
11 | 
12 | from deepspeed_methods  import deepspeed_utils
13 | from train import SummarizationBartModule
14 | 
15 | logger = getlogger(__file__)
16 | 
17 | class SummarizationBartModuleORT(SummarizationBartModule):
18 |     def __init__(
19 |             self,
20 |             *args,
21 |             **kwargs
22 |     ):
23 |         super().__init__(*args, **kwargs)
24 | 
25 |         #setting this here to avoid issues after wrapping
26 |         self._pad_token_id = self.model.config.pad_token_id
27 | 
28 |         logger.info("Employing ORT, wrapping model with ORTModule")
29 |         self.model = ORTModule(self.model)
30 |         
31 |     def get_core_model(self):
32 |         return deepspeed_utils.get_core_model(self.model, ort_flag=True)
33 |     
34 |     @property
35 |     def pad_token_id(self):
36 |         return self._pad_token_id
37 | 
38 | class SummarizationBartModuleORTDeepSpeed(SummarizationBartModuleORT):
39 |     def __init__(
40 |             self,
41 |             *args,
42 |             deepspeed_config='',
43 |             deepspeed_transformer_kernel=False,
44 |             deepspeed_ckpt_tag=None,
45 |             deepspeed_resume_from_checkpoint=None,
46 |             **kwargs
47 |     ):
48 |         super().__init__(*args, **kwargs)
49 |         logger.info(f"Employing Deepspeed, wrapping model with Deepspeed")
50 |         self.model, _ = deepspeed_utils.initialize_deepspeed(self.model, deepspeed_config, deepspeed_transformer_kernel)
51 |         self.deepspeed_resume_from_checkpoint = deepspeed_resume_from_checkpoint
52 |         self.deepspeed_ckpt_tag = deepspeed_ckpt_tag
53 |         self.DEEPSPEED_CKPT_PREFIX = "deepspeed_ckpt"
54 | 
55 |     def get_optimizers_schedulers(
56 |             self, estimated_global_steps_per_epoch: int, epochs: int
57 |     ):
58 |         print(f"Deepspeed is employed, optimizer and scheduler are defined in deepspeedConfig.json file")
59 |         return [], []
60 | 
61 |     def get_core_model(self):
62 |         return deepspeed_utils.get_core_model(self.model, ort_flag=True, deepspeed_flag=True)
63 | 
64 |     def train_step(self, global_step: int, batch, device):
65 |         batch = batch.to(device)
66 |         result = self.model(**batch)
67 |         global_stats.update("lr", self.model.get_lr()[0], frequent=True)
68 |         loss = result["loss"]
69 | 
70 |         return loss
71 | 
72 |     def get_state(self) -> Dict:
73 |         return None
74 | 
75 |     def update_state(self, state: Dict):
76 |         if self.deepspeed_resume_from_checkpoint is not None:
77 | 
78 |             import glob
79 |             loading_path = os.path.join(self.deepspeed_resume_from_checkpoint, self.DEEPSPEED_CKPT_PREFIX)
80 |             deepspeed_checkpoint_dirs = sorted(glob.glob(f"{loading_path}/*"))
81 | 
82 |             if len(deepspeed_checkpoint_dirs) > 0:
83 |                 logger.info(f"Attempting to resume from {loading_path}")
84 |                 # this magically updates self.optimizer and self.lr_scheduler
85 |                 load_path, _ = self.model.load_checkpoint(
86 |                     loading_path,
87 |                     load_optimizer_states=True,
88 |                     load_lr_scheduler_states=True,
89 |                     tag=self.deepspeed_ckpt_tag,
90 |                 )
91 |                 if load_path is None:
92 |                     raise ValueError(f"[deepspeed] failed to resume from checkpoint {self.deepspeed_resume_from_checkpoint}")
93 |             else:
94 |                 logger.error(f"{loading_path} doesn't have deepspeed checkpoints, doing nothing")
95 | 


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | matplotlib
3 | rouge-score
4 | nltk


--------------------------------------------------------------------------------
/examples/cnndailymail_text_summarization/train_ortds.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pymarlin.core import trainer
 3 | 
 4 | # too long import
 5 | from pymarlin.core.trainer_backend import build_trainer_backend
 6 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser
 7 | 
 8 | from filelock import FileLock
 9 | 
10 | # DeepSpeed + ORT
11 | from deepspeed_methods.deepspeed_trainer import DeepSpeedTrainer
12 | from deepspeed_methods.deepspeed_trainer_backend import DeepSpeedTrainerBackend, DeepSpeedDistributedTrainerBackend
13 | from onnxruntime.training.ortmodule import ORTModule
14 | 
15 | from data import SummarizationData
16 | from model_ortds import SummarizationBartModuleORT,SummarizationBartModuleORTDeepSpeed, SummarizationBartModule
17 | 
18 | if __name__ == '__main__':
19 |     config = CustomArgParser(yaml_file_arg_key="config_path", default_yamlfile="config-ortds.yaml").parse()
20 | 
21 |     print(f"config: {config}")
22 | 
23 |     data = SummarizationData(root=config["data_path"])
24 | 
25 |     if config['ortds']:
26 |         module_class = SummarizationBartModuleORTDeepSpeed
27 |     elif config['ort']:
28 |         module_class = SummarizationBartModuleORT
29 |     else:
30 |         module_class = SummarizationBartModule
31 | 
32 |     module = module_class(data, **config["module"], generate_kwargs=config["generate"])
33 | 
34 |     trainer_args = trainer.TrainerArguments(
35 |         **config["trainer"],
36 |         stats_args=trainer.stats.StatInitArguments(**config["stat"]),
37 |         writer_args=trainer.WriterInitArguments(**config["wrt"]),
38 |         checkpointer_args=trainer.DefaultCheckpointerArguments(**config["chkp"])
39 |     )
40 | 
41 |     if config['ortds']:
42 |         module.deepspeed_resume_from_checkpoint = config["chkp"]["load_dir"]
43 |         tr =  DeepSpeedDistributedTrainerBackend(DeepSpeedTrainerBackend()) if config["dist"] else DeepSpeedTrainerBackend()
44 |         trainer = DeepSpeedTrainer(trainer_backend=tr, module=module, args=trainer_args)
45 |     else:
46 |         trainer = trainer.Trainer(module=module, args=trainer_args)
47 | 
48 |     trainer.train()
49 |     trainer.validate()
50 | 


--------------------------------------------------------------------------------
/examples/covid19_text_classification/azureml/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04
 2 | 
 3 | ##############################################################################
 4 | # Custom Conda environment
 5 | ##############################################################################
 6 | 
 7 | ENV CONDAPATH /opt/miniconda/envs/pymarlin
 8 | RUN conda create -p $CONDAPATH python=3.8 pip=20.2.4
 9 | ENV PATH $CONDAPATH/bin:$PATH
10 | 
11 | ##############################################################################
12 | # PyTorch
13 | ##############################################################################
14 | 
15 | RUN pip install --no-cache-dir torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
16 | 
17 | ################################################################################################
18 | # pymarlin[plugins] should have everything needed to run classification
19 | ################################################################################################
20 | 
21 | RUN pip install --no-cache-dir --use-feature=2020-resolver pymarlin[plugins]
22 | 


--------------------------------------------------------------------------------
/examples/covid19_text_classification/azureml/submit.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from azureml.core import Workspace, Dataset, Experiment, ScriptRunConfig, Environment
 4 | from azureml.core.runconfig import PyTorchConfiguration, MpiConfiguration
 5 | 
 6 | def prepare_env_cmd():
 7 |     """Prepare the environment and submission command for the classification example."""
 8 |     env = Environment("pymarlin_requirements")
 9 |     env.docker.enabled = True
10 |     env.docker.base_image = None
11 |     env.docker.base_dockerfile = 'dockerfile'
12 |     env.python.user_managed_dependencies = True
13 |     env.python.interpreter_path = "/opt/miniconda/bin/python"
14 |     env.register(ws)
15 | 
16 |     ds = ws.get_default_datastore()
17 |     # preprocessed data needs to be placed into datastore
18 |     # ds.upload_files([r'data\covid-19-nlp-text-classification\preprocessed\bert'], 'datasets/covid19_classification/preprocessed/bert/')
19 |     dataset = Dataset.File.from_files((ds, 'datasets/covid19_classification/preprocessed/bert/')).as_download()
20 | 
21 |     cmd = f'''python train.py --trainer.backend {args.backend} '''.split()
22 |     cmd.extend(['--data.preprocessed_dir', dataset])
23 |     
24 |     return env, cmd
25 |     
26 | if __name__ == "__main__":
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--target_name", '-t', default="sriovdedicated1")
29 |     parser.add_argument("--node_count", "-n", type=int, default=1)
30 |     parser.add_argument("--process_count", "-p", type=int, default=1)
31 |     parser.add_argument("--experiment_name", '-e', type=str, default="marlin-tests")
32 |     parser.add_argument("--distributed_config", "-d", type=str, choices=["mpi", "pytorch"], default="pytorch")
33 |     parser.add_argument("--backend", "-b", choices=["sp", "ddp-amp"], default="sp")
34 |     parser.add_argument("--subscription_id", '-s', help='azure subscription id', required=True)
35 |     parser.add_argument("--resource_group", '-rg', help='azure resource group', required=True)
36 |     parser.add_argument("--workspace_name",'-ws',  help='azure machine learning workspace', required=True)
37 |     parser.add_argument("--wait", "-w", action="store_true", help="Throw error is Azure ML job fails.")
38 |     args = parser.parse_args()
39 | 
40 |     ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name)
41 | 
42 |     target = ws.compute_targets[args.target_name]
43 | 
44 |     if args.distributed_config == "pytorch":
45 |         distributed_job_config = PyTorchConfiguration(
46 |             process_count=args.process_count, node_count=args.node_count
47 |         )
48 |     elif args.distributed_config == "mpi":
49 |         distributed_job_config = MpiConfiguration(
50 |             process_count_per_node=args.process_count, node_count=args.node_count
51 |         )
52 |     else:
53 |         raise ValueError(f"Didn't recognize the distributed config {args.distributed_config}. Select on of 'mpi' or 'pytorch'.")
54 | 
55 |     env, cmd = prepare_env_cmd()
56 | 
57 |     src = ScriptRunConfig(
58 |         source_directory='..',
59 |         command=cmd,
60 |         compute_target=target,
61 |         distributed_job_config=distributed_job_config,
62 |         environment=env,
63 |     )
64 | 
65 |     print("Submitting experiment...")
66 |     run = Experiment(ws, args.experiment_name).submit(src)
67 | 
68 |     print(f"{run.get_portal_url()}")
69 | 
70 |     if args.wait:
71 |         print("Waiting for run completion...")
72 |         run.wait_for_completion(show_output=True, raise_on_error=True)
73 | 


--------------------------------------------------------------------------------
/examples/covid19_text_classification/config.yaml:
--------------------------------------------------------------------------------
 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line.
 2 | # Example usage in command-line: --tmgr.epochs 10
 3 | 
 4 | # trainer arguments
 5 | trainer:
 6 |     max_train_steps_per_epoch : 20 # Maximum train steps per epoch.
 7 |     max_val_steps_per_epoch : 5 # Maximum validation steps per epoch.
 8 |     train_batch_size: 8 # Training global batch size.
 9 |     val_batch_size: 4 # Validation global batch size.
10 |     epochs: 1 # Total epochs to run.
11 |     gpu_batch_size_limit : 8 # Max limit for GPU batch size during training.
12 |     clip_grads : False # Enable or disable clipping of gradients.
13 |     use_gpu: True # Enable or disable use of GPU.
14 |     max_grad_norm: 1.0 # Maximum value for gradient norm.
15 |     writers: ['stdout', 'aml', 'tensorboard'] # List of all the writers to use.
16 |     reset_optimizers_schedulers: True
17 |     backend: sp
18 | 
19 | # Checkpointer arguments
20 | chkp:
21 |     checkpoint: True # Flag indicating whether to checkpoint model.
22 |     delete_existing_checkpoints: False # Flag indicating whether to delete checkpoints under save_dir before training.
23 |     period: 1 # Period of epochs at which to checkpoint model.
24 |     save_dir: 'checkpoints' # Path to directory where checkpoints are to be stored.
25 |     #load_dir: 'checkpoints' # Path to directory where checkpoints are to be loaded from.
26 |     #load_filename: 'tweetClassification_0.pt' # Filename of checkpoint under load_dir (overrides automatic loading of max epoch).
27 |     file_prefix: 'tweetClassification' # Prefix of the checkpoint filename.
28 |     file_ext: 'pt' # File extension for the checkpoint.
29 |     log_level: 'DEBUG' # Log level for checkpointer module.
30 | 
31 | # Basic-Statistics arguments
32 | stat:
33 |     log_steps: 50 # Interval between steps for logging stats.
34 |     update_system_stats: False # Enable or disable updating system stats
35 |     log_model_steps: 1000 # Interval between steps for logging model.
36 |     exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)' # Exclude list for logging.
37 | 
38 | # Writers arguments
39 | wrts:
40 |     model_log_level : 'INFO' # Log level for model. Set to DEBUG to enable.
41 |     tb_log_dir : 'logs' # Folder name for storing Tensorboard logs.
42 |     tb_logpath_parent_env : null # Log path parent Environment.
43 |     tb_log_multi : False # Enable or disable logging multi.
44 |     tb_log_hist_steps : 20000 # Interval between steps to log histogram.
45 | 
46 | # Scenario-specific arguments
47 | module:
48 |     max_lr : 0.00004 # Maximum learning rate.
49 |     log_level: 'INFO'
50 | 
51 | data:
52 |     filepath_train: 'data/covid-19-nlp-text-classification/Corona_NLP_train.csv'
53 |     filepath_test: 'data/covid-19-nlp-text-classification/Corona_NLP_test.csv'
54 |     preprocessed_dir: 'data/covid-19-nlp-text-classification/preprocessed/bert'
55 |     encoding: 'ISO-8859-1'
56 |     text_field: 'OriginalTweet'
57 |     label_field: 'Sentiment'
58 |     splitpct: 10
59 |     log_level: 'INFO'
60 | 


--------------------------------------------------------------------------------
/examples/covid19_text_classification/readme.md:
--------------------------------------------------------------------------------
 1 | # Covid-19 Text Sentiment Classification
 2 | 
 3 | ## Instructions
 4 | 1. Install requirements (start in this directory)
 5 |         pip install -r requirements.txt
 6 | 2. Download data from kaggle
 7 |         Ref: https://github.com/Kaggle/kaggle-api
 8 |         
 9 | 	Get your credentials file from kaggle here: C:\Users\<user>\.kaggle\kaggle.json
10 |         
11 | 	mkdir data
12 |         
13 | 	cd data
14 |         
15 | 	kaggle datasets download -d datatattle/covid-19-nlp-text-classification
16 |         
17 | 	(if windows): Expand-Archive .\covid-19-nlp-text-classification.zip 
18 |         
19 | 	(else): unzip ./covid-19-nlp-text-classification.zip 
20 | 		
21 | 		mkdir covid-19-nlp-text-classification
22 | 		
23 | 		move the two csv files to the new folder
24 | 
25 | 3. Install pymarlin library
26 | 
27 |         pip install pymarlin
28 | 
29 |         or
30 | 
31 |         $env:PYTHONPATH=<pymarlin repo path>
32 | 
33 | 4. Set working directory
34 | 
35 |         cd ..
36 | 
37 | 5. Prepare data
38 | 
39 |         python data.py
40 | 
41 | 6. Train
42 | 
43 |         python train.py [--trainer.max_train_steps_per_epoch 2]
44 | 
45 | ## Running AzureML
46 | You can use [`examples/classification/azureml/azureml_submit.py`](https://github.com/microsoft/PyMarlin/blob/main/examples/classification/azureml/azureml_submit.py)
47 | to submit examples to run on Azure ML.
48 | 
49 | For example:
50 | 
51 | ```bash
52 | cd examples/classification/azureml/
53 | python azureml_submit.py --backend ddp-amp --process_count 2 
54 | --subcription_id <your azure sub id> --resource_group <rg> --workspace_name <azureml workspace>
55 | ```
56 | 
57 | See `examples/classification/azureml/azureml_submit.py -h` for more options.
58 | 
59 | **Note.** Submitting to AzureML requires setting up an AzureML workspace. See [Azure ML CheatSheet](https://aka.ms/aml/cheatsheet) for more details.
60 | 


--------------------------------------------------------------------------------
/examples/covid19_text_classification/requirements.txt:
--------------------------------------------------------------------------------
1 | kaggle
2 | transformers


--------------------------------------------------------------------------------
/examples/germ_text_ner/config_germ.yaml:
--------------------------------------------------------------------------------
 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line.
 2 | # Example usage in command-line: --tmod.max_lr 4E-5
 3 | 
 4 | # data_processor args
 5 | data:
 6 |     train_filepath : null
 7 |     val_filepath : null
 8 |     labels_list: [B-LOC, B-LOCderiv, B-LOCpart, B-ORG, B-ORGderiv, B-ORGpart, B-OTH, B-OTHderiv,
 9 |         B-OTHpart, B-PER, B-PERderiv, B-PERpart, I-LOC, I-LOCderiv, I-LOCpart, I-ORG, I-ORGderiv,
10 |         I-ORGpart, I-OTH, I-OTHderiv, I-OTHpart, I-PER, I-PERderiv, I-PERpart, O]
11 |     has_labels: True
12 |     file_format: "tsv"
13 | 
14 | # model arguments
15 | model:
16 |     model_name: "bert"
17 |     encoder_key: "bert"
18 |     hf_model: "bert-base-multilingual-cased"
19 |     model_file: "pytorch_model.bin"
20 |     model_config_file: "config.json"
21 |     model_path: null
22 |     model_config_path: null
23 |     tokenizer_path: null
24 | 
25 | # module_interface arguments
26 | module:
27 |     output_dir: null
28 |     max_lr : 0.00002 # Maximum learning rate.
29 |     warmup_prop: 0.1
30 |     has_labels: True
31 |     max_seq_len: 128
32 |     pad_label_id: -100
33 |     label_all_tokens: False
34 | 
35 | # distill module arguments
36 | distill:
37 |     enable: False
38 |     student_model_config_path: null
39 |     student_model_config_file: null
40 |     student_model_path: null
41 |     student_model_file: null
42 |     student_layers: [0,6,11]
43 |     loss_types: ["logits"]
44 |     loss_weights: [1]
45 |     temperature: 1
46 | 
47 | # trainer arguments
48 | trainer:
49 |     backend: "sp"
50 |     train_batch_size: 32 # Training global batch size.
51 |     val_batch_size: 16 # Validation global batch size.
52 |     epochs: 5 # Total epochs to run.
53 |     gpu_batch_size_limit : 8 # Max limit for GPU batch size during training.
54 |     clip_grads : True # Enable or disable clipping of gradients.
55 |     use_gpu: True # Enable or disable use of GPU.
56 |     max_grad_norm: 1.0 # Maximum value for gradient norm.
57 |     writers: ['stdout', 'aml', 'tensorboard'] # List of all the writers to use.
58 |     disable_tqdm: True
59 |     
60 | # Checkpointer arguments
61 | ckpt:
62 |     checkpoint: True # Flag indicating whether to checkpoint model.
63 |     delete_existing_checkpoints: True
64 |     period: 1 # Period of epochs at which to checkpoint model.
65 |     save_dir: 'ckpts' # Path to directory where checkpoints are to be stored.
66 |     model_state_save_dir: 'model_ckpts'
67 |     file_prefix: 'marlin' # Prefix of the checkpoint filename.
68 |     file_ext: 'bin' # File extension for the checkpoint.
69 | 
70 | # Basic-Statistics arguments
71 | stats:
72 |     log_steps: 50
73 |     update_system_stats: False
74 |     log_model_steps: 1000
75 |     exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)'
76 | 
77 | # Writers arguments
78 | wrts:
79 |     tb_log_dir : 'logs'
80 |     tb_logpath_parent_env : null
81 |     tb_log_multi : False
82 |     tb_log_hist_steps : 20000


--------------------------------------------------------------------------------
/examples/germ_text_ner/readme.md:
--------------------------------------------------------------------------------
 1 | # Germ Eval NER task
 2 | 
 3 | Ashwin Srinivasan
 4 | 
 5 | This example will walk you through executing the plugin for Germ eval NER task.
 6 | 
 7 | For more detailed understanding of how you can use NER plugin please refer to [this](https://microsoft.github.io/PyMarlin/docs/plugins/hf_ner)
 8 | 
 9 | ## Dataset format
10 | 
11 | NER plugin expects the input to be a TSV or CSV with 2 columns. A column with the text sentences followed by a column with the labels for the tokens in the sentence.'Sentence':'who is harry', 'Slot': 'O O B-contact_name'
12 | 
13 | For GermEval dataset we have already modified and provided the dataset along with this example. You will find the train file under train_germ and dev file under val_germ.
14 | 
15 | ## Running on VM
16 | ```
17 | conda create -n pymarlin
18 | conda activate pymarlin
19 | pip install -r requirements.txt
20 | ```
21 | 
22 | Moving data : 
23 | 
24 | scp -r -P $port .\examples\NER_Plugin\  $user@${machine}:/home/$user
25 | ssh $user@$machine -p $port
26 | 
27 | Running on CLI would be as simple as:
28 | 
29 | ```
30 | python test.py --data.train_filepath ./train_germ/train.tsv --data.val_filepath ./val_germ/dev.tsv --config_path config_germ.yaml
31 | ```
32 | 
33 | Running with multi-GPU:
34 | 
35 | ```
36 | python -m torch.distributed.launch --nproc_per_node 4 test.py --data.train_filepath ./train_germ/train.tsv --data.val_filepath ./val_germ/dev.tsv --config_path config_germ.yaml --trainer.backend ddp-amp
37 | ```
38 | Results:
39 | 
40 |     tensorboard --logdir logs
41 | 
42 | Tunnel to view tensorboard UI: (if using VM):
43 | 
44 |     ssh -N -f -L 127.0.0.1:6006:127.0.0.1:6006  $user@${machine} -p $port
45 | 
46 | View Tensorboard UI:
47 | 
48 |     http://localhost:6006/
49 | 
50 | ## Running on Azure ML
51 | 
52 | A notebook has been provided along with this titled 'GermEvalAML.ipynb' , once you have a valid azure workspace , resource group and compute replace the placeholders in the notebook and you should be able to submit a script to AML.
53 | 
54 | ## Mode checkpoint extraction + Inference
55 | 
56 | You may want to further use this model checkpoint for inference or use it in your project. The instructions are [here](https://microsoft.github.io/PyMarlin/docs/plugins/hf_ner) under evaluation section. Further the notebook includes a inference section with the relevant code.
57 | 


--------------------------------------------------------------------------------
/examples/germ_text_ner/requirements.txt:
--------------------------------------------------------------------------------
1 | pymarlin[plugins]
2 | torch==1.8.1+cu111
3 | -f https://download.pytorch.org/whl/torch_stable.html
4 | 


--------------------------------------------------------------------------------
/examples/germ_text_ner/test.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import os
 3 | 
 4 | import itertools
 5 | import pandas as pd
 6 | from pymarlin.core.trainer_backend import build_trainer_backend
 7 | 
 8 | from pymarlin.utils.logger.logging_utils import getlogger
 9 | logger = getlogger(__name__, 'DEBUG')
10 | from pymarlin.core import data_interface
11 | from transformers import InputExample, AutoTokenizer, InputFeatures
12 | from pymarlin.plugins import HfNERPlugin
13 | 
14 | if __name__ == '__main__':
15 |     ########### Usage #############
16 |     plugin = HfNERPlugin()
17 | 
18 | ################ Run plugin.setup() to bootstrap entire pipeline #################
19 | 
20 | #### Cmdline: python test.py --data.train_filepath ./train_germ/train.tsv --data.val_filepath ./val_germ/dev.tsv --config_path config_germ.yaml
21 | 
22 |     plugin.setup_trainer()
23 |     trainer = plugin.trainer
24 |     trainer.train()
25 | 


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04
 2 | RUN apt-get update
 3 | 
 4 | 
 5 | # create conda environment
 6 | RUN conda update -n base -c defaults conda -y
 7 | RUN conda create -n marlin python=3.8 -y
 8 | RUN echo ". /opt/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc
 9 | 
10 | 
11 | 
12 | ADD requirements.txt /workdir/
13 | WORKDIR /workdir
14 | 
15 | RUN /opt/miniconda/envs/marlin/bin/pip install -U -r requirements.txt
16 | 
17 | # Instructions to update docker image. (replace krishansubudhi with your dockerhub account name)
18 | # https://krishansubudhi.github.io/development/2019/09/23/CreatingDockerImage.html
19 | 
20 | # In a VM, Build
21 | # docker build --rm -t krishansubudhi/marlin:latest .
22 | 
23 | # Test
24 | # docker run --gpus all -it -d -p 5000:5000 krishansubudhi/marlin:latest
25 | # docker ps 
26 | # CONTAINER_ID=4dd751e87293 # replace 4dd751e87293 wit your container id
27 | # docker cp ./src $CONTAINER_ID:/workdir
28 | # docker attach $CONTAINER_ID
29 | 
30 | # Push new image to dockerhub
31 | # docker login
32 | # docker push krishansubudhi/marlin:latest


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/README.md:
--------------------------------------------------------------------------------
  1 | # GLUE Finetuning
  2 | This code works well in a virtual machine with GPU (preferably with nvidia V100 or A100 GPU for AMP support). More information can be found in this [blog](https://krishansubudhi.github.io/deeplearning/2020/12/09/run-ml-on-vm.html).
  3 | 
  4 | We can train any GLUE task using this code. This blog only shows instruction for RTE. Running other tasks are relatively straight forward.
  5 | 
  6 | This code can be used for any other single sentence or sentence pair classifier too. A new DataInterface needs to be created for non-GLUE dataset.
  7 | 
  8 | ## Move the code to GPU VM (Optional)
  9 | 
 10 |     scp -r -P $port .\examples\GLUE\  $user@${machine}:/home/$user
 11 |     ssh $user@$machine -p $port
 12 | 
 13 | ## Setup environment and dependencies
 14 | 
 15 |     conda create -n pymarlin
 16 |     conda activate pymarlin
 17 |     pip install -r requirements.txt
 18 | ## Analyze data
 19 | Script:
 20 | 
 21 |     python src/data.py rte
 22 | 
 23 | Result: 
 24 | 
 25 |     DatasetDict({
 26 |         train: Dataset({
 27 |             features: ['sentence1', 'sentence2', 'label', 'idx'],
 28 |             num_rows: 2490
 29 |         })
 30 |         validation: Dataset({
 31 |             features: ['sentence1', 'sentence2', 'label', 'idx'],
 32 |             num_rows: 277
 33 |         })
 34 |         test: Dataset({
 35 |             features: ['sentence1', 'sentence2', 'label', 'idx'],
 36 |             num_rows: 3000
 37 |         })
 38 |     })
 39 | 
 40 |     train data label distribution
 41 |         idx  label                                          sentence1                                          sentence2
 42 |     0    0      1  No Weapons of Mass Destruction Found in Iraq Yet.         Weapons of Mass Destruction Found in Iraq.
 43 |     1    1      0  A place of sorrow, after Pope John Paul II die...  Pope Benedict XVI is the new leader of the Rom...
 44 |         count     ratio
 45 |     label                 
 46 |     0       1249  0.501606
 47 |     1       1241  0.498394
 48 |     count    2490.000000
 49 |     mean       43.565462
 50 |     std        32.389776
 51 |     min         4.000000
 52 |     50%        31.000000
 53 |     95%       112.000000
 54 |     99%       143.000000
 55 |     99.9%     170.066000
 56 |     max       239.000000
 57 |     Name: sentence1, dtype: float64
 58 |     count    2490.000000
 59 |     mean        8.790361
 60 |     std         4.396781
 61 |     min         3.000000
 62 |     50%         8.000000
 63 |     95%        18.000000
 64 |     99%        26.000000
 65 |     99.9%      31.000000
 66 |     max        41.000000
 67 |     Name: sentence2, dtype: float64
 68 | 
 69 |     validation data label distribution
 70 |     idx  label                                          sentence1                                         sentence2
 71 |     0    0      1  Dana Reeve, the widow of the actor Christopher...                Christopher Reeve had an accident.
 72 |     1    1      0  Yet, we now are discovering that antibiotics a...  Bacteria is winning the war against antibiotics.
 73 |         count     ratio
 74 |     label                 
 75 |     0        146  0.527076
 76 |     1        131  0.472924
 77 | 
 78 | 
 79 | ## Train and validate
 80 | Script Single GPU:
 81 | 
 82 |     python src/train.py --config_path configs-roberta-base/rte.yaml
 83 | 
 84 | Script multiple GPUs:
 85 | 
 86 |     python -m torch.distributed.launch --nproc_per_node 4 src/train.py --config_path configs-roberta-base/rte.yaml --tr.backend ddp-amp
 87 | 
 88 | Results:
 89 | 
 90 |     tensorboard --logdir logs_roberta_base
 91 | 
 92 | Tunnel to view tensorboard UI: (if using VM):
 93 | 
 94 |     ssh -N -f -L 127.0.0.1:6006:127.0.0.1:6006  $user@${machine} -p $port
 95 | 
 96 | View Tensorboard UI:
 97 | 
 98 |     http://localhost:6006/
 99 | 
100 | ![results](images/tensorboard_screenshot.jpg)
101 | ## infer
102 | 0 is entailment, 1 is contradiction.
103 | 
104 | Script
105 | 
106 |     python src/infer.py  --config_path configs-roberta-base/rte.yaml 
107 | 
108 | Input
109 | 
110 |     sentence1 = ['No Weapons of Mass Destruction Found in Iraq Yet.',
111 |                 'India is a hot country',
112 |                 'Krishan has written this inference example']
113 |     sentence2 = ['Weapons of Mass Destruction Found in Iraq.',
114 |                 'It\'s warm in india',
115 |                 'Krishan is the author of this example']
116 | 
117 | Result:
118 | 
119 |     tensor([1, 0, 0])


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/cola.yaml:
--------------------------------------------------------------------------------
 1 | #https://arxiv.org/pdf/1907.11692v1.pdf
 2 | #We consider a limited hyperparameter
 3 | # sweep for each task, with batch sizes ∈ {16, 32}
 4 | # and learning rates ∈ {1e−5, 2e−5, 3e−5}, with a
 5 | # linear warmup for the first 6% of steps followed by
 6 | # a linear decay to 0. We finetune for 10 epochs and
 7 | # perform early stopping based on each tasks evaluation metric on the dev set. 
 8 | # The rest of the hyperparameters remain the same as during pretraining.
 9 | 
10 | glue_task: 'cola'
11 | 
12 | mi:
13 |     encoder : "roberta-base"
14 |     tokenizer: "roberta-base"
15 |     num_labels : 2
16 |     max_lr : 0.00002
17 |     warmup : 0.06
18 | tr:
19 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
20 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
21 |     train_batch_size: 32 # Training global batch size.
22 |     val_batch_size: 64 # Validation global batch size.
23 |     epochs: 10 # Total epochs to run.
24 |     gpu_batch_size_limit : 32 # Max limit for GPU batch size during training.
25 |     disable_tqdm : False
26 |     writers: ["tensorboard"]
27 |     backend: 'sp'
28 | 
29 | wrt:
30 |     tb_log_dir : 'logs_roberta_base/cola/from_pretrained'
31 | 
32 | stat:
33 |     log_steps : 20
34 | 
35 | dist:
36 |     local_rank : 3
37 |     period: 5
38 |   
39 | ckp:
40 |     checkpoint : False


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/mnli.yaml:
--------------------------------------------------------------------------------
 1 | glue_task: 'mnli'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 3
 7 |     max_lr : 0.00002
 8 |     s1_key : 'premise'
 9 |     s2_key : 'hypothesis'
10 | tr:
11 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
12 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
13 |     train_batch_size: 32 # Training global batch size.
14 |     val_batch_size: 64 # Validation global batch size.
15 |     epochs: 3 # Total epochs to run.
16 |     gpu_batch_size_limit : 16 # Max limit for GPU batch size during training.
17 |     disable_tqdm : False
18 |     writers: ["tensorboard"]
19 |     backend: 'sp'
20 | 
21 | wrt:
22 |     tb_log_dir : 'logs_roberta_base/mnli/from_pretrained'
23 | 
24 | stat:
25 |     log_steps : 20
26 | 
27 | dist:
28 |     local_rank : 3
29 |   
30 | ckp:
31 |     checkpoint : False
32 |     period: 2
33 | 
34 | # python -m torch.distributed.launch --nproc_per_node 2  src/mtl/finetune_glue.py --config_path configs/roberta-base/mnli.yaml --distributed


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/mrpc.yaml:
--------------------------------------------------------------------------------
 1 | glue_task: 'mrpc'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 2
 7 |     max_lr : 0.00002
 8 |     s1_key : 'sentence1'
 9 |     s2_key : 'sentence2'
10 |     max_length : 128
11 |     warmup : 0.06
12 | tr:
13 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
14 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
15 |     train_batch_size: 32 # Training global batch size.
16 |     val_batch_size: 64 # Validation global batch size.
17 |     epochs: 10 # Total epochs to run.
18 |     gpu_batch_size_limit : 32 # Max limit for GPU batch size during training.
19 |     disable_tqdm : False
20 |     writers: ["tensorboard"]
21 |     backend: 'sp'
22 | 
23 | wrt:
24 |     tb_log_dir : 'logs_roberta_base/mrpc/from_pretrained'
25 | 
26 | stat:
27 |     log_steps : 20
28 | dist:
29 |     local_rank : 3
30 | ckp:
31 |     checkpoint : False
32 |     period: 5


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/qnli.yaml:
--------------------------------------------------------------------------------
 1 | glue_task: 'qnli'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 2
 7 |     max_lr : 0.00002
 8 |     s1_key : 'question'
 9 |     s2_key : 'sentence'
10 |     max_length : 512
11 | tr:
12 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
13 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
14 |     train_batch_size: 32 # Training global batch size.
15 |     val_batch_size: 64 # Validation global batch size.
16 |     epochs: 4 # Total epochs to run.
17 |     gpu_batch_size_limit : 16 # Max limit for GPU batch size during training.
18 |     disable_tqdm : False
19 |     writers: ["tensorboard"]
20 |     backend: 'sp'
21 | wrt:
22 |     tb_log_dir : 'logs_roberta_base/qnli/from_pretrained'
23 | 
24 | stat:
25 |     log_steps : 20
26 | dist:
27 |     local_rank : 0
28 | ckp:
29 |     checkpoint : False
30 |     period: 2


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/qqp.yaml:
--------------------------------------------------------------------------------
 1 | glue_task: 'qqp'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 2
 7 |     max_lr : 0.00002
 8 |     max_length : 128
 9 |     warmup : 0.06
10 | tr:
11 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
12 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
13 |     train_batch_size: 32 # Training global batch size.
14 |     val_batch_size: 64 # Validation global batch size.
15 |     epochs: 10 # Total epochs to run.
16 |     gpu_batch_size_limit : 32 # Max limit for GPU batch size during training.
17 |     disable_tqdm : False
18 |     writers: ["tensorboard"]
19 |     backend: 'sp'
20 | 
21 | wrt:
22 |     tb_log_dir : 'logs_roberta_base/qqp/from_pretrained'
23 | 
24 | stat:
25 |     log_steps : 20
26 | 
27 | dist:
28 |     local_rank : 3
29 |     period: 5
30 |   
31 | ckp:
32 |     checkpoint : False


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/rte.yaml:
--------------------------------------------------------------------------------
 1 | glue_task : 'rte'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 2
 7 |     max_lr : 0.00002
 8 |     s1_key : 'sentence1'
 9 |     s2_key : 'sentence2'
10 |     max_length : 128
11 |     warmup : 0.06
12 | tr:
13 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
14 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
15 |     train_batch_size: 32 # Training global batch size.
16 |     val_batch_size: 64 # Validation global batch size.
17 |     epochs: 10 # Total epochs to run.
18 |     gpu_batch_size_limit : 32 # Max limit for GPU batch size during training.
19 |     disable_tqdm : False
20 |     writers: ["tensorboard"]
21 |     backend: 'sp'
22 | 
23 | wrt:
24 |     tb_log_dir : 'logs_roberta_base/rte/from_pretrained'
25 | 
26 | stat:
27 |     log_steps : 20
28 | dist:
29 |     local_rank : 1
30 | ckp:
31 |     checkpoint : True
32 |     period: 5


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/sst2.yaml:
--------------------------------------------------------------------------------
 1 | glue_task: 'sst2'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 2
 7 |     max_lr : 0.00002
 8 |     max_length : 128
 9 | tr:
10 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
11 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
12 |     train_batch_size: 32 # Training global batch size.
13 |     val_batch_size: 64 # Validation global batch size.
14 |     epochs: 3 # Total epochs to run.
15 |     gpu_batch_size_limit : 32 # Max limit for GPU batch size during training.
16 |     disable_tqdm : False
17 |     writers: ["tensorboard"]
18 |     backend: 'sp'
19 | 
20 | wrt:
21 |     tb_log_dir : 'logs_roberta_base/sst2/from_pretrained'
22 | 
23 | stat:
24 |     log_steps : 20
25 | dist:
26 |     local_rank : 1
27 | ckp:
28 |     checkpoint : False
29 |     period: 2


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/configs-roberta-base/stsb.yaml:
--------------------------------------------------------------------------------
 1 | glue_task: 'stsb'
 2 | 
 3 | mi:
 4 |     encoder : "roberta-base"
 5 |     tokenizer: "roberta-base"
 6 |     num_labels : 1
 7 |     max_lr : 0.00002
 8 |     s1_key : 'sentence1'
 9 |     s2_key : 'sentence2'
10 |     max_length : 128
11 | tr:
12 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
13 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
14 |     train_batch_size: 32 # Training global batch size.
15 |     val_batch_size: 64 # Validation global batch size.
16 |     epochs: 10 # Total epochs to run.
17 |     gpu_batch_size_limit : 32 # Max limit for GPU batch size during training.
18 |     disable_tqdm : False
19 |     writers: ["tensorboard"]
20 |     backend: 'sp'
21 | 
22 | wrt:
23 |     tb_log_dir : 'logs_roberta_base/stsb/from_pretrained'
24 | 
25 | stat:
26 |     log_steps : 20
27 | dist:
28 |     local_rank : 2
29 | ckp:
30 |     checkpoint : False
31 |     period: 5


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/images/tensorboard_screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/glue_text_benchmark/images/tensorboard_screenshot.jpg


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/logs_roberta_base/rte/from_pretrained/events.out.tfevents.1623336412.krishan-vm.20548.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/examples/glue_text_benchmark/logs_roberta_base/rte/from_pretrained/events.out.tfevents.1623336412.krishan-vm.20548.0


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | pymarlin
3 | torch==1.8.1+cu111
4 | -f https://download.pytorch.org/whl/torch_stable.html
5 | transformers
6 | sklearn
7 | matplotlib
8 | mock


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/src/data.py:
--------------------------------------------------------------------------------
  1 | from pymarlin.core.data_interface import DataInterface, DataProcessor
  2 | from datasets import load_dataset
  3 | import matplotlib.pyplot as plt
  4 | import pandas as pd
  5 | 
  6 | cache_dir  =r"/tmp/hf_data"
  7 | class GlueDataAnalyzer(DataProcessor):
  8 |     def __init__(self, glue_task):
  9 |         self.datasets = load_dataset("glue",glue_task, cache_dir = cache_dir)
 10 |     
 11 |     def process(self):
 12 |         pass
 13 | 
 14 |     def analyze(self):
 15 |         print(self.datasets)
 16 |         for split in self.datasets.keys():
 17 |             self.analyze_split(split)
 18 |         
 19 |     def analyze_split(self,split = 'train'):
 20 |         print(f'\n{split} data label distribution')
 21 |         df = pd.DataFrame(self.datasets[split])
 22 |         print(df.head(2))
 23 |         count = df.groupby('label')['label'].count()
 24 |         summary = pd.DataFrame({'count':count, 'ratio': count/len(df)})
 25 |         print(summary)
 26 |         return df
 27 |     
 28 |     def analyze_texts(self, texts):
 29 |         s = pd.Series( texts)
 30 |         s = s.apply(lambda cell : cell.split())
 31 |         print(s.apply(len).describe(percentiles = [0.5,0.95,0.99,0.999]))
 32 | 
 33 | class SentenceDataAnalyzer(GlueDataAnalyzer):
 34 |     def __init__(self, glue_task, sentence_key = 'sentence'):
 35 |         super().__init__(glue_task)
 36 |         self.sentence_key = sentence_key
 37 |     def analyze_split(self, split = 'train'):
 38 |         df = super().analyze_split(split)
 39 |         self.analyze_texts(df[self.sentence_key])
 40 | 
 41 | class SPDataAnalyzer(GlueDataAnalyzer):
 42 |     def __init__(self, glue_task, s1_key = 'question1', s2_key = 'question2'):
 43 |         super().__init__(glue_task)
 44 |         self.s1_key = s1_key
 45 |         self.s2_key = s2_key
 46 |     def analyze_split(self, split = 'train'):
 47 |         df = super().analyze_split(split)
 48 |         self.analyze_texts(df[self.s1_key])
 49 |         self.analyze_texts(df[self.s2_key])
 50 | 
 51 | 
 52 | class SPRegressionDataAnalyzer(SPDataAnalyzer):
 53 |     def __init__(self, glue_task, s1_key = 'question1', s2_key = 'question2'):
 54 |         super().__init__(glue_task)
 55 |         self.s1_key = s1_key
 56 |         self.s2_key = s2_key
 57 |     def analyze_split(self, split = 'train'):
 58 |         print(f'\n{split} data label distribution')
 59 |         df = pd.DataFrame(self.datasets[split])
 60 |         print(df.head(2))
 61 |         # print(df.label.describe())
 62 |         self.analyze_texts(df[self.s1_key])
 63 |         self.analyze_texts(df[self.s2_key])
 64 | 
 65 | def analyzer_factory(glue_task):
 66 |     factory = {
 67 |         'default':GlueDataAnalyzer(glue_task),
 68 |         'qqp':SPDataAnalyzer('qqp'),
 69 |         'rte':SPDataAnalyzer('rte', 'sentence1', 'sentence2'),
 70 |         'mnli':SPDataAnalyzer('mnli', 'premise', 'hypothesis'),
 71 |         'qnli':SPDataAnalyzer('qnli', 'question', 'sentence'),
 72 |         'sst2':SentenceDataAnalyzer('sst2'),
 73 |         'stsb':SPRegressionDataAnalyzer('stsb', 'sentence1','sentence2'),
 74 |         'wnli':SPDataAnalyzer('wnli', 'sentence1', 'sentence2'),
 75 |         'mrpc':SPDataAnalyzer('mrpc', 'sentence1', 'sentence2'),
 76 |     }
 77 |     glue_task = glue_task if glue_task in factory else 'default'
 78 |     return factory[glue_task]
 79 | 
 80 | class GlueData(DataInterface):
 81 |     def setup_datasets(self, glue_task = 'cola'):
 82 |         self.glue_task = glue_task
 83 |         datasets = load_dataset("glue",glue_task, cache_dir = cache_dir)
 84 |         self.train_ds = datasets['train']
 85 |         if glue_task == 'mnli':
 86 |             self.val_ds = {'mnli_matched':datasets['validation_matched'],'mnli_mismatched':datasets['validation_mismatched']}
 87 |             self.test_ds = [datasets['test_matched'],datasets['test_mismatched']]
 88 |         else:
 89 |             self.val_ds = datasets['validation']
 90 |             self.test_ds = datasets['test']
 91 |   
 92 |     def get_train_dataset(self):
 93 |         return self.train_ds
 94 | 
 95 |     def get_val_dataset(self):
 96 |         return self.val_ds 
 97 | 
 98 |     def get_test_dataset(self):
 99 |         return self.test_ds
100 | 
101 | if __name__ == "__main__":
102 |     import sys
103 |      
104 |     glue_task = sys.argv[1] if len(sys.argv) >1 else 'cola'
105 |     print(glue_task)
106 |     di = GlueData()
107 |     di.setup_datasets(glue_task=glue_task)
108 |     dp = analyzer_factory(glue_task)
109 |     dp.process_data()
110 | 
111 | #python src/data_hf_glue.py rte


--------------------------------------------------------------------------------
/examples/glue_text_benchmark/src/infer.py:
--------------------------------------------------------------------------------
 1 | from train import *
 2 | from mock import MagicMock
 3 | 
 4 | def load_classifier():
 5 |     config = CustomArgParser().parse()
 6 |     checkpoint_path = 'checkpoints/model_9.pt'
 7 |     glue_task = config['glue_task']
 8 |     data = MagicMock()
 9 |     classifier = recipe_factory(glue_task, data_interface = data, **config['mi'])
10 |     sd = torch.load(checkpoint_path,map_location = 'cpu')['module_interface_state']
11 |     classifier.load_state_dict(sd)
12 |     return classifier
13 | 
14 | if __name__ == "__main__":
15 |     classifier = load_classifier()
16 | 
17 | 
18 |     #RTE
19 |     sentence1 = ['No Weapons of Mass Destruction Found in Iraq Yet.',
20 |                 'India is a hot country',
21 |                 'Krishan has written this inference example']
22 |     sentence2 = ['Weapons of Mass Destruction Found in Iraq.',
23 |                 'It\'s warm in india',
24 |                 'Krishan is the author of this example']
25 |     input = classifier.tokenizer(
26 |             text = sentence1,
27 |             text_pair = sentence2,
28 |             max_length=classifier.max_length,
29 |             return_tensors="pt",
30 |             padding=True,
31 |             truncation=True,
32 |         )
33 |     output = classifier.net(classifier.encoder(**input))
34 |     result = torch.argmax(output.logits, dim = -1)
35 |     print(result)
36 |     
37 | 
38 | 


--------------------------------------------------------------------------------
/examples/readme.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | A collection of examples of PyMarlin in action!
4 | 
5 | 1. Classification on Kaggle `covid19-nlp-text-classification` with BERT
6 | 2. Summarization on `CNN/DailyMail` with BART (and optionally with ORT+DeepSpeed extensions)
7 | 3. `GLUE Benchmark` with RoBERTa
8 | 4. Named Entity Recognition (Plugin) on `GERM` with bert-base-multilingual-cased
9 | 5. CIFAR Image Classification notebook


--------------------------------------------------------------------------------
/examples/snli_benchmark/configs-bert-base/snli.yaml:
--------------------------------------------------------------------------------
 1 | glue_task : 'snli'
 2 | 
 3 | mi:
 4 |     encoder : "bert-base-cased"
 5 |     tokenizer: "bert-base-cased"
 6 |     num_labels : 3
 7 |     lr : 0.0005
 8 |     s1_key : 'premise'
 9 |     s2_key : 'hypothesis'
10 |     max_length : 128
11 |     warmup : 0.06
12 | tr:
13 |     clip_grads: False
14 |     max_train_steps_per_epoch : null # Maximum train steps per epoch.
15 |     max_val_steps_per_epoch : null # Maximum validation steps per epoch.
16 |     train_batch_size: 32 # Training global batch size.
17 |     val_batch_size: 64 # Validation global batch size.
18 |     epochs: 3 # Total epochs to run.
19 |     gpu_batch_size_limit : 4 # Max limit for GPU batch size during training.
20 |     disable_tqdm : True
21 |     writers: ['stdout', 'aml', 'tensorboard']
22 |     backend: 'ddp-dp'
23 | 
24 | dp:
25 |     per_sample_max_grad_norm: 1.0
26 |     noise_multiplier: 0.4
27 |     sample_rate: 0.00005818 #snli: 32/550000
28 |     target_delta: 0.000001818 #snli: 1/550000
29 | 
30 | wrt:
31 |     tb_log_dir : 'logs_bert_base/snli/from_pretrained'
32 | 
33 | stat:
34 |     log_steps : 20
35 | 
36 | dist:
37 |     local_rank : 1
38 | 
39 | ckp:
40 |     checkpoint : False
41 |     period: 5
42 | 


--------------------------------------------------------------------------------
/examples/snli_benchmark/src/data.py:
--------------------------------------------------------------------------------
 1 | from pymarlin.core.data_interface import DataInterface
 2 | from datasets import load_dataset
 3 | 
 4 | class SnliData(DataInterface):
 5 |     def setup_datasets(self, task):
 6 |         self.task = task
 7 |         datasets = load_dataset(self.task)
 8 |         self.train_ds = datasets['train']
 9 |         self.train_ds = self.train_ds.filter(lambda x: x["label"] != -1)
10 |         
11 |         self.val_ds = datasets['validation']
12 |         self.val_ds = self.val_ds.filter(lambda x: x["label"] != -1)
13 | 
14 |         self.test_ds = datasets['test']
15 |         self.test_ds = self.test_ds.filter(lambda x: x["label"] != -1)
16 | 
17 |     def get_train_dataset(self):
18 |         return self.train_ds
19 | 
20 |     def get_val_dataset(self):
21 |         return self.val_ds 
22 | 
23 |     def get_test_dataset(self):
24 |         return self.test_ds
25 | 


--------------------------------------------------------------------------------
/pymarlin/__init__.py:
--------------------------------------------------------------------------------
 1 | """A lightweight library for Deep Learning model training"""
 2 | 
 3 | __version__ = '0.3.5'
 4 | from pymarlin.core.trainer import (
 5 |     TrainerArguments,
 6 |     Trainer,
 7 | )
 8 | from pymarlin.core.data_interface import (
 9 |     DataProcessor,
10 |     DataInterface,
11 | )
12 | from pymarlin.core.module_interface import (
13 |     CallbackInterface,
14 |     ModuleInterface,
15 | )
16 | from pymarlin.core.trainer_backend import (
17 |     SingleProcess,
18 |     SingleProcessAmp,
19 |     SingleProcessApexAmp,
20 |     DDPTrainerBackend,
21 | )
22 | 
23 | from pymarlin.utils.checkpointer.checkpoint_utils import (
24 |     DefaultCheckpointerArguments,
25 |     DefaultCheckpointer,
26 | )
27 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser
28 | from pymarlin.utils.stats.basic_stats import BasicStats
29 | 


--------------------------------------------------------------------------------
/pymarlin/core/__init__.py:
--------------------------------------------------------------------------------
1 | '''Empty init file'''
2 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | '''pymarlin.plugins'''
2 | from .plugin_module_interface import PluginModuleInterface
3 | from .hf_seq_classification import HfSeqClassificationPlugin
4 | from .hf_ner import HfNERPlugin
5 | from .hf_seq2seq import HfSeq2SeqPlugin
6 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/base.py:
--------------------------------------------------------------------------------
  1 | """ Base class for all plugins. """
  2 | from abc import abstractmethod
  3 | from typing import Optional, Dict
  4 | from pymarlin.core import module_interface, data_interface
  5 | from pymarlin.core import trainer as trn
  6 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser
  7 | 
  8 | 
  9 | class Plugin:
 10 |     """Base class for all plugins.
 11 | 
 12 |     It is structured around three core components
 13 |     [trn.Trainer, module_interface.ModuleInterface, data_interface.DataInterface].
 14 |     Derived classes should implement the methods `setup_data`,
 15 |     `setup_module`, and `setup`. These methods will execute the data processing
 16 |     pipeline and initialize the required components for training such as
 17 |     `trainer` and `module_interface`. `setup_trainer` initializes the PyMarlin
 18 |     trainer and backend.
 19 | 
 20 |     `plugin.setup` is provided to bootstrap the entire pipeline for a specific
 21 |     downstream task.
 22 |     Example::
 23 | 
 24 |          trainer = plugin.setup()
 25 |          trainer.train()
 26 |          trainer.validate()
 27 |     """
 28 | 
 29 |     def __init__(self, config: Optional[Dict] = None):
 30 |         """CustomArgParser parses YAML config located at cmdline --config_path. If --config_path
 31 |         is not provided, assumes YAML file is named config.yaml and present in working directory.
 32 |             self.trainer_args (trn.TrainerArguments): Instantiated dataclass containing
 33 |         args required to initialize trn.Trainer class.
 34 |         """
 35 |         if config is None:
 36 |             config = CustomArgParser().parse()
 37 |         self.trainer_args = trn.TrainerArguments(
 38 |             **config["trainer"],
 39 |             stats_args=trn.stats.StatInitArguments(**config["stats"]),
 40 |             writer_args=trn.WriterInitArguments(**config["wrts"]),
 41 |             checkpointer_args=trn.DefaultCheckpointerArguments(**config["ckpt"])
 42 |         )
 43 | 
 44 |     @property
 45 |     def datainterface(self):
 46 |         """DataInterface object used for data processing.
 47 |         The property can be set in `setup_datainterface`.
 48 | 
 49 |         Returns:
 50 |             An object of type data_interface.DataInterface.
 51 |         """
 52 |         return self._datainterface
 53 | 
 54 |     @datainterface.setter
 55 |     def datainterface(self, data_interface_obj: data_interface.DataInterface):
 56 |         assert isinstance(data_interface_obj, data_interface.DataInterface)
 57 |         self._datainterface = data_interface_obj
 58 | 
 59 |     @property
 60 |     def dataprocessor(self):
 61 |         """DataProcessor object(s) used for data processing.
 62 |         The property may be used in conjuction with `datainterface` in the
 63 |         `setup_datainterface` method.
 64 | 
 65 |         Returns:
 66 |             An object of type data_interface.DataProcessor.
 67 |         """
 68 |         return self._dataprocessor
 69 | 
 70 |     @dataprocessor.setter
 71 |     def dataprocessor(self, data_processor_obj: data_interface.DataProcessor):
 72 |         assert isinstance(data_processor_obj, data_interface.DataProcessor)
 73 |         self._dataprocessor = data_processor_obj
 74 | 
 75 |     @property
 76 |     def moduleinterface(self):
 77 |         """ModuleInterface object.
 78 |         The property can be set in `setup_module`.
 79 | 
 80 |         Returns:
 81 |             An object of type module_interface.ModuleInterface.
 82 |         """
 83 |         return self._moduleinterface
 84 | 
 85 |     @moduleinterface.setter
 86 |     def moduleinterface(self, module_interface_obj: module_interface.ModuleInterface):
 87 |         assert isinstance(module_interface_obj, module_interface.ModuleInterface)
 88 |         self._moduleinterface = module_interface_obj
 89 | 
 90 |     @property
 91 |     def trainer(self):
 92 |         """Trainer object.
 93 |         The property can be set in `setup_trainer`.
 94 | 
 95 |         Returns:
 96 |             An object of type trn.Trainer.
 97 |         """
 98 |         return self._trainer
 99 | 
100 |     @trainer.setter
101 |     def trainer(self, trainer_obj: trn.Trainer):
102 |         assert isinstance(trainer_obj, trn.Trainer)
103 |         self._trainer = trainer_obj
104 | 
105 |     @abstractmethod
106 |     def setup_datainterface(self, *args: Optional):
107 |         """Derived plugins must implement this method. The method should
108 |         execute a generic data processing pipeline for the task and update the
109 |         TaskDataInterface object to contain the processed train and val datasets.
110 | 
111 |         NOTE to TaskPlugin designers: Typically, the plugin shouldn't need
112 |         any input arguments from user except from the YAML config. DataInterface and
113 |         DataProcessor related arguments should be processed in the __init__ method of
114 |         the TaskPlugin.
115 | 
116 |         Returns:
117 |             datainterface_obj (data_interface.DataInterface): TaskDataInterface object
118 |         """
119 | 
120 |     @abstractmethod
121 |     def setup_module(self, *args: Optional):
122 |         """Derived plugins must implement this method. The method should
123 |         create a TaskModuleInterface object (module_interface.ModuleInterface)
124 |         and set `moduleinterface` property.
125 | 
126 |         NOTE to TaskPlugin designers: Typically, the plugin shouldn't need
127 |         any input arguments from user. ModuleInterface related arguments should be
128 |         processed in the __init__ method of the TaskPlugin.
129 |         """
130 | 
131 |     def setup_trainer(self):
132 |         """Creates a trn.Trainer object and sets the `trainer` property.
133 |         Used by all plugins unless overriden (not recommended).
134 |         """
135 |         self.trainer = trn.Trainer(args=self.trainer_args, module=self.moduleinterface)
136 | 
137 |     @abstractmethod
138 |     def setup(self, **kwargs):
139 |         """Executes all steps from data processing to trainer initialization.
140 | 
141 |         This should be equivalent to::
142 | 
143 |              plugin.setup_datainterface()
144 |              plugin.setup_module()
145 |              plugin.setup_trainer()
146 |         """
147 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_ner/__init__.py:
--------------------------------------------------------------------------------
1 | from .implementation import HfNERPlugin
2 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_ner/config_germ.yaml:
--------------------------------------------------------------------------------
 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line.
 2 | # Example usage in command-line: --tmod.max_lr 4E-5
 3 | 
 4 | # data_processor args
 5 | data:
 6 |     train_dir : null
 7 |     val_dir : null
 8 |     labels_list: [B-LOC, B-LOCderiv, B-LOCpart, B-ORG, B-ORGderiv, B-ORGpart, B-OTH, B-OTHderiv,
 9 |         B-OTHpart, B-PER, B-PERderiv, B-PERpart, I-LOC, I-LOCderiv, I-LOCpart, I-ORG, I-ORGderiv,
10 |         I-ORGpart, I-OTH, I-OTHderiv, I-OTHpart, I-PER, I-PERderiv, I-PERpart, O]
11 |     max_seq_len: 128
12 |     pad_label_id: -100
13 |     has_labels: True
14 |     tokenizer: "bert-base-multilingual-cased"
15 |     file_format: "tsv"
16 |     label_all_tokens: False
17 | 
18 | # model arguments
19 | model:
20 |     model_name: "bert"
21 |     encoder_key: "bert"
22 |     hf_model: "bert-base-multilingual-cased"
23 |     model_file: "pytorch_model.bin"
24 |     model_config_file: "config.json"
25 |     model_path: null
26 |     model_config_path: null
27 | 
28 | # module_interface arguments
29 | module:
30 |     output_dir: null
31 |     max_lr : 0.00003 # Maximum learning rate.
32 |     warmup_prop: 0.1
33 |     has_labels: True
34 | 
35 | # distill module arguments
36 | distill:
37 |     enable: False
38 |     student_model_config_path: null
39 |     student_model_config_file: null
40 |     student_model_path: null
41 |     student_model_file: null
42 |     student_layers: [0,6,11]
43 |     loss_types: ["logits"]
44 |     loss_weights: [1]
45 |     temperature: 1
46 | 
47 | # trainer arguments
48 | trainer:
49 |     backend: "sp"
50 |     train_batch_size: 32 # Training global batch size.
51 |     val_batch_size: 16 # Validation global batch size.
52 |     epochs: 1 # Total epochs to run.
53 |     gpu_batch_size_limit : 8 # Max limit for GPU batch size during training.
54 |     clip_grads : True # Enable or disable clipping of gradients.
55 |     use_gpu: True # Enable or disable use of GPU.
56 |     max_grad_norm: 1.0 # Maximum value for gradient norm.
57 |     writers: ['stdout', 'aml', 'tensorboard'] # List of all the writers to use.
58 |     disable_tqdm: True
59 |     log_level: "DEBUG"
60 |     max_train_steps_per_epoch: 1
61 |     max_val_steps_per_epoch: 1
62 | 
63 | # Checkpointer arguments
64 | ckpt:
65 |     checkpoint: False # Flag indicating whether to checkpoint model.
66 |     delete_existing_checkpoints: False
67 |     period: 1 # Period of epochs at which to checkpoint model.
68 |     save_dir: 'ckpts' # Path to directory where checkpoints are to be stored.
69 |     file_prefix: 'bert' # Prefix of the checkpoint filename.
70 |     file_ext: 'tar' # File extension for the checkpoint.
71 | 
72 | # Basic-Statistics arguments
73 | stats:
74 |     log_steps: 50
75 |     update_system_stats: False
76 |     log_model_steps: 1000
77 |     exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)'
78 | 
79 | # Writers arguments
80 | wrts:
81 |     model_log_level : 'INFO'
82 |     tb_log_dir : 'logs'
83 |     tb_logpath_parent_env : null
84 |     tb_log_multi : False
85 |     tb_log_hist_steps : 20000


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_ner/data_classes.py:
--------------------------------------------------------------------------------
 1 | '''pymarlin.pluguins.hf_ner.data_classes'''
 2 | import dataclasses
 3 | import pandas as pd
 4 | import multiprocessing
 5 | import torch
 6 | 
 7 | from torch.utils.data import Dataset
 8 | from transformers import InputExample, InputFeatures
 9 | from pymarlin.utils.logger.logging_utils import getlogger
10 | from pymarlin.core import data_interface
11 | 
12 | logger = getlogger(__name__, "DEBUG")
13 | 
14 | @dataclasses.dataclass
15 | class DataArguments:
16 |     train_filepath: None
17 |     val_filepath: None
18 |     labels_list: None
19 |     has_labels: True
20 |     file_format: str = "tsv"
21 | 
22 | 
23 | class NERBaseDataset(Dataset):
24 |     def __init__(self, args, input_filepath):
25 |         self.input_filepath = input_filepath
26 |         self.args = args
27 | 
28 |         if self.args.file_format == "tsv":
29 |             sep = "\t"
30 |         else:
31 |             sep = ","
32 |         self.df = pd.read_csv(self.input_filepath, sep=sep).dropna()
33 | 
34 |     def __len__(self):
35 |         return len(self.df)
36 | 
37 |     def __getitem__(self, idx):
38 |         record = self.df.iloc[idx]
39 |         sent = record["Sentence"].split(" ")
40 |         label = record["Slot"].split(" ")
41 |         assert len(sent) == len(label)
42 |         return sent,label 
43 | 
44 | class NERDataInterface(data_interface.DataInterface):
45 |     '''NER Data Interface'''
46 |     def __init__(self, args):
47 |         super().__init__()
48 |         self.args = args
49 |         self.train_dataset = []
50 |         self.val_dataset = []
51 |         self._set_args()
52 | 
53 |     def setup_datasets(self):
54 |         self.train_dataset = NERBaseDataset(self.args, self.args.train_filepath)
55 |         self.val_dataset = NERBaseDataset(self.args, self.args.val_filepath)
56 | 
57 |     def get_train_dataset(self):
58 |         return self.train_dataset
59 | 
60 |     def get_val_dataset(self):
61 |         return self.val_dataset
62 | 
63 |     def get_labels(self):
64 |         return self.args.labels_list
65 | 
66 |     def _set_args(self):
67 |         self.label_map = (
68 |             {label: i for i, label in enumerate(self.args.labels_list)}
69 |             if self.args.labels_list is not None
70 |             else None
71 |         )
72 |         logger.info(f"Labels map = {self.label_map}")
73 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_ner/implementation.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Dict
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import torch
 6 | from torch.utils.data import DataLoader, Dataset
 7 | from torch.optim import Adam
 8 | from torch.optim.lr_scheduler import OneCycleLR
 9 | 
10 | from transformers import InputExample, AutoTokenizer
11 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser
12 | 
13 | from .data_classes import DataArguments, NERBaseDataset, NERDataInterface
14 | from .module_classes import NERModule, ModuleInterfaceArguments, ModelArguments
15 | 
16 | from transformers import AutoModelForTokenClassification
17 | from pymarlin.core.data_interface import DataProcessor, DataInterface
18 | from pymarlin.core.module_interface import ModuleInterface
19 | from pymarlin.plugins.base import Plugin
20 | from pymarlin.plugins.hfdistill_utils import build_distill_module, DistillationArguments
21 | from pymarlin.utils.stats import global_stats
22 | from pymarlin.utils.logger.logging_utils import getlogger
23 | 
24 | logger = getlogger(__name__, "DEBUG")
25 | 
26 | 
27 | class HfNERPlugin(Plugin):
28 |     """Named Entity Recognition or Token Classification plugin for HuggingFace models
29 | 
30 |     plugin.setup() bootstraps the entire pipeline and returns a fully setup trainer.
31 |     Example:
32 | 
33 |              trainer = plugin.setup_trainer()
34 |              trainer.train()
35 |              trainer.validate()
36 |     """
37 | 
38 |     def __init__(self, config: Optional[Dict] = None):
39 |         """CustomArgParser parses YAML config located at cmdline --config_path. If --config_path
40 |         is not provided, assumes YAML file is named config.yaml and present in working directory.
41 |         Instantiates dataclasses:
42 |             self.data_args (arguments.DataInterfaceArguments): Instantiated dataclass containing
43 |             args required to initialize NERDataInterface and NERProcessor classes
44 |             self.module_args (arguments.ModuleInterfaceArguments): Instantiated dataclass containing
45 |             args required to initialize NERModule class
46 | 
47 |         Sets properties:
48 |             self.datainterface: data_interface.DataInterface [NERDataInterface] object
49 |             self.dataprocessor: data_interface.DataProcessor [NERProcessor] object.
50 |                 These two together are used to read raw data and create sequences of tokens in `setup_datainterface`.
51 |                 The processed data is fed to HuggingFace AutoModelForTokenClassification models.
52 |             self.module: module_interface.ModuleInterface [NERModule] object
53 |                 This is used to initialize a Marlin trainer.
54 |         """
55 |         super().__init__(config=None)
56 |         if config is None:
57 |             config = CustomArgParser(log_level="DEBUG").parse()
58 |         self.data_args = DataArguments(**config["data"])
59 |         self.module_args = ModuleInterfaceArguments(
60 |             **config["module"], model_args=ModelArguments(**config["model"])
61 |         )
62 |         self.distill_args = DistillationArguments(**config["distill"])
63 | 
64 |         self.datainterface = NERDataInterface(self.data_args)
65 |         self.datainterface.setup_datasets()
66 |         module_class = NERModule
67 | 
68 |         module_params = [self.module_args, self.datainterface]
69 | 
70 |         if self.distill_args.enable:
71 |             module_params = [self.distill_args] + module_params
72 |             module_class = build_distill_module(module_class)
73 | 
74 |         self.moduleinterface = module_class(*module_params)
75 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | from .implementation import HfSeq2SeqPlugin
2 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq2seq/data_classes.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import dataclasses
 3 | import os
 4 | import pandas as pd
 5 | import torch
 6 | from pymarlin.core import data_interface
 7 | import matplotlib
 8 | 
 9 | matplotlib.use("Agg")  # disable this in local machine to see plots
10 | import matplotlib.pyplot as plt
11 | 
12 | def get_source_target(path="D:/data/cnn_cln", stage="val"):
13 |     source = os.path.join(path, f"{stage}.source")
14 |     target = os.path.join(path, f"{stage}.target")
15 |     return source, target
16 | 
17 | 
18 | class AnalyzeProcessor(data_interface.DataProcessor):
19 |     def __init__(self, source, target):
20 |         with open(source, "r", encoding="UTF-8") as f:
21 |             self.source = f.readlines()
22 |         with open(target, "r", encoding="UTF-8") as f:
23 |             self.target = f.readlines()
24 | 
25 |     def process(self):
26 |         pass
27 | 
28 |     def analyze(self):
29 |         self.df = pd.DataFrame({"source": self.source, "target": self.target})
30 |         print(self.df.head())
31 |         print("Word length analysis:")
32 |         wordlengths = self.df.applymap(lambda x: len(x.split()))
33 |         print(wordlengths.describe())
34 |         plt.plot(wordlengths)
35 |         plt.legend(["source", "target"])
36 | 
37 | 
38 | class HfSeq2SeqDataset(torch.utils.data.Dataset):
39 |     def __init__(self, source, target):
40 |         with open(source, "r", encoding="UTF-8") as f:
41 |             self.source = f.readlines()
42 |         with open(target, "r", encoding="UTF-8") as f:
43 |             self.target = f.readlines()
44 |         print(
45 |             "len(self.source), len(self.target) = ", len(self.source), len(self.target)
46 |         )
47 | 
48 |     def __getitem__(self, i):
49 |         # print('len(self.source), len(self.target) = ',len(self.source), len(self.target))
50 |         return self.source[i].strip(), self.target[i].strip()
51 | 
52 |     def __len__(self):
53 |         return len(self.target)
54 | 
55 | 
56 | @dataclasses.dataclass
57 | class DataInterfaceArguments:
58 |     data_dir: str = None
59 | 
60 | 
61 | class HfSeq2SeqData(data_interface.DataInterface):
62 |     """
63 |     Class which expects input data to have different files for source and target.
64 |     Returns dataset which returns non tokenized source and target text.
65 |     """
66 | 
67 |     def __init__(self, args: DataInterfaceArguments):
68 |         self.args = args
69 | 
70 |     def setup_datasets(self):
71 |         self.train_ds = HfSeq2SeqDataset(
72 |             *get_source_target(self.args.data_dir, "train")
73 |         )
74 |         self.val_ds = HfSeq2SeqDataset(*get_source_target(self.args.data_dir, "val"))
75 |         print("self.train_ds length = ", len(self.train_ds))
76 | 
77 |     def get_train_dataset(self, *args, **kwargs):
78 |         return self.train_ds
79 | 
80 |     def get_val_dataset(self, *args, **kwargs):
81 |         return self.val_ds
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     dm = HfSeq2SeqData()
86 |     root = sys.argv[1]  #'D:/data/cnn_cln'
87 |     print("Train")
88 |     dm.process_data(AnalyzeProcessor(*get_source_target(root=root, stage="train")))
89 |     print("Val")
90 |     dm.process_data(AnalyzeProcessor(*get_source_target(root=root, stage="val")))
91 |     plt.show()
92 | 
93 |     # dm.setup_datasets()
94 |     # ds = dm.get_train_dataset()
95 |     # len(ds),ds[0]
96 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq2seq/implementation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import multiprocessing
  3 | 
  4 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser
  5 | from pymarlin.utils.logger.logging_utils import getlogger
  6 | 
  7 | logger = getlogger(__name__, "DEBUG")
  8 | 
  9 | from pymarlin.core import data_interface, module_interface
 10 | from pymarlin.core import trainer as trn
 11 | 
 12 | from pymarlin.plugins.base import Plugin
 13 | from .data_classes import HfSeq2SeqData, DataInterfaceArguments
 14 | from .module_classes import (
 15 |     HfSeq2SeqModule,
 16 |     ModuleInterfaceArguments,
 17 |     ModelArguments,
 18 |     GenerateArguments,
 19 | )
 20 | 
 21 | 
 22 | class HfSeq2SeqPlugin(Plugin):
 23 |     """Plugin for Text Sequence to Sequence Generation using Huggingface models.
 24 | 
 25 |     plugin.setup() bootstraps the entire pipeline and returns a fully setup trainer.
 26 |     Example:
 27 | 
 28 |              trainer = plugin.setup()
 29 |              trainer.train()
 30 |              trainer.validate()
 31 | 
 32 |     Alternatively, you can run `setup_datainterface` `setup_module` `setup_trainer` individually.
 33 |     Example:
 34 | 
 35 |              plugin.setup_datainterface()
 36 |              plugin.setup_module()
 37 |              trainer = plugin.setup_trainer()
 38 |     """
 39 | 
 40 |     def __init__(self, config=None):
 41 |         """Accepts optional config dictionary.
 42 |         CustomArgParser parses YAML config located at cmdline --config_path. If --config_path
 43 |         is not provided, assumes YAML file is named config.yaml and present in working directory.
 44 |         Instantiates dataclasses:
 45 |             self.data_args (arguments.DataInterfaceArguments): Data Inference arguments
 46 |             self.module_args (arguments.ModuleInterfaceArguments): Module Interface Arguments
 47 |         Sets properties:
 48 |             self.datainterface: data_interface.DataInterface [HfSeq2SeqData] object
 49 |             self.moduleinterface: module_interface.ModuleInterface [HfSeq2SeqModule] object
 50 |         """
 51 |         super().__init__()
 52 |         if config is None:
 53 |             config = CustomArgParser(log_level="DEBUG").parse()
 54 |         self.data_args = DataInterfaceArguments(**config["data"])
 55 |         self.module_args = ModuleInterfaceArguments(
 56 |             **config["module"],
 57 |             model_args=ModelArguments(**config["model"]),
 58 |             generate_args=GenerateArguments(**config["generate"])
 59 |         )
 60 |         # self.distill_args = DistillationArguments(**config['distill'])
 61 | 
 62 |     def setup_datainterface(self):
 63 |         """Calls `datainterface.setup_datasets(train_data, val_data)`.
 64 | 
 65 |         Assumptions:
 66 |             Training and validation files are placed in separate directories.
 67 |             Accepted file formats: source/target text lines in data_args.data_dir/{train,val}.{source,targets}
 68 |         """
 69 |         self.datainterface = HfSeq2SeqData(self.data_args)
 70 |         self.datainterface.setup_datasets()
 71 | 
 72 |     def setup_module(self):
 73 |         """Sets `HfSeq2SeqModule.data` property to `datainterface` which contains
 74 |         the processed datasets. Assertion error is thrown if `datainterface` retrieves no train
 75 |         or val data, indicating that `datainterface` hasn't been setup with processed data.
 76 |         Sets the `HfSeq2SeqModule.model` property after initializing weights:
 77 |             Option 1: Load weights from specified files mentioned in YAML config
 78 |                         model:
 79 |                             model_config_path
 80 |                             model_config_file
 81 |                             model_path
 82 |                             model_file
 83 |             Option 2: Load from Huggingface model hub, specify string in YAML config as:
 84 |                         model:
 85 |                             hf_model
 86 |         """
 87 |         # datainterface should contain the processed datasets
 88 |         assert (
 89 |             len(self.datainterface.get_train_dataset()) != 0
 90 |             or len(self.datainterface.get_val_dataset()) != 0
 91 |         )
 92 |         self.moduleinterface = HfSeq2SeqModule(self.datainterface, self.module_args)
 93 | 
 94 |     def setup(self):
 95 |         """Executes all the setup methods required to create a trn.Trainer object.
 96 |         Trainer needs `moduleinterface` and backend is specified by self.trainer_args.backend.
 97 |         """
 98 |         self.setup_datainterface()
 99 |         self.setup_module()
100 |         self.setup_trainer()
101 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq2seq/metric_utils.py:
--------------------------------------------------------------------------------
 1 | from rouge_score import rouge_scorer
 2 | 
 3 | """ Metric Functions """
 4 | 
 5 | 
 6 | def get_metric_func(metric_name):
 7 |     METRIC_MAP = {"rouge": rouge}
 8 |     return METRIC_MAP[metric_name]
 9 | 
10 | 
11 | def rouge(preds, labels):
12 |     # All Rouge scores for CNN/DailyMail
13 |     scorer = rouge_scorer.RougeScorer(
14 |         ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True
15 |     )
16 |     agg_scores = {}
17 | 
18 |     # sum up fmeasures
19 |     for pred, ref in zip(preds, labels):
20 |         scores = scorer.score(pred, ref)
21 |         for key in scores:
22 |             if key not in agg_scores:
23 |                 agg_scores[key] = 0
24 |             agg_scores[key] += scores[key].fmeasure
25 | 
26 |     # and divide to average
27 |     for key in agg_scores:
28 |         agg_scores[key] /= len(preds)
29 | 
30 |     return agg_scores
31 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq_classification/__init__.py:
--------------------------------------------------------------------------------
1 | from .implementation import HfSeqClassificationPlugin
2 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq_classification/config.yaml:
--------------------------------------------------------------------------------
 1 | # Using abreviated letter coding for group names. This will be used to parse these arguments when passed from command line.
 2 | # Example usage in command-line: --tmod.max_lr 4E-5
 3 | 
 4 | # data arguments
 5 | data:
 6 |     train_filepath: null
 7 |     val_filepath: null
 8 |     file_format: "csv"
 9 |     header: 0 # file has a header at row 0
10 |     text_a_col: "OriginalTweet"
11 |     text_b_col: null # null in config file is equivalent to None
12 |     label_col: "Sentiment"
13 |     labels_list: ["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"] # list of labels which will be mapped in order from 0 to 4 for the model
14 | 
15 | # model arguments
16 | model:
17 |     tokenizer_path: null
18 |     hf_model: "roberta-base"
19 |     encoder_key: "roberta"
20 |     model_config_path: null # provide path to model config dir
21 |     model_config_file: "config.json"
22 |     model_path: null # provide path to model weights dir
23 |     model_file: "pytorch_model.bin"
24 | 
25 | # module_interface arguments
26 | module:
27 |     metric: "acc_and_f1"
28 |     max_lr : 0.00002
29 |     warmup_prop: 0.1
30 |     has_labels: True
31 |     max_seq_len: 128
32 | 
33 | # distill module arguments
34 | distill:
35 |     enable: False
36 |     # config_output_dir: null
37 |     student_model_config_path: null
38 |     student_model_config_file: null
39 |     student_model_path: null
40 |     student_model_file: null
41 |     student_layers: [0,6,11]
42 |     loss_types: ["logits"]
43 |     loss_weights: [1]
44 |     temperature: 1
45 | 
46 | # trainer arguments
47 | trainer:
48 |     backend: "sp" # sp, sp-amp, ddp, ddp-amp
49 |     train_batch_size: 32 # Training global batch size.
50 |     val_batch_size: 16 # Validation global batch size.
51 |     epochs: 3 # Total epochs to run.
52 |     gpu_batch_size_limit : 8 # Max limit for GPU batch size during training.
53 |     clip_grads : True # Enable or disable clipping of gradients.
54 |     use_gpu: True # Enable or disable use of GPU.
55 |     max_grad_norm: 1.0 # Maximum value for gradient norm.
56 |     disable_tqdm: True
57 |     log_level: "INFO"
58 | 
59 | # Checkpointer arguments
60 | ckpt:
61 |     checkpoint: True # Flag indicating whether to checkpoint model.
62 |     delete_existing_checkpoints: False
63 |     period: 1 # Period of epochs at which to checkpoint model.
64 |     save_dir: 'marlin_states' # Path to directory where checkpoints are to be stored.
65 |     model_state_save_dir: 'model_ckpts'
66 |     file_prefix: 'marlin' # Prefix of the checkpoint filename.
67 |     file_ext: 'bin' # File extension for the checkpoint.
68 | 
69 | # Basic-Statistics arguments
70 | stats:
71 |     log_steps: 5
72 |     update_system_stats: False
73 |     log_model_steps: 1000
74 |     exclude_list: 'bias|LayerNorm|layer\\.[3-9]|layer\\.1(?!1)|layer\\.2(?!3)'
75 | 
76 | # Writers arguments
77 | wrts:
78 |     model_log_level : 'INFO'
79 |     tb_log_dir : 'logs'
80 |     tb_logpath_parent_env : null
81 |     tb_log_multi : False
82 |     tb_log_hist_steps : 20000


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq_classification/data_classes.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import pandas as pd
  3 | import torch
  4 | from torch.utils.data import Dataset
  5 | 
  6 | from transformers import InputFeatures
  7 | 
  8 | from pymarlin.core import data_interface
  9 | from pymarlin.utils.logger.logging_utils import getlogger
 10 | logger = getlogger(__name__, "DEBUG")
 11 | 
 12 | 
 13 | @dataclasses.dataclass
 14 | class DataArguments:
 15 |     train_filepath: str = None
 16 |     val_filepath: str = None
 17 |     labels_list: list = None
 18 |     file_format: str = "tsv"
 19 |     header: int = None
 20 |     text_a_col: int or str = None
 21 |     text_b_col: int or str = None
 22 |     label_col: int or str = None
 23 | 
 24 | class HfSeqClassificationDataset(Dataset):
 25 |     """PyTorch Dataset."""
 26 | 
 27 |     def __init__(self, args, input_filepath, label_map):
 28 |         """
 29 |         Args:
 30 |             args: DataInterface arguments
 31 |             input_filepath (str): Path to dataset
 32 |             label_map (dict): Map categorical values to numerical
 33 |         """
 34 |         self.args = args
 35 |         self.label_map = label_map
 36 |         if self.args.file_format == "json":
 37 |             self.df = pd.read_json(input_filepath, lines=True)
 38 |         elif self.args.file_format in ["tsv", "csv"]:
 39 |             if self.args.file_format == "tsv":
 40 |                 sep = "\t"
 41 |             else:
 42 |                 sep = ","
 43 |             self.df = pd.read_csv(input_filepath, sep=sep, header=self.args.header)
 44 | 
 45 |     def __len__(self):
 46 |         return len(self.df)
 47 | 
 48 |     def __getitem__(self, idx):
 49 |         record = self.df.iloc[idx]
 50 |         if self.label_map is not None:
 51 |             label = self.label_map[record[self.args.label_col]]
 52 |         else:
 53 |             label = float(record[self.args.label_col])
 54 | 
 55 |         if self.args.text_b_col is not None:
 56 |             return record[self.args.text_a_col], record[self.args.text_b_col], label
 57 |         else:
 58 |             return record[self.args.text_a_col], label
 59 | 
 60 | class HfSeqClassificationDataInterface(data_interface.DataInterface):
 61 |     """Retrieves train and val PyTorch Datasets."""
 62 | 
 63 |     def __init__(self, args):
 64 |         """
 65 |         Args:
 66 |             args (arguments.DataArguments): Dataclass
 67 |         """
 68 |         super().__init__()
 69 |         self.args = args
 70 |         self.train_dataset = []
 71 |         self.val_dataset = []
 72 |         self._set_args()
 73 | 
 74 |     def _set_args(self):
 75 |         if self.args.file_format in ["tsv", "csv"]:
 76 |             if self.args.file_format == "tsv":
 77 |                 sep = "\t"
 78 |             else:
 79 |                 sep = ","
 80 |             if self.args.header is None:  # Refer by column numbers
 81 |                 self.args.text_a_col = int(self.args.text_a_col)
 82 |                 if self.args.text_b_col:
 83 |                     self.args.text_b_col = int(self.args.text_b_col)
 84 |                 self.args.label_col = int(self.args.label_col)
 85 |         self.label_map = (
 86 |             {label: i for i, label in enumerate(self.args.labels_list)}
 87 |             if len(self.args.labels_list) > 1
 88 |             else None
 89 |         )
 90 | 
 91 |     def setup_datasets(self):
 92 |         if self.args.train_filepath is not None:
 93 |             self.train_dataset = HfSeqClassificationDataset(self.args, self.args.train_filepath, self.label_map)
 94 |         if self.args.val_filepath is not None:
 95 |             self.val_dataset = HfSeqClassificationDataset(self.args, self.args.val_filepath, self.label_map)
 96 | 
 97 |     def get_train_dataset(self):
 98 |         return self.train_dataset
 99 | 
100 |     def get_val_dataset(self):
101 |         return self.val_dataset
102 | 
103 |     def get_labels(self):
104 |         return self.args.labels_list
105 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq_classification/implementation.py:
--------------------------------------------------------------------------------
  1 | from pymarlin.utils.config_parser.custom_arg_parser import CustomArgParser
  2 | from pymarlin.core import data_interface, module_interface
  3 | from pymarlin.plugins.base import Plugin
  4 | from pymarlin.plugins.hfdistill_utils import build_distill_module, DistillationArguments
  5 | 
  6 | from .data_classes import (
  7 |     HfSeqClassificationDataInterface,
  8 |     DataArguments,
  9 | )
 10 | from .module_classes import (
 11 |     HfSeqClassificationModule,
 12 |     ModuleInterfaceArguments,
 13 |     ModelArguments,
 14 | )
 15 | from typing import Optional, Dict
 16 | 
 17 | 
 18 | class HfSeqClassificationPlugin(Plugin):
 19 |     """Plugin for Text Sequence Classification using Huggingface models.
 20 | 
 21 | 
 22 |     plugin.setup() bootstraps the entire pipeline and returns a fully setup trainer.
 23 |     Example::
 24 | 
 25 |              trainer = plugin.setup()
 26 |              trainer.train()
 27 |              trainer.validate()
 28 | 
 29 |     Alternatively, you can run `setup_datainterface` `setup_module` `setup_trainer` individually.
 30 |     Example::
 31 | 
 32 |              plugin.setup_datainterface()
 33 |              plugin.setup_module()
 34 |              trainer = plugin.setup_trainer()
 35 |     """
 36 | 
 37 |     def __init__(self, config: Optional[Dict] = None):
 38 |         """CustomArgParser parses YAML config located at cmdline --config_path. If --config_path
 39 |         is not provided, assumes YAML file is named config.yaml and present in working directory.
 40 |         Instantiates dataclasses:
 41 |             self.data_args (arguments.DataInterfaceArguments): Instantiated dataclass containing
 42 |             args.
 43 |             self.module_args (arguments.ModuleInterfaceArguments): Instantiated dataclass containing
 44 |             args required to initialize HfSeqClassificationModule class.
 45 |             self.distill_args (arguments.DistillationArguments): Instantiated dataclass
 46 |             required to initialize DistillHfModule.
 47 |                 Set self.distill_args.enable = True in config file to do knowledge distillation
 48 |                 instead of regular training.
 49 |         Sets properties:
 50 |             self.datainterface: data_interface.DataInterface [HfSeqClassificationDataInterface] object
 51 |             self.module: module_interface.ModuleInterface [HfSeqClassificationModule] object
 52 |                 This is used to initialize a Marlin trainer.
 53 |         """
 54 |         super().__init__(config=None)
 55 |         if config is None:
 56 |             config = CustomArgParser(log_level="DEBUG").parse()
 57 |         self.data_args = DataArguments(**config["data"])
 58 |         self.module_args = ModuleInterfaceArguments(
 59 |             **config["module"], model_args=ModelArguments(**config["model"])
 60 |         )
 61 |         self.distill_args = DistillationArguments(**config["distill"])
 62 | 
 63 |     def setup_datainterface(self):
 64 |         """Calls `datainterface.setup_datasets(train_data, val_data)`.
 65 | 
 66 |         Assumptions:
 67 |             Training and validation files are placed in separate directories.
 68 |             Accepted file formats: source/target text lines in data_args.data_dir/{train,val}.{source,targets}
 69 |         """
 70 |         self.datainterface = HfSeqClassificationDataInterface(self.data_args)
 71 |         self.datainterface.setup_datasets()
 72 | 
 73 |     def setup_module(self):
 74 |         """Sets `HfSeqClassificationModule.data` property to `datainterface` which contains
 75 |         the processed datasets. Assertion error is thrown if `datainterface` retrieves no train
 76 |         or val data, indicating that `datainterface` hasn't been setup with processed data.
 77 |         Sets the `HfSeqClassificationModule.model` property after initializing weights:
 78 |             Option 1: Load weights from specified files mentioned in YAML config
 79 |                         model:
 80 |                             model_config_path
 81 |                             model_config_file
 82 |                             model_path
 83 |                             model_file
 84 |             Option 2: Load from Huggingface model hub, specify string in YAML config as:
 85 |                         model:
 86 |                             hf_model
 87 |         """
 88 |         # datainterface should contain the processed datasets
 89 |         assert (
 90 |             len(self.datainterface.get_train_dataset()) != 0
 91 |             or len(self.datainterface.get_val_dataset()) != 0
 92 |         )
 93 |         module_class = HfSeqClassificationModule
 94 |         module_params = [self.module_args, self.datainterface]
 95 |         if self.distill_args.enable:
 96 |             module_params = [self.distill_args] + module_params
 97 |             module_class = build_distill_module(module_class)
 98 |         self.moduleinterface = module_class(*module_params)
 99 | 
100 |     def setup(self):
101 |         """Executes all the setup methods required to create a trn.Trainer object.
102 |         Trainer needs `moduleinterface` and backend is specified by self.trainer_args.backend.
103 |         """
104 |         self.setup_datainterface()
105 |         self.setup_module()
106 |         self.setup_trainer()


--------------------------------------------------------------------------------
/pymarlin/plugins/hf_seq_classification/metric_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import math
 4 | from scipy.stats import pearsonr, spearmanr
 5 | from sklearn.metrics import (
 6 |     matthews_corrcoef,
 7 |     f1_score,
 8 |     precision_score,
 9 |     recall_score,
10 |     classification_report,
11 |     accuracy_score,
12 | )
13 | 
14 | """ Metric Functions """
15 | 
16 | 
17 | def get_metric_func(metric_name):
18 |     METRIC_MAP = {
19 |         "acc": simple_accuracy,
20 |         "acc_and_f1": acc_and_f1,
21 |         "pcc_and_scc": pearson_and_spearman,
22 |         "mcc": mcc,
23 |     }
24 |     return METRIC_MAP[metric_name]
25 | 
26 | 
27 | def mcc(labels, preds):
28 |     return {"mcc": matthews_corrcoef(labels, preds)}
29 | 
30 | 
31 | def simple_accuracy(labels, preds):
32 |     return {"acc": accuracy_score(preds, labels)}
33 | 
34 | 
35 | def acc_and_f1(labels, preds, average="weighted", target_labels=None):
36 |     f1 = f1_score(y_true=labels, y_pred=preds, average=average, labels=target_labels)
37 |     precision = precision_score(
38 |         y_true=labels, y_pred=preds, average=average, labels=target_labels
39 |     )
40 |     recall = recall_score(
41 |         y_true=labels, y_pred=preds, average=average, labels=target_labels
42 |     )
43 |     metrics_dict = {
44 |         "f1": f1,
45 |         "precision": precision,
46 |         "recall": recall,
47 |     }
48 |     metrics_dict.update(simple_accuracy(labels, preds))
49 |     return metrics_dict
50 | 
51 | 
52 | def pearson_and_spearman(labels, preds):
53 |     pearson_corr = pearsonr(preds, labels)[0]
54 |     spearman_corr = spearmanr(preds, labels)[0]
55 |     return {
56 |         "pearson": pearson_corr,
57 |         "spearmanr": spearman_corr,
58 |         "corr": (pearson_corr + spearman_corr) / 2,
59 |     }
60 | 


--------------------------------------------------------------------------------
/pymarlin/plugins/plugin_module_interface.py:
--------------------------------------------------------------------------------
  1 | '''plugin module interface'''
  2 | import os
  3 | from transformers import AutoTokenizer, AutoConfig
  4 | from pymarlin.core import module_interface, data_interface
  5 | 
  6 | class PluginModuleInterface(module_interface.ModuleInterface):
  7 |     '''Common plugin module interface to easily load Huggingface tokenizers and Configs'''
  8 |     def auto_setup(self, automodel_class):
  9 |         """Run all (tokenizer,config,model) setups"""
 10 |         self.setup_tokenizer()
 11 |         self.setup_model_config()
 12 |         self.setup_model(automodel_class)
 13 | 
 14 |     @property
 15 |     def data(self):
 16 |         """DataInterface object that is used to retrieve corresponding train or val dataset.
 17 | 
 18 |         Returns:
 19 |             data: DataInterface object with at least one of train or val data.
 20 |         """
 21 |         return self._data
 22 | 
 23 |     @data.setter
 24 |     def data(self, datainterface):
 25 |         assert isinstance(datainterface, data_interface.DataInterface)
 26 |         assert (
 27 |             len(datainterface.get_train_dataset()) != 0
 28 |             or len(datainterface.get_val_dataset()) != 0
 29 |         )
 30 |         self._data = datainterface
 31 | 
 32 |     @property
 33 |     def model(self):
 34 |         """Pytorch model."""
 35 |         return self._model
 36 | 
 37 |     @model.setter
 38 |     def model(self, newmodel):
 39 |         self._model = newmodel
 40 | 
 41 |     def setup_tokenizer(self):
 42 |         """Initializes AutoTokenizer from
 43 |         model_args.tokenizer_path or model_args.hf_model string
 44 |         """
 45 |         if self.args.model_args.tokenizer_path is not None:
 46 |             tokenizer = AutoTokenizer.from_pretrained(
 47 |                 self.args.model_args.tokenizer_path
 48 |             )
 49 |         else:
 50 |             tokenizer = AutoTokenizer.from_pretrained(self.args.model_args.hf_model)
 51 | 
 52 |         self.tokenizer = tokenizer
 53 | 
 54 |     def setup_model_config(self):
 55 |         """Initializes AutoConfig from
 56 |         model_args.model_config + model_args.model_config_file path or model_args.hf_model string
 57 |         """
 58 |         if self.args.model_args.model_config_path is not None:
 59 |             model_config = AutoConfig.from_pretrained(
 60 |                 os.path.join(
 61 |                     self.args.model_args.model_config_path,
 62 |                     self.args.model_args.model_config_file,
 63 |                 )
 64 |             )
 65 |         else:
 66 |             model_config = AutoConfig.from_pretrained(
 67 |                 self.args.model_args.hf_model
 68 |             )
 69 | 
 70 |         model_config.num_labels = (
 71 |             len(self.data.get_labels()) if hasattr(self.data, "get_labels") else None
 72 |         )
 73 |         self.model_config = model_config
 74 | 
 75 |     def setup_model(self, automodel_class):
 76 |         """Initializes automodel_class arg by either:
 77 |         Option 1: Load weights from specified files mentioned in YAML config
 78 |             model:
 79 |                 model_config_path
 80 |                 model_config_file
 81 |                 model_path
 82 |                 model_file
 83 |         Option 2: Load from Huggingface model hub, specify string in YAML config as:
 84 |             model:
 85 |                 hf_model
 86 | 
 87 |         Args:
 88 |             automodel_class: Huggingface AutoModelFor* class
 89 |         """
 90 |         if (
 91 |             self.args.model_args.model_path is not None
 92 |             and self.args.model_args.model_file is not None
 93 |         ):
 94 |             self.model = automodel_class.from_pretrained(
 95 |                 os.path.join(
 96 |                     self.args.model_args.model_path, self.args.model_args.model_file
 97 |                 ),
 98 |                 config=self.model_config,
 99 |             )
100 |         else:
101 |             self.model = automodel_class.from_pretrained(
102 |                 self.args.model_args.hf_model, config=self.model_config
103 |             )
104 | 


--------------------------------------------------------------------------------
/pymarlin/utils/__init__.py:
--------------------------------------------------------------------------------
1 | '''Empty init file'''
2 | 


--------------------------------------------------------------------------------
/pymarlin/utils/checkpointer/__init__.py:
--------------------------------------------------------------------------------
1 | '''checkpointer utils'''
2 | from .checkpoint_utils import AbstractCheckpointer, DefaultCheckpointer, DefaultCheckpointerArguments
3 | 


--------------------------------------------------------------------------------
/pymarlin/utils/config_parser/__init__.py:
--------------------------------------------------------------------------------
1 | '''config parser'''
2 | 


--------------------------------------------------------------------------------
/pymarlin/utils/differential_privacy.py:
--------------------------------------------------------------------------------
 1 | """Differential Privacy utils"""
 2 | from typing import Optional
 3 | from dataclasses import dataclass
 4 | 
 5 | @dataclass
 6 | class DifferentialPrivacyArguments:
 7 |     noise_multiplier: float = 1.0   # Scaling for the noise variance
 8 |     per_sample_max_grad_norm: float = 1.0    # Clips the per sample gradients
 9 |     sample_rate: float = 0.0   # Should be set as batch_size/number_of_samples (see doc for special cases)
10 |     delta: Optional[float] = None   # Typically set as o(1/number_of_samples), only required to calculate privacy budget (epsilon)
11 | 
12 | # Wrap any No-DP optimizer to distinguish from the DP optimizer
13 | # This is expected to be a very rare situation
14 | class NoDPWrap:
15 |     def __init__(self, optimizer):
16 |         self.optimizer = optimizer
17 | 


--------------------------------------------------------------------------------
/pymarlin/utils/distributed.py:
--------------------------------------------------------------------------------
  1 | """distributed utils"""
  2 | import os
  3 | from dataclasses import dataclass
  4 | from typing import Optional
  5 | from functools import wraps
  6 | from azureml.core.run import Run
  7 | import torch
  8 | 
  9 | @dataclass
 10 | class DistributedTrainingArguments:
 11 |     local_rank: int = 0
 12 |     global_rank: int = 0
 13 |     world_size: int = 1
 14 |     backend: str = "nccl"
 15 |     init_method: str = "env://"
 16 |     gather_frequency: Optional[int] = None
 17 | 
 18 | @dataclass
 19 | class DistributedPreprocessArguments:
 20 |     local_rank: int = 0
 21 |     global_rank: int = 0
 22 |     world_size: int = 1
 23 |     node_count: int = 1
 24 |     local_size: int = 1
 25 |     node_rank: Optional[int] = None
 26 | 
 27 | class SequentialDistributedSampler(torch.utils.data.distributed.DistributedSampler):
 28 |     def __init__(self, dataset, num_replicas=None, rank=None, seed=0, drop_last=False, **kwargs):
 29 |         super().__init__(dataset, shuffle=False, num_replicas=num_replicas, rank=rank, seed=seed, drop_last=drop_last, **kwargs)
 30 | 
 31 | def ranks_already_set(args) -> bool:
 32 |     """Return True is both local and global ranks have been set."""
 33 |     is_local_rank_set = args.local_rank > -1
 34 |     is_global_rank_set = args.global_rank > -1
 35 |     return is_local_rank_set and is_global_rank_set
 36 | 
 37 | def fetch_ranks_from_azureml_preprocess():
 38 |     """Look up distributed arguments from Azure ML environment variables.
 39 | 
 40 |     Assumes OpenMPI image.
 41 | 
 42 |     Note:
 43 |         Sets up NCCL environment variables used by Azure ML:
 44 | 
 45 |         - NCCL_SOCKET_IFNAME
 46 |         - NCCL_IB_DISABLE
 47 |     """
 48 |     ranks = DistributedPreprocessArguments()
 49 | 
 50 |     run = Run.get_context()
 51 |     run.get_status()
 52 |     ranks.node_count = run.get_details()['runDefinition']['nodeCount']
 53 |     ranks.local_size = run.get_details()['runDefinition']['mpi']['processCountPerNode']
 54 | 
 55 |     ranks.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK"))
 56 |     ranks.global_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK"))
 57 |     ranks.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE"))
 58 | 
 59 |     return ranks
 60 | 
 61 | def fetch_ranks_from_azureml():
 62 |     """Look up distributed arguments from Azure ML environment variables.
 63 | 
 64 |     Assumes OpenMPI image.
 65 | 
 66 |     Note:
 67 |         Sets up NCCL environment variables used by Azure ML:
 68 | 
 69 |         - NCCL_SOCKET_IFNAME
 70 |         - NCCL_IB_DISABLE
 71 |     """
 72 |     ranks = DistributedTrainingArguments()
 73 |     ranks.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK"))
 74 |     ranks.global_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK"))
 75 |     ranks.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE"))
 76 |     return ranks
 77 | 
 78 | 
 79 | def fetch_ranks_from_torch_distributed_launch():
 80 |     """Read distributed arguments set by torch.distributed.launch via environment variables."""
 81 |     ranks = DistributedTrainingArguments()
 82 |     ranks.local_rank = int(os.environ["LOCAL_RANK"])
 83 |     ranks.global_rank = int(os.environ["RANK"])
 84 |     ranks.world_size = int(os.environ["WORLD_SIZE"])
 85 |     return ranks
 86 | 
 87 | 
 88 | def set_environment_variables_for_nccl_backend():
 89 |     """Sets distributed training environments for azureml openmpi runs with NCCL backend."""
 90 | 
 91 |     # NCCL environment. Still works without it.
 92 |     os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
 93 |     os.environ["NCCL_IB_DISABLE"] = "0"  # for IB
 94 | 
 95 |     master_node = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
 96 |     master_port = "54965"
 97 | 
 98 |     # set env variables
 99 |     os.environ["MASTER_ADDR"] = master_node
100 |     os.environ["MASTER_PORT"] = master_port
101 | 
102 | 
103 | def rank_zero_only(fn):
104 |     """Decorates functions to only execute on global rank 0, else wait via torch.distributed"""
105 | 
106 |     @wraps(fn)
107 |     def wrapped_fn(*args, **kwargs):
108 |         if rank_zero_only.rank == 0:
109 |             res = fn(*args, **kwargs)
110 |             if torch.distributed.is_initialized():
111 |                 torch.distributed.barrier()
112 |             return res
113 |         else:
114 |             if torch.distributed.is_initialized():
115 |                 torch.distributed.barrier()
116 | 
117 |     return wrapped_fn
118 | rank_zero_only.rank = 0 # by default
119 | 


--------------------------------------------------------------------------------
/pymarlin/utils/fabrics.py:
--------------------------------------------------------------------------------
 1 | """Compute fabric specific utility methods."""
 2 | import os
 3 | import importlib.util
 4 | 
 5 | 
 6 | def is_azureml_mpirun() -> bool:
 7 |     """Check if run set up by azureml using OpenMPI image.
 8 | 
 9 |     When running MPIRUN with OpenMPI images, AzureML sets a specific combination
10 |     of environment variables which we check for here, specifically::
11 | 
12 |         OMPI_COMM_WORLD_RANK  # the rank of the process
13 |         OMPI_COMM_WORLD_SIZE  # the world size
14 |         OMPI_COMM_WORLD_LOCAL_RANK  # the local rank of the process on the node
15 |         OMPI_COMM_WORLD_LOCAL_SIZE  # number of processes on the node
16 | 
17 |     and one of the following::
18 | 
19 |         AZ_BATCH_MASTER_NODE  # multiple nodes
20 |         AZ_BATCHAI_MPI_MASTER_NODE  # single node
21 |     """
22 |     is_openmpi_image: bool = (
23 |         "OMPI_COMM_WORLD_RANK" in os.environ
24 |         and "OMPI_COMM_WORLD_SIZE" in os.environ
25 |         and "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ
26 |         and "OMPI_COMM_WORLD_LOCAL_SIZE" in os.environ
27 |     )
28 | 
29 |     is_azureml_mpirun_env: bool = (
30 |         "AZ_BATCH_MASTER_NODE" in os.environ
31 |         or "AZ_BATCHAI_MPI_MASTER_NODE" in os.environ
32 |     )
33 | 
34 |     return bool(is_openmpi_image and is_azureml_mpirun_env)
35 | 
36 | 
37 | def is_torch_distributed_launch_via_environment_variables() -> bool:
38 |     """Check if torch.distributed.launch used to submit the job using environment variables."""
39 | 
40 |     env_vars = os.environ
41 |     is_using_environment_vars: bool = (
42 |         "RANK" in env_vars
43 |         and "MASTER_ADDR" in env_vars
44 |         and "MASTER_PORT" in env_vars
45 |         and "WORLD_SIZE" in env_vars
46 |     )
47 | 
48 |     return is_using_environment_vars
49 | 
50 | 
51 | def is_azureml_run_with_sdk() -> bool:
52 |     """Check if we are running on Azure ML with azureml-sdk."""
53 |     if not _is_azureml_available():
54 |         print("Unable to import azureml sdk.")
55 |         return False
56 | 
57 |     import azureml.core.run
58 | 
59 |     run = azureml.core.run.Run.get_context()
60 |     is_azureml_run = False
61 | 
62 |     try:
63 |         run.get_status()
64 |         is_azureml_run = True
65 |     except AttributeError:
66 |         print("This is not an Azure ML run")
67 | 
68 |     return is_azureml_run
69 | 
70 | 
71 | def _is_azureml_available() -> bool:
72 |     """Check sys.modules to see if azureml.core.run is available.
73 |     See https://github.com/huggingface/transformers/blob/02e05fb0a532e572b56ba75dad6ba3db625bbdeb/src/transformers/integrations.py#L81
74 |     """
75 |     if importlib.util.find_spec("azureml") is None:
76 |         return False
77 |     if importlib.util.find_spec("azureml.core") is None:
78 |         return False
79 |     return importlib.util.find_spec("azureml.core.run") is not None
80 | 


--------------------------------------------------------------------------------
/pymarlin/utils/logger/__init__.py:
--------------------------------------------------------------------------------
1 | ''' logging utils '''
2 | from .logging_utils import getlogger
3 | 


--------------------------------------------------------------------------------
/pymarlin/utils/logger/logging_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging util module
 3 | """
 4 | import logging
 5 | 
 6 | # create console handler for pymarlin format
 7 | console_handler = logging.StreamHandler()
 8 | formatter = logging.Formatter('%(asctime)s:%(levelname)s : %(name)s : %(lineno)d : %(message)s')
 9 | console_handler.setFormatter(formatter)
10 | 
11 | def getlogger(name, log_level='INFO'):
12 |     """
13 |     This method returns a logger object to be used by the calling class.
14 |     The logger object returned has the following format for all the logs:
15 |     '%(asctime)s:%(levelname)s : %(name)s : %(lineno)d : %(message)s'
16 | 
17 |     Args:
18 |     name (str): Directory under which to search for checkpointed files.
19 |     file_prefix (str): Prefix to match for when searching for candidate files.
20 |     file_ext (str, optional): File extension to consider when searching.
21 | 
22 |     Returns:
23 |         logger (object): logger object to use for logging.
24 |     """
25 |     logger = logging.getLogger(name)
26 |     logger.handlers = [console_handler]
27 |     logger.setLevel(log_level)
28 |     return logger
29 | 
30 | if __name__ == '__main__':
31 |     # pylint: disable=pointless-string-statement
32 |     """
33 |     Running this command: "python logging_utils.py" will print following to console:
34 |     logging level for logger1 is INFO
35 |     logging level for logger2 is DEBUG
36 |     <timestamp>:ERROR: logger1 : 34: hello printing error message here for l1
37 |     <timestamp>:ERROR: logger2 : 35: hello printing error message here for l2
38 |     <timestamp>:DEBUG: logger2 : 36: hello printing debug message here for l2
39 |     <timestamp>:INFO: logger2 : 37: hello printing info message here for l2
40 |     """
41 |     l1 = getlogger('logger1')
42 |     l2 = getlogger('logger2', log_level='DEBUG')
43 |     l1.error('hello printing error message here for l1')
44 |     l2.error('hello printing error message here for l2')
45 |     l2.debug('hello printing debug message here for l2')
46 |     l2.info('hello printing info message here for l2')
47 | 


--------------------------------------------------------------------------------
/pymarlin/utils/misc/__init__.py:
--------------------------------------------------------------------------------
1 | '''Empty init file'''
2 | 


--------------------------------------------------------------------------------
/pymarlin/utils/misc/misc_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous utility functions
 3 | """
 4 | 
 5 | from functools import wraps
 6 | import os
 7 | import re
 8 | import shutil
 9 | from pymarlin.utils.logger.logging_utils import getlogger
10 | 
11 | #https://docs.python.org/2/library/functools.html#functools.wraps
12 | 
13 | def snake2camel(name):
14 |     """
15 |     This method changes input name from snake format to camel format.
16 | 
17 |     Args:
18 |         name (str): snake format input name.
19 | 
20 |     Returns:
21 |         name (str): camel format input name.
22 | 
23 |     """
24 |     return re.sub(r'(?:^|_)([a-z])', lambda x: x.group(1).upper(), name)
25 | 
26 | def clear_dir(path, skips=None):
27 |     """
28 |     This method deletes the contents of the directory for which path
29 |     has been provided and not included in the skips list.
30 | 
31 |     Args:
32 |         path (str): Path for directory to be deleted.
33 |         skips (List[str]): List of paths for sub directories to be skipped from deleting.
34 | 
35 |     """
36 |     if os.path.isdir(path):
37 |         with os.scandir(path) as path_iter:
38 |             for entry in path_iter:
39 |                 if entry.path in skips:
40 |                     continue
41 |                 try:
42 |                     if entry.is_file() or entry.is_symlink():
43 |                         os.remove(entry.path)
44 |                     else:
45 |                         shutil.rmtree(entry.path)
46 |                 except PermissionError:
47 |                     getlogger(__name__).warning(f"could not delete path: {entry.path}")
48 | 
49 | def debug(method):
50 |     """
51 |     This method wraps input method with debug calls to measure time taken for
52 |     the given input method to finish.
53 | 
54 |     Args:
55 |         method (function): Method which needs to be timed.
56 | 
57 |     Returns:
58 |         debugged (method): debugged function.
59 | 
60 |     """
61 |     @wraps(method)
62 |     def debugged(*args, **kw):
63 |         logger = getlogger(__name__)
64 |         logger.debug('Inside method: %s', method.__name__)
65 |         result = method(*args, **kw)
66 |         logger.debug('Finished method: %s', method.__name__)
67 |         return result
68 |     return debugged
69 | 


--------------------------------------------------------------------------------
/pymarlin/utils/stats/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Stats package.
3 | '''
4 | from .basic_stats import BasicStats, StatInitArguments
5 | #singleton object
6 | global_stats = BasicStats(StatInitArguments(), writers=[])
7 | 


--------------------------------------------------------------------------------
/pymarlin/utils/writer/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Writers package.
 3 | """
 4 | from pymarlin.utils.logger.logging_utils import getlogger
 5 | from .base import WriterInitArguments
 6 | from .aml import Aml
 7 | from .stdout import Stdout
 8 | from .tensorboard import Tensorboard
 9 | logger = getlogger(__name__)
10 | 
11 | def build_writer(writer, args: WriterInitArguments):
12 |     """
13 |     Initializes and returns writer object based on writer type.
14 |     """
15 |     logger.debug(f'Building Writer {writer}')
16 |     if writer == 'stdout':
17 |         return Stdout(args)
18 |     if writer == 'aml':
19 |         return Aml()
20 |     if writer == 'tensorboard':
21 |         return Tensorboard(args)
22 |     logger.error(f'Error initializing writer {writer}')
23 |     raise Exception(f"Invalid writer type:{writer} requested.")
24 | 


--------------------------------------------------------------------------------
/pymarlin/utils/writer/aml.py:
--------------------------------------------------------------------------------
 1 | """
 2 | AML writer module.
 3 | """
 4 | from pymarlin.utils.logger.logging_utils import getlogger
 5 | from .base import Writer
 6 | 
 7 | class Aml(Writer):
 8 |     """
 9 |     This class implements the Azure ML writer for stats.
10 |     """
11 |     def __init__(self):
12 |         super().__init__(getlogger(__name__))
13 |         self.run = None
14 |         try:
15 |             from azureml.core.run import Run
16 |             self.run = Run.get_context()
17 |             self.logger.info(self.run.get_status())
18 |         except Exception: # pylint: disable=broad-except
19 |             self.run = None
20 |             self.logger.warning('AML writer failed to initialize.')
21 |         self.logger.info(f'run = {self.run}')
22 | 
23 |     def log_scalar(self, k, v, step):
24 |         """
25 |         Log metric to AML.
26 |         """
27 |         kwargs = {
28 |             'global_step': step,
29 |             k: v
30 |             }
31 |         if self.run is not None:
32 |             self.run.log_row(k, **kwargs)
33 | 
34 |     def log_multi(self, k, v, step):
35 |         """
36 |         Log metrics to stdout.
37 |         """
38 |         for key, val in v.items():
39 |             key = k+'/'+key
40 |             self.log_scalar(key, val, step)
41 | 


--------------------------------------------------------------------------------
/pymarlin/utils/writer/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for Writers
 3 | """
 4 | from abc import ABC
 5 | import dataclasses
 6 | 
 7 | @dataclasses.dataclass
 8 | class WriterInitArguments:
 9 |     """
10 |     Writer Arguments.
11 |     """
12 |     tb_log_dir: str = 'logs'
13 |     tb_logpath_parent_env: str = None
14 |     tb_log_multi: bool = False
15 |     tb_log_hist_steps: int = 20000
16 |     model_log_level: str = 'INFO'
17 | 
18 | class Writer(ABC):
19 |     """
20 |     Abstract Base class for Writers.
21 |     """
22 |     def __init__(self, logger):
23 |         self.logger = logger
24 | 
25 |     def log_scalar(self, k, v, step):
26 |         pass
27 | 
28 |     def log_multi(self, k, v, step):
29 |         pass
30 | 
31 |     def log_model(self, flat_weights, flat_grads, step):
32 |         pass
33 | 
34 |     def log_args(self, args):
35 |         pass
36 | 
37 |     def log_graph(self, model, device=None):
38 |         pass
39 | 
40 |     def log_image(self, k, v, step, dataformats='HW'):
41 |         pass
42 | 
43 |     def log_pr_curve(self, k, preds, labels, step):
44 |         pass
45 | 
46 |     def log_histogram(self, param_name, vals, step):
47 |         pass
48 | 
49 |     def log_embedding(self, tag, mat, labels, step):
50 |         pass
51 | 
52 |     def _log_norms(self, step, param_name, weight_norm, grad_norm):
53 |         pass
54 | 
55 |     def log_multi_line(self, string):
56 |         lines = string.split('\n')
57 |         for line in lines:
58 |             self.logger.info(line)
59 | 
60 |     def flush(self):
61 |         pass
62 | 
63 |     def finish(self):
64 |         pass
65 | 


--------------------------------------------------------------------------------
/pymarlin/utils/writer/stdout.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Stdout writer module.
 3 | """
 4 | from pymarlin.utils.logger.logging_utils import getlogger
 5 | from .base import Writer, WriterInitArguments
 6 | 
 7 | class Stdout(Writer):
 8 |     """
 9 |     This class implements the stdout writer for stats.
10 |     """
11 |     def __init__(self, args: WriterInitArguments):
12 |         super().__init__(getlogger(__name__))
13 |         self.args = args
14 | 
15 |     def log_scalar(self, k, v, step):
16 |         """
17 |         Log metric to stdout.
18 |         """
19 |         self.logger.info(f'step = {step}, {k} : {v}')
20 | 
21 |     def log_multi(self, k, v, step):
22 |         """
23 |         Log metric to stdout.
24 |         """
25 |         self.logger.info(f'step = {step}, {k} : {v}')
26 | 
27 |     def log_model(self, flat_weights, flat_grads, step):
28 |         """
29 |         Log model to stdout.
30 |         Can slow down training. Only use for debugging.
31 |         It's logged in Tensorboard by default.
32 |         """
33 |         if self.args.model_log_level == 'DEBUG':
34 |             for name in flat_weights:
35 |                 weight_norm = flat_weights[name].norm().item()
36 |                 grad_norm = None
37 |                 if name in flat_grads:
38 |                     grad_norm = flat_grads[name].norm().item()
39 |                 self._log_norms(step, name, weight_norm, grad_norm)
40 | 
41 |     def log_graph(self, model, device=None):
42 |         """
43 |         Log model graph to stdout.
44 |         """
45 |         self.logger.debug('Logging model graph')
46 |         self.log_multi_line(str(model))
47 | 
48 |     def _log_norms(self, step, param_name, weight_norm, grad_norm):
49 |         self.logger.debug(f'step = {step} , {param_name} : weight_norm = {weight_norm}, grad_norm = {grad_norm}')
50 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning:tensorboard.*
4 |     ignore::DeprecationWarning:pywintypes.*


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | required = ['torch<=1.9.1','tqdm','tensorboard', 'Pillow','azureml-core','pyyaml','pandas']
 7 | extras = {
 8 |     'dev': ['pylint', 'pytest', 'pytest-cov'],
 9 |     'plugins': ['transformers','pandas','matplotlib','sklearn','scipy','rouge-score']
10 | }
11 | 
12 | setup(
13 |     name="pymarlin",
14 |     version="0.3.6",
15 |     author="ELR Team",
16 |     author_email="elrcore@microsoft.com",
17 |     description="Lightweight Deeplearning Library",
18 |     long_description=long_description,
19 |     long_description_content_type = "text/markdown",
20 |     url="https://microsoft.github.io/PyMarlin/",
21 |     packages=find_packages(),
22 |     classifiers=[
23 |         "Programming Language :: Python :: 3",
24 |         "License :: OSI Approved :: MIT License",
25 |         "Operating System :: OS Independent",
26 |     ],
27 |     install_requires=required,
28 |     extras_require=extras,
29 |     python_requires=">=3.6",
30 | )
31 | # https://packaging.python.org/discussions/install-requires-vs-requirements/


--------------------------------------------------------------------------------
/tests/core/test_data_interface.py:
--------------------------------------------------------------------------------
  1 | """Test module for data_interface"""
  2 | 
  3 | import os
  4 | from dataclasses import dataclass
  5 | import unittest
  6 | import pandas as pd
  7 | from torch.utils.data import Dataset
  8 | from pymarlin.core.data_interface import DataInterface, DataProcessor
  9 | 
 10 | @dataclass
 11 | class MyArgs:
 12 |     filepath_train: str = os.path.join("outputs", "file1.csv")
 13 |     filepath_test: str = os.path.join("outputs", "file2.csv")
 14 |     text_field: str = "text"
 15 |     label_field: str = "label"
 16 | 
 17 | 
 18 | class MyDataset(Dataset):
 19 |     def __init__(self, df, text_field, label_field):
 20 |         self.df = df
 21 |         self.text_field = text_field
 22 |         self.label_field = label_field
 23 | 
 24 |     def __len__(self):
 25 |         return len(self.df)
 26 | 
 27 |     def __getitem__(self, idx):
 28 |         row = self.df.iloc[idx]
 29 |         return (row[self.text_field], row[self.label_field])
 30 | 
 31 | 
 32 | class MyData(DataInterface):
 33 | 
 34 |     def __init__(self):        
 35 |         self._train_ds = None
 36 |         self._val_ds = None
 37 | 
 38 |     def setup_datasets(self, train_ds, val_ds):
 39 |         self._train_ds = train_ds
 40 |         self._val_ds = val_ds
 41 | 
 42 |     def get_train_dataset(self):
 43 |         return self._train_ds
 44 | 
 45 |     def get_val_dataset(self):
 46 |         return self._val_ds
 47 | 
 48 | 
 49 | class MyDataProcessor(DataProcessor):
 50 | 
 51 |     def __init__(self, args):
 52 |         super().__init__()
 53 |         self.args = args
 54 | 
 55 |     def process(self):
 56 |         df = pd.read_csv(self.args.filepath_train)
 57 |         return MyDataset(df, self.args.text_field,
 58 |                          self.args.label_field)
 59 | 
 60 | 
 61 | class MyDataMultiProcessor(DataProcessor):
 62 | 
 63 |     def __init__(self, args):
 64 |         super().__init__()
 65 |         self.args = args
 66 | 
 67 |     def process(self, filename):
 68 |         df = pd.read_csv(filename)
 69 |         return MyDataset(df, self.args.text_field,
 70 |                          self.args.label_field)
 71 | 
 72 | 
 73 | class TestDataInterface(unittest.TestCase):
 74 | 
 75 |     def setUp(self):
 76 | 
 77 |         self.args = MyArgs()
 78 |         self.data_interface = MyData()
 79 |         self.data_processor = MyDataProcessor(self.args)
 80 |         self.data_multiprocessor = MyDataMultiProcessor(self.args)
 81 | 
 82 |         if not os.path.exists("outputs"):
 83 |             os.makedirs("outputs")
 84 | 
 85 |     def tearDown(self):
 86 |         for f in os.listdir("outputs"):
 87 |             if f.endswith(".csv"):
 88 |                 os.remove(os.path.join("outputs", f))
 89 | 
 90 |     def test_process_data(self):
 91 |         df = pd.DataFrame({self.args.text_field: ['one', 'two'],
 92 |                            self.args.label_field: [1, 2]})
 93 |         df.to_csv(self.args.filepath_train)
 94 | 
 95 |         train_ds = self.data_processor.process_data()
 96 |         assert train_ds[0] == ('one', 1)
 97 |         assert len(train_ds) == 2
 98 | 
 99 |     def test_multi_process_data(self):
100 |         df1 = pd.DataFrame({self.args.text_field: ['one', 'two'],
101 |                             self.args.label_field: [1, 2]})
102 |         df2 = pd.DataFrame({self.args.text_field: ['three', 'four', 'five'],
103 |                             self.args.label_field: [3, 4, 5]})
104 |         df1.to_csv(self.args.filepath_train)
105 |         df2.to_csv(self.args.filepath_test)
106 | 
107 |         train_ds_list = self.data_multiprocessor.multi_process_data(
108 |             [self.args.filepath_train, self.args.filepath_test],
109 |             process_count=2)
110 |         assert train_ds_list[0] == ('one', 1)
111 |         assert train_ds_list[2] == ('three', 3)
112 |         assert len(train_ds_list) == 5
113 | 
114 |     def test_collect_params(self):
115 |         a_number = 1
116 |         single_argument = 'single_argument'
117 |         list_to_split = ['first', 'second', 'third']
118 |         a_dict = {'test': 'dict'}
119 |         list_not_to_split = [['this', 'lists', 'elements', 'dont', 'split']]
120 |         self.data_processor._set_ranks()
121 |         list_params = self.data_processor._collect_params(
122 |             a_number,
123 |             single_argument,
124 |             list_to_split,
125 |             a_dict,
126 |             list_not_to_split
127 |         )
128 |         assert len(list_params) == 5
129 |         assert list_params[2][0] == list_to_split[0]
130 |         assert list_params[2][1] == list_to_split[1]
131 |         assert list_params[2][2] == list_to_split[2]
132 | 
133 |         for param in list_params[1]:
134 |             assert param == single_argument
135 | 
136 |         for param in list_params:
137 |             assert len(param) == 3
138 | 


--------------------------------------------------------------------------------
/tests/core/test_trainer_backend.py:
--------------------------------------------------------------------------------
  1 | """Test module for trainer_backend"""
  2 | 
  3 | import unittest
  4 | from unittest import mock
  5 | import torch
  6 | import pytest
  7 | from pymarlin.core import module_interface, trainer_backend
  8 | from pymarlin.utils.distributed import DistributedTrainingArguments
  9 | #https://docs.python.org/3/library/unittest.mock.hmock_tml
 10 | 
 11 | class TestSingleProcess(unittest.TestCase):
 12 |     def setUp(self):
 13 |         self.trainer_backend = trainer_backend.SingleProcess()
 14 |         self.mock_module = mock.MagicMock(spec = module_interface.ModuleInterface)
 15 |         # make x^2 as loss
 16 |         # self.x = torch.Tensor([1])
 17 |         # self.x.requires_grad = True
 18 |         # self.loss = self.x*self.x
 19 |         self.loss = torch.randn(1, requires_grad=True)
 20 |         self.mock_module.forward = mock.MagicMock(return_value = [self.loss])
 21 | 
 22 |         self.mock_scheduler = mock.MagicMock()
 23 | 
 24 |         self.mock_optimizer = mock.MagicMock(spec = torch.optim.Optimizer)
 25 |         self.trainer_backendArgs = trainer_backend.TrainerBackendArguments(
 26 |             model=self.mock_module,
 27 |             device='cpu',
 28 |             train_batch_size=1,
 29 |             max_train_steps_per_epoch=1,
 30 |             max_val_steps_per_epoch=1,
 31 |             distributed_training_args=DistributedTrainingArguments(),
 32 |             optimizers=[self.mock_optimizer],
 33 |             schedulers=[self.mock_scheduler],
 34 |             gradient_accumulation=1,
 35 |             clip_grads=False,
 36 |         )
 37 |         
 38 |         self.trainer_backend.init(self.trainer_backendArgs)
 39 | 
 40 |         self.mock_callback = mock.MagicMock()
 41 |         self.mock_dataloader = [mock.MagicMock()]*10
 42 | 
 43 |     def test_train_dl(self):
 44 | 
 45 |         # make x^2 as loss
 46 |         x = torch.Tensor([1])
 47 |         x.requires_grad = True
 48 |         loss = x*x
 49 |         self.mock_module.forward = mock.MagicMock(return_value = [loss])
 50 | 
 51 |         
 52 |         self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback)
 53 | 
 54 | 
 55 |         # test forward
 56 |         self.mock_module.forward.assert_called_once_with(
 57 |             stage = module_interface.Stage.TRAIN,
 58 |             batch = self.mock_dataloader[0],
 59 |             device = 'cpu',
 60 |             global_step =1 )
 61 |         print(self.mock_module.forward.return_value)
 62 |         # test backward
 63 |         assert x.grad == 2 *x
 64 |         # test optimization
 65 |         self.mock_optimizer.step.assert_called_once()
 66 |         self.mock_optimizer.zero_grad.assert_called_once()
 67 |         #test callback
 68 |         self.mock_callback.on_end_train_step.assert_called_once()
 69 |         self.mock_callback.on_end_train_step.assert_called_with( 1, loss.detach())
 70 | 
 71 |     def test_eval_dl(self):
 72 | 
 73 |         self.trainer_backend.validate_dl(self.mock_dataloader)
 74 | 
 75 |         # test forward
 76 |         self.mock_module.forward.assert_called_once_with(
 77 |             stage = module_interface.Stage.VAL,
 78 |             batch = self.mock_dataloader[0],
 79 |             device = 'cpu',
 80 |             global_step = 0 )
 81 | 
 82 |     def test_gradiend_accumulation(self):
 83 |         self.trainer_backend.args.gradient_accumulation = 2
 84 |         self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback)
 85 |         assert self.mock_module.forward.call_count == 2
 86 |         assert self.mock_optimizer.step.call_count == 1
 87 |         assert self.mock_optimizer.step.call_count == 1
 88 | 
 89 |     def test_gradiend_clipping(self):
 90 |         # make x^2 as loss
 91 |         
 92 | 
 93 |         self.trainer_backend.args.clip_grads = True
 94 |         self.trainer_backend.args.max_grad_norm  = 1
 95 | 
 96 |         for val in range(-10, 10):
 97 |             x = torch.Tensor([val])
 98 |             x.requires_grad = True
 99 |             loss = x*x
100 |             self.mock_module.parameters = mock.MagicMock(return_value = [x])
101 |             self.mock_module.forward = mock.MagicMock(return_value = [loss])
102 |             self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback)
103 | 
104 |             assert min(0, 2*val -1) < x.grad.item() <= 1
105 | 
106 |     def test_output_collection(self):
107 | 
108 |         self.trainer_backendArgs.max_train_steps_per_epoch = 2
109 |         self.trainer_backend.args.gradient_accumulation = 2
110 | 
111 |         losses = [torch.randn(1, requires_grad=True).squeeze(), torch.randn(1, requires_grad=True).squeeze()] * 2
112 |         labels = [torch.randint(0,10, size = (4,3)), torch.randint(0,10, size = (3,3))] * 2
113 |         # guids = range(4)
114 |         self.mock_module.forward = mock.MagicMock()
115 |         self.mock_module.forward.side_effect = zip(losses, labels)#, guids)
116 | 
117 |         outputs = self.trainer_backend.train_dl(self.mock_dataloader, self.mock_callback)
118 |         assert self.mock_module.forward.call_count == 4
119 | 
120 |         assert outputs[0].shape == torch.Size([4])
121 |         assert outputs[1].shape == torch.Size([4+3+4+3, 3]) # concatinated across axis 0
122 |         # assert outputs[2] == [0,1,2,3] 
123 |         #print(outputs, outputs[0].shape)
124 | 
125 | 
126 |     def test_get_state(self):
127 |         state = self.trainer_backend.get_state()
128 |         assert state['global_step_completed'] == 0
129 |         assert state['batches_completed'] == 0
130 | 
131 |     def test_update_state(self):
132 |         state_dict = {
133 |             'global_step_completed': 1,
134 |             'batches_completed': 2
135 |         }
136 |         self.trainer_backend.update_state(state_dict)
137 |         assert self.trainer_backend.get_global_steps_completed() == 1
138 |         assert self.trainer_backend.get_batches_completed() == 2
139 | 
140 | 
141 | @pytest.mark.filterwarnings("ignore::UserWarning: torch.cuda.amp.")
142 | class TestSingleProcessAmp(TestSingleProcess):
143 |     def setUp(self):
144 |         super().setUp()
145 |         self.trainer_backend = trainer_backend.SingleProcessAmp()
146 |         self.trainer_backend.init(self.trainer_backendArgs)


--------------------------------------------------------------------------------
/tests/test_sanity.py:
--------------------------------------------------------------------------------
1 | '''Class to test import of marlin library'''
2 | def test_import():
3 |     import pymarlin
4 |     assert True


--------------------------------------------------------------------------------
/tests/utils/config.yaml:
--------------------------------------------------------------------------------
 1 | # Test YAML file for unit testing.
 2 | 
 3 | test_data_path: 'c:\tmp'
 4 | test_empty_data_path: null
 5 | 
 6 | # test arguments
 7 | test:
 8 |     test_float: -1.0
 9 |     test_int: -1
10 |     test_list_float: [-1.0, -1.0, -1.0]
11 |     test_list_int: [-1, -1, -1]
12 |     test_list_str: [ 'this', 'is', 'a', 'test', 'list']
13 |     test_str: null
14 |     test_true: False
15 |     test_false: True
16 |     test_two_levels:
17 |         test_int: -1
18 |         test_three_levels:
19 |             test_list_str: [ 'this', 'is', 'a', 'test', 'list']
20 |     


--------------------------------------------------------------------------------
/tests/utils/corrupt_files/config.yaml:
--------------------------------------------------------------------------------
1 | # Test for corrupt YAML file for unit testing.
2 | date: 2021-02-03
3 | dummy


--------------------------------------------------------------------------------
/tests/utils/test_stats.py:
--------------------------------------------------------------------------------
  1 | """Module to test stats module class"""
  2 | import os
  3 | import torch
  4 | import numpy as np
  5 | import pytest
  6 | import shutil
  7 | import unittest
  8 | from unittest import mock
  9 | from pymarlin.utils import stats
 10 | from pymarlin.utils.writer import build_writer, WriterInitArguments
 11 | import collections
 12 | import functools
 13 | 
 14 | class TestStats(unittest.TestCase):
 15 |     def setUp(self):
 16 |         self.stats = stats.global_stats
 17 |         self.stat_args = stats.StatInitArguments(
 18 |                 log_steps = 50,
 19 |                 update_system_stats = False,
 20 |                 log_model_steps = 1000,
 21 |                 exclude_list = None
 22 |         )
 23 |         self.writer_args = WriterInitArguments(
 24 |                 tb_log_dir='logs'
 25 |         )
 26 |         self.writers = [
 27 |                 build_writer(writer, self.writer_args)
 28 |                 if isinstance(writer, str)
 29 |                 else writer
 30 |                 for writer in ['stdout','tensorboard']
 31 |         ]
 32 |         self.stats.rebuild(args=self.stat_args, writers=self.writers)
 33 | 
 34 |     def tearDown(self):
 35 |         self.stats.rebuild(args=None, writers=[])
 36 | 
 37 |     @pytest.fixture(scope='module')
 38 |     def project_file(self, tmpdir_factory):
 39 |         print('deleting temp folder')
 40 |         my_tmpdir = tmpdir_factory.mktemp(self.writer_args.tb_log_dir)
 41 |         yield my_tmpdir
 42 |         shutil.rmtree(str(my_tmpdir))
 43 | 
 44 |     def test_short(self):
 45 |         scalars = {'F1': 0.5, 'acc':0.8}
 46 |         for k, v in scalars.items():
 47 |             self.stats.update(k,v, frequent = True)
 48 |             assert self.stats.scalars_short[k] == v
 49 |             self.stats.update(k,v+0.1, frequent = True)
 50 |             assert self.stats.scalars_short[k] == v+0.1
 51 |         self.stat_args.log_steps = 2
 52 |         self.stats.rebuild(args=self.stat_args, writers=self.writers)
 53 |         print('log stats for step 1. Nothing should be logged here.')
 54 |         self.stats.log_stats(step = 1)
 55 |         assert len(self.stats.scalars_short) > 0
 56 |         print('log stats for step 2. should be logged now.')
 57 |         self.stats.log_stats(step = 2)
 58 |         assert len(self.stats.scalars_short) == 0
 59 | 
 60 |     def test_long(self):
 61 |         scalars = {'epochs': 1}
 62 |         for k,v in scalars.items():
 63 |             self.stats.update(k,v, frequent = False)
 64 |             assert self.stats.scalars_long[k] == v
 65 |         multi = {'losses': {'train':0.5, 'val_email':0.8, 'val_wiki':0.3}}
 66 |         for k,v in multi.items():
 67 |             self.stats.update_multi(k,v, frequent = False)
 68 |             assert self.stats.multi_long[k] == v
 69 |         print('log long stats . should be logged')
 70 |         self.stats.log_long_stats(step = 1000)
 71 |     
 72 |     def test_log_model(self):
 73 |         # 2 layer NN with layer norm and sigmoid
 74 |         model = MyModel()
 75 |         self.stats.log_graph(model, device='cpu')
 76 |         optim = torch.optim.SGD(params = model.parameters(), lr = 1)
 77 |         
 78 |         self.stat_args.log_steps = 1
 79 |         self.writer_args.tb_hist_interval = 2
 80 |         self.writers = [
 81 |                 build_writer(writer, self.writer_args)
 82 |                 if isinstance(writer, str)
 83 |                 else writer
 84 |                 for writer in ['stdout','tensorboard']
 85 |         ]
 86 |         self.stats.rebuild(args=self.stat_args, writers=self.writers)
 87 |         
 88 |         for step in range(1, 5):
 89 |             op = model.forward(torch.rand(2,3))
 90 |             loss = torch.nn.MSELoss()(op, torch.rand(2,1))
 91 |             loss.backward()
 92 |             
 93 |             self.stats.log_model(step, model)
 94 |             optim.step()
 95 |             optim.zero_grad()
 96 |         #expectation. norms should be logged 4 times
 97 |         #histogram should be logged only twice in tensorboard
 98 | 
 99 |     def test_log_image(self):
100 |         random_image = np.random.randint(100,size = (1,100)).reshape(10,10)
101 |         random_image = random_image/ 100
102 |         self.stats.update_image('random_image',
103 |                         random_image,
104 |                         dataformats = 'HW')
105 |         self.stats.log_long_stats(step = 1000)
106 | 
107 |     def test_log_pr(self):
108 |         preds = np.random.rand(100)
109 |         labels = np.random.randint(2, size=100)
110 |         self.stats.update_pr('binary_pr',
111 |                         preds, labels)
112 |         self.stats.log_long_stats(step = 1000)
113 | 
114 | class MyModel(torch.nn.Module):
115 |     def __init__(self):
116 |         super().__init__()
117 |         self.hidden = torch.nn.Linear(3,5)
118 |         self.hidden_activation = torch.nn.Tanh()
119 |         self.hidden_layernorm = torch.nn.LayerNorm(5)
120 |         self.output = torch.nn.Linear(5,1)
121 |     
122 |     def forward(self, input):
123 |         hidden_op = self.hidden_activation(
124 |                         self.hidden_layernorm(
125 |                             self.hidden(input)))
126 |         op = self.output(hidden_op)
127 |         return op
128 |     
129 |     def get_sample_input(self):
130 |         return torch.ones(1,3, dtype = torch.float32)
131 | 


--------------------------------------------------------------------------------
/website/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | /node_modules
 3 | 
 4 | # Production
 5 | /build
 6 | 
 7 | # Generated files
 8 | .docusaurus
 9 | .cache-loader
10 | 
11 | # Misc
12 | .DS_Store
13 | .env.local
14 | .env.development.local
15 | .env.test.local
16 | .env.production.local
17 | 
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 | 


--------------------------------------------------------------------------------
/website/README.md:
--------------------------------------------------------------------------------
 1 | # Website
 2 | 
 3 | This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | To build and test documentation locally, begin by downloading and installing [Node.js](https://nodejs.org/en/download/), and then installing [Yarn](https://classic.yarnpkg.com/en/).
 8 | On Windows, you can install via the npm package manager (npm) which comes bundled with Node.js:
 9 | 
10 | ```console
11 | npm install --global yarn
12 | ```
13 | 
14 | ## Installation
15 | 
16 | ```console
17 | yarn install
18 | pip install pydoc-markdown
19 | ```
20 | 
21 | ## Local Development
22 | 
23 | Navigate to the website folder and run:
24 | 
25 | ```console
26 | pydoc-markdown
27 | yarn start
28 | ```
29 | 
30 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server.
31 | 
32 | ## Build
33 | 
34 | ```console
35 | yarn build
36 | ```
37 | 
38 | This command generates static content into the `build` directory and can be served using any static contents hosting service.
39 | 
40 | ## Deployment
41 | 
42 | ```console
43 | GIT_USER=<Your GitHub username> USE_SSH=true yarn deploy
44 | ```
45 | 
46 | If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch.
47 | 


--------------------------------------------------------------------------------
/website/UML/diagrams/out/classes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classes.png


--------------------------------------------------------------------------------
/website/UML/diagrams/out/classification_data_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classification_data_processing.png


--------------------------------------------------------------------------------
/website/UML/diagrams/out/classification_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classification_train.png


--------------------------------------------------------------------------------
/website/UML/diagrams/out/classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/classifier.png


--------------------------------------------------------------------------------
/website/UML/diagrams/out/training_lifecycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/diagrams/out/training_lifecycle.png


--------------------------------------------------------------------------------
/website/UML/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for UML diagram generation
 6 | 
 7 | if "%1" == "svg" goto svg
 8 | if "%1" == "png" goto png
 9 | 
10 | java -jar plantuml.jar -h
11 | goto end
12 | 
13 | :svg
14 | java -jar plantuml.jar -tsvg -o "../out" "diagrams/src"
15 | 
16 | :png
17 | java -jar plantuml.jar -tpng -o "../out" "diagrams/src"
18 | 
19 | :end
20 | popd


--------------------------------------------------------------------------------
/website/UML/plantuml.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/UML/plantuml.jar


--------------------------------------------------------------------------------
/website/UML/readme.md:
--------------------------------------------------------------------------------
 1 | ## UML Diagrams
 2 | 
 3 | We created all diagrams using PlantUML. If you are using Visual Studio Code,
 4 | you can leverage the PlantUML extension for modifying and previewing diagrams.
 5 | 
 6 | https://plantuml.com/
 7 | 
 8 | ### Prerequisites
 9 | 
10 | - PlantUML.jar: https://sourceforge.net/projects/plantuml/files/plantuml.jar/download
11 | - Java: https://www.java.com/en/download/
12 |     - Add to java executable to PATH or if using PlantUML extension for VS Code,
13 |       add full path to `java.exe` under `plantuml.java` in PlantUML's `settings.json` file.
14 | - Graphviz (not needed for sequence diagrams): https://graphviz.org/download/
15 | 
16 | Note: the latest versions of PlantUML include a minimalistic graphviz dot.exe.
17 | 
18 | ### Building the diagrams
19 | 
20 | All diagram source codes are stored under `docs/UML/diagrams/src`, and outputs
21 | under `docs/UML/diagrams/out`.
22 | 
23 | To automatically generate all diagrams run `make svg` for .svg outputs, or `make png` for .png outputs.
24 | 
25 | ```bash
26 | cd docs/UML
27 | make svg
28 | make png
29 | ```
30 | 
31 | To manually build the diagrams, you can use the following command, which will
32 | search the directory for files with .pu extension with `@startuml` and `@enduml`, and
33 | create all diagrams found under `docs/UML/diagrams/src`.
34 | 
35 | ```bash
36 | cd docs/UML
37 | java -jar plantuml.jar -tsvg -o "../out" "diagrams/src"
38 | ```
39 | 
40 | Note: all diagram names are specified via `@startuml diagram_name` in each file.
41 | 
42 | 
43 | ### Contribution
44 | 
45 | To modify an existing diagram simply modify between `@startuml diagram_name` and `@enduml`,
46 | and regenerate diagrams. To create new diagrams, please create a new .pu file and it will
47 | automatically be detected when calling `make svg`.


--------------------------------------------------------------------------------
/website/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
3 | };
4 | 


--------------------------------------------------------------------------------
/website/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contribution and Feedback
 2 | ## Contributing
 3 | PyMarlin welcomes your contributions!
 4 | 
 5 | ## Contributor License Agreement
 6 | This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
 9 | 
10 | ## Code of Conduct
11 | This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.
12 | 
13 | ## Feedback
14 | PyMarlin library is developed by Microsoft engineers and we cannot wait to see how our library will be adopted by the wider community and help bring more innovation into the world of Artificial Intelligence !
15 | 
16 | Please reach out to us for any feedback and suggestions about the library in the github issues here: https://github.com/microsoft/PyMarlin/issues


--------------------------------------------------------------------------------
/website/docs/examples/classification.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Covid-19 Classification
3 | ---
4 | 
5 | In this example, we finetune BERT for for Covid-19 Tweet Sentiment detection. We also provide instructions for running in AzureML seamlessly with DDP Distributed Multi-GPU training. Check out the latest instructions in the GitHub repo [here](https://github.com/microsoft/PyMarlin/tree/main/examples/covid19_text_classification)
6 | 


--------------------------------------------------------------------------------
/website/docs/examples/datamodule-example.md:
--------------------------------------------------------------------------------
 1 | # Data interface single and multi process
 2 | 
 3 | This is an example explaining how to leverage the in-built multiprocessing capability of DataInterface for large amounts of data.
 4 | For example purpose we are using 27 files from wikipedia raw text.
 5 | 1) Azure virtual machine , single node multi-process , single selective machine
 6 | 2) AML, single node vs multi-node, single selective machine
 7 | ## Configs - YAML and Parsing
 8 | 
 9 | For ease of use we have configs passed in as YAML files. 
10 | In this case we use the config file : config_prod.yaml included with example code.
11 | 
12 | Snippet of config: ( modify file paths according to your folder structure)
13 | 
14 | ```python
15 | input_dir: 'C:/Users/ashwinsr/wikipedia.part1'
16 | out_dir: 'C:/Users/ashwinsr/out_fold'
17 | process_count: 10
18 | run_type: ''
19 | ```
20 | 
21 | This config can be read in like below : 
22 | 
23 | ```python
24 | #Create arg parser and read config
25 | parser = CustomArgParser(log_level='DEBUG', default_yamlfile="config_prod.yaml")
26 | config = parser.parse()
27 | ```
28 | 
29 | Our data processor is a simple token splitter which given raw text will split it into token store the results back in a file. The processor runs 1 file at a time.
30 | 
31 | ## Virtual machine
32 | ### Single virtual machine with multi process
33 | 
34 | ```python
35 | dataInterface = Ex_dataInterface()  
36 | file_list = dataInterface.get_file_names(config["input_dir"])
37 | #create and run processor1
38 | example_processor = Processor1(config["input_dir"], config["out_dir"])
39 | out = example_processor.multi_process_data(file_list, process_count=config["process_count"])
40 | ```
41 | Here we create a list of files in the directory and initialize the processor with the input and output directory. We call the the multi_process_data function in the processor, passing the list of files , with the process count. The processor then spins up those many number of processes to create coressponding output. 
42 | 
43 | ### Selective node preprocessing
44 | For a case where we have a single node but want to process the data in batches. We want the processor to run on different subset of files depending upon the rank we assign. This is to emulate multi-node behaviour with a single node by controlling the node rank parameter.
45 | 
46 | For instance if we have 30 files to process over 5 separate runs , then we need to add the following to config and initialize dataProcessor accordingly
47 | 
48 | ```python
49 | distribArgs:
50 |     local_rank:  0
51 |     global_rank:  0
52 |     world_size:  1
53 |     node_count:  5
54 |     local_size:  1
55 |     node_rank: 3
56 | ```
57 | 
58 | ```python
59 | distrib = DistributedPreprocessArguments(**config["distribArgs"])
60 | example_processorer = Processor1(config["input_dir"], config["out_dir"], distrib)
61 | ```
62 | Remember to initialize the base dataProcessor class with the distributed arguemnts as shown below, the default None would treat it like a regular multi-node processing job
63 | 
64 | ```
65 | class Processor1(data_interface.DataProcessor):
66 |     def __init__(self, input_dir, out_dir, distrib_args = None):
67 |         super(Processor1, self).__init__(distrib_args)
68 |         self.input_dir = input_dir
69 |         self.out_dir = out_dir
70 | ```
71 | 
72 | With the above setting we would process files 18-24 out of 30. Since the node_rank is 3 (0 indexed) and can be a maximum of 4. node_count gives us a count of total nodes available
73 | This gives a flexibility with large data processing with limited compute.
74 | 
75 | To run in virtual machine copy over the files to virtual machine using SCP 
76 | Install pymarlin and requirements and run example
77 | ```python
78 |     > ssh $user@$machine -p $port
79 |     $ pip install  ./pymarlin --force-reinstall
80 |     $ pip install -r pymarlin/requirements.txt
81 |     $ cd data_ex
82 |     $ python data.py
83 | ```
84 | 
85 | ### AML 
86 | We can do single and multi-node processing both with AML. The datamodule handles AML ranking internally for both single and multinodes to appropriately divide the files across nodes. 
87 | You will find a notebook along with the example to submit a AML a job, with placeholders for storage and compute accounts.
88 | 


--------------------------------------------------------------------------------
/website/docs/examples/distillation.md:
--------------------------------------------------------------------------------
 1 | # Distillation
 2 | 
 3 | With `pymarlin` library, distillation can be done in a standalone manner or as an extension to your original training Scenario. In this example, we will go through how the [GLUE Task](glue-tasks.md) setup was extended to also perform distillation.
 4 | 
 5 | Data Preprocessing is the same as [here](glue-tasks.md). The main implementation is in the `ModuleInterface` which we chose to call `DistillRecipe` (inheriting from the GLUE `Recipe`).
 6 | 
 7 | The key methods of `DistillRecipe` that we want to override:
 8 | 1. Setting up teacher and student model and related items such as config as needed. Here, we have the option to modify the student config depending on the desired changes to the depth or width of the model.
 9 |     ```python
10 |         def setup_models(self):
11 |             self._setup_configs()
12 |             # teacher setup
13 |             self.teacher = AutoModelForSequenceClassification.from_pretrained(
14 |                 os.path.join(self.args.model_args.model_wts_path, self.args.model_args.model_file),
15 |                 config=self.model_config
16 |                 )
17 |             # student setup
18 |             self.model = copy.deepcopy(self.teacher)
19 |             if len(self.student_layers) > 0:
20 |                 layer_modules = getattr(self.model, self.args.model_args.encoder_key).encoder.layer
21 |                 new_layer_modules = distill_utils.extract_layers(layer_modules, self.student_layers)
22 |                 getattr(self.model, self.args.model_args.encoder_key).encoder.layer = new_layer_modules
23 |      
24 |             self.teacher.eval()
25 |             self.output_hidden = True if 'hidden_states' in self.loss_types else False
26 |             self.output_attentions = True if 'attentions' in self.loss_types else False
27 |             return (self.model, self.teacher)
28 |     ```
29 | 
30 | 2. Modify `train_step` to set teacher in eval mode, get teacher outputs, get student outputs, and compute a custom loss. The loss can be a combination of `logits`, `labels` or various intermediate representations such as `hidden_states` and `attentions`. You have the flexibility to determine your distillation logic.
31 |     ```python
32 |         def train_step(self, global_step, batch, device):
33 |             self.teacher.eval()
34 |             inputs = self._inputs_to_device(batch, device)
35 |             teacher_outputs = self.teacher.forward(**inputs,
36 |                                 output_hidden_states=self.output_hidden,
37 |                                 output_attentions=self.output_attentions,
38 |                                 ) # label_loss, logits, hidden, attns
39 |             student_outputs = self.model.forward(**inputs,
40 |                                 output_hidden_states=self.output_hidden,
41 |                                 output_attentions=self.output_attentions,
42 |                                 )
43 |             total_loss = torch.zeros([1], dtype=student_outputs[0].dtype, device=device)
44 |             for i, k in enumerate(self.loss_types):
45 |                 if k == 'labels':
46 |                     student_scores = student_outputs.loss
47 |                     teacher_scores = teacher_outputs.loss
48 |                 else:
49 |                     student_scores = getattr(student_outputs, k)
50 |                     teacher_scores = getattr(teacher_outputs, k)
51 | 
52 |                 if student_scores is not None and teacher_scores is not None:
53 |                     if k == 'logits':
54 |                         total_loss += self.loss_weights[i] * distill_utils.logits_loss(
55 |                             student_scores, teacher_scores,
56 |                             temperature=self.distill_args.temperature,
57 |                         )
58 |                     elif k != 'logits' and self.distill_args.width_shrinkage == 0:
59 |                         total_loss += self.loss_weights[i] * distill_utils.representations_loss(
60 |                                         student_scores,
61 |                                         teacher_scores,
62 |                                         [*range(len(self.student_layers))],
63 |                                         self.student_layers
64 |                         )
65 |             return total_loss
66 |     ```
67 | 
68 | 3. As an example, `on_end_train` can be used to cleanup any changes made to the final student model config and save it to the output directory along with the student model.
69 | 
70 | That's it! If you have a scenario setup it's as easy as overriding just 2 methods. 


--------------------------------------------------------------------------------
/website/docs/examples/glue-tasks.md:
--------------------------------------------------------------------------------
1 | # GLUE Tasks
2 | 
3 | You can use the `pymarlin` library to easily benchmark your models for the GLUE tasks.
4 | Checkout the README in the github repo [here](https://github.com/microsoft/PyMarlin/tree/main/examples/glue_text_benchmark).


--------------------------------------------------------------------------------
/website/docs/examples/images/cifar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/examples/images/cifar.png


--------------------------------------------------------------------------------
/website/docs/examples/images/tb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/examples/images/tb.jpg


--------------------------------------------------------------------------------
/website/docs/examples/images/tensorboard_screenshot_bart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/examples/images/tensorboard_screenshot_bart.jpg


--------------------------------------------------------------------------------
/website/docs/examples/summarization.md:
--------------------------------------------------------------------------------
1 | # CNN/DailyMail Summarization
2 | 
3 | In this example, we finetune a BART model for summarizing CNN/Daily Mail news articles. Checkout the README in the GitHub repo [here](https://github.com/microsoft/PyMarlin/tree/main/examples/cnndailymail_text_summarization) for the latest instructions on how to run.


--------------------------------------------------------------------------------
/website/docs/getting-started.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | ### Welcome to PyMarlin, a lightweight PyTorch library for agile deep learning!
 4 | PyMarlin is a lightweight PyTorch extension library for agile deep learning experimentation. PyMarlin was developed with the goal of simplifying the E2E Deep Learning experimentation lifecycle for data scientists. The library enables an agile way to quickly prototype a new AI scenario on your dev box and seamlessly scale to multi-node GPU training in AzureML or any other cloud services.
 5 | 
 6 | ## Key features
 7 | - **Data pre-processing** module which enables data preprocessing recipes to scale from single CPU to multi-CPU and multi node. 
 8 | - **Infra-agnostic design**: native Azure ML integration implies the same code running on local dev-box can also run directly on any VM or Azure ML cluster.
 9 | - **Trainer backend abstraction** with support for Single Process (CPU/GPU), distributed Data Parallel, mixed-precision (AMP, Apex) training. ORT and Deepspeed libraries are also integrated to get the best distributed training throughputs.
10 | - Out-of-the-box **Plugins** that can be used for typical NLP tasks like Sequence Classification, Named Entity Recognition and Seq2Seq text generation.
11 | - **Utility modules** for model checkpointing, stats collection and Tensorboard events logging which can be customized based on your scenario.
12 | - **Custom arguments parser** that allows for saving all the default values for arguments related to a scenario in a YAML config file, merging user supplied arguments at runtime.
13 | 
14 | 
15 | ## Start exploring!
16 | 
17 | ### Train your first model with pymarlin
18 | 
19 | Check out [CIFAR image classification](examples/cifar.md) from the EXAMPLES section.
20 | 
21 | ### GLUE task benchmarking
22 | 
23 | Explore how to use pymarlin to [benchmark your models on GLUE tasks](examples/glue-tasks.md).
24 | 
25 | ## We want your feedback!
26 | 
27 | Reach out to us with your [feedback and suggestions](https://github.com/microsoft/PyMarlin/issues).


--------------------------------------------------------------------------------
/website/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | In this guide, we will share instructions on how to set up pymarlin in the following environments:
 3 | * Local/Dev Machine
 4 | * AzureML
 5 | 
 6 | ## Local/Dev Machine
 7 | ### Environment setup
 8 |     conda create -n pymarlin python=3.8
 9 |     conda activate pymarlin
10 | 
11 | ### Install pytorch
12 | [Latest documentation](https://pytorch.org/get-started/locally/)
13 | 
14 |     conda install pytorch cpuonly -c pytorch
15 | 
16 | ### Install PyMarlin
17 | You can install from our internal pip or alternatively install from source.
18 | 
19 | #### Install from pip (once available in PyPi)
20 | 
21 |     pip install pymarlin
22 | 
23 | #### Install from source
24 | 
25 |     git clone https://github.com/microsoft/PyMarlin.git
26 |     cd PyMarlin
27 |     pip install -e .
28 | 
29 | ## AzureML
30 | Specify the pip package in a supplied conda_env.yml file.


--------------------------------------------------------------------------------
/website/docs/marlin-in-pictures.md:
--------------------------------------------------------------------------------
 1 | # PyMarlin in Pictures
 2 | 
 3 | pymarlin is designed to improve agility, scalability, code manageability, all while
 4 | providing flexibility and control across DL environments. Due to this, pymarlin is divided
 5 | into various components correspoinding to extensible classes, only a few of which need to
 6 | be implemented by users. As shown in the following class diagram, the key clases that need
 7 | to be implemented are **DataInterface** and **ModuleInterface**. These interact with **Trainer** which
 8 | acts as an orchestrator, and **TrainerBackend**, which themselves are extensible and configurable via
 9 | arguments.
10 | 
11 | ![](../UML/diagrams/out/classes.svg)
12 | 
13 | ## Classification task example
14 | 
15 | Below you can find a collection of class and sequence diagrams
16 | for a classification task that exemplifies the use of pymarlin. The example
17 | shown implements the DataInterface and ModuleInterface for classifying tweet
18 | sentiment, as TweetSentData and TweetSentModule. The class diagram
19 | illustrates which classes correspond to built-in pymarlin classes, user-extended
20 | classes specific to the scenario, and some important external depenencies used.
21 | 
22 | ### Class Diagram
23 | 
24 | This diagram shows the classes implemented for the classification task, as well as
25 | relationships between the important pymarlin modules. Here, ModuleInterface has been
26 | extended as *TweetSentModule*, DataInterface as *TweetSentDataModule*, and
27 | data processing is managed by classes *Stage1* and *Stage2*, extended from DataProcessor. As a TrainerBackend, SingleProcess class is used for this example.
28 | Most customization and settings come from modifying default TrainerArguments
29 | via the config.yaml file.
30 | 
31 | ![](../UML/diagrams/out/classifier.svg)
32 | 
33 | ### Training
34 | 
35 | The following sequence diagram illustrates how to train the above classification
36 | model, with emphasis on the module instanciation users need to perform as part
37 | of their main script. Details on Data Processing and the Training Lifecycle referenced
38 | in this diagram can be found below.
39 | 
40 | ![](../UML/diagrams/out/classification_train.svg)
41 | 
42 | ### Data Processing
43 | 
44 | Sequence diagram for the data processing prior to training.
45 | 
46 | ![](../UML/diagrams/out/classification_data_processing.svg)
47 | 
48 | ### Training Lifecycle
49 | 
50 | Sequence diagram for the training loop.
51 | 
52 | ![](../UML/diagrams/out/training_lifecycle.svg)
53 | 


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfner/ner_dataset_mod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfner/ner_dataset_mod.png


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfseqclass/loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/loss.jpg


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfseqclass/loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/loss.png


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfseqclass/lr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/lr.jpg


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfseqclass/lr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/lr.png


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfseqclass/train_metrics.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/train_metrics.jpg


--------------------------------------------------------------------------------
/website/docs/plugins/images/hfseqclass/train_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/plugins/images/hfseqclass/train_metrics.png


--------------------------------------------------------------------------------
/website/docs/utils/images/tb_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/docs/utils/images/tb_example.jpg


--------------------------------------------------------------------------------
/website/docs/utils/stats.md:
--------------------------------------------------------------------------------
 1 | # Stats and Tensorboard logging
 2 | We have implemented a wrapper on Tensorboard's SummaryWriter for logging stats to Tensorboard (TB) which makes it easy to use the utility to save TB events and visualize on TB later for tracking the progress of your training experiment. We also have the Azure ML and stdout writers to be able to write out your stats to the logs.
 3 | 
 4 | Usage is demonstrated here: 
 5 | ```python
 6 | import os
 7 | os.listdir()
 8 | ```
 9 | 
10 | 
11 | 
12 | 
13 |     ['.ipynb_checkpoints', 'Untitled.ipynb']
14 | 
15 | 
16 | 
17 | 
18 | ```python
19 | import pymarlin
20 | from pymarlin.utils.stats import global_stats, StatInitArgs
21 | from pymarlin.utils.writer import build_writer, WriterInitArgs
22 | ```
23 | `global_stats` is a singleton variable that can be used across entire application to log stats. 
24 | 
25 | ```python
26 | writers = ['tensorboard','stdout','aml']
27 | writerargs = WriterInitArgs(
28 |     tb_log_dir = './logs'
29 |     )
30 | writer_objects = [build_writer(w, writerargs) for w in writers]
31 | global_stats.rebuild(StatInitArgs(), writer_objects)
32 | ```
33 | 
34 |     SystemLog: 2021-01-29 16:02:21,033:INFO : pymarlin.utils.writer.tensorboard : 37 : Cleared directory ./logs (skipping azureml dirs)
35 |     SystemLog: 2021-01-29 16:02:21,040:INFO : pymarlin.utils.writer.tensorboard : 40 : Created tensorboard folder ./logs : []
36 | 
37 | ## Write out stats you care about
38 | ```python
39 | for i in range(10):
40 |     global_stats.update('loss',random.uniform(0,2), frequent = True ) # adds a new in memory stat
41 |     global_stats.log_stats(step = i) #actually logs stats to stdout, tensorboard and aml simultaneously
42 | ```
43 | 
44 |     SystemLog: 2021-01-29 16:06:40,276:INFO : pymarlin.utils.writer.stdout : 10 : step = 0, iteration : 0
45 |     SystemLog: 2021-01-29 16:06:40,279:INFO : pymarlin.utils.writer.stdout : 10 : step = 0, loss : 0.44372909088471446
46 |     SystemLog: 2021-01-29 16:06:40,284:INFO : pymarlin.utils.writer.stdout : 10 : step = 1, loss : 0.5985009500820384
47 |     SystemLog: 2021-01-29 16:06:40,285:INFO : pymarlin.utils.writer.stdout : 10 : step = 2, loss : 1.5669796666205043
48 |     SystemLog: 2021-01-29 16:06:40,286:INFO : pymarlin.utils.writer.stdout : 10 : step = 3, loss : 0.8748342474891679
49 |     SystemLog: 2021-01-29 16:06:40,288:INFO : pymarlin.utils.writer.stdout : 10 : step = 4, loss : 1.8371541447672195
50 |     SystemLog: 2021-01-29 16:06:40,290:INFO : pymarlin.utils.writer.stdout : 10 : step = 5, loss : 0.18000397399047174
51 |     SystemLog: 2021-01-29 16:06:40,292:INFO : pymarlin.utils.writer.stdout : 10 : step = 6, loss : 0.1455008149921977
52 |     SystemLog: 2021-01-29 16:06:40,293:INFO : pymarlin.utils.writer.stdout : 10 : step = 7, loss : 1.4704800219353158
53 |     SystemLog: 2021-01-29 16:06:40,297:INFO : pymarlin.utils.writer.stdout : 10 : step = 8, loss : 0.8764679987392285
54 |     SystemLog: 2021-01-29 16:06:40,298:INFO : pymarlin.utils.writer.stdout : 10 : step = 9, loss : 0.6293567937040325
55 | 
56 | ## Check Tensorboard logs
57 | ```python
58 | os.listdir('logs')
59 | ```
60 | 
61 |     ['events.out.tfevents.1611964941.krishan-surface.16776.1']
62 | 
63 | ```python
64 | !tensorboard --logdir logs
65 | ```
66 | 
67 | ![img](images/tb_example.jpg)
68 | 
69 | For more info on stats, check the stat module docstring in **pymarlin API** section
70 | 


--------------------------------------------------------------------------------
/website/docusaurus.config.js:
--------------------------------------------------------------------------------
  1 | /** @type {import('@docusaurus/types').DocusaurusConfig} */
  2 | module.exports = {
  3 |   title: 'PyMarlin',
  4 |   tagline: 'Lightweight PyTorch Training Framework',
  5 |   url: 'https://github.com/microsoft/PyMarlin',
  6 |   baseUrl: '/PyMarlin/',
  7 |   onBrokenLinks: 'throw',
  8 |   onBrokenMarkdownLinks: 'warn',
  9 |   favicon: 'img/favicon.ico',
 10 |   organizationName: 'microsoft', // Usually your GitHub org/user name.
 11 |   projectName: 'PyMarlin', // Usually your repo name.
 12 |   themeConfig: {
 13 |     navbar: {
 14 |       title: 'PyMarlin',
 15 |       logo: {
 16 |         alt: 'My Site Logo',
 17 |         src: 'img/logo.svg',
 18 |       },
 19 |       items: [
 20 |         {
 21 |           type: 'doc',
 22 |           docId: 'getting-started',
 23 |           position: 'left',
 24 |           label: 'Docs',
 25 |         },
 26 |         {
 27 |             type: 'doc',
 28 |             docId: 'reference/core/module_interface',
 29 |             position: 'left',
 30 |             label: 'SDK',
 31 |         },
 32 |         {
 33 |           href: 'https://github.com/microsoft/PyMarlin',
 34 |           label: 'GitHub',
 35 |           position: 'right',
 36 |         },
 37 |       ],
 38 |     },
 39 |     footer: {
 40 |       style: 'dark',
 41 |       links: [
 42 |         {
 43 |           title: 'Docs',
 44 |           items: [
 45 |             {
 46 |               label: 'Getting Started',
 47 |               to: 'docs/getting-started',
 48 |             },
 49 |           ],
 50 |         },
 51 |         {
 52 |           title: 'Community',
 53 |           items: [
 54 |             {
 55 |               label: 'Stack Overflow',
 56 |               href: 'https://stackoverflow.com/questions/tagged/pymarlin',
 57 |             },
 58 |             // {
 59 |             //   label: 'Discord',
 60 |             //   href: 'https://discordapp.com/invite/docusaurus',
 61 |             // },
 62 |             // {
 63 |             //   label: 'Twitter',
 64 |             //   href: 'https://twitter.com/docusaurus',
 65 |             // },
 66 |           ],
 67 |         },
 68 |         {
 69 |           title: 'More',
 70 |           items: [
 71 |             {
 72 |               label: 'GitHub',
 73 |               href: 'https://github.com/microsoft/PyMarlin',
 74 |             },
 75 |           ],
 76 |         },
 77 |       ],
 78 |       copyright: `Copyright © ${new Date().getFullYear()} Microsoft Inc. Built with Docusaurus.`,
 79 |     },
 80 |   },
 81 |   presets: [
 82 |     [
 83 |       '@docusaurus/preset-classic',
 84 |       {
 85 |         docs: {
 86 |           sidebarPath: require.resolve('./sidebars.js'),
 87 |           // Please change this to your repo.
 88 |           editUrl:
 89 |             'https://github.com/microsoft/PyMarlin/edit/master/website/',
 90 |         },
 91 |         // blog: {
 92 |         //   showReadingTime: true,
 93 |         //   // Please change this to your repo.
 94 |         //   editUrl:
 95 |         //     'https://github.com/facebook/docusaurus/edit/master/website/blog/',
 96 |         // },
 97 |         theme: {
 98 |           customCss: require.resolve('./src/css/custom.css'),
 99 |         },
100 |       },
101 |     ],
102 |   ],
103 | };
104 | 


--------------------------------------------------------------------------------
/website/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "website",
 3 |   "version": "0.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "docusaurus": "docusaurus",
 7 |     "start": "docusaurus start",
 8 |     "build": "docusaurus build",
 9 |     "swizzle": "docusaurus swizzle",
10 |     "deploy": "docusaurus deploy",
11 |     "clear": "docusaurus clear",
12 |     "serve": "docusaurus serve",
13 |     "write-translations": "docusaurus write-translations",
14 |     "write-heading-ids": "docusaurus write-heading-ids"
15 |   },
16 |   "dependencies": {
17 |     "@docusaurus/core": "2.0.0-alpha.75",
18 |     "@docusaurus/preset-classic": "2.0.0-alpha.75",
19 |     "@mdx-js/react": "^1.6.21",
20 |     "@svgr/webpack": "^5.5.0",
21 |     "clsx": "^1.1.1",
22 |     "file-loader": "^6.2.0",
23 |     "react": "^17.0.1",
24 |     "react-dom": "^17.0.1",
25 |     "url-loader": "^4.1.1",
26 |     "trim": "^0.0.3"
27 |   },
28 |   "browserslist": {
29 |     "production": [
30 |       ">0.5%",
31 |       "not dead",
32 |       "not op_mini all"
33 |     ],
34 |     "development": [
35 |       "last 1 chrome version",
36 |       "last 1 firefox version",
37 |       "last 1 safari version"
38 |     ]
39 |   }
40 | }


--------------------------------------------------------------------------------
/website/pydoc-markdown.yml:
--------------------------------------------------------------------------------
 1 | loaders:
 2 |    - type: python
 3 |      search_path: [../pymarlin]
 4 | processors:
 5 |   - type: filter
 6 |     skip_empty_modules: true
 7 |   - type: smart
 8 |   - type: crossref
 9 | renderer:
10 |   type: docusaurus
11 |   docs_base_path: docs
12 |   relative_output_path: reference
13 |   relative_sidebar_path: sidebar.json
14 |   sidebar_top_level_label: Reference
15 | 


--------------------------------------------------------------------------------
/website/sidebars.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Creating a sidebar enables you to:
 3 |  - create an ordered group of docs
 4 |  - render a sidebar for each doc of that group
 5 |  - provide next/previous navigation
 6 | 
 7 |  The sidebars can be generated from the filesystem, or explicitly defined here.
 8 | 
 9 |  Create as many sidebars as you want.
10 |  */
11 | 
12 | module.exports = {
13 |   docsSidebar: [
14 |     'getting-started', 
15 |     'installation', 
16 |     'marlin-in-pictures',
17 |     {'Examples': [{type: 'autogenerated', dirName: 'examples'}]},
18 |     {'Plugins': [{type: 'autogenerated', dirName: 'plugins'}]},
19 |     'utils/stats',
20 |     'contributing'
21 |   ],
22 |   // pydoc-markdown auto-generated markdowns from docstrings
23 |   referenceSideBar: [require("./docs/reference/sidebar.json")]
24 | };
25 | 


--------------------------------------------------------------------------------
/website/src/components/HomepageFeatures.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import clsx from 'clsx';
 3 | import styles from './HomepageFeatures.module.css';
 4 | 
 5 | const FeatureList = [
 6 |   {
 7 |     title: 'Easy to Use',
 8 |     Svg: require('../../static/img/undraw_docusaurus_mountain.svg').default,
 9 |     description: (
10 |       <>
11 |         PyMarlin was designed to make PyTorch training as easy as possible while still getting the benefits of GPU and distributed node acceleration.
12 |       </>
13 |     ),
14 |   },
15 |   {
16 |     title: 'Focus on What Matters',
17 |     Svg: require('../../static/img/undraw_docusaurus_tree.svg').default,
18 |     description: (
19 |       <>
20 |         Focus on your scenario code and data preprocessing pipeline, we'll take care of the training loop and optimizations for you.
21 |       </>
22 |     ),
23 |   },
24 |   {
25 |     title: 'Scale out to hundreds of GPUs with AzureML',
26 |     Svg: require('../../static/img/undraw_docusaurus_react.svg').default,
27 |     description: (
28 |       <>
29 |         Run the same code in all environments and just use a simple configuration change to scale from a single CPU on your dev machines
30 |         to hundreds of GPU's in AzureML or other cloud services.
31 |       </>
32 |     ),
33 |   },
34 | ];
35 | 
36 | function Feature({Svg, title, description}) {
37 |   return (
38 |     <div className={clsx('col col--4')}>
39 |       <div className="text--center">
40 |         <Svg className={styles.featureSvg} alt={title} />
41 |       </div>
42 |       <div className="text--center padding-horiz--md">
43 |         <h3>{title}</h3>
44 |         <p>{description}</p>
45 |       </div>
46 |     </div>
47 |   );
48 | }
49 | 
50 | export default function HomepageFeatures() {
51 |   return (
52 |     <section className={styles.features}>
53 |       <div className="container">
54 |         <div className="row">
55 |           {FeatureList.map((props, idx) => (
56 |             <Feature key={idx} {...props} />
57 |           ))}
58 |         </div>
59 |       </div>
60 |     </section>
61 |   );
62 | }
63 | 


--------------------------------------------------------------------------------
/website/src/components/HomepageFeatures.module.css:
--------------------------------------------------------------------------------
 1 | /* stylelint-disable docusaurus/copyright-header */
 2 | 
 3 | .features {
 4 |   display: flex;
 5 |   align-items: center;
 6 |   padding: 2rem 0;
 7 |   width: 100%;
 8 | }
 9 | 
10 | .featureSvg {
11 |   height: 200px;
12 |   width: 200px;
13 | }
14 | 


--------------------------------------------------------------------------------
/website/src/css/custom.css:
--------------------------------------------------------------------------------
 1 | /* stylelint-disable docusaurus/copyright-header */
 2 | /**
 3 |  * Any CSS included here will be global. The classic template
 4 |  * bundles Infima by default. Infima is a CSS framework designed to
 5 |  * work well for content-centric websites.
 6 |  */
 7 | 
 8 | /* You can override the default Infima variables here. */
 9 | :root {
10 |   --ifm-color-primary: #25c2a0;
11 |   --ifm-color-primary-dark: rgb(33, 175, 144);
12 |   --ifm-color-primary-darker: rgb(31, 165, 136);
13 |   --ifm-color-primary-darkest: rgb(26, 136, 112);
14 |   --ifm-color-primary-light: rgb(70, 203, 174);
15 |   --ifm-color-primary-lighter: rgb(102, 212, 189);
16 |   --ifm-color-primary-lightest: rgb(146, 224, 208);
17 |   --ifm-code-font-size: 95%;
18 | }
19 | 
20 | .docusaurus-highlight-code-line {
21 |   background-color: rgb(72, 77, 91);
22 |   display: block;
23 |   margin: 0 calc(-1 * var(--ifm-pre-padding));
24 |   padding: 0 var(--ifm-pre-padding);
25 | }


--------------------------------------------------------------------------------
/website/src/pages/index.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import clsx from 'clsx';
 3 | import Layout from '@theme/Layout';
 4 | import Link from '@docusaurus/Link';
 5 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 6 | import styles from './index.module.css';
 7 | import HomepageFeatures from '../components/HomepageFeatures';
 8 | 
 9 | function HomepageHeader() {
10 |   const {siteConfig} = useDocusaurusContext();
11 |   return (
12 |     <header className={clsx('hero hero--primary', styles.heroBanner)}>
13 |       <div className="container">
14 |         <h1 className="hero__title">{siteConfig.title}</h1>
15 |         <p className="hero__subtitle">{siteConfig.tagline}</p>
16 |         <div className={styles.buttons}>
17 |           <Link
18 |             className="button button--secondary button--lg"
19 |             to="/docs/getting-started">
20 |             PyMarlin Getting Started - 5min ⏱️
21 |           </Link>
22 |         </div>
23 |       </div>
24 |     </header>
25 |   );
26 | }
27 | 
28 | export default function Home() {
29 |   const {siteConfig} = useDocusaurusContext();
30 |   return (
31 |     <Layout
32 |       title={`Hello from ${siteConfig.title}`}
33 |       description="Description will go into a meta tag in <head />">
34 |       <HomepageHeader />
35 |       <main>
36 |         <HomepageFeatures />
37 |       </main>
38 |     </Layout>
39 |   );
40 | }
41 | 


--------------------------------------------------------------------------------
/website/src/pages/index.module.css:
--------------------------------------------------------------------------------
 1 | /* stylelint-disable docusaurus/copyright-header */
 2 | 
 3 | /**
 4 |  * CSS files with the .module.css suffix will be treated as CSS modules
 5 |  * and scoped locally.
 6 |  */
 7 | 
 8 | .heroBanner {
 9 |   padding: 4rem 0;
10 |   text-align: center;
11 |   position: relative;
12 |   overflow: hidden;
13 | }
14 | 
15 | @media screen and (max-width: 966px) {
16 |   .heroBanner {
17 |     padding: 2rem;
18 |   }
19 | }
20 | 
21 | .buttons {
22 |   display: flex;
23 |   align-items: center;
24 |   justify-content: center;
25 | }
26 | 


--------------------------------------------------------------------------------
/website/src/pages/markdown-page.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Markdown page example
3 | ---
4 | 
5 | # Markdown page example
6 | 
7 | You don't need React to write simple standalone pages.
8 | 


--------------------------------------------------------------------------------
/website/static/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/.nojekyll


--------------------------------------------------------------------------------
/website/static/img/docusaurus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/docusaurus.png


--------------------------------------------------------------------------------
/website/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/favicon.ico


--------------------------------------------------------------------------------
/website/static/img/tutorial/docsVersionDropdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/tutorial/docsVersionDropdown.png


--------------------------------------------------------------------------------
/website/static/img/tutorial/localeDropdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PyMarlin/3d851768ddace94585867435468187938c61f377/website/static/img/tutorial/localeDropdown.png


--------------------------------------------------------------------------------