├── .devcontainer
    └── devcontainer.json
├── .github
    └── workflows
    │   ├── pre-commit.yml
    │   └── template-cleanup.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
    ├── extensions.json
    ├── launch.json
    └── settings.json
├── CODE_OF_CONDUCT.md
├── License
├── README.md
├── README_template.md
├── SECURITY.md
├── assets
    └── images
    │   └── table-of-contents.png
├── azure-pipelines.yml
├── configs
    └── train.yaml
├── docker
    ├── Dockerfile_base_azureml
    ├── Dockerfile_base_azureml_cu116
    ├── Dockerfile_base_azureml_nightly
    ├── Dockerfile_base_nvidia
    ├── environment.yml
    └── environment_cu116.yml
├── setup.cfg
├── setup.py
├── src
    ├── __init__.py
    ├── datamodules
    │   ├── __init__.py
    │   └── mnist_datamodule.py
    ├── models
    │   ├── __init__.py
    │   ├── components
    │   │   ├── __init__.py
    │   │   └── simple_dense_net.py
    │   └── mnist_module.py
    ├── train.py
    └── utils
    │   ├── pl_utils.py
    │   └── system_monitor.py
└── tests
    ├── helpers.py
    └── test_dev_fast_run.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // Remote devcontainer file that describes what container to build, how to build it,
 2 | // and the extensions VS Code needs to enable the best remote development experience
 3 | {
 4 |     "name": "devcontainer",
 5 |     "build": {
 6 |         "context": "../docker",
 7 |         // Uncomment the base dockerfile of your choice
 8 |         "dockerfile": "../docker/Dockerfile_base_conda",
 9 |         // "dockerfile": "../docker/Dockerfile_base_nvidia",
10 |         "args": {
11 |             // Edit docker build args here as appropriate
12 |             // find latest BASE_IMAGE for Dockerfile_base_conda at https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
13 |             "BASE_IMAGE": "openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest"
14 |             // find latest BASE_IMAGE for Dockerfile_base_nvidia at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags
15 |             // Uncomment the following for nvidia base image
16 |             // "BASE_IMAGE": "22.06-py3"
17 |         },
18 |     },
19 |     // Configure tool-specific properties.
20 |     "customizations": {
21 |         // Configure properties specific to VS Code.
22 |         "vscode": {
23 |             // Add the IDs of extensions you want installed when the container is created.
24 |             "extensions": [
25 |                 "eamodio.gitlens",
26 |                 "ms-python.python",
27 |                 "ms-python.vscode-pylance",
28 |                 "ms-azuretools.vscode-docker",
29 |                 "ms-vscode-remote.remote-containers",
30 |                 "ms-vscode-remote.remote-ssh",
31 |                 "ms-vscode-remote.remote-ssh-edit",
32 |                 "ms-vscode-remote.remote-wsl",
33 |                 "ms-vscode-remote.vscode-remote-extensionpack",
34 |                 "redhat.vscode-yaml",
35 |                 "yzhang.markdown-all-in-one",
36 |                 "TrungNgo.autoflake",
37 |                 "Shan.code-settings-sync",
38 |                 "njpwerner.autodocstring",
39 |                 "jbockle.jbockle-format-files"
40 |             ]
41 |         }
42 |     },
43 |     // Docker run args
44 |     "runArgs": [
45 |         // Run with GPU support
46 |         "--privileged",
47 |         "--gpus",
48 |         "all",
49 |         // Uncomment the next line if you will be using a ptrace-based debugger like C++, Go, and Rust.
50 |         "--cap-add=SYS_PTRACE",
51 |         "--security-opt",
52 |         "seccomp=unconfined",
53 |         // Use Docker from inside the container. See https://aka.ms/vscode-remote/samples/docker-in-docker for details.
54 |         "-v",
55 |         "/var/run/docker.sock:/var/run/docker.sock"
56 |     ],
57 |     // Run the following command after the container has started and workspace mounted
58 |     "postStartCommand": "conda env config vars set -n base PYTHONPATH=${containerWorkspaceFolder} && git config --global --add safe.directory ${containerWorkspaceFolder}"
59 |     // Use 'forwardPorts' to make a list of ports inside the container available locally.
60 |     // "forwardPorts": [],
61 |     // Use 'postCreateCommand' to run commands after the container is created.
62 |     // "postCreateCommand": "python --version"
63 | }
64 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | # ref: https://github.com/pre-commit-ci-demo/demo/blob/main/.github/workflows/pre-commit.yml
 2 | name: pre-commit
 3 | 
 4 | on:
 5 |   push:
 6 |     branches: [main]
 7 |   pull_request:
 8 |     branches: [main]
 9 | 
10 | jobs:
11 |   pre-commit:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - uses: actions/setup-python@v2
16 |       - uses: pre-commit/action@v2.0.0
17 | 


--------------------------------------------------------------------------------
/.github/workflows/template-cleanup.yml:
--------------------------------------------------------------------------------
 1 | # credits: https://github.com/JetBrains/intellij-platform-plugin-template/blob/main/.github/workflows/template-cleanup.yml
 2 | 
 3 | # GitHub Actions Workflow responsible for cleaning up the Autonomous Research Systems' ml_template repository from
 4 | # the template-specific files and configurations. This workflow is supposed to be triggered automatically
 5 | # when a new template-based repository has been created.
 6 | 
 7 | name: Template Cleanup
 8 | on:
 9 |   push:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   # Run cleaning process only if workflow is triggered by the ml_template repository.
15 |   template-cleanup:
16 |     name: Template Cleanup
17 |     runs-on: ubuntu-latest
18 |     if: github.event.repository.name != 'auto-sys-ml-template'
19 |     steps:
20 |       # Check out current repository
21 |       - name: Fetch Sources
22 |         uses: actions/checkout@v2.4.0
23 | 
24 |       # Cleanup project
25 |       - name: Cleanup
26 |         run: |
27 |           rm -r assets/
28 |           mv README_template.md README.md
29 |           rm .github/workflows/template-cleanup.yml
30 | 
31 |       # Commit modified files
32 |       - name: Commit files
33 |         run: |
34 |           git config --local user.email "ratneshmadaan@gmail.com"
35 |           git config --local user.name "madratman"
36 |           git add .
37 |           git commit -m "bla"
38 |           git reset $(git commit-tree HEAD^{tree} -m "microsoft/AutonomousSystemsResearchGroup: init ml template repo")
39 | 
40 |       # Push changes
41 |       - name: Push changes
42 |         uses: ad-m/github-push-action@master
43 |         with:
44 |           branch: main
45 |           github_token: ${{ secrets.GITHUB_TOKEN }}
46 |           force: true
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # amulet
 2 | .amltconfig
 3 | amlt/
 4 | 
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # Distribution / packaging
11 | build/
12 | dist/
13 | *.egg-info/
14 | 
15 | # data
16 | data/
17 | 
18 | # logs
19 | logs/
20 | outputs/
21 | 
22 | # env
23 | .env
24 | .autoenv
25 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3
 3 | 
 4 | ci:
 5 |   autofix_prs: true
 6 |   autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions"
 7 |   autoupdate_schedule: quarterly
 8 | 
 9 | repos:
10 |   - repo: https://github.com/pre-commit/pre-commit-hooks
11 |     rev: v4.4.0
12 |     hooks:
13 |       # list of supported hooks: https://pre-commit.com/hooks.html
14 |       - id: trailing-whitespace
15 |       - id: end-of-file-fixer
16 |       - id: check-yaml
17 |       - id: check-case-conflict
18 |       - id: debug-statements
19 |       - id: detect-private-key
20 |       - id: check-added-large-files
21 |         args: ["--maxkb=500", "--enforce-all"]
22 |         exclude: |
23 |           (?x)^(
24 |           )$
25 | 
26 |   - repo: https://github.com/asottile/pyupgrade
27 |     rev: v3.3.1
28 |     hooks:
29 |       - id: pyupgrade
30 |         args: [--py37-plus]
31 |         name: Upgrade code
32 | 
33 |   # python formatting
34 |   - repo: https://github.com/psf/black
35 |     rev: 23.1.0
36 |     hooks:
37 |       - id: black
38 |         name: Format code
39 |         args: ["--line-length=120"]
40 | 
41 |   - repo: https://github.com/hadialqattan/pycln
42 |     rev: v2.1.3 # Possible releases: https://github.com/hadialqattan/pycln/releases
43 |     hooks:
44 |       - id: pycln
45 |         args: [--all]
46 | 
47 |   # ref: https://github.com/microsoft/vscode-isort]
48 |   - repo: https://github.com/pycqa/isort
49 |     rev: 5.12.0
50 |     hooks:
51 |       - id: isort
52 |         name: isort (python)
53 |         args: [--profile, "black"]
54 | 
55 |   # python docstring formatting
56 |   - repo: https://github.com/myint/docformatter
57 |     rev: v1.5.1
58 |     hooks:
59 |       - id: docformatter
60 |         args: [--in-place, --wrap-summaries, "99", --wrap-descriptions, "92"]
61 | 
62 |   # yaml formatting
63 |   - repo: https://github.com/pre-commit/mirrors-prettier
64 |     rev: v3.0.0-alpha.6
65 |     hooks:
66 |       - id: prettier
67 |         types: [yaml]
68 | 
69 |   # markdown formatting
70 |   - repo: https://github.com/executablebooks/mdformat
71 |     rev: 0.7.16
72 |     hooks:
73 |       - id: mdformat
74 |         additional_dependencies:
75 |           - mdformat-gfm
76 |           #- mdformat-black
77 |           - mdformat_frontmatter
78 |         exclude: CHANGELOG.md
79 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations.
 3 |     // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp
 4 |     // List of extensions which should be recommended for users of this workspace.
 5 |     "recommendations": [
 6 |         "eamodio.gitlens",
 7 |         "ms-python.python",
 8 |         "ms-python.vscode-pylance",
 9 |         "ms-azuretools.vscode-docker",
10 |         "ms-vscode-remote.remote-containers",
11 |         "ms-vscode-remote.remote-ssh",
12 |         "ms-vscode-remote.remote-ssh-edit",
13 |         "ms-vscode-remote.remote-wsl",
14 |         "ms-vscode-remote.vscode-remote-extensionpack",
15 |         "redhat.vscode-yaml",
16 |         "yzhang.markdown-all-in-one",
17 |         "TrungNgo.autoflake",
18 |         "Shan.code-settings-sync",
19 |         "njpwerner.autodocstring",
20 |         "jbockle.jbockle-format-files"
21 |     ],
22 |     // List of extensions recommended by VS Code that should not be recommended for users of this workspace.
23 |     "unwantedRecommendations": []
24 | }
25 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: Current File",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "justMyCode": true
14 |         },
15 |         {
16 |             "name": "train.py",
17 |             "type": "python",
18 |             "request": "launch",
19 |             "program": "src/train.py",
20 |             "console": "integratedTerminal",
21 |             "justMyCode": true,
22 |             "args": [
23 |                 "base=configs/train.yaml",
24 |                 "trainer.num_nodes=1",
25 |                 "trainer.devices=1",
26 |                 "data.train_params.batch_size=256",
27 |                 "model.optimizer_config.lr=1e-3"
28 |             ]
29 |         }
30 |     ]
31 | }
32 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "editor.defaultFormatter": "ms-python.black-formatter",
 3 |     "editor.formatOnPaste": true,
 4 |     "editor.formatOnSave": true,
 5 |     "editor.codeActionsOnSave": {
 6 |         "source.organizeImports": true
 7 |     },
 8 |     "python.analysis.typeCheckingMode": "basic",
 9 |     "python.formatting.provider": "black",
10 |     "python.formatting.blackArgs": [
11 |         "--line-length",
12 |         "120"
13 |     ],
14 |     "python.linting.enabled": true,
15 |     "python.linting.pylintEnabled": false,
16 |     "python.linting.flake8Enabled": true,
17 |     "python.linting.flake8Args": [
18 |         "--max-line-length=120",
19 |     ],
20 |     "python.testing.pytestArgs": [
21 |         "tests"
22 |     ],
23 |     "python.testing.unittestEnabled": false,
24 |     "python.testing.pytestEnabled": true,
25 |     "isort.args": [
26 |         "--profile",
27 |         "black"
28 |     ],
29 | }
30 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Autonomous Systems Research Group: ML Template
  2 | 
  3 | This document serves as an onboarding document as well as a template repository to quickstart machine learning experimentation at the [Autonomous Systems Research Group at Microsoft](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/)
  4 | 
  5 | **Note** Use the table of contents icon <img src="./assets/images/table-of-contents.png" width="25" height="25" /> on the top left corner of this document to get to a specific section quickly.
  6 | 
  7 | ### Using this template to generate a repository:
  8 | 
  9 | - Click on the green colored box titled **Use this template** top right, and name your new repository.
 10 | - You can clone your repo when it looks like [example_repo_generated_from_ml_template](https://github.com/AutonomousSystemsResearch/example_repo_generated_from_ml_template).
 11 | 
 12 | > **Note** that after you create the template, it will take about **20 seconds** for an automated github action to clean up the generated repository using an auto-commit. Please ensure your repository looks like [example_repo_generated_from_ml_template](https://github.com/AutonomousSystemsResearch/example_repo_generated_from_ml_template) before cloning it.
 13 | 
 14 | ## Introduction
 15 | 
 16 | For the template repository, we will use:
 17 | 
 18 | - [Pytorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/)
 19 |   - For minimizing boilerplate code
 20 | - [OmegaConf](https://omegaconf.readthedocs.io/)
 21 |   - Please go through [OmegaConf's github readme](https://github.com/omry/omegaconf#releases) for tutorials.
 22 |   - For config management
 23 |     > **Note**: we have an [archived branch called `hydra`](https://github.com/AutonomousSystemsResearch/ml_template/tree/hydra) which uses [hydra](https://hydra.cc/) for config management.
 24 | - Logging
 25 |   - We primarily use tensorboard. Amulet automatically patches tensorboard scalars to MLFlow for viewing metrics in Azure ML Studio.
 26 | - Conda and Docker
 27 |   - For development
 28 | 
 29 | ## Using this repository
 30 | 
 31 | ### **Running locally**
 32 | 
 33 | #### Setup
 34 | 
 35 | - **VSCode**
 36 | 
 37 |   - Extensions:
 38 | 
 39 |     - Hit `Ctrl+Shift+P` and type `Show Recommended Extensions` and install them from the sidebar.
 40 |       Or click "yes" when you get a VS Code pop up to install the recommended extensions, which are specified in [.vscode/extensions.json](.vscode/extensions.json).
 41 |       Follow [this doc](https://code.visualstudio.com/docs/editor/extension-marketplace#_recommended-extensions) for more details.
 42 |     - `Python`, `Pylance`, `Docker`, `GitLens`, `YAML`, and the [Remote development extension pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) are strongly recommended.
 43 | 
 44 |   - Debugging:
 45 | 
 46 |     - Please follow [VSCode docs and tutorials](https://code.visualstudio.com/docs/python/debugging) on Python debugging
 47 |     - A minimal debugging configuration has been provided in [.vscode/launch.json](.vscode/launch.json). Please see VSCode docs on [launch.json configs](https://code.visualstudio.com/docs/python/debugging#_additional-configurations) and [config options](https://code.visualstudio.com/docs/python/debugging#_set-configuration-options).
 48 | 
 49 | - **Conda**
 50 | 
 51 |   - Recommended for local development and debugging.
 52 |   - Note: For CUDA 11.6, see `Creating the conda environment from scratch (click to expand)` below.
 53 | 
 54 |   ```
 55 |   # create env
 56 |   conda env create --file docker/environment.yml
 57 | 
 58 |   # activate it
 59 |   conda activate ml_template
 60 | 
 61 |   # install this repo
 62 |   (ml_template) $ pip install -e .
 63 | 
 64 |   # install pre-commit (recommended). Scroll down to the #Developing section for details.
 65 |   (ml_template) $ pre-commit install
 66 |   ```
 67 | 
 68 |   > **Note** If you install additional packages in your environment manually, you should update the `environment.yml` correspondingly by doing a `$ conda env export | grep -v "^prefix: " > docker/environment.yml`.
 69 | 
 70 |   <details>
 71 |       <summary>
 72 |       Creating the conda environment from scratch (click to expand)
 73 |       </summary>
 74 | 
 75 |   ```
 76 |   conda update -n base -c defaults conda
 77 |   conda create --name ml_template python=3.9
 78 |   conda activate ml_template
 79 |   conda install pip
 80 |   conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
 81 |   conda install pytorch-lightning -c conda-forge
 82 |   pip install omegaconf \
 83 |     pytest \
 84 |     sh \
 85 |     pre-commit \
 86 |     mlflow \
 87 |     azureml-mlflow \
 88 |     azureml-core \
 89 |     torch_tb_profiler \
 90 |     opencv-python \
 91 |     black isort flake8 \
 92 |     psutil \
 93 |     rich
 94 |   conda env export | grep -v "^prefix: " > docker/environment.yml
 95 |   pre-commit install
 96 |   pre-commit run --all-files
 97 |   pip install -e .
 98 |   ```
 99 | 
100 |   For CUDA 11.6:
101 | 
102 |   ```
103 |   conda update -n base -c defaults conda
104 |   conda create --name ml_template_cu116 python=3.9
105 |   conda activate ml_template_cu116
106 |   conda install pip
107 |   conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
108 |   pip install pytorch-lightning
109 |   pip install omegaconf \
110 |     pytest \
111 |     sh \
112 |     pre-commit \
113 |     mlflow \
114 |     azureml-mlflow \
115 |     azureml-core \
116 |     torch_tb_profiler \
117 |     opencv-python \
118 |     black isort flake8 \
119 |     psutil \
120 |     rich
121 |   conda env export | grep -v "^prefix: " > docker/environment_cu116.yml
122 |   pre-commit install
123 |   pre-commit run --all-files
124 |   pip install -e .
125 |   ```
126 | 
127 |   </details>
128 | 
129 |   <details>
130 |       <summary>
131 |       Upgrading pytorch and cudatoolkit (click to expand)
132 |       </summary>
133 | 
134 |   ```
135 |   conda remove pytorch torchvision torchaudio cudatoolkit
136 |   # then follow pytorch installation steps, for example:
137 |   conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
138 |   # then update pytorch lightning:
139 |   pip install pytorch-lightning --upgrade
140 |   pip install pytorch-lightning[extra] --upgrade
141 |   pip install -U jsonargparse[signatures] --upgrade
142 |   ```
143 | 
144 |   </details>
145 | 
146 | - **Docker**
147 | 
148 |   - While submitting jobs to AzureML, we take our local conda environment and overlay them on an appropriate docker base image. For a new project / a custom conda environment, you can build the docker image locally as explained in a note later in this section. Optionally, the docker image building can be automated by CI (as explained later) if your project has a frequently update conda environment.
149 | 
150 |   - For `ml_template`, we have [three docker images](docker/) built automatically on each commit to `main` branch or a branch corresponding to a Pull Request.
151 |     Docker images are pushed to [PRIVATEAZURECONTAINERREGISTRYNAME](https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/964a24a8-8835-43c1-9633-7d78841facf1/resourceGroups/research_team/providers/Microsoft.ContainerRegistry/registries/PRIVATEAZURECONTAINERREGISTRYNAME/repository) container registory under [ml_template](https://ms.portal.azure.com/#view/Microsoft_Azure_ContainerRegistries/RepositoryBlade/id/%2Fsubscriptions%2F964a24a8-8835-43c1-9633-7d78841facf1%2FresourceGroups%2Fresearch_team%2Fproviders%2FMicrosoft.ContainerRegistry%2Fregistries%25PRIVATEAZURECONTAINERREGISTRYNAME/repository/ml_template).
152 |     To automate this for your generated repository from this template, please follow make an Azure Pipelines which will `azure-pipelines.yml`
153 | 
154 |   - The following tags correspond to the the *latest commit on the main branch.*
155 | 
156 | |                      Tag                      |                        Dockerfile                         |                                  docker pull command                                  |                                                                                       Base Image                                                                                       |
157 | | :-------------------------------------------: | :-------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
158 | |         `latest`  or `latest-azureml`         |         [azureml](docker/Dockerfile_base_azureml)         |     `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest`     | [mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04) |
159 | | `latest-nightly`  or `latest-azureml-nightly` | [azureml_nightly](docker/Dockerfile_base_azureml_nightly) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nightly` | [mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04) |
160 | |                `latest-nvidia`                |          [nvidia](docker/Dockerfile_base_nvidia)          | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia`  |                                  [nvcr.io/nvidia/pytorch:22-06-py3](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html)                                  |
161 | 
162 | - Building docker images and running docker containers locally - can be useful to reproduce issues which might occur while submitting to AzureML on your local machine. Please peruse public documentation on docker + vscode.
163 | 
164 | ```
165 | # pull image with [azureml image](https://hub.docker.com/_/microsoft-azureml?tab=description) as base with docker/environment.yml on top
166 | docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
167 | 
168 | # (optional) pull image with nvidia pytorch image as base
169 | docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia (for nvidia pytorch base image. See the note below for more details.)
170 | 
171 | # run image
172 | docker run -it --gpus=all -v <PATH_TO_THIS_REPO>:<PATH_TO_THIS_REPO> PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
173 | 
174 | # (optional) recommended give a name to your container
175 | docker run -it --rm --name=MYFANCYCONTAINERNAME --gpus=all -v <PATH_TO_THIS_REPO>:<PATH_TO_THIS_REPO> PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
176 | 
177 | # setup the repo (run inside the container)
178 | pip install -e .
179 | 
180 | # install pre-commit (recommended). Scroll down to the "Developing" section for details.
181 | pre-commit install
182 | ```
183 | 
184 | <details>
185 |   <summary>
186 |   More details on docker image tags for Pull Request and main branch builds (click to expand)
187 |   </summary>
188 | Similar to the `main` branch, for each pull request, we have:
189 | 
190 | - `PR-<#pr_number>-latest` aka `PR-<pr_number>-latest-azureml`
191 | - `PR-<#pr_number>-latest-nightly` aka `PR-<pr_number>-latest-azureml-nightly`
192 | - `PR-<#pr_number>-latest-nvidia`
193 | 
194 | And finally for both `main` and PR branches, we have tags corresponding to git commit hashes
195 | 
196 | - `main-<gitcommithash>-azureml` and `PR-<pr_number>-<gitcommithash>-azureml`
197 | - `main-<gitcommithash>-azureml-nightly` and `PR-<pr_number>-<gitcommithash>-azureml-nightly`
198 | - `main-<gitcommithash>-nvidia` and `PR-<pr_number>-<gitcommithash>-nvidia`
199 | 
200 | For example:
201 | 
202 | - `main-7fadad2b-azureml`, `main-7fadad2b-azureml-nightly`, `main-7fadad2b-nvidia`: correspond to [commit 7fadad2b](https://github.com/AutonomousSystemsResearch/ml_template/commit/7fadad2b1391cdbbc46422a6865caaf0300b9af8) on `main` branch with our three different dockerfiles
203 | - `PR-50-latest-azureml`, `PR-50-latest-azureml-nightly`, `PR-50-latest-nvidia`: correspond to latest commit on [PR#50](https://github.com/AutonomousSystemsResearch/ml_template/pull/50) with our three different dockerfiles
204 | - `PR-50-eef3b90-azureml`, `PR-50-eef3b90-azureml-nightly`, `PR-50-eef3b90-nvidia`: correspond to [commit eef3b90](https://github.com/AutonomousSystemsResearch/ml_template/pull/50/commits/eef3b900fc956614c7d45eac6fa9245b57f7bd72) on [PR#50](https://github.com/AutonomousSystemsResearch/ml_template/pull/50) with our three different dockerfiles
205 | 
206 | </details>
207 | <details>
208 |     <summary>
209 |     Building and understanding our Dockerfiles (click to expand)
210 |     </summary>
211 | 
212 | - We have three docker files:
213 | 
214 |   - azureml base:
215 |     - [docker/Dockerfile_base_azureml](docker/Dockerfile_base_azureml)
216 |     - [docker/Dockerfile_base_azureml_latest](docker/Dockerfile_base_azureml_latest)
217 |   - nvidia pytorch base:
218 |     - [docker/Dockerfile_base_nvidia](docker/Dockerfile_base_nvidia).
219 | 
220 | - Both of the azureml base images grabs a base image from [here](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu), and put the user's conda environment ([docker/environment.yml](docker/environment.yml)) on top of the base page.
221 | 
222 | - In the `latest-azureml` version, packages in your local conda environment should match the docker image exactly.
223 | 
224 | - In the `latest-azureml-nightly` image, pytorch (including cudatoolkit) and pytorch lightning are updated to the nightly versions.
225 | 
226 | - The nvidia pytorch base image grabs a base image from [here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) ([here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) for details), which already has the latest version of pytorch.
227 |   Instead of using user's conda environment, this docker file uses `pip` to install pytorch lightning and other dependencies on top of base image. So this image can have different versions of  packages as compared to your conda environment.
228 | 
229 | All docker images accept a build argument to update the base image version easily:
230 | 
231 | - azureml images:
232 |   - take base azure image name's suffix **and** tag. see available options [here](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu):
233 |     - examples: `openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest`, `openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest`, and so on.
234 | - nvidia pytorch image:
235 |   - takes base nvidia image name's tag only.
236 |   - see [available tags here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) and [the release notes for their contents](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html)
237 |   - examples: `22.06-py3`, `22.05-py3`, and so on.
238 | 
239 | Please review the arguments in the dockerfiles carefully. These can also be seen by reading through [azure-pipelines.yml](azure-pipelines.yml).
240 | 
241 | Building the azure-ml base + conda env images locally:
242 | 
243 | ```
244 | cd docker;
245 | 
246 | docker build \
247 |   -f Dockerfile_base_azureml \
248 |   --build-arg BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest \
249 |   -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml .
250 | 
251 | # note that in the PRIVATEAZURECONTAINERREGISTRYNAME acr, latest is equivalent to latest-azureml tag. So, we can just re-tag the image:
252 | docker tag PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
253 | ```
254 | 
255 | For the CUDA 11.6 version:
256 | 
257 | ```
258 | cd docker;
259 | 
260 | docker build \
261 |   -f Dockerfile_base_azureml_cu116 \
262 |   --build-arg BASE_IMAGE=openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest \
263 |   -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml-cu116 .
264 | 
265 | # note that in the PRIVATEAZURECONTAINERREGISTRYNAME acr, latest is equivalent to latest-azureml tag. So, we can just re-tag the image:
266 | docker tag PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml-cu116 PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-cu116
267 | ```
268 | 
269 | Building the nvidia-pytorch image locally:
270 | 
271 | ```
272 | # building nvidia-pytorch image with locally.
273 | cd docker;
274 | 
275 | docker build \
276 |   -f Dockerfile_base_nvidia \
277 |   --build-arg BASE_IMAGE=22.06-py3 \
278 |   -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia .
279 | ```
280 | 
281 | </details>
282 | 
283 | <details>
284 |     <summary>
285 |     Developing inside docker containers with VSCode: (click to expand)
286 |     </summary>
287 | 
288 | - [Attach to a docker container](https://code.visualstudio.com/docs/remote/attach-container)
289 | 
290 | - [Devcontainer](https://code.visualstudio.com/docs/remote/containers)
291 | 
292 |   > **Note**: This method can be used on an Azure VM or locally with no change and uses docker
293 | 
294 |   Follow the steps below:
295 | 
296 |   - Connect to your remote Azure VM using VS Code
297 |   - Open the workspace within a docker container for development, either using the popup as shown in the animation above, or by searching for `(Re)Build and (Re)open in container` in the  command palette (hit `Ctrl+Shift+P` to open the command palette)
298 |   - After setup is complete, it is time to set up the repository:
299 |     ```
300 |       pip install -e .
301 |       pre-commit install
302 |     ```
303 |   - > **Note**: By default, the devcontainer uses the [azureml-conda base image](docker/Dockerfile_base_azureml). We can also use the [nvidia base image](docker/Dockerfile_base_nvidia) by modifying the `dockerfile` line in [devcontainer.json](.devcontainer/devcontainer.json). Similarly, we can edit the docker files build argument therein itself.
304 | 
305 | </details>
306 | 
307 | #### Running MNIST example
308 | 
309 | - Understanding OmegaConf and config files
310 | 
311 |   - Please review OmegaConf's [github readme](https://github.com/omry/omegaconf#releases) for [their documentation](https://omegaconf.readthedocs.io/en/2.2_branch/), [slides (for ver 2.1)](https://docs.google.com/presentation/d/e/2PACX-1vT_UIV7hCnquIbLUm4NnkUpXvPEh33IKiUEvPRF850WKA8opOlZOszjKdZ3tPmf8u7hGNP6HpqS-NT5/pub?start=false&loop=false&delayms=3000&slide=id.p), and a [live tutorial](https://github.com/omry/omegaconf#live-tutorial).
312 | 
313 | - Single GPU
314 | 
315 |   ```
316 |   python src/train.py base=configs/train.yaml trainer.num_nodes=1 trainer.devices=1
317 |   ```
318 | 
319 | - Multiple GPUs
320 | 
321 |   ```
322 |   python src/train.py base=configs/train.yaml trainer.num_nodes=1 trainer.devices=4
323 |   ```
324 | 
325 | ### Running on Azure
326 | 
327 | Note: This section used internal tools for job submission to Azure ML workspaces. This section is not supported publicly at the time of writing. However, one may peruse existing public documentation on azure ml.
328 | 
329 | ### Developing
330 | 
331 | #### Tests
332 | 
333 | The template has some basic tests in `tests/` directory. To run them, run:
334 | 
335 | ```
336 | # run all tests
337 | pytest
338 | 
339 | # run single test
340 | pytest tests/test_dev_fast_run.py
341 | ```
342 | 
343 | List of tests implemented:
344 | 
345 | - [fast_dev_run](https://pytorch-lightning.readthedocs.io/en/stable/common/debugging.html#fast-dev-run):  a simple check to run your trainer on single batch of train, valid, and test datase.
346 |   It can also be useful to quickly check your code works by running while adding new features:
347 |   ```
348 |   python src/train.py base=configs/train.yaml --fast_dev_run=True
349 |   ```
350 | 
351 | #### Code formatting and Linting
352 | 
353 | We use:
354 | 
355 | - [black](https://black.readthedocs.io/en/stable/) for code formatting
356 | 
357 | - [isort](https://pycqa.github.io/isort/) for import ordering
358 | 
359 | - [pycln](https://hadialqattan.github.io/pycln/#/) for removing unused imports
360 | 
361 | - Running locally:
362 | 
363 |   ```
364 |   $ cd ml_template;
365 |   $ black .
366 |   $ isort .
367 |   $ pycln --all .
368 |   ```
369 | 
370 | #### Pre-commit Hooks: Automating Code formatting and Linting
371 | 
372 | [pre-commit](https://pre-commit.com/) hooks automate black autoformatting and ensuring PEP8 compliance.
373 | 
374 | - Setting up:
375 | 
376 |   ```
377 |   $ cd ml_template;
378 |   $ pre-commit install
379 |   ```
380 | 
381 | - Running:
382 | 
383 |   After the above step, `pre-commit` will run **automatically** when you `git commit`.
384 |   If the run fails with errors in red, you can check the edits made by `pre-commit` by `git diff`.
385 |   If the changes look good, (1) `git add` those files again, and then (2) run `git commit` again.
386 | 
387 |   Optionally, you can also run pre-commit manually by:
388 | 
389 |   ```
390 |   $ pre-commit run --all-files
391 |   ```
392 | 
393 | - Updating hooks:
394 |   Use the `autoupdate` command to keep the versions of formatters in `.pre-commit-config.yaml` up to date.
395 | 
396 |   ```
397 |   $ pre-commit autoupdate
398 |   ```
399 | 
400 | ### Continuous Integration
401 | 
402 | - **Github Actions**
403 | 
404 |   - [Pre-commit checks](.github/workflows/pre-commit.yml)
405 |   - [Template cleanup](.github/workflows/template-cleanup.yml):
406 |     When a new repository is generated using this template, this action replace `README.md` with `README_template.md` to keep microsoft links internal.
407 | 
408 | - **Azure Pipelines**
409 | 
410 |   - Create an azure devops pipeline for your repository.
411 |     This automates building of your docker images, and also run pytests on them.
412 | 
413 |   - The azure pipeline logs can be seen at Azure DevOps webpage, but not on with github UI directly.
414 | 
415 |     Pull Request example:
416 | 
417 |     - You can click `View more details on Azure Pipelines` under the `Checks` section of a github PR.
418 |     - See [PR#6/checks](https://github.com/AutonomousSystemsResearch/ml_template/pull/6/checks) for an example.
419 | 
420 |     <br>
421 | 
422 |   - [Docker Build and Push Image](azure-pipelines.yml)
423 | 
424 |     See the job `BuildDockerImageAndPush` in [azure-pipelines.yml](azure-pipelines.yml). It will build the image in [docker/Dockerfile](docker/Dockerfile) and push it to a private azure container registry
425 | 
426 |     See docker section under #running-locally for details
427 | 
428 | ### Contributing
429 | 
430 | - conda `environment.yml` update:
431 | 
432 |   If you install packages in conda, update the `docker/environment.yml` by `conda env export | grep -v "^prefix: " > docker/environment.yml`, and send a PR.
433 | 
434 | ## Reference Repositories
435 | 
436 | - Pytorch Lightning:
437 | 
438 |   - Pytorch v/s Pytorch Lightning
439 | 
440 |     - [PyTorch Lightning for Dummies - A Tutorial and Overview
441 |       ](https://www.assemblyai.com/blog/pytorch-lightning-for-dummies/)
442 |     - [PyTorch Lightning: DataModules, Callbacks, TPU, and Loggers
443 |       ](https://dev.to/krypticmouse/pytorch-lightning-datamodules-callbacks-tpu-and-loggers-4nhb)
444 | 
445 |   - Template / reference repositories
446 | 
447 |     - https://github.com/ashleve/lightning-hydra-template
448 |     - https://github.com/lkhphuc/lightning-hydra-template
449 |     - [Pytorch lightning bolts](https://lightning-bolts.readthedocs.io/en/latest/)
450 |       - Look inside the code for datamodules, datasets, models, etc: https://github.com/PyTorchLightning/lightning-bolts/tree/master/pl_bolts
451 | 
452 | - Pytorch Geometric:
453 | 
454 |   - [lightning-examples](https://github.com/pyg-team/pytorch_geometric/tree/d451d6d20287b03cbe5036e5c53ee5f633f3c429/examples/pytorch_lightning)
455 |   - [torch_geometric.data.lightning_datamodule](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/lightning_datamodule.html)
456 |   - [Graph Gym](https://pytorch-geometric.readthedocs.io/en/latest/notes/graphgym.html)
457 | 
458 | - Pytorch data, datapipes, dataloaders:
459 | 
460 |   - https://pytorch.org/data/main/examples.html
461 |   - https://github.com/tcapelle/torchdata
462 |   - https://github.com/pytorch/data
463 | 


--------------------------------------------------------------------------------
/README_template.md:
--------------------------------------------------------------------------------
 1 | # Name_of_Your_Project
 2 | 
 3 | ## Setting up
 4 | 
 5 | - Using conda
 6 | 
 7 |   ```
 8 |   # create env
 9 |   conda env create --file docker/environment.yml
10 | 
11 |   # activate it
12 |   conda activate NAMEOFYOURPROJECT
13 | 
14 |   # install this repo
15 |   (NAMEOFYOURPROJECT) $ pip install -e .
16 |   ```
17 | 
18 | - Using docker
19 | 
20 |   ```
21 |   # pull image with [azureml image](https://hub.docker.com/_/microsoft-azureml?tab=description) as base with docker/environment.yml on top
22 |   docker pull NAMEOFYOURPROJECT:latest
23 | 
24 |   # pull image with nvidia pytorch image as base
25 |   # docker pull NAMEOFYOURPROJECT:latest-nvidia
26 | 
27 |   # run image
28 |   docker run -it --gpus=all -v <PATH_TO_THIS_REPO>:<NAMEOFYOURPROJECT:latest
29 | 
30 |   # setup the repo (run inside the container)
31 |   pip install -e .
32 |   ```
33 | 
34 | - VSCode + Docker
35 | 
36 |   - [Using Devcontainer](https://code.visualstudio.com/docs/remote/containers)
37 | 
38 |     - Connect to your remote Azure VM using VS Code
39 | 
40 |     - Open the workspace within a docker container for development, either using the popup as shown in the animation above, or by searching for `(Re)Build and (Re)open in container` in the  command palette (hit `Ctrl+Shift+P` to open the command palette)
41 | 
42 |     - After setup is complete, it is time to set up the repository:
43 | 
44 |       ```
45 |         pip install -e .
46 |         pre-commit install
47 |       ```
48 | 
49 |     - Note: By default, the devcontainer uses the [azureml-conda base image](docker/Dockerfile_base_conda). We can also use the [nvidia base image](docker/Dockerfile_base_nvidia) by modifying the `dockerfile` line in [devcontainer.json](.devcontainer/devcontainer.json). Similarly, we can edit the docker files build argument therein itself.
50 | 
51 |   - [Attach to a docker container](https://code.visualstudio.com/docs/remote/attach-container)
52 | 
53 | ## Running the code
54 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 | - Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | - Full paths of source file(s) related to the manifestation of the issue
23 | - The location of the affected source code (tag/branch/commit or direct URL)
24 | - Any special configuration required to reproduce the issue
25 | - Step-by-step instructions to reproduce the issue
26 | - Proof-of-concept or exploit code (if possible)
27 | - Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/assets/images/table-of-contents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/assets/images/table-of-contents.png


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
  1 | trigger:
  2 |   - main
  3 |   # paths:
  4 |   #   include:
  5 |   #     - docker
  6 | 
  7 | pr:
  8 |   - main
  9 | 
 10 | resources:
 11 |   - repo: self
 12 | 
 13 | variables:
 14 |   # Note: to customize the pipeline to use private ACRs other than commondockerimages,
 15 |   # you need to change BOTH dockerRegistryServiceConnection and containerRegistryName
 16 | 
 17 |   # Container registry service connection established during pipeline creation
 18 |   dockerRegistryServiceConnection: "442ea973-c852-4792-aa09-fab4a9df791f"
 19 |   containerRegistryName: "commondockerimages.azurecr.io"
 20 | 
 21 |   dockerfileazuremlPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_azureml"
 22 |   dockerfileazuremlnightlyPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_azureml_nightly"
 23 |   dockerfilenvidiaPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_nvidia"
 24 | 
 25 |   tagLatest: "latest"
 26 | 
 27 |   # Agent VM image name
 28 |   vmImageName: "ubuntu-latest"
 29 | 
 30 | stages:
 31 |   # docs: https://docs.microsoft.com/azure/devops/pipelines/languages/docker
 32 |   - stage: BuildDockerImagesAndRunPytest
 33 |     displayName: Build docker; run pytests on built images
 34 |     jobs:
 35 |       - job: DefineDockerTags
 36 |         displayName: define docker tags
 37 |         pool:
 38 |           vmImage: $(vmImageName)
 39 |         steps:
 40 |           - bash: |
 41 |               github_organization_prefix="AutonomousSystemsResearch/"
 42 |               full_repo_name=$(Build.Repository.Name)
 43 |               repo_name=${full_repo_name#"$github_organization_prefix"}
 44 |               branch_name=$(Build.SourceBranchName)
 45 | 
 46 |               git_short_hash_main=`git rev-parse --short=7 HEAD`
 47 |               git_hash_pr=$(System.PullRequest.SourceCommitId)
 48 |               git_short_hash_pr=${git_hash_pr:0:7}
 49 |               pr_number=$(System.PullRequest.PullRequestNumber)
 50 | 
 51 |               tag_main_git_commit=main-$git_short_hash_main
 52 |               tag_pr_git_commit=PR-$pr_number-$git_short_hash_pr
 53 |               tag_pr_latest=PR-$pr_number-latest
 54 | 
 55 |               echo ""
 56 |               echo "full repo name:  $(Build.Repository.Name)"
 57 |               echo "repo name:  $repo_name"
 58 |               echo "Build Id:  $(Build.BuildId)"
 59 |               echo "Build BuildNumber:  $(Build.BuildNumber)"
 60 |               echo "Build Reason:  $(Build.Reason)"
 61 |               echo "Build Branch Name:  $(Build.SourceBranchName)"
 62 |               echo "git commit message: $(Build.SourceVersionMessage)"
 63 |               echo "git hash (main branch):  $(Build.SourceVersion)"
 64 |               echo "git hash short (main branch): $git_short_hash_main"
 65 |               echo "PR branch: $(System.PullRequest.SourceBranch)"
 66 |               echo "PR number: $(System.PullRequest.PullRequestNumber)"
 67 |               echo "PR ID: $(System.PullRequest.PullRequestId)"
 68 |               echo "git hash (PR branch): $(System.PullRequest.SourceCommitId)"
 69 |               echo "git hash short (PR branch): : $git_short_hash_pr"
 70 | 
 71 |               # set pipeline variables which can be referenced in the jobs that follow to tag docker images appropriately
 72 |               echo "##vso[task.setvariable variable=repoName;isoutput=true]$repo_name"
 73 |               echo "##vso[task.setvariable variable=tagPRLatest;isoutput=true]$tag_pr_latest"
 74 | 
 75 |               if [[ "$branch_name" == "main" ]]; then
 76 |                 echo "##vso[task.setvariable variable=tagOfThisBuild;isoutput=true]$tag_main_git_commit"
 77 |               fi
 78 |               if [[ "$branch_name" == "merge" ]]; then
 79 |                 echo "##vso[task.setvariable variable=tagOfThisBuild;isoutput=true]$tag_pr_git_commit"
 80 |               fi
 81 | 
 82 |               ## deprecated; but might be of use in the future
 83 |               # echo "##vso[task.setvariable variable=tagMainGitCommitHash;isoutput=true]$tag_main_git_commit"
 84 |               # echo "##vso[task.setvariable variable=tagPRGitCommitHash;isoutput=true]$tag_pr_git_commit"
 85 | 
 86 |               # print tags:
 87 |               echo "tag_pr_latest:  $tag_pr_latest"
 88 |               echo "tag_pr_git_commit:  $tag_pr_git_commit"
 89 |               echo "tag_main_git_commit:  $tag_main_git_commit"
 90 | 
 91 |               # print outputvars:
 92 |               echo "tag_pr_latest:  $tag_pr_latest"
 93 |               echo "tag_pr_git_commit:  $tag_pr_git_commit"
 94 |               echo "tag_main_git_commit:  $tag_main_git_commit"
 95 |             name: DockerTagVars # because we're going to depend on it, we need to name the step
 96 |             displayName: (debug) print git info
 97 | 
 98 |       - job: BuildDockerAzureMLBase
 99 |         dependsOn: DefineDockerTags
100 |         displayName: build azureml; run pytest
101 |         pool:
102 |           vmImage: $(vmImageName)
103 |         variables:
104 |           tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ]
105 |           tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ]
106 |           repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
107 |         steps:
108 |           - bash: |
109 |               echo "repoName: $(repoName)"
110 |               echo "tagOfThisBuild: $(tagOfThisBuild)"
111 |               echo "tagPRLatest: $(tagPRLatest)"
112 |             displayName: (debug) print pipeline vars
113 | 
114 |           - task: Docker@2
115 |             displayName: Build and Push Image
116 |             inputs:
117 |               command: buildAndPush
118 |               repository: $(repoName)
119 |               dockerfile: $(dockerfileazuremlPath)
120 |               containerRegistry: $(dockerRegistryServiceConnection)
121 |               ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}:
122 |                 tags: |
123 |                   $(tagOfThisBuild)-azureml
124 |                   $(tagPRLatest)-azureml
125 |                   $(tagPRLatest)
126 |               ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
127 |                 tags: |
128 |                   $(tagOfThisBuild)-azureml
129 |                   $(tagLatest)-azureml
130 |                   $(tagLatest)
131 | 
132 |           - bash: |
133 |               docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-azureml
134 |               docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\""
135 |               docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\""
136 |               docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\""
137 |             displayName: print versions
138 | 
139 |           - script: |
140 |               docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
141 |             displayName: pytest
142 | 
143 |       - job: BuildDockerAzureMLBaseNightly
144 |         dependsOn: DefineDockerTags
145 |         displayName: build azureml nightly; run pytest
146 |         pool:
147 |           vmImage: $(vmImageName)
148 |         variables:
149 |           tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ]
150 |           tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ]
151 |           repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
152 |         steps:
153 |           - bash: |
154 |               echo "repoName: $(repoName)"
155 |               echo "tagOfThisBuild: $(tagOfThisBuild)-azureml-nightly"
156 |               echo "tagPRLatest: $(tagPRLatest)-azureml-nightly"
157 |             displayName: (debug) print pipeline vars
158 | 
159 |           - task: Docker@2
160 |             displayName: Build and Push Image
161 |             inputs:
162 |               command: buildAndPush
163 |               repository: $(repoName)
164 |               dockerfile: $(dockerfileazuremlnightlyPath)
165 |               containerRegistry: $(dockerRegistryServiceConnection)
166 |               ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}:
167 |                 tags: |
168 |                   $(tagOfThisBuild)-azureml-nightly
169 |                   $(tagPRLatest)-azureml-nightly
170 |               ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
171 |                 tags: |
172 |                   $(tagOfThisBuild)-azureml-nightly
173 |                   $(tagLatest)-azureml-nightly
174 | 
175 |           - bash: |
176 |               docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-azureml-nightly
177 |               docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\""
178 |               docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\""
179 |               docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\""
180 |             displayName: print versions
181 | 
182 |           - script: |
183 |               docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
184 |             displayName: pytest
185 | 
186 |       - job: BuildDockerNvidiaBasePipInstall
187 |         dependsOn: DefineDockerTags
188 |         displayName: build nvidia pytorch; run pytest
189 |         pool:
190 |           vmImage: $(vmImageName)
191 |         variables:
192 |           tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ]
193 |           tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ]
194 |           repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
195 |         steps:
196 |           - bash: |
197 |               echo "repoName: $(repoName)"
198 |               echo "tagOfThisBuild: $(tagOfThisBuild)-nvidia"
199 |               echo "tagPRLatest: $(tagPRLatest)-nvidia"
200 | 
201 |               # echo "tagMainGitCommitHash: $(tagMainGitCommitHash)"
202 |               # echo "tagPRGitCommitHash: $(tagPRGitCommitHash)"
203 |             displayName: (debug) print pipeline vars
204 | 
205 |           - task: Docker@2
206 |             displayName: Build and Push Image
207 |             inputs:
208 |               command: buildAndPush
209 |               repository: $(repoName)
210 |               dockerfile: $(dockerfilenvidiaPath)
211 |               containerRegistry: $(dockerRegistryServiceConnection)
212 |               ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}:
213 |                 tags: |
214 |                   $(tagOfThisBuild)-nvidia
215 |                   $(tagPRLatest)-nvidia
216 |               ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
217 |                 tags: |
218 |                   $(tagOfThisBuild)-nvidia
219 |                   $(tagLatest)-nvidia
220 | 
221 |           - bash: |
222 |               docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-nvidia
223 |               docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\""
224 |               docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\""
225 |               docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\""
226 |             displayName: print versions
227 | 
228 |           - script: |
229 |               docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
230 |             displayName: pytest
231 | 
232 |       - job: PytestAzureMLBaseTagLatest
233 |         dependsOn: DefineDockerTags
234 |         displayName: pytest latest-azureml
235 |         pool:
236 |           vmImage: $(vmImageName)
237 |         variables:
238 |           repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
239 |         steps:
240 |           - checkout: self
241 | 
242 |           - task: Docker@2
243 |             displayName: Login to ACR
244 |             inputs:
245 |               command: login
246 |               containerRegistry: $(dockerRegistryServiceConnection)
247 | 
248 |           - script: |
249 |               docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-azureml
250 |             displayName: docker pull and run
251 | 
252 |           - script: |
253 |               docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
254 |             displayName: pytest
255 | 
256 |       - job: PytestAzureMLBaseTagLatestNightly
257 |         dependsOn: DefineDockerTags
258 |         displayName: pytest latest-azureml-nightly
259 |         pool:
260 |           vmImage: $(vmImageName)
261 |         variables:
262 |           repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
263 |         steps:
264 |           - checkout: self
265 | 
266 |           - task: Docker@2
267 |             displayName: Login to ACR
268 |             inputs:
269 |               command: login
270 |               containerRegistry: $(dockerRegistryServiceConnection)
271 | 
272 |           - script: |
273 |               docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-azureml-nightly
274 |             displayName: docker pull and run
275 | 
276 |           - script: |
277 |               docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
278 |             displayName: pytest
279 | 
280 |       - job: PytestNvidiaBaseTagLatest
281 |         dependsOn: DefineDockerTags
282 |         displayName: pytest latest-nvidia
283 |         pool:
284 |           vmImage: $(vmImageName)
285 |         variables:
286 |           repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
287 |         steps:
288 |           - checkout: self
289 | 
290 |           - task: Docker@2
291 |             displayName: Login to ACR
292 |             inputs:
293 |               command: login
294 |               containerRegistry: $(dockerRegistryServiceConnection)
295 | 
296 |           - script: |
297 |               docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-nvidia
298 |             displayName: docker pull and run
299 | 
300 |           - script: |
301 |               docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
302 |             displayName: pytest
303 | 


--------------------------------------------------------------------------------
/configs/train.yaml:
--------------------------------------------------------------------------------
 1 | seed_everything: 42
 2 | 
 3 | trainer:
 4 |   default_root_dir: ${oc.env:AMLT_OUTPUT_DIR,outputs}
 5 | 
 6 |   num_nodes: 1
 7 |   devices: 1
 8 |   accelerator: gpu
 9 |   strategy: ddp_find_unused_parameters_false
10 | 
11 |   min_epochs: 1
12 |   max_epochs: 10
13 |   enable_progress_bar: true
14 | 
15 |   sync_batchnorm: True
16 |   enable_checkpointing: True
17 |   resume_from_checkpoint: null
18 | 
19 |   # debugging
20 |   fast_dev_run: false
21 | 
22 | data:
23 |   _target_: datamodules.mnist_datamodule.MNISTDataModule
24 | 
25 |   file_params:
26 |     base_dir: data/
27 |     train_val_test_split: [55_000, 5_000, 10_000]
28 | 
29 |   train_params:
30 |     batch_size: 128
31 |     num_workers: 0
32 |     pin_memory: False
33 | 
34 | model:
35 |   _target_: models.mnist_module.MNISTLitModule
36 | 
37 |   mlp_config:
38 |     input_size: 784
39 |     lin1_size: 256
40 |     lin2_size: 256
41 |     lin3_size: 256
42 |     output_size: 10
43 | 
44 |   optimizer_config:
45 |     lr: 0.001
46 |     weight_decay: 0.0005
47 | 
48 | logger:
49 |   tensorboard:
50 |     _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
51 |     save_dir: ${trainer.default_root_dir}/logs
52 |     name: null
53 |     version: null
54 |     log_graph: False
55 |     default_hp_metric: True
56 |     prefix: ""
57 | 
58 | callbacks:
59 |   checkpoint:
60 |     _target_: pytorch_lightning.callbacks.ModelCheckpoint
61 |     dirpath: "${trainer.default_root_dir}/checkpoints/"
62 |     monitor: "val/acc" # name of the logged metric which determines when model is improving
63 |     mode: "max" # "max" means higher metric value is better, can be also "min"
64 |     save_top_k: 1 # save k best models (determined by above metric)
65 |     save_last: True # additionaly always save model from last epoch
66 |     verbose: False
67 |     filename: "epoch_{epoch:03d}"
68 |     auto_insert_metric_name: False
69 | 
70 |   early_stopping:
71 |     _target_: pytorch_lightning.callbacks.EarlyStopping
72 |     monitor: "val/loss" # name of the logged metric which determines when model is improving
73 |     mode: "min" # "max" means higher metric value is better, can be also "min"
74 |     patience: 100 # how many validation epochs of not improving until training stops
75 |     min_delta: 0 # minimum change in the monitored metric needed to qualify as an improvement
76 | 
77 |   model_summary:
78 |     _target_: pytorch_lightning.callbacks.RichModelSummary
79 |     max_depth: -1
80 | 
81 |   progress:
82 |     _target_: pytorch_lightning.callbacks.RichProgressBar
83 | 
84 |   lr_mon:
85 |     _target_: pytorch_lightning.callbacks.LearningRateMonitor
86 |     logging_interval: "epoch"
87 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_base_azureml:
--------------------------------------------------------------------------------
 1 | # see latest azureml base images tags here
 2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
 3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description
 4 | 
 5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest
 6 | 
 7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE}
 8 | 
 9 | ARG DEBIAN_FRONTEND=noninteractive
10 | 
11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
12 |     build-essential \
13 |     software-properties-common \
14 |     cmake \
15 |     g++-7 \
16 |     git \
17 |     gpg \
18 |     curl \
19 |     vim \
20 |     wget \
21 |     ca-certificates \
22 |     libjpeg-dev \
23 |     libpng-dev \
24 |     librdmacm1 \
25 |     libibverbs1 \
26 |     ibverbs-providers \
27 |     openssh-client \
28 |     openssh-server \
29 |     libsm6 \
30 |     libxext6 \
31 |     ffmpeg \
32 |     libfontconfig1 \
33 |     libxrender1 \
34 |     libgl1-mesa-glx &&\
35 |     apt-get clean && rm -rf /var/lib/apt/lists/*
36 | 
37 | ADD environment.yml /tmp/environment.yml
38 | RUN conda env update -n base -f /tmp/environment.yml
39 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_base_azureml_cu116:
--------------------------------------------------------------------------------
 1 | # see latest azureml base images tags here
 2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
 3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description
 4 | 
 5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest
 6 | 
 7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE}
 8 | 
 9 | ARG DEBIAN_FRONTEND=noninteractive
10 | 
11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
12 |     build-essential \
13 |     software-properties-common \
14 |     cmake \
15 |     g++-7 \
16 |     git \
17 |     gpg \
18 |     curl \
19 |     vim \
20 |     wget \
21 |     ca-certificates \
22 |     libjpeg-dev \
23 |     libpng-dev \
24 |     librdmacm1 \
25 |     libibverbs1 \
26 |     ibverbs-providers \
27 |     openssh-client \
28 |     openssh-server \
29 |     libsm6 \
30 |     libxext6 \
31 |     ffmpeg \
32 |     libfontconfig1 \
33 |     libxrender1 \
34 |     libgl1-mesa-glx &&\
35 |     apt-get clean && rm -rf /var/lib/apt/lists/*
36 | 
37 | ADD environment_cu116.yml /tmp/environment.yml
38 | RUN conda env update -n base -f /tmp/environment.yml
39 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_base_azureml_nightly:
--------------------------------------------------------------------------------
 1 | # see latest azureml base images tags here
 2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
 3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description
 4 | 
 5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest
 6 | 
 7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE}
 8 | 
 9 | ARG DEBIAN_FRONTEND=noninteractive
10 | 
11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
12 |     build-essential \
13 |     software-properties-common \
14 |     cmake \
15 |     g++-7 \
16 |     git \
17 |     gpg \
18 |     curl \
19 |     vim \
20 |     wget \
21 |     ca-certificates \
22 |     libjpeg-dev \
23 |     libpng-dev \
24 |     librdmacm1 \
25 |     libibverbs1 \
26 |     ibverbs-providers \
27 |     openssh-client \
28 |     openssh-server \
29 |     libsm6 \
30 |     libxext6 \
31 |     ffmpeg \
32 |     libfontconfig1 \
33 |     libxrender1 \
34 |     libgl1-mesa-glx &&\
35 |     apt-get clean && rm -rf /var/lib/apt/lists/*
36 | 
37 | # use user's conda env as base
38 | ADD environment.yml /tmp/environment.yml
39 | RUN conda env update -n base -f /tmp/environment.yml
40 | 
41 | # update pytorch installed from the above step to nightly
42 | RUN conda update pytorch torchvision torchaudio -c pytorch-nightly -y
43 | 
44 | # install pytorch lightning nightly
45 | RUN pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip && \
46 |     pip install jsonargparse[signatures] --upgrade
47 | 
48 | # (optional) update all conda pkgs
49 | # RUN conda update --all
50 | 


--------------------------------------------------------------------------------
/docker/Dockerfile_base_nvidia:
--------------------------------------------------------------------------------
 1 | # tags release notes: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
 2 | # tags: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags?quick-deploy=false
 3 | 
 4 | ARG BASE_IMAGE=22.06-py3
 5 | 
 6 | FROM nvcr.io/nvidia/pytorch:${BASE_IMAGE}
 7 | 
 8 | ARG DEBIAN_FRONTEND=noninteractive
 9 | 
10 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
11 |     build-essential \
12 |     software-properties-common \
13 |     cmake \
14 |     g++-7 \
15 |     git \
16 |     gpg \
17 |     curl \
18 |     vim \
19 |     wget \
20 |     ca-certificates \
21 |     libjpeg-dev \
22 |     libpng-dev \
23 |     librdmacm1 \
24 |     libibverbs1 \
25 |     ibverbs-providers \
26 |     openssh-client \
27 |     openssh-server \
28 |     libsm6 \
29 |     libxext6 \
30 |     ffmpeg \
31 |     libfontconfig1 \
32 |     libxrender1 \
33 |     libgl1-mesa-glx &&\
34 |     apt-get clean && rm -rf /var/lib/apt/lists/*
35 | 
36 | RUN pip install click termcolor future python-dateutil \
37 |     azureml-core azureml-mlflow \
38 |     opencv-python scipy psutil
39 | 
40 | # jsonargparse[signatures] does not work in docker, so need lightning[extra]
41 | # in conda, jsonargparse[signatures] is enough
42 | RUN pip install pytorch-lightning[extra] einops pre-commit pytest sh rich
43 | # RUN pip install pytorch-lightning jsonargparse[signatures] einops
44 | 


--------------------------------------------------------------------------------
/docker/environment.yml:
--------------------------------------------------------------------------------
  1 | name: ml_template
  2 | channels:
  3 |   - pytorch
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=5.1=1_gnu
  9 |   - absl-py=1.2.0=pyhd8ed1ab_0
 10 |   - aiohttp=3.8.1=py39hb9d737c_1
 11 |   - aiosignal=1.2.0=pyhd8ed1ab_0
 12 |   - async-timeout=4.0.2=pyhd8ed1ab_0
 13 |   - attrs=22.1.0=pyh71513ae_1
 14 |   - blas=1.0=mkl
 15 |   - blinker=1.4=py_1
 16 |   - brotlipy=0.7.0=py39h27cfd23_1003
 17 |   - bzip2=1.0.8=h7b6447c_0
 18 |   - c-ares=1.18.1=h7f98852_0
 19 |   - ca-certificates=2022.9.24=ha878542_0
 20 |   - cachetools=5.2.0=pyhd8ed1ab_0
 21 |   - certifi=2022.9.24=pyhd8ed1ab_0
 22 |   - cffi=1.15.1=py39h74dc2b5_0
 23 |   - charset-normalizer=2.0.4=pyhd3eb1b0_0
 24 |   - click=8.1.3=py39hf3d152e_0
 25 |   - colorama=0.4.5=pyhd8ed1ab_0
 26 |   - cryptography=37.0.1=py39h9ce1e76_0
 27 |   - cudatoolkit=11.3.1=h2bc3f7f_2
 28 |   - ffmpeg=4.3=hf484d3e_0
 29 |   - freetype=2.11.0=h70c0345_0
 30 |   - frozenlist=1.2.0=py39h7f8727e_0
 31 |   - fsspec=2022.8.2=pyhd8ed1ab_0
 32 |   - giflib=5.2.1=h7b6447c_0
 33 |   - gmp=6.2.1=h295c915_3
 34 |   - gnutls=3.6.15=he1e5248_0
 35 |   - google-auth=2.11.1=pyh1a96a4e_0
 36 |   - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
 37 |   - grpcio=1.42.0=py39hce63b2e_0
 38 |   - idna=3.3=pyhd3eb1b0_0
 39 |   - importlib-metadata=4.11.4=py39hf3d152e_0
 40 |   - intel-openmp=2021.4.0=h06a4308_3561
 41 |   - jpeg=9e=h7f8727e_0
 42 |   - lame=3.100=h7b6447c_0
 43 |   - lcms2=2.12=h3be6417_0
 44 |   - ld_impl_linux-64=2.38=h1181459_1
 45 |   - lerc=3.0=h295c915_0
 46 |   - libdeflate=1.8=h7f8727e_5
 47 |   - libffi=3.3=he6710b0_2
 48 |   - libgcc-ng=11.2.0=h1234567_1
 49 |   - libgomp=11.2.0=h1234567_1
 50 |   - libiconv=1.16=h7f8727e_2
 51 |   - libidn2=2.3.2=h7f8727e_0
 52 |   - libpng=1.6.37=hbc83047_0
 53 |   - libprotobuf=3.15.8=h780b84a_1
 54 |   - libstdcxx-ng=11.2.0=h1234567_1
 55 |   - libtasn1=4.16.0=h27cfd23_0
 56 |   - libtiff=4.4.0=hecacb30_0
 57 |   - libunistring=0.9.10=h27cfd23_0
 58 |   - libwebp=1.2.2=h55f646e_0
 59 |   - libwebp-base=1.2.2=h7f8727e_0
 60 |   - lz4-c=1.9.3=h295c915_1
 61 |   - markdown=3.4.1=pyhd8ed1ab_0
 62 |   - markupsafe=2.1.1=py39hb9d737c_1
 63 |   - mkl=2021.4.0=h06a4308_640
 64 |   - mkl-service=2.4.0=py39h7f8727e_0
 65 |   - mkl_fft=1.3.1=py39hd3c417c_0
 66 |   - mkl_random=1.2.2=py39h51133e4_0
 67 |   - multidict=6.0.2=py39hb9d737c_1
 68 |   - ncurses=6.3=h5eee18b_3
 69 |   - nettle=3.7.3=hbbd107a_1
 70 |   - numpy=1.23.1=py39h6c91a56_0
 71 |   - numpy-base=1.23.1=py39ha15fc14_0
 72 |   - oauthlib=3.2.1=pyhd8ed1ab_0
 73 |   - openh264=2.1.1=h4ff587b_0
 74 |   - openssl=1.1.1q=h7f8727e_0
 75 |   - packaging=21.3=pyhd8ed1ab_0
 76 |   - pillow=9.2.0=py39hace64e9_1
 77 |   - pip=22.1.2=py39h06a4308_0
 78 |   - protobuf=3.15.8=py39he80948d_0
 79 |   - pyasn1=0.4.8=py_0
 80 |   - pyasn1-modules=0.2.7=py_0
 81 |   - pycparser=2.21=pyhd3eb1b0_0
 82 |   - pydeprecate=0.3.2=pyhd8ed1ab_0
 83 |   - pyjwt=2.5.0=pyhd8ed1ab_0
 84 |   - pyopenssl=22.0.0=pyhd3eb1b0_0
 85 |   - pyparsing=3.0.9=pyhd8ed1ab_0
 86 |   - pysocks=1.7.1=py39h06a4308_0
 87 |   - python=3.9.13=haa1d7c7_1
 88 |   - python_abi=3.9=2_cp39
 89 |   - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0
 90 |   - pytorch-lightning=1.7.7=pyhd8ed1ab_0
 91 |   - pytorch-mutex=1.0=cuda
 92 |   - pyu2f=0.1.5=pyhd8ed1ab_0
 93 |   - pyyaml=6.0=py39hb9d737c_4
 94 |   - readline=8.1.2=h7f8727e_1
 95 |   - requests=2.28.1=py39h06a4308_0
 96 |   - requests-oauthlib=1.3.1=pyhd8ed1ab_0
 97 |   - rsa=4.9=pyhd8ed1ab_0
 98 |   - setuptools=63.4.1=py39h06a4308_0
 99 |   - six=1.16.0=pyhd3eb1b0_1
100 |   - sqlite=3.39.3=h5082296_0
101 |   - tensorboard=2.10.1=pyhd8ed1ab_0
102 |   - tensorboard-data-server=0.6.0=py39hd97740a_2
103 |   - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
104 |   - tk=8.6.12=h1ccaba5_0
105 |   - torchaudio=0.12.1=py39_cu113
106 |   - torchmetrics=0.9.3=pyhd8ed1ab_0
107 |   - torchvision=0.13.1=py39_cu113
108 |   - tqdm=4.64.1=pyhd8ed1ab_0
109 |   - typing-extensions=4.3.0=py39h06a4308_0
110 |   - typing_extensions=4.3.0=py39h06a4308_0
111 |   - tzdata=2022c=h04d1e81_0
112 |   - urllib3=1.26.11=py39h06a4308_0
113 |   - werkzeug=2.2.2=pyhd8ed1ab_0
114 |   - wheel=0.37.1=pyhd3eb1b0_0
115 |   - xz=5.2.6=h5eee18b_0
116 |   - yaml=0.2.5=h7f98852_2
117 |   - yarl=1.7.2=py39hb9d737c_2
118 |   - zipp=3.8.1=pyhd8ed1ab_0
119 |   - zlib=1.2.12=h5eee18b_3
120 |   - zstd=1.5.2=ha4553b6_0
121 |   - pip:
122 |       - adal==1.2.7
123 |       - alembic==1.8.1
124 |       - antlr4-python3-runtime==4.9.3
125 |       - argcomplete==2.0.0
126 |       - azure-common==1.1.28
127 |       - azure-core==1.25.1
128 |       - azure-graphrbac==0.61.1
129 |       - azure-identity==1.11.0
130 |       - azure-mgmt-authorization==2.0.0
131 |       - azure-mgmt-containerregistry==10.0.0
132 |       - azure-mgmt-core==1.3.2
133 |       - azure-mgmt-keyvault==10.1.0
134 |       - azure-mgmt-resource==21.1.0
135 |       - azure-mgmt-storage==20.0.0
136 |       - azure-storage-blob==12.13.0
137 |       - azureml-core==1.45.0.post2
138 |       - azureml-mlflow==1.45.0
139 |       - backports-tempfile==1.0
140 |       - backports-weakref==1.0.post1
141 |       - bcrypt==4.0.0
142 |       - black==22.8.0
143 |       - cfgv==3.3.1
144 |       - cloudpickle==2.2.0
145 |       - commonmark==0.9.1
146 |       - contextlib2==21.6.0
147 |       - databricks-cli==0.17.3
148 |       - distlib==0.3.6
149 |       - docker==5.0.3
150 |       - filelock==3.8.0
151 |       - flake8==5.0.4
152 |       - flask==2.2.2
153 |       - gitdb==4.0.9
154 |       - gitpython==3.1.27
155 |       - greenlet==1.1.3
156 |       - gunicorn==20.1.0
157 |       - humanfriendly==10.0
158 |       - identify==2.5.5
159 |       - iniconfig==1.1.1
160 |       - isodate==0.6.1
161 |       - isort==5.10.1
162 |       - itsdangerous==2.1.2
163 |       - jeepney==0.8.0
164 |       - jmespath==1.0.1
165 |       - jsonpickle==2.2.0
166 |       - knack==0.9.0
167 |       - mako==1.2.3
168 |       - mccabe==0.7.0
169 |       - mlflow==1.29.0
170 |       - mlflow-skinny==1.29.0
171 |       - msal==1.19.0
172 |       - msal-extensions==1.0.0
173 |       - msrest==0.7.1
174 |       - msrestazure==0.6.4
175 |       - mypy-extensions==0.4.3
176 |       - ndg-httpsclient==0.5.1
177 |       - nodeenv==1.7.0
178 |       - omegaconf==2.2.3
179 |       - opencv-python==4.6.0.66
180 |       - pandas==1.5.0
181 |       - paramiko==2.11.0
182 |       - pathspec==0.10.1
183 |       - pkginfo==1.8.3
184 |       - platformdirs==2.5.2
185 |       - pluggy==1.0.0
186 |       - portalocker==2.5.1
187 |       - pre-commit==2.20.0
188 |       - prometheus-flask-exporter==0.20.3
189 |       - psutil==5.9.2
190 |       - py==1.11.0
191 |       - pycodestyle==2.9.1
192 |       - pyflakes==2.5.0
193 |       - pynacl==1.5.0
194 |       - pytest==7.1.3
195 |       - pytz==2022.2.1
196 |       - querystring-parser==1.2.4
197 |       - rich==12.5.1
198 |       - scipy==1.9.1
199 |       - secretstorage==3.3.3
200 |       - sh==1.14.3
201 |       - smmap==5.0.0
202 |       - sqlalchemy==1.4.41
203 |       - sqlparse==0.4.3
204 |       - tabulate==0.8.10
205 |       - toml==0.10.2
206 |       - tomli==2.0.1
207 |       - torch-tb-profiler==0.4.0
208 |       - types-cryptography==3.3.23
209 |       - virtualenv==20.16.5
210 |       - websocket-client==1.4.1
211 | 


--------------------------------------------------------------------------------
/docker/environment_cu116.yml:
--------------------------------------------------------------------------------
  1 | name: ml_template_cu_116
  2 | channels:
  3 |   - pytorch
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=5.1=1_gnu
  9 |   - absl-py=1.2.0=pyhd8ed1ab_0
 10 |   - aiohttp=3.8.1=py39hb9d737c_1
 11 |   - aiosignal=1.2.0=pyhd8ed1ab_0
 12 |   - async-timeout=4.0.2=pyhd8ed1ab_0
 13 |   - attrs=22.1.0=pyh71513ae_1
 14 |   - blas=1.0=mkl
 15 |   - blinker=1.4=py_1
 16 |   - brotlipy=0.7.0=py39hb9d737c_1004
 17 |   - bzip2=1.0.8=h7f98852_4
 18 |   - c-ares=1.18.1=h7f98852_0
 19 |   - ca-certificates=2022.9.24=ha878542_0
 20 |   - cachetools=5.2.0=pyhd8ed1ab_0
 21 |   - certifi=2022.9.24=pyhd8ed1ab_0
 22 |   - cffi=1.14.6=py39he32792d_0
 23 |   - charset-normalizer=2.1.1=pyhd8ed1ab_0
 24 |   - click=8.1.3=py39hf3d152e_0
 25 |   - colorama=0.4.5=pyhd8ed1ab_0
 26 |   - cryptography=37.0.2=py39hd97740a_0
 27 |   - cudatoolkit=11.6.0=hecad31d_10
 28 |   - ffmpeg=4.3=hf484d3e_0
 29 |   - freetype=2.10.4=h0708190_1
 30 |   - frozenlist=1.2.0=py39h7f8727e_0
 31 |   - fsspec=2022.8.2=pyhd8ed1ab_0
 32 |   - gmp=6.2.1=h58526e2_0
 33 |   - gnutls=3.6.13=h85f3911_1
 34 |   - google-auth=2.11.1=pyh1a96a4e_0
 35 |   - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
 36 |   - grpcio=1.42.0=py39hce63b2e_0
 37 |   - idna=3.4=pyhd8ed1ab_0
 38 |   - importlib-metadata=4.11.4=py39hf3d152e_0
 39 |   - intel-openmp=2021.4.0=h06a4308_3561
 40 |   - jpeg=9e=h166bdaf_1
 41 |   - lame=3.100=h7f98852_1001
 42 |   - lcms2=2.12=hddcbb42_0
 43 |   - ld_impl_linux-64=2.38=h1181459_1
 44 |   - libffi=3.3=he6710b0_2
 45 |   - libgcc-ng=11.2.0=h1234567_1
 46 |   - libgomp=11.2.0=h1234567_1
 47 |   - libiconv=1.17=h166bdaf_0
 48 |   - libpng=1.6.37=h21135ba_2
 49 |   - libprotobuf=3.15.8=h780b84a_1
 50 |   - libstdcxx-ng=11.2.0=h1234567_1
 51 |   - libtiff=4.2.0=hf544144_3
 52 |   - libwebp-base=1.2.2=h7f98852_1
 53 |   - lz4-c=1.9.3=h9c3ff4c_1
 54 |   - markdown=3.4.1=pyhd8ed1ab_0
 55 |   - markupsafe=2.1.1=py39hb9d737c_1
 56 |   - mkl=2021.4.0=h06a4308_640
 57 |   - mkl-service=2.4.0=py39h7e14d7c_0
 58 |   - mkl_fft=1.3.1=py39h0c7bc48_1
 59 |   - mkl_random=1.2.2=py39hde0f152_0
 60 |   - multidict=6.0.2=py39hb9d737c_1
 61 |   - ncurses=6.3=h5eee18b_3
 62 |   - nettle=3.6=he412f7d_0
 63 |   - numpy=1.23.1=py39h6c91a56_0
 64 |   - numpy-base=1.23.1=py39ha15fc14_0
 65 |   - oauthlib=3.2.1=pyhd8ed1ab_0
 66 |   - olefile=0.46=pyh9f0ad1d_1
 67 |   - openh264=2.1.1=h780b84a_0
 68 |   - openjpeg=2.4.0=hb52868f_1
 69 |   - openssl=1.1.1q=h7f8727e_0
 70 |   - packaging=21.3=pyhd8ed1ab_0
 71 |   - pillow=8.2.0=py39hf95b381_1
 72 |   - pip=22.1.2=py39h06a4308_0
 73 |   - pyasn1=0.4.8=py_0
 74 |   - pyasn1-modules=0.2.7=py_0
 75 |   - pycparser=2.21=pyhd8ed1ab_0
 76 |   - pydeprecate=0.3.2=pyhd8ed1ab_0
 77 |   - pyjwt=2.5.0=pyhd8ed1ab_0
 78 |   - pyopenssl=22.0.0=pyhd8ed1ab_1
 79 |   - pyparsing=3.0.9=pyhd8ed1ab_0
 80 |   - pysocks=1.7.1=pyha2e5f31_6
 81 |   - python=3.9.13=haa1d7c7_1
 82 |   - python_abi=3.9=2_cp39
 83 |   - pytorch=1.12.1=py3.9_cuda11.6_cudnn8.3.2_0
 84 |   - pytorch-lightning=1.7.7=pyhd8ed1ab_0
 85 |   - pytorch-mutex=1.0=cuda
 86 |   - pyu2f=0.1.5=pyhd8ed1ab_0
 87 |   - pyyaml=6.0=py39hb9d737c_4
 88 |   - readline=8.1.2=h7f8727e_1
 89 |   - requests=2.28.1=pyhd8ed1ab_1
 90 |   - requests-oauthlib=1.3.1=pyhd8ed1ab_0
 91 |   - rsa=4.9=pyhd8ed1ab_0
 92 |   - setuptools=63.4.1=py39h06a4308_0
 93 |   - six=1.16.0=pyh6c4a22f_0
 94 |   - sqlite=3.39.2=h5082296_0
 95 |   - tensorboard=2.10.1=pyhd8ed1ab_0
 96 |   - tensorboard-data-server=0.6.0=py39hd97740a_2
 97 |   - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
 98 |   - tk=8.6.12=h1ccaba5_0
 99 |   - torchaudio=0.12.1=py39_cu116
100 |   - torchmetrics=0.9.3=pyhd8ed1ab_0
101 |   - torchvision=0.13.1=py39_cu116
102 |   - tqdm=4.64.1=pyhd8ed1ab_0
103 |   - typing-extensions=4.3.0=hd8ed1ab_0
104 |   - typing_extensions=4.3.0=pyha770c72_0
105 |   - tzdata=2022c=h04d1e81_0
106 |   - urllib3=1.26.11=pyhd8ed1ab_0
107 |   - werkzeug=2.2.2=pyhd8ed1ab_0
108 |   - wheel=0.37.1=pyhd3eb1b0_0
109 |   - xz=5.2.6=h5eee18b_0
110 |   - yaml=0.2.5=h7f98852_2
111 |   - yarl=1.7.2=py39hb9d737c_2
112 |   - zipp=3.8.1=pyhd8ed1ab_0
113 |   - zlib=1.2.12=h5eee18b_3
114 |   - zstd=1.5.0=ha95c52a_0
115 |   - pip:
116 |       - adal==1.2.7
117 |       - alembic==1.8.1
118 |       - antlr4-python3-runtime==4.9.3
119 |       - argcomplete==2.0.0
120 |       - azure-common==1.1.28
121 |       - azure-core==1.25.1
122 |       - azure-graphrbac==0.61.1
123 |       - azure-identity==1.11.0
124 |       - azure-mgmt-authorization==2.0.0
125 |       - azure-mgmt-containerregistry==10.0.0
126 |       - azure-mgmt-core==1.3.2
127 |       - azure-mgmt-keyvault==10.1.0
128 |       - azure-mgmt-resource==21.1.0
129 |       - azure-mgmt-storage==20.0.0
130 |       - azure-storage-blob==12.13.0
131 |       - azureml-core==1.45.0.post2
132 |       - azureml-mlflow==1.45.0
133 |       - backports-tempfile==1.0
134 |       - backports-weakref==1.0.post1
135 |       - bcrypt==4.0.0
136 |       - black==22.8.0
137 |       - cfgv==3.3.1
138 |       - cloudpickle==2.2.0
139 |       - commonmark==0.9.1
140 |       - contextlib2==21.6.0
141 |       - contourpy==1.0.5
142 |       - databricks-cli==0.17.3
143 |       - distlib==0.3.6
144 |       - docker==5.0.3
145 |       - docstring-parser==0.15
146 |       - filelock==3.8.0
147 |       - flake8==5.0.4
148 |       - flask==2.2.2
149 |       - fonttools==4.37.3
150 |       - gcsfs==2022.8.2
151 |       - gitdb==4.0.9
152 |       - gitpython==3.1.27
153 |       - google-api-core==2.10.1
154 |       - google-cloud-core==2.3.2
155 |       - google-cloud-storage==2.5.0
156 |       - google-crc32c==1.5.0
157 |       - google-resumable-media==2.3.3
158 |       - googleapis-common-protos==1.56.4
159 |       - greenlet==1.1.3
160 |       - gunicorn==20.1.0
161 |       - humanfriendly==10.0
162 |       - hydra-core==1.2.0
163 |       - identify==2.5.5
164 |       - iniconfig==1.1.1
165 |       - isodate==0.6.1
166 |       - isort==5.10.1
167 |       - itsdangerous==2.1.2
168 |       - jeepney==0.8.0
169 |       - jmespath==1.0.1
170 |       - jsonargparse==4.15.0
171 |       - jsonpickle==2.2.0
172 |       - kiwisolver==1.4.4
173 |       - knack==0.9.0
174 |       - mako==1.2.3
175 |       - matplotlib==3.6.0
176 |       - mccabe==0.7.0
177 |       - mlflow==1.29.0
178 |       - mlflow-skinny==1.29.0
179 |       - msal==1.19.0
180 |       - msal-extensions==1.0.0
181 |       - msrest==0.7.1
182 |       - msrestazure==0.6.4
183 |       - mypy-extensions==0.4.3
184 |       - ndg-httpsclient==0.5.1
185 |       - nodeenv==1.7.0
186 |       - omegaconf==2.2.3
187 |       - opencv-python==4.6.0.66
188 |       - pandas==1.5.0
189 |       - paramiko==2.11.0
190 |       - pathspec==0.10.1
191 |       - pkginfo==1.8.3
192 |       - platformdirs==2.5.2
193 |       - pluggy==1.0.0
194 |       - portalocker==2.5.1
195 |       - pre-commit==2.20.0
196 |       - prometheus-flask-exporter==0.20.3
197 |       - protobuf==3.20.1
198 |       - psutil==5.9.2
199 |       - py==1.11.0
200 |       - pycodestyle==2.9.1
201 |       - pyflakes==2.5.0
202 |       - pynacl==1.5.0
203 |       - pytest==7.1.3
204 |       - pytz==2022.2.1
205 |       - querystring-parser==1.2.4
206 |       - rich==12.5.1
207 |       - scipy==1.9.1
208 |       - secretstorage==3.3.3
209 |       - sh==1.14.3
210 |       - smmap==5.0.0
211 |       - sqlalchemy==1.4.41
212 |       - sqlparse==0.4.3
213 |       - tabulate==0.8.10
214 |       - toml==0.10.2
215 |       - tomli==2.0.1
216 |       - torch-tb-profiler==0.4.0
217 |       - torchtext==0.13.1
218 |       - types-cryptography==3.3.23
219 |       - virtualenv==20.16.5
220 |       - websocket-client==1.4.1
221 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = asmr_ml_template
 3 | version = 0.0.1
 4 | author = Example Author
 5 | author_email = author@example.com
 6 | description = A small example package
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/pypa/sampleproject
10 | project_urls =
11 |     Bug Tracker = https://github.com/pypa/sampleproject/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | 
17 | [options]
18 | package_dir =
19 |     = .
20 | packages = find:
21 | python_requires = >=3.6
22 | 
23 | [options.packages.find]
24 | where = .
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()
4 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/__init__.py


--------------------------------------------------------------------------------
/src/datamodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/datamodules/__init__.py


--------------------------------------------------------------------------------
/src/datamodules/mnist_datamodule.py:
--------------------------------------------------------------------------------
  1 | # credits: https://github.com/ashleve/lightning-hydra-template/tree/main/src/datamodules
  2 | from typing import Dict, Optional, Tuple, Union
  3 | 
  4 | import torch
  5 | from pytorch_lightning import LightningDataModule
  6 | from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split
  7 | from torchvision.datasets import MNIST
  8 | from torchvision.transforms import transforms
  9 | 
 10 | 
 11 | class MNISTDataModule(LightningDataModule):
 12 |     """Example of LightningDataModule for MNIST dataset.
 13 | 
 14 |     A DataModule implements 5 key methods:
 15 |         - prepare_data (things to do on 1 GPU/TPU, not on every GPU/TPU in distributed mode)
 16 |         - setup (things to do on every accelerator in distributed mode)
 17 |         - train_dataloader (the training dataloader)
 18 |         - val_dataloader (the validation dataloader(s))
 19 |         - test_dataloader (the test dataloader(s))
 20 | 
 21 |     This allows you to share a full dataset without explaining how to download,
 22 |     split, transform and process the data.
 23 | 
 24 |     Read the docs:
 25 |         https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, file_params: Dict[str, Union[str, Tuple[int, int, int]]], train_params: Dict[str, Union[int, bool]]
 30 |     ):
 31 |         super().__init__()
 32 | 
 33 |         # this line allows to access init params with 'self.hparams' attribute
 34 |         self.save_hyperparameters(logger=False)
 35 | 
 36 |         # data transformations
 37 |         self.transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
 38 | 
 39 |         self.data_train: Optional[Dataset] = None
 40 |         self.data_val: Optional[Dataset] = None
 41 |         self.data_test: Optional[Dataset] = None
 42 | 
 43 |     @property
 44 |     def num_classes(self) -> int:
 45 |         return 10
 46 | 
 47 |     def prepare_data(self):
 48 |         """Download data if needed.
 49 | 
 50 |         This method is called only from a single GPU.
 51 |         Do not use it to assign state (self.x = y).
 52 |         """
 53 |         MNIST(self.hparams.file_params["base_dir"], train=True, download=True)
 54 |         MNIST(self.hparams.file_params["base_dir"], train=False, download=True)
 55 | 
 56 |     def setup(self, stage: Optional[str] = None):
 57 |         """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
 58 | 
 59 |         This method is called by lightning when doing `trainer.fit()` and `trainer.test()`,
 60 |         so be careful not to execute the random split twice! The `stage` can be used to
 61 |         differentiate whether it's called before trainer.fit()` or `trainer.test()`.
 62 |         """
 63 | 
 64 |         # load datasets only if they're not loaded already
 65 |         if not self.data_train and not self.data_val and not self.data_test:
 66 |             trainset = MNIST(self.hparams.file_params["base_dir"], train=True, transform=self.transforms)
 67 |             testset = MNIST(self.hparams.file_params["base_dir"], train=False, transform=self.transforms)
 68 |             dataset = ConcatDataset(datasets=[trainset, testset])
 69 |             self.data_train, self.data_val, self.data_test = random_split(
 70 |                 dataset=dataset,
 71 |                 lengths=self.hparams.file_params["train_val_test_split"],
 72 |                 generator=torch.Generator().manual_seed(42),
 73 |             )
 74 | 
 75 |     def train_dataloader(self):
 76 |         return DataLoader(
 77 |             dataset=self.data_train,
 78 |             batch_size=self.hparams.train_params["batch_size"],
 79 |             num_workers=self.hparams.train_params["num_workers"],
 80 |             pin_memory=self.hparams.train_params["pin_memory"],
 81 |             shuffle=True,
 82 |         )
 83 | 
 84 |     def val_dataloader(self):
 85 |         return DataLoader(
 86 |             dataset=self.data_val,
 87 |             batch_size=self.hparams.train_params["batch_size"],
 88 |             num_workers=self.hparams.train_params["num_workers"],
 89 |             pin_memory=self.hparams.train_params["pin_memory"],
 90 |             shuffle=False,
 91 |         )
 92 | 
 93 |     def test_dataloader(self):
 94 |         return DataLoader(
 95 |             dataset=self.data_test,
 96 |             batch_size=self.hparams.train_params["batch_size"],
 97 |             num_workers=self.hparams.train_params["num_workers"],
 98 |             pin_memory=self.hparams.train_params["pin_memory"],
 99 |             shuffle=False,
100 |         )
101 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/models/__init__.py


--------------------------------------------------------------------------------
/src/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/models/components/__init__.py


--------------------------------------------------------------------------------
/src/models/components/simple_dense_net.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class SimpleDenseNet(nn.Module):
 5 |     def __init__(
 6 |         self,
 7 |         input_size: int = 784,
 8 |         lin1_size: int = 256,
 9 |         lin2_size: int = 256,
10 |         lin3_size: int = 256,
11 |         output_size: int = 10,
12 |     ):
13 |         super().__init__()
14 | 
15 |         self.model = nn.Sequential(
16 |             nn.Linear(input_size, lin1_size),
17 |             nn.BatchNorm1d(lin1_size),
18 |             nn.ReLU(),
19 |             nn.Linear(lin1_size, lin2_size),
20 |             nn.BatchNorm1d(lin2_size),
21 |             nn.ReLU(),
22 |             nn.Linear(lin2_size, lin3_size),
23 |             nn.BatchNorm1d(lin3_size),
24 |             nn.ReLU(),
25 |             nn.Linear(lin3_size, output_size),
26 |         )
27 | 
28 |     def forward(self, x):
29 |         batch_size, channels, width, height = x.size()
30 | 
31 |         # (batch, 1, width, height) -> (batch, 1*width*height)
32 |         x = x.view(batch_size, -1)
33 | 
34 |         return self.model(x)
35 | 


--------------------------------------------------------------------------------
/src/models/mnist_module.py:
--------------------------------------------------------------------------------
  1 | # credits: https://github.com/ashleve/lightning-hydra-template/blob/main/src/models/mnist_module.py
  2 | from typing import Any, Dict, List, Union
  3 | 
  4 | import torch
  5 | from pytorch_lightning import LightningModule
  6 | from torchmetrics import MaxMetric
  7 | from torchmetrics.classification.accuracy import Accuracy
  8 | 
  9 | from src.models.components.simple_dense_net import SimpleDenseNet
 10 | 
 11 | 
 12 | class MNISTLitModule(LightningModule):
 13 |     """Example of LightningModule for MNIST classification.
 14 | 
 15 |     A LightningModule organizes your PyTorch code into 5 sections:
 16 |         - Computations (init).
 17 |         - Train loop (training_step)
 18 |         - Validation loop (validation_step)
 19 |         - Test loop (test_step)
 20 |         - Optimizers (configure_optimizers)
 21 | 
 22 |     Read the docs:
 23 |         https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html
 24 |     """
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         mlp_config: Dict[str, dict],
 29 |         optimizer_config: Dict[str, Union[float, str, list]],
 30 |     ):
 31 |         super().__init__()
 32 | 
 33 |         # this line allows to access init params with 'self.hparams' attribute
 34 |         # it also ensures init params will be stored in ckpt
 35 |         self.save_hyperparameters("mlp_config", "optimizer_config")
 36 | 
 37 |         self.net = SimpleDenseNet(**mlp_config)
 38 | 
 39 |         # loss function
 40 |         self.criterion = torch.nn.CrossEntropyLoss()
 41 | 
 42 |         # use separate metric instance for train, val and test step
 43 |         # to ensure a proper reduction over the epoch
 44 |         self.train_acc = Accuracy()
 45 |         self.val_acc = Accuracy()
 46 |         self.test_acc = Accuracy()
 47 | 
 48 |         # for logging best so far validation accuracy
 49 |         self.val_acc_best = MaxMetric()
 50 | 
 51 |     def forward(self, x: torch.Tensor):
 52 |         return self.net(x)
 53 | 
 54 |     def on_train_start(self):
 55 |         # by default lightning executes validation step sanity checks before training starts,
 56 |         # so we need to make sure val_acc_best doesn't store accuracy from these checks
 57 |         self.val_acc_best.reset()
 58 | 
 59 |     def step(self, batch: Any):
 60 |         x, y = batch
 61 |         logits = self.forward(x)
 62 |         loss = self.criterion(logits, y)
 63 |         preds = torch.argmax(logits, dim=1)
 64 |         return loss, preds, y
 65 | 
 66 |     def training_step(self, batch: Any, batch_idx: int):
 67 |         loss, preds, targets = self.step(batch)
 68 | 
 69 |         # log train metrics
 70 |         acc = self.train_acc(preds, targets)
 71 |         self.log("train/loss", loss, on_step=False, on_epoch=True, prog_bar=False)
 72 |         self.log("train/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
 73 | 
 74 |         # we can return here dict with any tensors
 75 |         # and then read it in some callback or in `training_epoch_end()` below
 76 |         # remember to always return loss from `training_step()` or else backpropagation will fail!
 77 |         return {"loss": loss, "preds": preds, "targets": targets}
 78 | 
 79 |     def training_epoch_end(self, outputs: List[Any]):
 80 |         # `outputs` is a list of dicts returned from `training_step()`
 81 |         self.train_acc.reset()
 82 | 
 83 |     def validation_step(self, batch: Any, batch_idx: int):
 84 |         loss, preds, targets = self.step(batch)
 85 | 
 86 |         # log val metrics
 87 |         acc = self.val_acc(preds, targets)
 88 |         self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=False)
 89 |         self.log("val/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
 90 | 
 91 |         return {"loss": loss, "preds": preds, "targets": targets}
 92 | 
 93 |     def validation_epoch_end(self, outputs: List[Any]):
 94 |         acc = self.val_acc.compute()  # get val accuracy from current epoch
 95 |         self.val_acc_best.update(acc)
 96 |         self.log("val/acc_best", self.val_acc_best.compute(), on_epoch=True, prog_bar=True)
 97 | 
 98 |         self.val_acc.reset()  # reset val accuracy for next epoch
 99 | 
100 |     def test_step(self, batch: Any, batch_idx: int):
101 |         loss, preds, targets = self.step(batch)
102 | 
103 |         # log test metrics
104 |         acc = self.test_acc(preds, targets)
105 |         self.log("test/loss", loss, on_step=False, on_epoch=True)
106 |         self.log("test/acc", acc, on_step=False, on_epoch=True)
107 | 
108 |         return {"loss": loss, "preds": preds, "targets": targets}
109 | 
110 |     def test_epoch_end(self, outputs: List[Any]):
111 |         self.test_acc.reset()
112 | 
113 |     def configure_optimizers(self):
114 |         """Choose what optimizers and learning-rate schedulers to use in your optimization.
115 |         Normally you'd need one. But in the case of GANs or similar you might have multiple.
116 | 
117 |         See examples here:     https://pytorch-
118 |         lightning.readthedocs.io/en/latest/common/lightning_module.html#configure-optimizers
119 |         """
120 |         return torch.optim.Adam(
121 |             params=self.parameters(),
122 |             lr=self.hparams.optimizer_config["lr"],
123 |             weight_decay=self.hparams.optimizer_config["weight_decay"],
124 |         )
125 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import OmegaConf
 2 | 
 3 | from utils import pl_utils
 4 | 
 5 | 
 6 | def main(cfg):
 7 |     trainer = pl_utils.instantiate_trainer(cfg)
 8 |     model = pl_utils.instantiate_class(cfg["model"])
 9 |     datamodule = pl_utils.instantiate_class(cfg["data"])
10 | 
11 |     trainer.fit(model, datamodule)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     cfg = OmegaConf.from_cli()
16 | 
17 |     if "base" in cfg:
18 |         basecfg = OmegaConf.load(cfg.base)
19 |         del cfg.base
20 |         cfg = OmegaConf.merge(basecfg, cfg)
21 |         cfg = OmegaConf.to_container(cfg, resolve=True)
22 |         print(OmegaConf.to_yaml(cfg))
23 |         main(cfg)
24 |     else:
25 |         raise SystemExit("Base configuration file not specified! Exiting.")
26 | 


--------------------------------------------------------------------------------
/src/utils/pl_utils.py:
--------------------------------------------------------------------------------
 1 | from importlib import import_module
 2 | from typing import Any, Dict, List
 3 | 
 4 | import pytorch_lightning as pl
 5 | from pytorch_lightning import Callback, Trainer
 6 | from pytorch_lightning.loggers import LightningLoggerBase
 7 | 
 8 | 
 9 | def instantiate_class(init: Dict[str, Any]) -> Any:
10 |     """Instantiates a class with the given args and init.
11 | 
12 |     Args:
13 |         todo
14 | 
15 |     Returns:
16 |         The instantiated class object.
17 |     """
18 |     kwargs = {k: init[k] for k in set(list(init.keys())) - {"_target_"}}
19 | 
20 |     class_module, class_name = init["_target_"].rsplit(".", 1)
21 |     module = import_module(class_module, package=class_name)
22 |     args_class = getattr(module, class_name)
23 |     return args_class(**kwargs)
24 | 
25 | 
26 | def instantiate_callbacks(callbacks_cfg: dict) -> List[Callback]:
27 |     """Instantiates callbacks from config."""
28 |     callbacks: List[Callback] = []
29 | 
30 |     if not callbacks_cfg:
31 |         return callbacks
32 | 
33 |     if not isinstance(callbacks_cfg, dict):
34 |         raise TypeError("Callbacks config must be a DictConfig!")
35 | 
36 |     for _, cb_conf in callbacks_cfg.items():
37 |         if isinstance(cb_conf, dict) and "_target_" in cb_conf:
38 |             callbacks.append(instantiate_class(cb_conf))
39 | 
40 |     return callbacks
41 | 
42 | 
43 | def instantiate_loggers(logger_cfg: dict) -> List[LightningLoggerBase]:
44 |     """Instantiates loggers from config."""
45 |     logger: List[LightningLoggerBase] = []
46 | 
47 |     if not logger_cfg:
48 |         return logger
49 | 
50 |     if not isinstance(logger_cfg, dict):
51 |         raise TypeError("Logger config must be a Dict!")
52 | 
53 |     for _, lg_conf in logger_cfg.items():
54 |         if isinstance(lg_conf, dict) and "_target_" in lg_conf:
55 |             logger.append(instantiate_class(lg_conf))
56 | 
57 |     return logger
58 | 
59 | 
60 | def instantiate_trainer(cfg: dict):
61 |     if cfg.get("seed", None):
62 |         pl.seed_everything(cfg["seed"], workers=True)
63 | 
64 |     callbacks: List[Callback] = instantiate_callbacks(cfg.get("callbacks"))
65 |     logger: List[LightningLoggerBase] = instantiate_loggers(cfg.get("logger"))
66 |     trainer: Trainer = Trainer(**cfg["trainer"], callbacks=callbacks, logger=logger)
67 | 
68 |     return trainer
69 | 


--------------------------------------------------------------------------------
/src/utils/system_monitor.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess as sp
  4 | import time
  5 | 
  6 | import mlflow
  7 | import psutil
  8 | 
  9 | 
 10 | def get_gpu_mem_info():
 11 |     output_to_list = lambda x: x.decode("ascii").split("\n")[:-1]
 12 |     COMMAND = "nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total --format=csv"
 13 | 
 14 |     try:
 15 |         memory_use_info = output_to_list(sp.check_output(COMMAND.split(), stderr=sp.STDOUT))[1:]
 16 |     except sp.CalledProcessError as e:
 17 |         raise RuntimeError(f"command '{e.cmd}' return with error (code {e.returncode}): {e.output}")
 18 |     return memory_use_info
 19 | 
 20 | 
 21 | def get_dist_info():
 22 |     dist_info_config = {
 23 |         "node_rank": "NODE_RANK",
 24 |         "local_rank": "LOCAL_RANK",
 25 |         "world_rank": "RANK",
 26 |         "world_size": "WORLD_SIZE",
 27 |     }
 28 | 
 29 |     dist_info = {key: os.environ.get(value) for key, value in dist_info_config.items()}
 30 | 
 31 |     # Single GPU job
 32 |     if dist_info["world_size"] is None:
 33 |         dist_info["node_rank"] = 0
 34 |         dist_info["world_rank"] = 0
 35 |         dist_info["local_rank"] = 0
 36 |         dist_info["world_size"] = 1
 37 | 
 38 |     dist_info = {key: int(value) for (key, value) in dist_info.items()}
 39 |     return dist_info
 40 | 
 41 | 
 42 | def main(args):
 43 |     # run on each node but only on one process corresponding to first gpu
 44 |     dist_info = get_dist_info()
 45 |     if not (dist_info["local_rank"] == 0):
 46 |         return
 47 | 
 48 |     node_rank = dist_info["node_rank"] + 1  # one index for display
 49 | 
 50 |     metrics = {}
 51 |     metrics = {
 52 |         f"monitor/node_{node_rank:02}/ram_usage_percent": 0.0,
 53 |         f"monitor/node_{node_rank:02}/ram_usage_GB": 0.0,
 54 |         f"monitor/node_{node_rank:02}/cpu_usage_percent": 0.0,
 55 |         f"monitor/node_{node_rank:02}/swap": 0.0,
 56 |     }
 57 | 
 58 |     memory_use_info_list = get_gpu_mem_info()
 59 |     idx, gpu_percent, gpu_mem_used, gpu_mem_total = memory_use_info_list[0].split(",")
 60 |     gpu_mem_total = float(gpu_mem_total.split("MiB")[0])
 61 | 
 62 |     for gpu_idx, gpu_info in enumerate(memory_use_info_list, 1):
 63 |         metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/usage_percent"] = 0.0
 64 |         metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_GB"] = 0.0
 65 |         metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_percent"] = 0.0
 66 | 
 67 |     now = 0
 68 |     dt_sleep = args.watch_every_n_seconds
 69 | 
 70 |     while True:
 71 |         metrics[f"monitor/node_{node_rank:02}/ram_usage_GB"] = psutil.virtual_memory().used / 2**30
 72 |         metrics[f"monitor/node_{node_rank:02}/ram_usage_percent"] = psutil.virtual_memory().percent
 73 |         metrics[f"monitor/node_{node_rank:02}/cpu_usage_percent"] = psutil.cpu_percent()
 74 |         metrics[f"monitor/node_{node_rank:02}/swap"] = psutil.swap_memory().percent
 75 | 
 76 |         memory_use_info_list = get_gpu_mem_info()
 77 |         for gpu_idx, gpu_info in enumerate(memory_use_info_list, 1):
 78 |             _, gpu_percent, gpu_mem_used, _ = gpu_info.split(",")
 79 |             gpu_percent = float(gpu_percent.split("%")[0])
 80 |             gpu_mem_used = float(gpu_mem_used.split("MiB")[0])
 81 |             gpu_mem_percent = gpu_mem_used / gpu_mem_total * 100.0
 82 |             gpu_mem_used /= 1024.0
 83 | 
 84 |             metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/usage_percent"] = gpu_percent
 85 |             metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_GB"] = gpu_mem_used
 86 |             metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_percent"] = gpu_mem_percent
 87 | 
 88 |         # for key, value in metrics.items():
 89 |         #     print(f"{key}: {value}")
 90 |         mlflow.log_metrics(metrics, step=now)
 91 | 
 92 |         time.sleep(dt_sleep)
 93 |         now += dt_sleep
 94 | 
 95 | 
 96 | def get_parsed_args():
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument("--watch_every_n_seconds", type=int, default=5)
 99 |     args = parser.parse_args()
100 |     return args
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     print("system_monitor.py begins")
105 |     args = get_parsed_args()
106 |     main(args)
107 |     print("system_monitor.py done")
108 | 


--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pytest
 4 | import sh
 5 | 
 6 | 
 7 | def run_command(command: List[str]):
 8 |     """Default method for executing shell commands with pytest."""
 9 |     msg = None
10 |     try:
11 |         sh.python(command)
12 |     except sh.ErrorReturnCode as e:
13 |         msg = e.stderr.decode()
14 |     if msg:
15 |         pytest.fail(msg=msg)
16 | 


--------------------------------------------------------------------------------
/tests/test_dev_fast_run.py:
--------------------------------------------------------------------------------
 1 | import pytest  # nopycln: import
 2 | from helpers import run_command
 3 | 
 4 | 
 5 | def test_fast_dev_run():
 6 |     """Test running for 1 train, val and test batch."""
 7 |     command = [
 8 |         "src/train.py",
 9 |         "base=configs/train.yaml",
10 |         "trainer.fast_dev_run=true",
11 |     ]
12 |     run_command(command)
13 | 
14 | 
15 | # cpu only test for CI
16 | def test_fast_dev_run_cpu():
17 |     """Test running for 1 train, val and test batch."""
18 |     command = [
19 |         "src/train.py",
20 |         "base=configs/train.yaml",
21 |         "trainer.fast_dev_run=true",
22 |         "trainer.accelerator=cpu",
23 |         "trainer.sync_batchnorm=false",
24 |     ]
25 |     run_command(command)
26 | 


--------------------------------------------------------------------------------