├── .devcontainer
└── devcontainer.json
├── .github
└── workflows
│ ├── pre-commit.yml
│ └── template-cleanup.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
├── extensions.json
├── launch.json
└── settings.json
├── CODE_OF_CONDUCT.md
├── License
├── README.md
├── README_template.md
├── SECURITY.md
├── assets
└── images
│ └── table-of-contents.png
├── azure-pipelines.yml
├── configs
└── train.yaml
├── docker
├── Dockerfile_base_azureml
├── Dockerfile_base_azureml_cu116
├── Dockerfile_base_azureml_nightly
├── Dockerfile_base_nvidia
├── environment.yml
└── environment_cu116.yml
├── setup.cfg
├── setup.py
├── src
├── __init__.py
├── datamodules
│ ├── __init__.py
│ └── mnist_datamodule.py
├── models
│ ├── __init__.py
│ ├── components
│ │ ├── __init__.py
│ │ └── simple_dense_net.py
│ └── mnist_module.py
├── train.py
└── utils
│ ├── pl_utils.py
│ └── system_monitor.py
└── tests
├── helpers.py
└── test_dev_fast_run.py
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | // Remote devcontainer file that describes what container to build, how to build it,
2 | // and the extensions VS Code needs to enable the best remote development experience
3 | {
4 | "name": "devcontainer",
5 | "build": {
6 | "context": "../docker",
7 | // Uncomment the base dockerfile of your choice
8 | "dockerfile": "../docker/Dockerfile_base_conda",
9 | // "dockerfile": "../docker/Dockerfile_base_nvidia",
10 | "args": {
11 | // Edit docker build args here as appropriate
12 | // find latest BASE_IMAGE for Dockerfile_base_conda at https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
13 | "BASE_IMAGE": "openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest"
14 | // find latest BASE_IMAGE for Dockerfile_base_nvidia at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags
15 | // Uncomment the following for nvidia base image
16 | // "BASE_IMAGE": "22.06-py3"
17 | },
18 | },
19 | // Configure tool-specific properties.
20 | "customizations": {
21 | // Configure properties specific to VS Code.
22 | "vscode": {
23 | // Add the IDs of extensions you want installed when the container is created.
24 | "extensions": [
25 | "eamodio.gitlens",
26 | "ms-python.python",
27 | "ms-python.vscode-pylance",
28 | "ms-azuretools.vscode-docker",
29 | "ms-vscode-remote.remote-containers",
30 | "ms-vscode-remote.remote-ssh",
31 | "ms-vscode-remote.remote-ssh-edit",
32 | "ms-vscode-remote.remote-wsl",
33 | "ms-vscode-remote.vscode-remote-extensionpack",
34 | "redhat.vscode-yaml",
35 | "yzhang.markdown-all-in-one",
36 | "TrungNgo.autoflake",
37 | "Shan.code-settings-sync",
38 | "njpwerner.autodocstring",
39 | "jbockle.jbockle-format-files"
40 | ]
41 | }
42 | },
43 | // Docker run args
44 | "runArgs": [
45 | // Run with GPU support
46 | "--privileged",
47 | "--gpus",
48 | "all",
49 | // Uncomment the next line if you will be using a ptrace-based debugger like C++, Go, and Rust.
50 | "--cap-add=SYS_PTRACE",
51 | "--security-opt",
52 | "seccomp=unconfined",
53 | // Use Docker from inside the container. See https://aka.ms/vscode-remote/samples/docker-in-docker for details.
54 | "-v",
55 | "/var/run/docker.sock:/var/run/docker.sock"
56 | ],
57 | // Run the following command after the container has started and workspace mounted
58 | "postStartCommand": "conda env config vars set -n base PYTHONPATH=${containerWorkspaceFolder} && git config --global --add safe.directory ${containerWorkspaceFolder}"
59 | // Use 'forwardPorts' to make a list of ports inside the container available locally.
60 | // "forwardPorts": [],
61 | // Use 'postCreateCommand' to run commands after the container is created.
62 | // "postCreateCommand": "python --version"
63 | }
64 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | # ref: https://github.com/pre-commit-ci-demo/demo/blob/main/.github/workflows/pre-commit.yml
2 | name: pre-commit
3 |
4 | on:
5 | push:
6 | branches: [main]
7 | pull_request:
8 | branches: [main]
9 |
10 | jobs:
11 | pre-commit:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v2
15 | - uses: actions/setup-python@v2
16 | - uses: pre-commit/action@v2.0.0
17 |
--------------------------------------------------------------------------------
/.github/workflows/template-cleanup.yml:
--------------------------------------------------------------------------------
1 | # credits: https://github.com/JetBrains/intellij-platform-plugin-template/blob/main/.github/workflows/template-cleanup.yml
2 |
3 | # GitHub Actions Workflow responsible for cleaning up the Autonomous Research Systems' ml_template repository from
4 | # the template-specific files and configurations. This workflow is supposed to be triggered automatically
5 | # when a new template-based repository has been created.
6 |
7 | name: Template Cleanup
8 | on:
9 | push:
10 | branches:
11 | - main
12 |
13 | jobs:
14 | # Run cleaning process only if workflow is triggered by the ml_template repository.
15 | template-cleanup:
16 | name: Template Cleanup
17 | runs-on: ubuntu-latest
18 | if: github.event.repository.name != 'auto-sys-ml-template'
19 | steps:
20 | # Check out current repository
21 | - name: Fetch Sources
22 | uses: actions/checkout@v2.4.0
23 |
24 | # Cleanup project
25 | - name: Cleanup
26 | run: |
27 | rm -r assets/
28 | mv README_template.md README.md
29 | rm .github/workflows/template-cleanup.yml
30 |
31 | # Commit modified files
32 | - name: Commit files
33 | run: |
34 | git config --local user.email "ratneshmadaan@gmail.com"
35 | git config --local user.name "madratman"
36 | git add .
37 | git commit -m "bla"
38 | git reset $(git commit-tree HEAD^{tree} -m "microsoft/AutonomousSystemsResearchGroup: init ml template repo")
39 |
40 | # Push changes
41 | - name: Push changes
42 | uses: ad-m/github-push-action@master
43 | with:
44 | branch: main
45 | github_token: ${{ secrets.GITHUB_TOKEN }}
46 | force: true
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # amulet
2 | .amltconfig
3 | amlt/
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # Distribution / packaging
11 | build/
12 | dist/
13 | *.egg-info/
14 |
15 | # data
16 | data/
17 |
18 | # logs
19 | logs/
20 | outputs/
21 |
22 | # env
23 | .env
24 | .autoenv
25 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3
3 |
4 | ci:
5 | autofix_prs: true
6 | autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions"
7 | autoupdate_schedule: quarterly
8 |
9 | repos:
10 | - repo: https://github.com/pre-commit/pre-commit-hooks
11 | rev: v4.4.0
12 | hooks:
13 | # list of supported hooks: https://pre-commit.com/hooks.html
14 | - id: trailing-whitespace
15 | - id: end-of-file-fixer
16 | - id: check-yaml
17 | - id: check-case-conflict
18 | - id: debug-statements
19 | - id: detect-private-key
20 | - id: check-added-large-files
21 | args: ["--maxkb=500", "--enforce-all"]
22 | exclude: |
23 | (?x)^(
24 | )$
25 |
26 | - repo: https://github.com/asottile/pyupgrade
27 | rev: v3.3.1
28 | hooks:
29 | - id: pyupgrade
30 | args: [--py37-plus]
31 | name: Upgrade code
32 |
33 | # python formatting
34 | - repo: https://github.com/psf/black
35 | rev: 23.1.0
36 | hooks:
37 | - id: black
38 | name: Format code
39 | args: ["--line-length=120"]
40 |
41 | - repo: https://github.com/hadialqattan/pycln
42 | rev: v2.1.3 # Possible releases: https://github.com/hadialqattan/pycln/releases
43 | hooks:
44 | - id: pycln
45 | args: [--all]
46 |
47 | # ref: https://github.com/microsoft/vscode-isort]
48 | - repo: https://github.com/pycqa/isort
49 | rev: 5.12.0
50 | hooks:
51 | - id: isort
52 | name: isort (python)
53 | args: [--profile, "black"]
54 |
55 | # python docstring formatting
56 | - repo: https://github.com/myint/docformatter
57 | rev: v1.5.1
58 | hooks:
59 | - id: docformatter
60 | args: [--in-place, --wrap-summaries, "99", --wrap-descriptions, "92"]
61 |
62 | # yaml formatting
63 | - repo: https://github.com/pre-commit/mirrors-prettier
64 | rev: v3.0.0-alpha.6
65 | hooks:
66 | - id: prettier
67 | types: [yaml]
68 |
69 | # markdown formatting
70 | - repo: https://github.com/executablebooks/mdformat
71 | rev: 0.7.16
72 | hooks:
73 | - id: mdformat
74 | additional_dependencies:
75 | - mdformat-gfm
76 | #- mdformat-black
77 | - mdformat_frontmatter
78 | exclude: CHANGELOG.md
79 |
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations.
3 | // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp
4 | // List of extensions which should be recommended for users of this workspace.
5 | "recommendations": [
6 | "eamodio.gitlens",
7 | "ms-python.python",
8 | "ms-python.vscode-pylance",
9 | "ms-azuretools.vscode-docker",
10 | "ms-vscode-remote.remote-containers",
11 | "ms-vscode-remote.remote-ssh",
12 | "ms-vscode-remote.remote-ssh-edit",
13 | "ms-vscode-remote.remote-wsl",
14 | "ms-vscode-remote.vscode-remote-extensionpack",
15 | "redhat.vscode-yaml",
16 | "yzhang.markdown-all-in-one",
17 | "TrungNgo.autoflake",
18 | "Shan.code-settings-sync",
19 | "njpwerner.autodocstring",
20 | "jbockle.jbockle-format-files"
21 | ],
22 | // List of extensions recommended by VS Code that should not be recommended for users of this workspace.
23 | "unwantedRecommendations": []
24 | }
25 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python: Current File",
9 | "type": "python",
10 | "request": "launch",
11 | "program": "${file}",
12 | "console": "integratedTerminal",
13 | "justMyCode": true
14 | },
15 | {
16 | "name": "train.py",
17 | "type": "python",
18 | "request": "launch",
19 | "program": "src/train.py",
20 | "console": "integratedTerminal",
21 | "justMyCode": true,
22 | "args": [
23 | "base=configs/train.yaml",
24 | "trainer.num_nodes=1",
25 | "trainer.devices=1",
26 | "data.train_params.batch_size=256",
27 | "model.optimizer_config.lr=1e-3"
28 | ]
29 | }
30 | ]
31 | }
32 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "editor.defaultFormatter": "ms-python.black-formatter",
3 | "editor.formatOnPaste": true,
4 | "editor.formatOnSave": true,
5 | "editor.codeActionsOnSave": {
6 | "source.organizeImports": true
7 | },
8 | "python.analysis.typeCheckingMode": "basic",
9 | "python.formatting.provider": "black",
10 | "python.formatting.blackArgs": [
11 | "--line-length",
12 | "120"
13 | ],
14 | "python.linting.enabled": true,
15 | "python.linting.pylintEnabled": false,
16 | "python.linting.flake8Enabled": true,
17 | "python.linting.flake8Args": [
18 | "--max-line-length=120",
19 | ],
20 | "python.testing.pytestArgs": [
21 | "tests"
22 | ],
23 | "python.testing.unittestEnabled": false,
24 | "python.testing.pytestEnabled": true,
25 | "isort.args": [
26 | "--profile",
27 | "black"
28 | ],
29 | }
30 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Autonomous Systems Research Group: ML Template
2 |
3 | This document serves as an onboarding document as well as a template repository to quickstart machine learning experimentation at the [Autonomous Systems Research Group at Microsoft](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/)
4 |
5 | **Note** Use the table of contents icon
on the top left corner of this document to get to a specific section quickly.
6 |
7 | ### Using this template to generate a repository:
8 |
9 | - Click on the green colored box titled **Use this template** top right, and name your new repository.
10 | - You can clone your repo when it looks like [example_repo_generated_from_ml_template](https://github.com/AutonomousSystemsResearch/example_repo_generated_from_ml_template).
11 |
12 | > **Note** that after you create the template, it will take about **20 seconds** for an automated github action to clean up the generated repository using an auto-commit. Please ensure your repository looks like [example_repo_generated_from_ml_template](https://github.com/AutonomousSystemsResearch/example_repo_generated_from_ml_template) before cloning it.
13 |
14 | ## Introduction
15 |
16 | For the template repository, we will use:
17 |
18 | - [Pytorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/)
19 | - For minimizing boilerplate code
20 | - [OmegaConf](https://omegaconf.readthedocs.io/)
21 | - Please go through [OmegaConf's github readme](https://github.com/omry/omegaconf#releases) for tutorials.
22 | - For config management
23 | > **Note**: we have an [archived branch called `hydra`](https://github.com/AutonomousSystemsResearch/ml_template/tree/hydra) which uses [hydra](https://hydra.cc/) for config management.
24 | - Logging
25 | - We primarily use tensorboard. Amulet automatically patches tensorboard scalars to MLFlow for viewing metrics in Azure ML Studio.
26 | - Conda and Docker
27 | - For development
28 |
29 | ## Using this repository
30 |
31 | ### **Running locally**
32 |
33 | #### Setup
34 |
35 | - **VSCode**
36 |
37 | - Extensions:
38 |
39 | - Hit `Ctrl+Shift+P` and type `Show Recommended Extensions` and install them from the sidebar.
40 | Or click "yes" when you get a VS Code pop up to install the recommended extensions, which are specified in [.vscode/extensions.json](.vscode/extensions.json).
41 | Follow [this doc](https://code.visualstudio.com/docs/editor/extension-marketplace#_recommended-extensions) for more details.
42 | - `Python`, `Pylance`, `Docker`, `GitLens`, `YAML`, and the [Remote development extension pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) are strongly recommended.
43 |
44 | - Debugging:
45 |
46 | - Please follow [VSCode docs and tutorials](https://code.visualstudio.com/docs/python/debugging) on Python debugging
47 | - A minimal debugging configuration has been provided in [.vscode/launch.json](.vscode/launch.json). Please see VSCode docs on [launch.json configs](https://code.visualstudio.com/docs/python/debugging#_additional-configurations) and [config options](https://code.visualstudio.com/docs/python/debugging#_set-configuration-options).
48 |
49 | - **Conda**
50 |
51 | - Recommended for local development and debugging.
52 | - Note: For CUDA 11.6, see `Creating the conda environment from scratch (click to expand)` below.
53 |
54 | ```
55 | # create env
56 | conda env create --file docker/environment.yml
57 |
58 | # activate it
59 | conda activate ml_template
60 |
61 | # install this repo
62 | (ml_template) $ pip install -e .
63 |
64 | # install pre-commit (recommended). Scroll down to the #Developing section for details.
65 | (ml_template) $ pre-commit install
66 | ```
67 |
68 | > **Note** If you install additional packages in your environment manually, you should update the `environment.yml` correspondingly by doing a `$ conda env export | grep -v "^prefix: " > docker/environment.yml`.
69 |
70 |
71 |
72 | Creating the conda environment from scratch (click to expand)
73 |
74 |
75 | ```
76 | conda update -n base -c defaults conda
77 | conda create --name ml_template python=3.9
78 | conda activate ml_template
79 | conda install pip
80 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
81 | conda install pytorch-lightning -c conda-forge
82 | pip install omegaconf \
83 | pytest \
84 | sh \
85 | pre-commit \
86 | mlflow \
87 | azureml-mlflow \
88 | azureml-core \
89 | torch_tb_profiler \
90 | opencv-python \
91 | black isort flake8 \
92 | psutil \
93 | rich
94 | conda env export | grep -v "^prefix: " > docker/environment.yml
95 | pre-commit install
96 | pre-commit run --all-files
97 | pip install -e .
98 | ```
99 |
100 | For CUDA 11.6:
101 |
102 | ```
103 | conda update -n base -c defaults conda
104 | conda create --name ml_template_cu116 python=3.9
105 | conda activate ml_template_cu116
106 | conda install pip
107 | conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
108 | pip install pytorch-lightning
109 | pip install omegaconf \
110 | pytest \
111 | sh \
112 | pre-commit \
113 | mlflow \
114 | azureml-mlflow \
115 | azureml-core \
116 | torch_tb_profiler \
117 | opencv-python \
118 | black isort flake8 \
119 | psutil \
120 | rich
121 | conda env export | grep -v "^prefix: " > docker/environment_cu116.yml
122 | pre-commit install
123 | pre-commit run --all-files
124 | pip install -e .
125 | ```
126 |
127 |
128 |
129 |
130 |
131 | Upgrading pytorch and cudatoolkit (click to expand)
132 |
133 |
134 | ```
135 | conda remove pytorch torchvision torchaudio cudatoolkit
136 | # then follow pytorch installation steps, for example:
137 | conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
138 | # then update pytorch lightning:
139 | pip install pytorch-lightning --upgrade
140 | pip install pytorch-lightning[extra] --upgrade
141 | pip install -U jsonargparse[signatures] --upgrade
142 | ```
143 |
144 |
145 |
146 | - **Docker**
147 |
148 | - While submitting jobs to AzureML, we take our local conda environment and overlay them on an appropriate docker base image. For a new project / a custom conda environment, you can build the docker image locally as explained in a note later in this section. Optionally, the docker image building can be automated by CI (as explained later) if your project has a frequently update conda environment.
149 |
150 | - For `ml_template`, we have [three docker images](docker/) built automatically on each commit to `main` branch or a branch corresponding to a Pull Request.
151 | Docker images are pushed to [PRIVATEAZURECONTAINERREGISTRYNAME](https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/964a24a8-8835-43c1-9633-7d78841facf1/resourceGroups/research_team/providers/Microsoft.ContainerRegistry/registries/PRIVATEAZURECONTAINERREGISTRYNAME/repository) container registory under [ml_template](https://ms.portal.azure.com/#view/Microsoft_Azure_ContainerRegistries/RepositoryBlade/id/%2Fsubscriptions%2F964a24a8-8835-43c1-9633-7d78841facf1%2FresourceGroups%2Fresearch_team%2Fproviders%2FMicrosoft.ContainerRegistry%2Fregistries%25PRIVATEAZURECONTAINERREGISTRYNAME/repository/ml_template).
152 | To automate this for your generated repository from this template, please follow make an Azure Pipelines which will `azure-pipelines.yml`
153 |
154 | - The following tags correspond to the the *latest commit on the main branch.*
155 |
156 | | Tag | Dockerfile | docker pull command | Base Image |
157 | | :-------------------------------------------: | :-------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
158 | | `latest` or `latest-azureml` | [azureml](docker/Dockerfile_base_azureml) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest` | [mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04) |
159 | | `latest-nightly` or `latest-azureml-nightly` | [azureml_nightly](docker/Dockerfile_base_azureml_nightly) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nightly` | [mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04) |
160 | | `latest-nvidia` | [nvidia](docker/Dockerfile_base_nvidia) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia` | [nvcr.io/nvidia/pytorch:22-06-py3](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) |
161 |
162 | - Building docker images and running docker containers locally - can be useful to reproduce issues which might occur while submitting to AzureML on your local machine. Please peruse public documentation on docker + vscode.
163 |
164 | ```
165 | # pull image with [azureml image](https://hub.docker.com/_/microsoft-azureml?tab=description) as base with docker/environment.yml on top
166 | docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
167 |
168 | # (optional) pull image with nvidia pytorch image as base
169 | docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia (for nvidia pytorch base image. See the note below for more details.)
170 |
171 | # run image
172 | docker run -it --gpus=all -v : PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
173 |
174 | # (optional) recommended give a name to your container
175 | docker run -it --rm --name=MYFANCYCONTAINERNAME --gpus=all -v : PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
176 |
177 | # setup the repo (run inside the container)
178 | pip install -e .
179 |
180 | # install pre-commit (recommended). Scroll down to the "Developing" section for details.
181 | pre-commit install
182 | ```
183 |
184 |
185 |
186 | More details on docker image tags for Pull Request and main branch builds (click to expand)
187 |
188 | Similar to the `main` branch, for each pull request, we have:
189 |
190 | - `PR-<#pr_number>-latest` aka `PR--latest-azureml`
191 | - `PR-<#pr_number>-latest-nightly` aka `PR--latest-azureml-nightly`
192 | - `PR-<#pr_number>-latest-nvidia`
193 |
194 | And finally for both `main` and PR branches, we have tags corresponding to git commit hashes
195 |
196 | - `main--azureml` and `PR---azureml`
197 | - `main--azureml-nightly` and `PR---azureml-nightly`
198 | - `main--nvidia` and `PR---nvidia`
199 |
200 | For example:
201 |
202 | - `main-7fadad2b-azureml`, `main-7fadad2b-azureml-nightly`, `main-7fadad2b-nvidia`: correspond to [commit 7fadad2b](https://github.com/AutonomousSystemsResearch/ml_template/commit/7fadad2b1391cdbbc46422a6865caaf0300b9af8) on `main` branch with our three different dockerfiles
203 | - `PR-50-latest-azureml`, `PR-50-latest-azureml-nightly`, `PR-50-latest-nvidia`: correspond to latest commit on [PR#50](https://github.com/AutonomousSystemsResearch/ml_template/pull/50) with our three different dockerfiles
204 | - `PR-50-eef3b90-azureml`, `PR-50-eef3b90-azureml-nightly`, `PR-50-eef3b90-nvidia`: correspond to [commit eef3b90](https://github.com/AutonomousSystemsResearch/ml_template/pull/50/commits/eef3b900fc956614c7d45eac6fa9245b57f7bd72) on [PR#50](https://github.com/AutonomousSystemsResearch/ml_template/pull/50) with our three different dockerfiles
205 |
206 |
207 |
208 |
209 | Building and understanding our Dockerfiles (click to expand)
210 |
211 |
212 | - We have three docker files:
213 |
214 | - azureml base:
215 | - [docker/Dockerfile_base_azureml](docker/Dockerfile_base_azureml)
216 | - [docker/Dockerfile_base_azureml_latest](docker/Dockerfile_base_azureml_latest)
217 | - nvidia pytorch base:
218 | - [docker/Dockerfile_base_nvidia](docker/Dockerfile_base_nvidia).
219 |
220 | - Both of the azureml base images grabs a base image from [here](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu), and put the user's conda environment ([docker/environment.yml](docker/environment.yml)) on top of the base page.
221 |
222 | - In the `latest-azureml` version, packages in your local conda environment should match the docker image exactly.
223 |
224 | - In the `latest-azureml-nightly` image, pytorch (including cudatoolkit) and pytorch lightning are updated to the nightly versions.
225 |
226 | - The nvidia pytorch base image grabs a base image from [here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) ([here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) for details), which already has the latest version of pytorch.
227 | Instead of using user's conda environment, this docker file uses `pip` to install pytorch lightning and other dependencies on top of base image. So this image can have different versions of packages as compared to your conda environment.
228 |
229 | All docker images accept a build argument to update the base image version easily:
230 |
231 | - azureml images:
232 | - take base azure image name's suffix **and** tag. see available options [here](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu):
233 | - examples: `openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest`, `openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest`, and so on.
234 | - nvidia pytorch image:
235 | - takes base nvidia image name's tag only.
236 | - see [available tags here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) and [the release notes for their contents](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html)
237 | - examples: `22.06-py3`, `22.05-py3`, and so on.
238 |
239 | Please review the arguments in the dockerfiles carefully. These can also be seen by reading through [azure-pipelines.yml](azure-pipelines.yml).
240 |
241 | Building the azure-ml base + conda env images locally:
242 |
243 | ```
244 | cd docker;
245 |
246 | docker build \
247 | -f Dockerfile_base_azureml \
248 | --build-arg BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest \
249 | -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml .
250 |
251 | # note that in the PRIVATEAZURECONTAINERREGISTRYNAME acr, latest is equivalent to latest-azureml tag. So, we can just re-tag the image:
252 | docker tag PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest
253 | ```
254 |
255 | For the CUDA 11.6 version:
256 |
257 | ```
258 | cd docker;
259 |
260 | docker build \
261 | -f Dockerfile_base_azureml_cu116 \
262 | --build-arg BASE_IMAGE=openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest \
263 | -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml-cu116 .
264 |
265 | # note that in the PRIVATEAZURECONTAINERREGISTRYNAME acr, latest is equivalent to latest-azureml tag. So, we can just re-tag the image:
266 | docker tag PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml-cu116 PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-cu116
267 | ```
268 |
269 | Building the nvidia-pytorch image locally:
270 |
271 | ```
272 | # building nvidia-pytorch image with locally.
273 | cd docker;
274 |
275 | docker build \
276 | -f Dockerfile_base_nvidia \
277 | --build-arg BASE_IMAGE=22.06-py3 \
278 | -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia .
279 | ```
280 |
281 |
282 |
283 |
284 |
285 | Developing inside docker containers with VSCode: (click to expand)
286 |
287 |
288 | - [Attach to a docker container](https://code.visualstudio.com/docs/remote/attach-container)
289 |
290 | - [Devcontainer](https://code.visualstudio.com/docs/remote/containers)
291 |
292 | > **Note**: This method can be used on an Azure VM or locally with no change and uses docker
293 |
294 | Follow the steps below:
295 |
296 | - Connect to your remote Azure VM using VS Code
297 | - Open the workspace within a docker container for development, either using the popup as shown in the animation above, or by searching for `(Re)Build and (Re)open in container` in the command palette (hit `Ctrl+Shift+P` to open the command palette)
298 | - After setup is complete, it is time to set up the repository:
299 | ```
300 | pip install -e .
301 | pre-commit install
302 | ```
303 | - > **Note**: By default, the devcontainer uses the [azureml-conda base image](docker/Dockerfile_base_azureml). We can also use the [nvidia base image](docker/Dockerfile_base_nvidia) by modifying the `dockerfile` line in [devcontainer.json](.devcontainer/devcontainer.json). Similarly, we can edit the docker files build argument therein itself.
304 |
305 |
306 |
307 | #### Running MNIST example
308 |
309 | - Understanding OmegaConf and config files
310 |
311 | - Please review OmegaConf's [github readme](https://github.com/omry/omegaconf#releases) for [their documentation](https://omegaconf.readthedocs.io/en/2.2_branch/), [slides (for ver 2.1)](https://docs.google.com/presentation/d/e/2PACX-1vT_UIV7hCnquIbLUm4NnkUpXvPEh33IKiUEvPRF850WKA8opOlZOszjKdZ3tPmf8u7hGNP6HpqS-NT5/pub?start=false&loop=false&delayms=3000&slide=id.p), and a [live tutorial](https://github.com/omry/omegaconf#live-tutorial).
312 |
313 | - Single GPU
314 |
315 | ```
316 | python src/train.py base=configs/train.yaml trainer.num_nodes=1 trainer.devices=1
317 | ```
318 |
319 | - Multiple GPUs
320 |
321 | ```
322 | python src/train.py base=configs/train.yaml trainer.num_nodes=1 trainer.devices=4
323 | ```
324 |
325 | ### Running on Azure
326 |
327 | Note: This section used internal tools for job submission to Azure ML workspaces. This section is not supported publicly at the time of writing. However, one may peruse existing public documentation on azure ml.
328 |
329 | ### Developing
330 |
331 | #### Tests
332 |
333 | The template has some basic tests in `tests/` directory. To run them, run:
334 |
335 | ```
336 | # run all tests
337 | pytest
338 |
339 | # run single test
340 | pytest tests/test_dev_fast_run.py
341 | ```
342 |
343 | List of tests implemented:
344 |
345 | - [fast_dev_run](https://pytorch-lightning.readthedocs.io/en/stable/common/debugging.html#fast-dev-run): a simple check to run your trainer on single batch of train, valid, and test datase.
346 | It can also be useful to quickly check your code works by running while adding new features:
347 | ```
348 | python src/train.py base=configs/train.yaml --fast_dev_run=True
349 | ```
350 |
351 | #### Code formatting and Linting
352 |
353 | We use:
354 |
355 | - [black](https://black.readthedocs.io/en/stable/) for code formatting
356 |
357 | - [isort](https://pycqa.github.io/isort/) for import ordering
358 |
359 | - [pycln](https://hadialqattan.github.io/pycln/#/) for removing unused imports
360 |
361 | - Running locally:
362 |
363 | ```
364 | $ cd ml_template;
365 | $ black .
366 | $ isort .
367 | $ pycln --all .
368 | ```
369 |
370 | #### Pre-commit Hooks: Automating Code formatting and Linting
371 |
372 | [pre-commit](https://pre-commit.com/) hooks automate black autoformatting and ensuring PEP8 compliance.
373 |
374 | - Setting up:
375 |
376 | ```
377 | $ cd ml_template;
378 | $ pre-commit install
379 | ```
380 |
381 | - Running:
382 |
383 | After the above step, `pre-commit` will run **automatically** when you `git commit`.
384 | If the run fails with errors in red, you can check the edits made by `pre-commit` by `git diff`.
385 | If the changes look good, (1) `git add` those files again, and then (2) run `git commit` again.
386 |
387 | Optionally, you can also run pre-commit manually by:
388 |
389 | ```
390 | $ pre-commit run --all-files
391 | ```
392 |
393 | - Updating hooks:
394 | Use the `autoupdate` command to keep the versions of formatters in `.pre-commit-config.yaml` up to date.
395 |
396 | ```
397 | $ pre-commit autoupdate
398 | ```
399 |
400 | ### Continuous Integration
401 |
402 | - **Github Actions**
403 |
404 | - [Pre-commit checks](.github/workflows/pre-commit.yml)
405 | - [Template cleanup](.github/workflows/template-cleanup.yml):
406 | When a new repository is generated using this template, this action replace `README.md` with `README_template.md` to keep microsoft links internal.
407 |
408 | - **Azure Pipelines**
409 |
410 | - Create an azure devops pipeline for your repository.
411 | This automates building of your docker images, and also run pytests on them.
412 |
413 | - The azure pipeline logs can be seen at Azure DevOps webpage, but not on with github UI directly.
414 |
415 | Pull Request example:
416 |
417 | - You can click `View more details on Azure Pipelines` under the `Checks` section of a github PR.
418 | - See [PR#6/checks](https://github.com/AutonomousSystemsResearch/ml_template/pull/6/checks) for an example.
419 |
420 |
421 |
422 | - [Docker Build and Push Image](azure-pipelines.yml)
423 |
424 | See the job `BuildDockerImageAndPush` in [azure-pipelines.yml](azure-pipelines.yml). It will build the image in [docker/Dockerfile](docker/Dockerfile) and push it to a private azure container registry
425 |
426 | See docker section under #running-locally for details
427 |
428 | ### Contributing
429 |
430 | - conda `environment.yml` update:
431 |
432 | If you install packages in conda, update the `docker/environment.yml` by `conda env export | grep -v "^prefix: " > docker/environment.yml`, and send a PR.
433 |
434 | ## Reference Repositories
435 |
436 | - Pytorch Lightning:
437 |
438 | - Pytorch v/s Pytorch Lightning
439 |
440 | - [PyTorch Lightning for Dummies - A Tutorial and Overview
441 | ](https://www.assemblyai.com/blog/pytorch-lightning-for-dummies/)
442 | - [PyTorch Lightning: DataModules, Callbacks, TPU, and Loggers
443 | ](https://dev.to/krypticmouse/pytorch-lightning-datamodules-callbacks-tpu-and-loggers-4nhb)
444 |
445 | - Template / reference repositories
446 |
447 | - https://github.com/ashleve/lightning-hydra-template
448 | - https://github.com/lkhphuc/lightning-hydra-template
449 | - [Pytorch lightning bolts](https://lightning-bolts.readthedocs.io/en/latest/)
450 | - Look inside the code for datamodules, datasets, models, etc: https://github.com/PyTorchLightning/lightning-bolts/tree/master/pl_bolts
451 |
452 | - Pytorch Geometric:
453 |
454 | - [lightning-examples](https://github.com/pyg-team/pytorch_geometric/tree/d451d6d20287b03cbe5036e5c53ee5f633f3c429/examples/pytorch_lightning)
455 | - [torch_geometric.data.lightning_datamodule](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/lightning_datamodule.html)
456 | - [Graph Gym](https://pytorch-geometric.readthedocs.io/en/latest/notes/graphgym.html)
457 |
458 | - Pytorch data, datapipes, dataloaders:
459 |
460 | - https://pytorch.org/data/main/examples.html
461 | - https://github.com/tcapelle/torchdata
462 | - https://github.com/pytorch/data
463 |
--------------------------------------------------------------------------------
/README_template.md:
--------------------------------------------------------------------------------
1 | # Name_of_Your_Project
2 |
3 | ## Setting up
4 |
5 | - Using conda
6 |
7 | ```
8 | # create env
9 | conda env create --file docker/environment.yml
10 |
11 | # activate it
12 | conda activate NAMEOFYOURPROJECT
13 |
14 | # install this repo
15 | (NAMEOFYOURPROJECT) $ pip install -e .
16 | ```
17 |
18 | - Using docker
19 |
20 | ```
21 | # pull image with [azureml image](https://hub.docker.com/_/microsoft-azureml?tab=description) as base with docker/environment.yml on top
22 | docker pull NAMEOFYOURPROJECT:latest
23 |
24 | # pull image with nvidia pytorch image as base
25 | # docker pull NAMEOFYOURPROJECT:latest-nvidia
26 |
27 | # run image
28 | docker run -it --gpus=all -v :
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | - Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | - Full paths of source file(s) related to the manifestation of the issue
23 | - The location of the affected source code (tag/branch/commit or direct URL)
24 | - Any special configuration required to reproduce the issue
25 | - Step-by-step instructions to reproduce the issue
26 | - Proof-of-concept or exploit code (if possible)
27 | - Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/assets/images/table-of-contents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/assets/images/table-of-contents.png
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | trigger:
2 | - main
3 | # paths:
4 | # include:
5 | # - docker
6 |
7 | pr:
8 | - main
9 |
10 | resources:
11 | - repo: self
12 |
13 | variables:
14 | # Note: to customize the pipeline to use private ACRs other than commondockerimages,
15 | # you need to change BOTH dockerRegistryServiceConnection and containerRegistryName
16 |
17 | # Container registry service connection established during pipeline creation
18 | dockerRegistryServiceConnection: "442ea973-c852-4792-aa09-fab4a9df791f"
19 | containerRegistryName: "commondockerimages.azurecr.io"
20 |
21 | dockerfileazuremlPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_azureml"
22 | dockerfileazuremlnightlyPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_azureml_nightly"
23 | dockerfilenvidiaPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_nvidia"
24 |
25 | tagLatest: "latest"
26 |
27 | # Agent VM image name
28 | vmImageName: "ubuntu-latest"
29 |
30 | stages:
31 | # docs: https://docs.microsoft.com/azure/devops/pipelines/languages/docker
32 | - stage: BuildDockerImagesAndRunPytest
33 | displayName: Build docker; run pytests on built images
34 | jobs:
35 | - job: DefineDockerTags
36 | displayName: define docker tags
37 | pool:
38 | vmImage: $(vmImageName)
39 | steps:
40 | - bash: |
41 | github_organization_prefix="AutonomousSystemsResearch/"
42 | full_repo_name=$(Build.Repository.Name)
43 | repo_name=${full_repo_name#"$github_organization_prefix"}
44 | branch_name=$(Build.SourceBranchName)
45 |
46 | git_short_hash_main=`git rev-parse --short=7 HEAD`
47 | git_hash_pr=$(System.PullRequest.SourceCommitId)
48 | git_short_hash_pr=${git_hash_pr:0:7}
49 | pr_number=$(System.PullRequest.PullRequestNumber)
50 |
51 | tag_main_git_commit=main-$git_short_hash_main
52 | tag_pr_git_commit=PR-$pr_number-$git_short_hash_pr
53 | tag_pr_latest=PR-$pr_number-latest
54 |
55 | echo ""
56 | echo "full repo name: $(Build.Repository.Name)"
57 | echo "repo name: $repo_name"
58 | echo "Build Id: $(Build.BuildId)"
59 | echo "Build BuildNumber: $(Build.BuildNumber)"
60 | echo "Build Reason: $(Build.Reason)"
61 | echo "Build Branch Name: $(Build.SourceBranchName)"
62 | echo "git commit message: $(Build.SourceVersionMessage)"
63 | echo "git hash (main branch): $(Build.SourceVersion)"
64 | echo "git hash short (main branch): $git_short_hash_main"
65 | echo "PR branch: $(System.PullRequest.SourceBranch)"
66 | echo "PR number: $(System.PullRequest.PullRequestNumber)"
67 | echo "PR ID: $(System.PullRequest.PullRequestId)"
68 | echo "git hash (PR branch): $(System.PullRequest.SourceCommitId)"
69 | echo "git hash short (PR branch): : $git_short_hash_pr"
70 |
71 | # set pipeline variables which can be referenced in the jobs that follow to tag docker images appropriately
72 | echo "##vso[task.setvariable variable=repoName;isoutput=true]$repo_name"
73 | echo "##vso[task.setvariable variable=tagPRLatest;isoutput=true]$tag_pr_latest"
74 |
75 | if [[ "$branch_name" == "main" ]]; then
76 | echo "##vso[task.setvariable variable=tagOfThisBuild;isoutput=true]$tag_main_git_commit"
77 | fi
78 | if [[ "$branch_name" == "merge" ]]; then
79 | echo "##vso[task.setvariable variable=tagOfThisBuild;isoutput=true]$tag_pr_git_commit"
80 | fi
81 |
82 | ## deprecated; but might be of use in the future
83 | # echo "##vso[task.setvariable variable=tagMainGitCommitHash;isoutput=true]$tag_main_git_commit"
84 | # echo "##vso[task.setvariable variable=tagPRGitCommitHash;isoutput=true]$tag_pr_git_commit"
85 |
86 | # print tags:
87 | echo "tag_pr_latest: $tag_pr_latest"
88 | echo "tag_pr_git_commit: $tag_pr_git_commit"
89 | echo "tag_main_git_commit: $tag_main_git_commit"
90 |
91 | # print outputvars:
92 | echo "tag_pr_latest: $tag_pr_latest"
93 | echo "tag_pr_git_commit: $tag_pr_git_commit"
94 | echo "tag_main_git_commit: $tag_main_git_commit"
95 | name: DockerTagVars # because we're going to depend on it, we need to name the step
96 | displayName: (debug) print git info
97 |
98 | - job: BuildDockerAzureMLBase
99 | dependsOn: DefineDockerTags
100 | displayName: build azureml; run pytest
101 | pool:
102 | vmImage: $(vmImageName)
103 | variables:
104 | tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ]
105 | tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ]
106 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
107 | steps:
108 | - bash: |
109 | echo "repoName: $(repoName)"
110 | echo "tagOfThisBuild: $(tagOfThisBuild)"
111 | echo "tagPRLatest: $(tagPRLatest)"
112 | displayName: (debug) print pipeline vars
113 |
114 | - task: Docker@2
115 | displayName: Build and Push Image
116 | inputs:
117 | command: buildAndPush
118 | repository: $(repoName)
119 | dockerfile: $(dockerfileazuremlPath)
120 | containerRegistry: $(dockerRegistryServiceConnection)
121 | ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}:
122 | tags: |
123 | $(tagOfThisBuild)-azureml
124 | $(tagPRLatest)-azureml
125 | $(tagPRLatest)
126 | ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
127 | tags: |
128 | $(tagOfThisBuild)-azureml
129 | $(tagLatest)-azureml
130 | $(tagLatest)
131 |
132 | - bash: |
133 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-azureml
134 | docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\""
135 | docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\""
136 | docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\""
137 | displayName: print versions
138 |
139 | - script: |
140 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
141 | displayName: pytest
142 |
143 | - job: BuildDockerAzureMLBaseNightly
144 | dependsOn: DefineDockerTags
145 | displayName: build azureml nightly; run pytest
146 | pool:
147 | vmImage: $(vmImageName)
148 | variables:
149 | tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ]
150 | tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ]
151 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
152 | steps:
153 | - bash: |
154 | echo "repoName: $(repoName)"
155 | echo "tagOfThisBuild: $(tagOfThisBuild)-azureml-nightly"
156 | echo "tagPRLatest: $(tagPRLatest)-azureml-nightly"
157 | displayName: (debug) print pipeline vars
158 |
159 | - task: Docker@2
160 | displayName: Build and Push Image
161 | inputs:
162 | command: buildAndPush
163 | repository: $(repoName)
164 | dockerfile: $(dockerfileazuremlnightlyPath)
165 | containerRegistry: $(dockerRegistryServiceConnection)
166 | ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}:
167 | tags: |
168 | $(tagOfThisBuild)-azureml-nightly
169 | $(tagPRLatest)-azureml-nightly
170 | ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
171 | tags: |
172 | $(tagOfThisBuild)-azureml-nightly
173 | $(tagLatest)-azureml-nightly
174 |
175 | - bash: |
176 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-azureml-nightly
177 | docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\""
178 | docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\""
179 | docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\""
180 | displayName: print versions
181 |
182 | - script: |
183 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
184 | displayName: pytest
185 |
186 | - job: BuildDockerNvidiaBasePipInstall
187 | dependsOn: DefineDockerTags
188 | displayName: build nvidia pytorch; run pytest
189 | pool:
190 | vmImage: $(vmImageName)
191 | variables:
192 | tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ]
193 | tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ]
194 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
195 | steps:
196 | - bash: |
197 | echo "repoName: $(repoName)"
198 | echo "tagOfThisBuild: $(tagOfThisBuild)-nvidia"
199 | echo "tagPRLatest: $(tagPRLatest)-nvidia"
200 |
201 | # echo "tagMainGitCommitHash: $(tagMainGitCommitHash)"
202 | # echo "tagPRGitCommitHash: $(tagPRGitCommitHash)"
203 | displayName: (debug) print pipeline vars
204 |
205 | - task: Docker@2
206 | displayName: Build and Push Image
207 | inputs:
208 | command: buildAndPush
209 | repository: $(repoName)
210 | dockerfile: $(dockerfilenvidiaPath)
211 | containerRegistry: $(dockerRegistryServiceConnection)
212 | ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}:
213 | tags: |
214 | $(tagOfThisBuild)-nvidia
215 | $(tagPRLatest)-nvidia
216 | ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
217 | tags: |
218 | $(tagOfThisBuild)-nvidia
219 | $(tagLatest)-nvidia
220 |
221 | - bash: |
222 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-nvidia
223 | docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\""
224 | docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\""
225 | docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\""
226 | displayName: print versions
227 |
228 | - script: |
229 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
230 | displayName: pytest
231 |
232 | - job: PytestAzureMLBaseTagLatest
233 | dependsOn: DefineDockerTags
234 | displayName: pytest latest-azureml
235 | pool:
236 | vmImage: $(vmImageName)
237 | variables:
238 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
239 | steps:
240 | - checkout: self
241 |
242 | - task: Docker@2
243 | displayName: Login to ACR
244 | inputs:
245 | command: login
246 | containerRegistry: $(dockerRegistryServiceConnection)
247 |
248 | - script: |
249 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-azureml
250 | displayName: docker pull and run
251 |
252 | - script: |
253 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
254 | displayName: pytest
255 |
256 | - job: PytestAzureMLBaseTagLatestNightly
257 | dependsOn: DefineDockerTags
258 | displayName: pytest latest-azureml-nightly
259 | pool:
260 | vmImage: $(vmImageName)
261 | variables:
262 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
263 | steps:
264 | - checkout: self
265 |
266 | - task: Docker@2
267 | displayName: Login to ACR
268 | inputs:
269 | command: login
270 | containerRegistry: $(dockerRegistryServiceConnection)
271 |
272 | - script: |
273 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-azureml-nightly
274 | displayName: docker pull and run
275 |
276 | - script: |
277 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
278 | displayName: pytest
279 |
280 | - job: PytestNvidiaBaseTagLatest
281 | dependsOn: DefineDockerTags
282 | displayName: pytest latest-nvidia
283 | pool:
284 | vmImage: $(vmImageName)
285 | variables:
286 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ]
287 | steps:
288 | - checkout: self
289 |
290 | - task: Docker@2
291 | displayName: Login to ACR
292 | inputs:
293 | command: login
294 | containerRegistry: $(dockerRegistryServiceConnection)
295 |
296 | - script: |
297 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-nvidia
298 | displayName: docker pull and run
299 |
300 | - script: |
301 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu"
302 | displayName: pytest
303 |
--------------------------------------------------------------------------------
/configs/train.yaml:
--------------------------------------------------------------------------------
1 | seed_everything: 42
2 |
3 | trainer:
4 | default_root_dir: ${oc.env:AMLT_OUTPUT_DIR,outputs}
5 |
6 | num_nodes: 1
7 | devices: 1
8 | accelerator: gpu
9 | strategy: ddp_find_unused_parameters_false
10 |
11 | min_epochs: 1
12 | max_epochs: 10
13 | enable_progress_bar: true
14 |
15 | sync_batchnorm: True
16 | enable_checkpointing: True
17 | resume_from_checkpoint: null
18 |
19 | # debugging
20 | fast_dev_run: false
21 |
22 | data:
23 | _target_: datamodules.mnist_datamodule.MNISTDataModule
24 |
25 | file_params:
26 | base_dir: data/
27 | train_val_test_split: [55_000, 5_000, 10_000]
28 |
29 | train_params:
30 | batch_size: 128
31 | num_workers: 0
32 | pin_memory: False
33 |
34 | model:
35 | _target_: models.mnist_module.MNISTLitModule
36 |
37 | mlp_config:
38 | input_size: 784
39 | lin1_size: 256
40 | lin2_size: 256
41 | lin3_size: 256
42 | output_size: 10
43 |
44 | optimizer_config:
45 | lr: 0.001
46 | weight_decay: 0.0005
47 |
48 | logger:
49 | tensorboard:
50 | _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
51 | save_dir: ${trainer.default_root_dir}/logs
52 | name: null
53 | version: null
54 | log_graph: False
55 | default_hp_metric: True
56 | prefix: ""
57 |
58 | callbacks:
59 | checkpoint:
60 | _target_: pytorch_lightning.callbacks.ModelCheckpoint
61 | dirpath: "${trainer.default_root_dir}/checkpoints/"
62 | monitor: "val/acc" # name of the logged metric which determines when model is improving
63 | mode: "max" # "max" means higher metric value is better, can be also "min"
64 | save_top_k: 1 # save k best models (determined by above metric)
65 | save_last: True # additionaly always save model from last epoch
66 | verbose: False
67 | filename: "epoch_{epoch:03d}"
68 | auto_insert_metric_name: False
69 |
70 | early_stopping:
71 | _target_: pytorch_lightning.callbacks.EarlyStopping
72 | monitor: "val/loss" # name of the logged metric which determines when model is improving
73 | mode: "min" # "max" means higher metric value is better, can be also "min"
74 | patience: 100 # how many validation epochs of not improving until training stops
75 | min_delta: 0 # minimum change in the monitored metric needed to qualify as an improvement
76 |
77 | model_summary:
78 | _target_: pytorch_lightning.callbacks.RichModelSummary
79 | max_depth: -1
80 |
81 | progress:
82 | _target_: pytorch_lightning.callbacks.RichProgressBar
83 |
84 | lr_mon:
85 | _target_: pytorch_lightning.callbacks.LearningRateMonitor
86 | logging_interval: "epoch"
87 |
--------------------------------------------------------------------------------
/docker/Dockerfile_base_azureml:
--------------------------------------------------------------------------------
1 | # see latest azureml base images tags here
2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description
4 |
5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest
6 |
7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE}
8 |
9 | ARG DEBIAN_FRONTEND=noninteractive
10 |
11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
12 | build-essential \
13 | software-properties-common \
14 | cmake \
15 | g++-7 \
16 | git \
17 | gpg \
18 | curl \
19 | vim \
20 | wget \
21 | ca-certificates \
22 | libjpeg-dev \
23 | libpng-dev \
24 | librdmacm1 \
25 | libibverbs1 \
26 | ibverbs-providers \
27 | openssh-client \
28 | openssh-server \
29 | libsm6 \
30 | libxext6 \
31 | ffmpeg \
32 | libfontconfig1 \
33 | libxrender1 \
34 | libgl1-mesa-glx &&\
35 | apt-get clean && rm -rf /var/lib/apt/lists/*
36 |
37 | ADD environment.yml /tmp/environment.yml
38 | RUN conda env update -n base -f /tmp/environment.yml
39 |
--------------------------------------------------------------------------------
/docker/Dockerfile_base_azureml_cu116:
--------------------------------------------------------------------------------
1 | # see latest azureml base images tags here
2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description
4 |
5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest
6 |
7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE}
8 |
9 | ARG DEBIAN_FRONTEND=noninteractive
10 |
11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
12 | build-essential \
13 | software-properties-common \
14 | cmake \
15 | g++-7 \
16 | git \
17 | gpg \
18 | curl \
19 | vim \
20 | wget \
21 | ca-certificates \
22 | libjpeg-dev \
23 | libpng-dev \
24 | librdmacm1 \
25 | libibverbs1 \
26 | ibverbs-providers \
27 | openssh-client \
28 | openssh-server \
29 | libsm6 \
30 | libxext6 \
31 | ffmpeg \
32 | libfontconfig1 \
33 | libxrender1 \
34 | libgl1-mesa-glx &&\
35 | apt-get clean && rm -rf /var/lib/apt/lists/*
36 |
37 | ADD environment_cu116.yml /tmp/environment.yml
38 | RUN conda env update -n base -f /tmp/environment.yml
39 |
--------------------------------------------------------------------------------
/docker/Dockerfile_base_azureml_nightly:
--------------------------------------------------------------------------------
1 | # see latest azureml base images tags here
2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu
3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description
4 |
5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest
6 |
7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE}
8 |
9 | ARG DEBIAN_FRONTEND=noninteractive
10 |
11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
12 | build-essential \
13 | software-properties-common \
14 | cmake \
15 | g++-7 \
16 | git \
17 | gpg \
18 | curl \
19 | vim \
20 | wget \
21 | ca-certificates \
22 | libjpeg-dev \
23 | libpng-dev \
24 | librdmacm1 \
25 | libibverbs1 \
26 | ibverbs-providers \
27 | openssh-client \
28 | openssh-server \
29 | libsm6 \
30 | libxext6 \
31 | ffmpeg \
32 | libfontconfig1 \
33 | libxrender1 \
34 | libgl1-mesa-glx &&\
35 | apt-get clean && rm -rf /var/lib/apt/lists/*
36 |
37 | # use user's conda env as base
38 | ADD environment.yml /tmp/environment.yml
39 | RUN conda env update -n base -f /tmp/environment.yml
40 |
41 | # update pytorch installed from the above step to nightly
42 | RUN conda update pytorch torchvision torchaudio -c pytorch-nightly -y
43 |
44 | # install pytorch lightning nightly
45 | RUN pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip && \
46 | pip install jsonargparse[signatures] --upgrade
47 |
48 | # (optional) update all conda pkgs
49 | # RUN conda update --all
50 |
--------------------------------------------------------------------------------
/docker/Dockerfile_base_nvidia:
--------------------------------------------------------------------------------
1 | # tags release notes: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
2 | # tags: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags?quick-deploy=false
3 |
4 | ARG BASE_IMAGE=22.06-py3
5 |
6 | FROM nvcr.io/nvidia/pytorch:${BASE_IMAGE}
7 |
8 | ARG DEBIAN_FRONTEND=noninteractive
9 |
10 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
11 | build-essential \
12 | software-properties-common \
13 | cmake \
14 | g++-7 \
15 | git \
16 | gpg \
17 | curl \
18 | vim \
19 | wget \
20 | ca-certificates \
21 | libjpeg-dev \
22 | libpng-dev \
23 | librdmacm1 \
24 | libibverbs1 \
25 | ibverbs-providers \
26 | openssh-client \
27 | openssh-server \
28 | libsm6 \
29 | libxext6 \
30 | ffmpeg \
31 | libfontconfig1 \
32 | libxrender1 \
33 | libgl1-mesa-glx &&\
34 | apt-get clean && rm -rf /var/lib/apt/lists/*
35 |
36 | RUN pip install click termcolor future python-dateutil \
37 | azureml-core azureml-mlflow \
38 | opencv-python scipy psutil
39 |
40 | # jsonargparse[signatures] does not work in docker, so need lightning[extra]
41 | # in conda, jsonargparse[signatures] is enough
42 | RUN pip install pytorch-lightning[extra] einops pre-commit pytest sh rich
43 | # RUN pip install pytorch-lightning jsonargparse[signatures] einops
44 |
--------------------------------------------------------------------------------
/docker/environment.yml:
--------------------------------------------------------------------------------
1 | name: ml_template
2 | channels:
3 | - pytorch
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - _openmp_mutex=5.1=1_gnu
9 | - absl-py=1.2.0=pyhd8ed1ab_0
10 | - aiohttp=3.8.1=py39hb9d737c_1
11 | - aiosignal=1.2.0=pyhd8ed1ab_0
12 | - async-timeout=4.0.2=pyhd8ed1ab_0
13 | - attrs=22.1.0=pyh71513ae_1
14 | - blas=1.0=mkl
15 | - blinker=1.4=py_1
16 | - brotlipy=0.7.0=py39h27cfd23_1003
17 | - bzip2=1.0.8=h7b6447c_0
18 | - c-ares=1.18.1=h7f98852_0
19 | - ca-certificates=2022.9.24=ha878542_0
20 | - cachetools=5.2.0=pyhd8ed1ab_0
21 | - certifi=2022.9.24=pyhd8ed1ab_0
22 | - cffi=1.15.1=py39h74dc2b5_0
23 | - charset-normalizer=2.0.4=pyhd3eb1b0_0
24 | - click=8.1.3=py39hf3d152e_0
25 | - colorama=0.4.5=pyhd8ed1ab_0
26 | - cryptography=37.0.1=py39h9ce1e76_0
27 | - cudatoolkit=11.3.1=h2bc3f7f_2
28 | - ffmpeg=4.3=hf484d3e_0
29 | - freetype=2.11.0=h70c0345_0
30 | - frozenlist=1.2.0=py39h7f8727e_0
31 | - fsspec=2022.8.2=pyhd8ed1ab_0
32 | - giflib=5.2.1=h7b6447c_0
33 | - gmp=6.2.1=h295c915_3
34 | - gnutls=3.6.15=he1e5248_0
35 | - google-auth=2.11.1=pyh1a96a4e_0
36 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
37 | - grpcio=1.42.0=py39hce63b2e_0
38 | - idna=3.3=pyhd3eb1b0_0
39 | - importlib-metadata=4.11.4=py39hf3d152e_0
40 | - intel-openmp=2021.4.0=h06a4308_3561
41 | - jpeg=9e=h7f8727e_0
42 | - lame=3.100=h7b6447c_0
43 | - lcms2=2.12=h3be6417_0
44 | - ld_impl_linux-64=2.38=h1181459_1
45 | - lerc=3.0=h295c915_0
46 | - libdeflate=1.8=h7f8727e_5
47 | - libffi=3.3=he6710b0_2
48 | - libgcc-ng=11.2.0=h1234567_1
49 | - libgomp=11.2.0=h1234567_1
50 | - libiconv=1.16=h7f8727e_2
51 | - libidn2=2.3.2=h7f8727e_0
52 | - libpng=1.6.37=hbc83047_0
53 | - libprotobuf=3.15.8=h780b84a_1
54 | - libstdcxx-ng=11.2.0=h1234567_1
55 | - libtasn1=4.16.0=h27cfd23_0
56 | - libtiff=4.4.0=hecacb30_0
57 | - libunistring=0.9.10=h27cfd23_0
58 | - libwebp=1.2.2=h55f646e_0
59 | - libwebp-base=1.2.2=h7f8727e_0
60 | - lz4-c=1.9.3=h295c915_1
61 | - markdown=3.4.1=pyhd8ed1ab_0
62 | - markupsafe=2.1.1=py39hb9d737c_1
63 | - mkl=2021.4.0=h06a4308_640
64 | - mkl-service=2.4.0=py39h7f8727e_0
65 | - mkl_fft=1.3.1=py39hd3c417c_0
66 | - mkl_random=1.2.2=py39h51133e4_0
67 | - multidict=6.0.2=py39hb9d737c_1
68 | - ncurses=6.3=h5eee18b_3
69 | - nettle=3.7.3=hbbd107a_1
70 | - numpy=1.23.1=py39h6c91a56_0
71 | - numpy-base=1.23.1=py39ha15fc14_0
72 | - oauthlib=3.2.1=pyhd8ed1ab_0
73 | - openh264=2.1.1=h4ff587b_0
74 | - openssl=1.1.1q=h7f8727e_0
75 | - packaging=21.3=pyhd8ed1ab_0
76 | - pillow=9.2.0=py39hace64e9_1
77 | - pip=22.1.2=py39h06a4308_0
78 | - protobuf=3.15.8=py39he80948d_0
79 | - pyasn1=0.4.8=py_0
80 | - pyasn1-modules=0.2.7=py_0
81 | - pycparser=2.21=pyhd3eb1b0_0
82 | - pydeprecate=0.3.2=pyhd8ed1ab_0
83 | - pyjwt=2.5.0=pyhd8ed1ab_0
84 | - pyopenssl=22.0.0=pyhd3eb1b0_0
85 | - pyparsing=3.0.9=pyhd8ed1ab_0
86 | - pysocks=1.7.1=py39h06a4308_0
87 | - python=3.9.13=haa1d7c7_1
88 | - python_abi=3.9=2_cp39
89 | - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0
90 | - pytorch-lightning=1.7.7=pyhd8ed1ab_0
91 | - pytorch-mutex=1.0=cuda
92 | - pyu2f=0.1.5=pyhd8ed1ab_0
93 | - pyyaml=6.0=py39hb9d737c_4
94 | - readline=8.1.2=h7f8727e_1
95 | - requests=2.28.1=py39h06a4308_0
96 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0
97 | - rsa=4.9=pyhd8ed1ab_0
98 | - setuptools=63.4.1=py39h06a4308_0
99 | - six=1.16.0=pyhd3eb1b0_1
100 | - sqlite=3.39.3=h5082296_0
101 | - tensorboard=2.10.1=pyhd8ed1ab_0
102 | - tensorboard-data-server=0.6.0=py39hd97740a_2
103 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
104 | - tk=8.6.12=h1ccaba5_0
105 | - torchaudio=0.12.1=py39_cu113
106 | - torchmetrics=0.9.3=pyhd8ed1ab_0
107 | - torchvision=0.13.1=py39_cu113
108 | - tqdm=4.64.1=pyhd8ed1ab_0
109 | - typing-extensions=4.3.0=py39h06a4308_0
110 | - typing_extensions=4.3.0=py39h06a4308_0
111 | - tzdata=2022c=h04d1e81_0
112 | - urllib3=1.26.11=py39h06a4308_0
113 | - werkzeug=2.2.2=pyhd8ed1ab_0
114 | - wheel=0.37.1=pyhd3eb1b0_0
115 | - xz=5.2.6=h5eee18b_0
116 | - yaml=0.2.5=h7f98852_2
117 | - yarl=1.7.2=py39hb9d737c_2
118 | - zipp=3.8.1=pyhd8ed1ab_0
119 | - zlib=1.2.12=h5eee18b_3
120 | - zstd=1.5.2=ha4553b6_0
121 | - pip:
122 | - adal==1.2.7
123 | - alembic==1.8.1
124 | - antlr4-python3-runtime==4.9.3
125 | - argcomplete==2.0.0
126 | - azure-common==1.1.28
127 | - azure-core==1.25.1
128 | - azure-graphrbac==0.61.1
129 | - azure-identity==1.11.0
130 | - azure-mgmt-authorization==2.0.0
131 | - azure-mgmt-containerregistry==10.0.0
132 | - azure-mgmt-core==1.3.2
133 | - azure-mgmt-keyvault==10.1.0
134 | - azure-mgmt-resource==21.1.0
135 | - azure-mgmt-storage==20.0.0
136 | - azure-storage-blob==12.13.0
137 | - azureml-core==1.45.0.post2
138 | - azureml-mlflow==1.45.0
139 | - backports-tempfile==1.0
140 | - backports-weakref==1.0.post1
141 | - bcrypt==4.0.0
142 | - black==22.8.0
143 | - cfgv==3.3.1
144 | - cloudpickle==2.2.0
145 | - commonmark==0.9.1
146 | - contextlib2==21.6.0
147 | - databricks-cli==0.17.3
148 | - distlib==0.3.6
149 | - docker==5.0.3
150 | - filelock==3.8.0
151 | - flake8==5.0.4
152 | - flask==2.2.2
153 | - gitdb==4.0.9
154 | - gitpython==3.1.27
155 | - greenlet==1.1.3
156 | - gunicorn==20.1.0
157 | - humanfriendly==10.0
158 | - identify==2.5.5
159 | - iniconfig==1.1.1
160 | - isodate==0.6.1
161 | - isort==5.10.1
162 | - itsdangerous==2.1.2
163 | - jeepney==0.8.0
164 | - jmespath==1.0.1
165 | - jsonpickle==2.2.0
166 | - knack==0.9.0
167 | - mako==1.2.3
168 | - mccabe==0.7.0
169 | - mlflow==1.29.0
170 | - mlflow-skinny==1.29.0
171 | - msal==1.19.0
172 | - msal-extensions==1.0.0
173 | - msrest==0.7.1
174 | - msrestazure==0.6.4
175 | - mypy-extensions==0.4.3
176 | - ndg-httpsclient==0.5.1
177 | - nodeenv==1.7.0
178 | - omegaconf==2.2.3
179 | - opencv-python==4.6.0.66
180 | - pandas==1.5.0
181 | - paramiko==2.11.0
182 | - pathspec==0.10.1
183 | - pkginfo==1.8.3
184 | - platformdirs==2.5.2
185 | - pluggy==1.0.0
186 | - portalocker==2.5.1
187 | - pre-commit==2.20.0
188 | - prometheus-flask-exporter==0.20.3
189 | - psutil==5.9.2
190 | - py==1.11.0
191 | - pycodestyle==2.9.1
192 | - pyflakes==2.5.0
193 | - pynacl==1.5.0
194 | - pytest==7.1.3
195 | - pytz==2022.2.1
196 | - querystring-parser==1.2.4
197 | - rich==12.5.1
198 | - scipy==1.9.1
199 | - secretstorage==3.3.3
200 | - sh==1.14.3
201 | - smmap==5.0.0
202 | - sqlalchemy==1.4.41
203 | - sqlparse==0.4.3
204 | - tabulate==0.8.10
205 | - toml==0.10.2
206 | - tomli==2.0.1
207 | - torch-tb-profiler==0.4.0
208 | - types-cryptography==3.3.23
209 | - virtualenv==20.16.5
210 | - websocket-client==1.4.1
211 |
--------------------------------------------------------------------------------
/docker/environment_cu116.yml:
--------------------------------------------------------------------------------
1 | name: ml_template_cu_116
2 | channels:
3 | - pytorch
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - _openmp_mutex=5.1=1_gnu
9 | - absl-py=1.2.0=pyhd8ed1ab_0
10 | - aiohttp=3.8.1=py39hb9d737c_1
11 | - aiosignal=1.2.0=pyhd8ed1ab_0
12 | - async-timeout=4.0.2=pyhd8ed1ab_0
13 | - attrs=22.1.0=pyh71513ae_1
14 | - blas=1.0=mkl
15 | - blinker=1.4=py_1
16 | - brotlipy=0.7.0=py39hb9d737c_1004
17 | - bzip2=1.0.8=h7f98852_4
18 | - c-ares=1.18.1=h7f98852_0
19 | - ca-certificates=2022.9.24=ha878542_0
20 | - cachetools=5.2.0=pyhd8ed1ab_0
21 | - certifi=2022.9.24=pyhd8ed1ab_0
22 | - cffi=1.14.6=py39he32792d_0
23 | - charset-normalizer=2.1.1=pyhd8ed1ab_0
24 | - click=8.1.3=py39hf3d152e_0
25 | - colorama=0.4.5=pyhd8ed1ab_0
26 | - cryptography=37.0.2=py39hd97740a_0
27 | - cudatoolkit=11.6.0=hecad31d_10
28 | - ffmpeg=4.3=hf484d3e_0
29 | - freetype=2.10.4=h0708190_1
30 | - frozenlist=1.2.0=py39h7f8727e_0
31 | - fsspec=2022.8.2=pyhd8ed1ab_0
32 | - gmp=6.2.1=h58526e2_0
33 | - gnutls=3.6.13=h85f3911_1
34 | - google-auth=2.11.1=pyh1a96a4e_0
35 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
36 | - grpcio=1.42.0=py39hce63b2e_0
37 | - idna=3.4=pyhd8ed1ab_0
38 | - importlib-metadata=4.11.4=py39hf3d152e_0
39 | - intel-openmp=2021.4.0=h06a4308_3561
40 | - jpeg=9e=h166bdaf_1
41 | - lame=3.100=h7f98852_1001
42 | - lcms2=2.12=hddcbb42_0
43 | - ld_impl_linux-64=2.38=h1181459_1
44 | - libffi=3.3=he6710b0_2
45 | - libgcc-ng=11.2.0=h1234567_1
46 | - libgomp=11.2.0=h1234567_1
47 | - libiconv=1.17=h166bdaf_0
48 | - libpng=1.6.37=h21135ba_2
49 | - libprotobuf=3.15.8=h780b84a_1
50 | - libstdcxx-ng=11.2.0=h1234567_1
51 | - libtiff=4.2.0=hf544144_3
52 | - libwebp-base=1.2.2=h7f98852_1
53 | - lz4-c=1.9.3=h9c3ff4c_1
54 | - markdown=3.4.1=pyhd8ed1ab_0
55 | - markupsafe=2.1.1=py39hb9d737c_1
56 | - mkl=2021.4.0=h06a4308_640
57 | - mkl-service=2.4.0=py39h7e14d7c_0
58 | - mkl_fft=1.3.1=py39h0c7bc48_1
59 | - mkl_random=1.2.2=py39hde0f152_0
60 | - multidict=6.0.2=py39hb9d737c_1
61 | - ncurses=6.3=h5eee18b_3
62 | - nettle=3.6=he412f7d_0
63 | - numpy=1.23.1=py39h6c91a56_0
64 | - numpy-base=1.23.1=py39ha15fc14_0
65 | - oauthlib=3.2.1=pyhd8ed1ab_0
66 | - olefile=0.46=pyh9f0ad1d_1
67 | - openh264=2.1.1=h780b84a_0
68 | - openjpeg=2.4.0=hb52868f_1
69 | - openssl=1.1.1q=h7f8727e_0
70 | - packaging=21.3=pyhd8ed1ab_0
71 | - pillow=8.2.0=py39hf95b381_1
72 | - pip=22.1.2=py39h06a4308_0
73 | - pyasn1=0.4.8=py_0
74 | - pyasn1-modules=0.2.7=py_0
75 | - pycparser=2.21=pyhd8ed1ab_0
76 | - pydeprecate=0.3.2=pyhd8ed1ab_0
77 | - pyjwt=2.5.0=pyhd8ed1ab_0
78 | - pyopenssl=22.0.0=pyhd8ed1ab_1
79 | - pyparsing=3.0.9=pyhd8ed1ab_0
80 | - pysocks=1.7.1=pyha2e5f31_6
81 | - python=3.9.13=haa1d7c7_1
82 | - python_abi=3.9=2_cp39
83 | - pytorch=1.12.1=py3.9_cuda11.6_cudnn8.3.2_0
84 | - pytorch-lightning=1.7.7=pyhd8ed1ab_0
85 | - pytorch-mutex=1.0=cuda
86 | - pyu2f=0.1.5=pyhd8ed1ab_0
87 | - pyyaml=6.0=py39hb9d737c_4
88 | - readline=8.1.2=h7f8727e_1
89 | - requests=2.28.1=pyhd8ed1ab_1
90 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0
91 | - rsa=4.9=pyhd8ed1ab_0
92 | - setuptools=63.4.1=py39h06a4308_0
93 | - six=1.16.0=pyh6c4a22f_0
94 | - sqlite=3.39.2=h5082296_0
95 | - tensorboard=2.10.1=pyhd8ed1ab_0
96 | - tensorboard-data-server=0.6.0=py39hd97740a_2
97 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
98 | - tk=8.6.12=h1ccaba5_0
99 | - torchaudio=0.12.1=py39_cu116
100 | - torchmetrics=0.9.3=pyhd8ed1ab_0
101 | - torchvision=0.13.1=py39_cu116
102 | - tqdm=4.64.1=pyhd8ed1ab_0
103 | - typing-extensions=4.3.0=hd8ed1ab_0
104 | - typing_extensions=4.3.0=pyha770c72_0
105 | - tzdata=2022c=h04d1e81_0
106 | - urllib3=1.26.11=pyhd8ed1ab_0
107 | - werkzeug=2.2.2=pyhd8ed1ab_0
108 | - wheel=0.37.1=pyhd3eb1b0_0
109 | - xz=5.2.6=h5eee18b_0
110 | - yaml=0.2.5=h7f98852_2
111 | - yarl=1.7.2=py39hb9d737c_2
112 | - zipp=3.8.1=pyhd8ed1ab_0
113 | - zlib=1.2.12=h5eee18b_3
114 | - zstd=1.5.0=ha95c52a_0
115 | - pip:
116 | - adal==1.2.7
117 | - alembic==1.8.1
118 | - antlr4-python3-runtime==4.9.3
119 | - argcomplete==2.0.0
120 | - azure-common==1.1.28
121 | - azure-core==1.25.1
122 | - azure-graphrbac==0.61.1
123 | - azure-identity==1.11.0
124 | - azure-mgmt-authorization==2.0.0
125 | - azure-mgmt-containerregistry==10.0.0
126 | - azure-mgmt-core==1.3.2
127 | - azure-mgmt-keyvault==10.1.0
128 | - azure-mgmt-resource==21.1.0
129 | - azure-mgmt-storage==20.0.0
130 | - azure-storage-blob==12.13.0
131 | - azureml-core==1.45.0.post2
132 | - azureml-mlflow==1.45.0
133 | - backports-tempfile==1.0
134 | - backports-weakref==1.0.post1
135 | - bcrypt==4.0.0
136 | - black==22.8.0
137 | - cfgv==3.3.1
138 | - cloudpickle==2.2.0
139 | - commonmark==0.9.1
140 | - contextlib2==21.6.0
141 | - contourpy==1.0.5
142 | - databricks-cli==0.17.3
143 | - distlib==0.3.6
144 | - docker==5.0.3
145 | - docstring-parser==0.15
146 | - filelock==3.8.0
147 | - flake8==5.0.4
148 | - flask==2.2.2
149 | - fonttools==4.37.3
150 | - gcsfs==2022.8.2
151 | - gitdb==4.0.9
152 | - gitpython==3.1.27
153 | - google-api-core==2.10.1
154 | - google-cloud-core==2.3.2
155 | - google-cloud-storage==2.5.0
156 | - google-crc32c==1.5.0
157 | - google-resumable-media==2.3.3
158 | - googleapis-common-protos==1.56.4
159 | - greenlet==1.1.3
160 | - gunicorn==20.1.0
161 | - humanfriendly==10.0
162 | - hydra-core==1.2.0
163 | - identify==2.5.5
164 | - iniconfig==1.1.1
165 | - isodate==0.6.1
166 | - isort==5.10.1
167 | - itsdangerous==2.1.2
168 | - jeepney==0.8.0
169 | - jmespath==1.0.1
170 | - jsonargparse==4.15.0
171 | - jsonpickle==2.2.0
172 | - kiwisolver==1.4.4
173 | - knack==0.9.0
174 | - mako==1.2.3
175 | - matplotlib==3.6.0
176 | - mccabe==0.7.0
177 | - mlflow==1.29.0
178 | - mlflow-skinny==1.29.0
179 | - msal==1.19.0
180 | - msal-extensions==1.0.0
181 | - msrest==0.7.1
182 | - msrestazure==0.6.4
183 | - mypy-extensions==0.4.3
184 | - ndg-httpsclient==0.5.1
185 | - nodeenv==1.7.0
186 | - omegaconf==2.2.3
187 | - opencv-python==4.6.0.66
188 | - pandas==1.5.0
189 | - paramiko==2.11.0
190 | - pathspec==0.10.1
191 | - pkginfo==1.8.3
192 | - platformdirs==2.5.2
193 | - pluggy==1.0.0
194 | - portalocker==2.5.1
195 | - pre-commit==2.20.0
196 | - prometheus-flask-exporter==0.20.3
197 | - protobuf==3.20.1
198 | - psutil==5.9.2
199 | - py==1.11.0
200 | - pycodestyle==2.9.1
201 | - pyflakes==2.5.0
202 | - pynacl==1.5.0
203 | - pytest==7.1.3
204 | - pytz==2022.2.1
205 | - querystring-parser==1.2.4
206 | - rich==12.5.1
207 | - scipy==1.9.1
208 | - secretstorage==3.3.3
209 | - sh==1.14.3
210 | - smmap==5.0.0
211 | - sqlalchemy==1.4.41
212 | - sqlparse==0.4.3
213 | - tabulate==0.8.10
214 | - toml==0.10.2
215 | - tomli==2.0.1
216 | - torch-tb-profiler==0.4.0
217 | - torchtext==0.13.1
218 | - types-cryptography==3.3.23
219 | - virtualenv==20.16.5
220 | - websocket-client==1.4.1
221 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = asmr_ml_template
3 | version = 0.0.1
4 | author = Example Author
5 | author_email = author@example.com
6 | description = A small example package
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/pypa/sampleproject
10 | project_urls =
11 | Bug Tracker = https://github.com/pypa/sampleproject/issues
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: MIT License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 | package_dir =
19 | = .
20 | packages = find:
21 | python_requires = >=3.6
22 |
23 | [options.packages.find]
24 | where = .
25 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup()
4 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/__init__.py
--------------------------------------------------------------------------------
/src/datamodules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/datamodules/__init__.py
--------------------------------------------------------------------------------
/src/datamodules/mnist_datamodule.py:
--------------------------------------------------------------------------------
1 | # credits: https://github.com/ashleve/lightning-hydra-template/tree/main/src/datamodules
2 | from typing import Dict, Optional, Tuple, Union
3 |
4 | import torch
5 | from pytorch_lightning import LightningDataModule
6 | from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split
7 | from torchvision.datasets import MNIST
8 | from torchvision.transforms import transforms
9 |
10 |
11 | class MNISTDataModule(LightningDataModule):
12 | """Example of LightningDataModule for MNIST dataset.
13 |
14 | A DataModule implements 5 key methods:
15 | - prepare_data (things to do on 1 GPU/TPU, not on every GPU/TPU in distributed mode)
16 | - setup (things to do on every accelerator in distributed mode)
17 | - train_dataloader (the training dataloader)
18 | - val_dataloader (the validation dataloader(s))
19 | - test_dataloader (the test dataloader(s))
20 |
21 | This allows you to share a full dataset without explaining how to download,
22 | split, transform and process the data.
23 |
24 | Read the docs:
25 | https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html
26 | """
27 |
28 | def __init__(
29 | self, file_params: Dict[str, Union[str, Tuple[int, int, int]]], train_params: Dict[str, Union[int, bool]]
30 | ):
31 | super().__init__()
32 |
33 | # this line allows to access init params with 'self.hparams' attribute
34 | self.save_hyperparameters(logger=False)
35 |
36 | # data transformations
37 | self.transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
38 |
39 | self.data_train: Optional[Dataset] = None
40 | self.data_val: Optional[Dataset] = None
41 | self.data_test: Optional[Dataset] = None
42 |
43 | @property
44 | def num_classes(self) -> int:
45 | return 10
46 |
47 | def prepare_data(self):
48 | """Download data if needed.
49 |
50 | This method is called only from a single GPU.
51 | Do not use it to assign state (self.x = y).
52 | """
53 | MNIST(self.hparams.file_params["base_dir"], train=True, download=True)
54 | MNIST(self.hparams.file_params["base_dir"], train=False, download=True)
55 |
56 | def setup(self, stage: Optional[str] = None):
57 | """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
58 |
59 | This method is called by lightning when doing `trainer.fit()` and `trainer.test()`,
60 | so be careful not to execute the random split twice! The `stage` can be used to
61 | differentiate whether it's called before trainer.fit()` or `trainer.test()`.
62 | """
63 |
64 | # load datasets only if they're not loaded already
65 | if not self.data_train and not self.data_val and not self.data_test:
66 | trainset = MNIST(self.hparams.file_params["base_dir"], train=True, transform=self.transforms)
67 | testset = MNIST(self.hparams.file_params["base_dir"], train=False, transform=self.transforms)
68 | dataset = ConcatDataset(datasets=[trainset, testset])
69 | self.data_train, self.data_val, self.data_test = random_split(
70 | dataset=dataset,
71 | lengths=self.hparams.file_params["train_val_test_split"],
72 | generator=torch.Generator().manual_seed(42),
73 | )
74 |
75 | def train_dataloader(self):
76 | return DataLoader(
77 | dataset=self.data_train,
78 | batch_size=self.hparams.train_params["batch_size"],
79 | num_workers=self.hparams.train_params["num_workers"],
80 | pin_memory=self.hparams.train_params["pin_memory"],
81 | shuffle=True,
82 | )
83 |
84 | def val_dataloader(self):
85 | return DataLoader(
86 | dataset=self.data_val,
87 | batch_size=self.hparams.train_params["batch_size"],
88 | num_workers=self.hparams.train_params["num_workers"],
89 | pin_memory=self.hparams.train_params["pin_memory"],
90 | shuffle=False,
91 | )
92 |
93 | def test_dataloader(self):
94 | return DataLoader(
95 | dataset=self.data_test,
96 | batch_size=self.hparams.train_params["batch_size"],
97 | num_workers=self.hparams.train_params["num_workers"],
98 | pin_memory=self.hparams.train_params["pin_memory"],
99 | shuffle=False,
100 | )
101 |
--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/models/__init__.py
--------------------------------------------------------------------------------
/src/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/models/components/__init__.py
--------------------------------------------------------------------------------
/src/models/components/simple_dense_net.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class SimpleDenseNet(nn.Module):
5 | def __init__(
6 | self,
7 | input_size: int = 784,
8 | lin1_size: int = 256,
9 | lin2_size: int = 256,
10 | lin3_size: int = 256,
11 | output_size: int = 10,
12 | ):
13 | super().__init__()
14 |
15 | self.model = nn.Sequential(
16 | nn.Linear(input_size, lin1_size),
17 | nn.BatchNorm1d(lin1_size),
18 | nn.ReLU(),
19 | nn.Linear(lin1_size, lin2_size),
20 | nn.BatchNorm1d(lin2_size),
21 | nn.ReLU(),
22 | nn.Linear(lin2_size, lin3_size),
23 | nn.BatchNorm1d(lin3_size),
24 | nn.ReLU(),
25 | nn.Linear(lin3_size, output_size),
26 | )
27 |
28 | def forward(self, x):
29 | batch_size, channels, width, height = x.size()
30 |
31 | # (batch, 1, width, height) -> (batch, 1*width*height)
32 | x = x.view(batch_size, -1)
33 |
34 | return self.model(x)
35 |
--------------------------------------------------------------------------------
/src/models/mnist_module.py:
--------------------------------------------------------------------------------
1 | # credits: https://github.com/ashleve/lightning-hydra-template/blob/main/src/models/mnist_module.py
2 | from typing import Any, Dict, List, Union
3 |
4 | import torch
5 | from pytorch_lightning import LightningModule
6 | from torchmetrics import MaxMetric
7 | from torchmetrics.classification.accuracy import Accuracy
8 |
9 | from src.models.components.simple_dense_net import SimpleDenseNet
10 |
11 |
12 | class MNISTLitModule(LightningModule):
13 | """Example of LightningModule for MNIST classification.
14 |
15 | A LightningModule organizes your PyTorch code into 5 sections:
16 | - Computations (init).
17 | - Train loop (training_step)
18 | - Validation loop (validation_step)
19 | - Test loop (test_step)
20 | - Optimizers (configure_optimizers)
21 |
22 | Read the docs:
23 | https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html
24 | """
25 |
26 | def __init__(
27 | self,
28 | mlp_config: Dict[str, dict],
29 | optimizer_config: Dict[str, Union[float, str, list]],
30 | ):
31 | super().__init__()
32 |
33 | # this line allows to access init params with 'self.hparams' attribute
34 | # it also ensures init params will be stored in ckpt
35 | self.save_hyperparameters("mlp_config", "optimizer_config")
36 |
37 | self.net = SimpleDenseNet(**mlp_config)
38 |
39 | # loss function
40 | self.criterion = torch.nn.CrossEntropyLoss()
41 |
42 | # use separate metric instance for train, val and test step
43 | # to ensure a proper reduction over the epoch
44 | self.train_acc = Accuracy()
45 | self.val_acc = Accuracy()
46 | self.test_acc = Accuracy()
47 |
48 | # for logging best so far validation accuracy
49 | self.val_acc_best = MaxMetric()
50 |
51 | def forward(self, x: torch.Tensor):
52 | return self.net(x)
53 |
54 | def on_train_start(self):
55 | # by default lightning executes validation step sanity checks before training starts,
56 | # so we need to make sure val_acc_best doesn't store accuracy from these checks
57 | self.val_acc_best.reset()
58 |
59 | def step(self, batch: Any):
60 | x, y = batch
61 | logits = self.forward(x)
62 | loss = self.criterion(logits, y)
63 | preds = torch.argmax(logits, dim=1)
64 | return loss, preds, y
65 |
66 | def training_step(self, batch: Any, batch_idx: int):
67 | loss, preds, targets = self.step(batch)
68 |
69 | # log train metrics
70 | acc = self.train_acc(preds, targets)
71 | self.log("train/loss", loss, on_step=False, on_epoch=True, prog_bar=False)
72 | self.log("train/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
73 |
74 | # we can return here dict with any tensors
75 | # and then read it in some callback or in `training_epoch_end()` below
76 | # remember to always return loss from `training_step()` or else backpropagation will fail!
77 | return {"loss": loss, "preds": preds, "targets": targets}
78 |
79 | def training_epoch_end(self, outputs: List[Any]):
80 | # `outputs` is a list of dicts returned from `training_step()`
81 | self.train_acc.reset()
82 |
83 | def validation_step(self, batch: Any, batch_idx: int):
84 | loss, preds, targets = self.step(batch)
85 |
86 | # log val metrics
87 | acc = self.val_acc(preds, targets)
88 | self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=False)
89 | self.log("val/acc", acc, on_step=False, on_epoch=True, prog_bar=True)
90 |
91 | return {"loss": loss, "preds": preds, "targets": targets}
92 |
93 | def validation_epoch_end(self, outputs: List[Any]):
94 | acc = self.val_acc.compute() # get val accuracy from current epoch
95 | self.val_acc_best.update(acc)
96 | self.log("val/acc_best", self.val_acc_best.compute(), on_epoch=True, prog_bar=True)
97 |
98 | self.val_acc.reset() # reset val accuracy for next epoch
99 |
100 | def test_step(self, batch: Any, batch_idx: int):
101 | loss, preds, targets = self.step(batch)
102 |
103 | # log test metrics
104 | acc = self.test_acc(preds, targets)
105 | self.log("test/loss", loss, on_step=False, on_epoch=True)
106 | self.log("test/acc", acc, on_step=False, on_epoch=True)
107 |
108 | return {"loss": loss, "preds": preds, "targets": targets}
109 |
110 | def test_epoch_end(self, outputs: List[Any]):
111 | self.test_acc.reset()
112 |
113 | def configure_optimizers(self):
114 | """Choose what optimizers and learning-rate schedulers to use in your optimization.
115 | Normally you'd need one. But in the case of GANs or similar you might have multiple.
116 |
117 | See examples here: https://pytorch-
118 | lightning.readthedocs.io/en/latest/common/lightning_module.html#configure-optimizers
119 | """
120 | return torch.optim.Adam(
121 | params=self.parameters(),
122 | lr=self.hparams.optimizer_config["lr"],
123 | weight_decay=self.hparams.optimizer_config["weight_decay"],
124 | )
125 |
--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
1 | from omegaconf import OmegaConf
2 |
3 | from utils import pl_utils
4 |
5 |
6 | def main(cfg):
7 | trainer = pl_utils.instantiate_trainer(cfg)
8 | model = pl_utils.instantiate_class(cfg["model"])
9 | datamodule = pl_utils.instantiate_class(cfg["data"])
10 |
11 | trainer.fit(model, datamodule)
12 |
13 |
14 | if __name__ == "__main__":
15 | cfg = OmegaConf.from_cli()
16 |
17 | if "base" in cfg:
18 | basecfg = OmegaConf.load(cfg.base)
19 | del cfg.base
20 | cfg = OmegaConf.merge(basecfg, cfg)
21 | cfg = OmegaConf.to_container(cfg, resolve=True)
22 | print(OmegaConf.to_yaml(cfg))
23 | main(cfg)
24 | else:
25 | raise SystemExit("Base configuration file not specified! Exiting.")
26 |
--------------------------------------------------------------------------------
/src/utils/pl_utils.py:
--------------------------------------------------------------------------------
1 | from importlib import import_module
2 | from typing import Any, Dict, List
3 |
4 | import pytorch_lightning as pl
5 | from pytorch_lightning import Callback, Trainer
6 | from pytorch_lightning.loggers import LightningLoggerBase
7 |
8 |
9 | def instantiate_class(init: Dict[str, Any]) -> Any:
10 | """Instantiates a class with the given args and init.
11 |
12 | Args:
13 | todo
14 |
15 | Returns:
16 | The instantiated class object.
17 | """
18 | kwargs = {k: init[k] for k in set(list(init.keys())) - {"_target_"}}
19 |
20 | class_module, class_name = init["_target_"].rsplit(".", 1)
21 | module = import_module(class_module, package=class_name)
22 | args_class = getattr(module, class_name)
23 | return args_class(**kwargs)
24 |
25 |
26 | def instantiate_callbacks(callbacks_cfg: dict) -> List[Callback]:
27 | """Instantiates callbacks from config."""
28 | callbacks: List[Callback] = []
29 |
30 | if not callbacks_cfg:
31 | return callbacks
32 |
33 | if not isinstance(callbacks_cfg, dict):
34 | raise TypeError("Callbacks config must be a DictConfig!")
35 |
36 | for _, cb_conf in callbacks_cfg.items():
37 | if isinstance(cb_conf, dict) and "_target_" in cb_conf:
38 | callbacks.append(instantiate_class(cb_conf))
39 |
40 | return callbacks
41 |
42 |
43 | def instantiate_loggers(logger_cfg: dict) -> List[LightningLoggerBase]:
44 | """Instantiates loggers from config."""
45 | logger: List[LightningLoggerBase] = []
46 |
47 | if not logger_cfg:
48 | return logger
49 |
50 | if not isinstance(logger_cfg, dict):
51 | raise TypeError("Logger config must be a Dict!")
52 |
53 | for _, lg_conf in logger_cfg.items():
54 | if isinstance(lg_conf, dict) and "_target_" in lg_conf:
55 | logger.append(instantiate_class(lg_conf))
56 |
57 | return logger
58 |
59 |
60 | def instantiate_trainer(cfg: dict):
61 | if cfg.get("seed", None):
62 | pl.seed_everything(cfg["seed"], workers=True)
63 |
64 | callbacks: List[Callback] = instantiate_callbacks(cfg.get("callbacks"))
65 | logger: List[LightningLoggerBase] = instantiate_loggers(cfg.get("logger"))
66 | trainer: Trainer = Trainer(**cfg["trainer"], callbacks=callbacks, logger=logger)
67 |
68 | return trainer
69 |
--------------------------------------------------------------------------------
/src/utils/system_monitor.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess as sp
4 | import time
5 |
6 | import mlflow
7 | import psutil
8 |
9 |
10 | def get_gpu_mem_info():
11 | output_to_list = lambda x: x.decode("ascii").split("\n")[:-1]
12 | COMMAND = "nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total --format=csv"
13 |
14 | try:
15 | memory_use_info = output_to_list(sp.check_output(COMMAND.split(), stderr=sp.STDOUT))[1:]
16 | except sp.CalledProcessError as e:
17 | raise RuntimeError(f"command '{e.cmd}' return with error (code {e.returncode}): {e.output}")
18 | return memory_use_info
19 |
20 |
21 | def get_dist_info():
22 | dist_info_config = {
23 | "node_rank": "NODE_RANK",
24 | "local_rank": "LOCAL_RANK",
25 | "world_rank": "RANK",
26 | "world_size": "WORLD_SIZE",
27 | }
28 |
29 | dist_info = {key: os.environ.get(value) for key, value in dist_info_config.items()}
30 |
31 | # Single GPU job
32 | if dist_info["world_size"] is None:
33 | dist_info["node_rank"] = 0
34 | dist_info["world_rank"] = 0
35 | dist_info["local_rank"] = 0
36 | dist_info["world_size"] = 1
37 |
38 | dist_info = {key: int(value) for (key, value) in dist_info.items()}
39 | return dist_info
40 |
41 |
42 | def main(args):
43 | # run on each node but only on one process corresponding to first gpu
44 | dist_info = get_dist_info()
45 | if not (dist_info["local_rank"] == 0):
46 | return
47 |
48 | node_rank = dist_info["node_rank"] + 1 # one index for display
49 |
50 | metrics = {}
51 | metrics = {
52 | f"monitor/node_{node_rank:02}/ram_usage_percent": 0.0,
53 | f"monitor/node_{node_rank:02}/ram_usage_GB": 0.0,
54 | f"monitor/node_{node_rank:02}/cpu_usage_percent": 0.0,
55 | f"monitor/node_{node_rank:02}/swap": 0.0,
56 | }
57 |
58 | memory_use_info_list = get_gpu_mem_info()
59 | idx, gpu_percent, gpu_mem_used, gpu_mem_total = memory_use_info_list[0].split(",")
60 | gpu_mem_total = float(gpu_mem_total.split("MiB")[0])
61 |
62 | for gpu_idx, gpu_info in enumerate(memory_use_info_list, 1):
63 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/usage_percent"] = 0.0
64 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_GB"] = 0.0
65 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_percent"] = 0.0
66 |
67 | now = 0
68 | dt_sleep = args.watch_every_n_seconds
69 |
70 | while True:
71 | metrics[f"monitor/node_{node_rank:02}/ram_usage_GB"] = psutil.virtual_memory().used / 2**30
72 | metrics[f"monitor/node_{node_rank:02}/ram_usage_percent"] = psutil.virtual_memory().percent
73 | metrics[f"monitor/node_{node_rank:02}/cpu_usage_percent"] = psutil.cpu_percent()
74 | metrics[f"monitor/node_{node_rank:02}/swap"] = psutil.swap_memory().percent
75 |
76 | memory_use_info_list = get_gpu_mem_info()
77 | for gpu_idx, gpu_info in enumerate(memory_use_info_list, 1):
78 | _, gpu_percent, gpu_mem_used, _ = gpu_info.split(",")
79 | gpu_percent = float(gpu_percent.split("%")[0])
80 | gpu_mem_used = float(gpu_mem_used.split("MiB")[0])
81 | gpu_mem_percent = gpu_mem_used / gpu_mem_total * 100.0
82 | gpu_mem_used /= 1024.0
83 |
84 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/usage_percent"] = gpu_percent
85 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_GB"] = gpu_mem_used
86 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_percent"] = gpu_mem_percent
87 |
88 | # for key, value in metrics.items():
89 | # print(f"{key}: {value}")
90 | mlflow.log_metrics(metrics, step=now)
91 |
92 | time.sleep(dt_sleep)
93 | now += dt_sleep
94 |
95 |
96 | def get_parsed_args():
97 | parser = argparse.ArgumentParser()
98 | parser.add_argument("--watch_every_n_seconds", type=int, default=5)
99 | args = parser.parse_args()
100 | return args
101 |
102 |
103 | if __name__ == "__main__":
104 | print("system_monitor.py begins")
105 | args = get_parsed_args()
106 | main(args)
107 | print("system_monitor.py done")
108 |
--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pytest
4 | import sh
5 |
6 |
7 | def run_command(command: List[str]):
8 | """Default method for executing shell commands with pytest."""
9 | msg = None
10 | try:
11 | sh.python(command)
12 | except sh.ErrorReturnCode as e:
13 | msg = e.stderr.decode()
14 | if msg:
15 | pytest.fail(msg=msg)
16 |
--------------------------------------------------------------------------------
/tests/test_dev_fast_run.py:
--------------------------------------------------------------------------------
1 | import pytest # nopycln: import
2 | from helpers import run_command
3 |
4 |
5 | def test_fast_dev_run():
6 | """Test running for 1 train, val and test batch."""
7 | command = [
8 | "src/train.py",
9 | "base=configs/train.yaml",
10 | "trainer.fast_dev_run=true",
11 | ]
12 | run_command(command)
13 |
14 |
15 | # cpu only test for CI
16 | def test_fast_dev_run_cpu():
17 | """Test running for 1 train, val and test batch."""
18 | command = [
19 | "src/train.py",
20 | "base=configs/train.yaml",
21 | "trainer.fast_dev_run=true",
22 | "trainer.accelerator=cpu",
23 | "trainer.sync_batchnorm=false",
24 | ]
25 | run_command(command)
26 |
--------------------------------------------------------------------------------