├── .devcontainer └── devcontainer.json ├── .github └── workflows │ ├── pre-commit.yml │ └── template-cleanup.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode ├── extensions.json ├── launch.json └── settings.json ├── CODE_OF_CONDUCT.md ├── License ├── README.md ├── README_template.md ├── SECURITY.md ├── assets └── images │ └── table-of-contents.png ├── azure-pipelines.yml ├── configs └── train.yaml ├── docker ├── Dockerfile_base_azureml ├── Dockerfile_base_azureml_cu116 ├── Dockerfile_base_azureml_nightly ├── Dockerfile_base_nvidia ├── environment.yml └── environment_cu116.yml ├── setup.cfg ├── setup.py ├── src ├── __init__.py ├── datamodules │ ├── __init__.py │ └── mnist_datamodule.py ├── models │ ├── __init__.py │ ├── components │ │ ├── __init__.py │ │ └── simple_dense_net.py │ └── mnist_module.py ├── train.py └── utils │ ├── pl_utils.py │ └── system_monitor.py └── tests ├── helpers.py └── test_dev_fast_run.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // Remote devcontainer file that describes what container to build, how to build it, 2 | // and the extensions VS Code needs to enable the best remote development experience 3 | { 4 | "name": "devcontainer", 5 | "build": { 6 | "context": "../docker", 7 | // Uncomment the base dockerfile of your choice 8 | "dockerfile": "../docker/Dockerfile_base_conda", 9 | // "dockerfile": "../docker/Dockerfile_base_nvidia", 10 | "args": { 11 | // Edit docker build args here as appropriate 12 | // find latest BASE_IMAGE for Dockerfile_base_conda at https://github.com/Azure/AzureML-Containers/tree/master/base/gpu 13 | "BASE_IMAGE": "openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest" 14 | // find latest BASE_IMAGE for Dockerfile_base_nvidia at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags 15 | // Uncomment the following for nvidia base image 16 | // "BASE_IMAGE": "22.06-py3" 17 | }, 18 | }, 19 | // Configure tool-specific properties. 20 | "customizations": { 21 | // Configure properties specific to VS Code. 22 | "vscode": { 23 | // Add the IDs of extensions you want installed when the container is created. 24 | "extensions": [ 25 | "eamodio.gitlens", 26 | "ms-python.python", 27 | "ms-python.vscode-pylance", 28 | "ms-azuretools.vscode-docker", 29 | "ms-vscode-remote.remote-containers", 30 | "ms-vscode-remote.remote-ssh", 31 | "ms-vscode-remote.remote-ssh-edit", 32 | "ms-vscode-remote.remote-wsl", 33 | "ms-vscode-remote.vscode-remote-extensionpack", 34 | "redhat.vscode-yaml", 35 | "yzhang.markdown-all-in-one", 36 | "TrungNgo.autoflake", 37 | "Shan.code-settings-sync", 38 | "njpwerner.autodocstring", 39 | "jbockle.jbockle-format-files" 40 | ] 41 | } 42 | }, 43 | // Docker run args 44 | "runArgs": [ 45 | // Run with GPU support 46 | "--privileged", 47 | "--gpus", 48 | "all", 49 | // Uncomment the next line if you will be using a ptrace-based debugger like C++, Go, and Rust. 50 | "--cap-add=SYS_PTRACE", 51 | "--security-opt", 52 | "seccomp=unconfined", 53 | // Use Docker from inside the container. See https://aka.ms/vscode-remote/samples/docker-in-docker for details. 54 | "-v", 55 | "/var/run/docker.sock:/var/run/docker.sock" 56 | ], 57 | // Run the following command after the container has started and workspace mounted 58 | "postStartCommand": "conda env config vars set -n base PYTHONPATH=${containerWorkspaceFolder} && git config --global --add safe.directory ${containerWorkspaceFolder}" 59 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 60 | // "forwardPorts": [], 61 | // Use 'postCreateCommand' to run commands after the container is created. 62 | // "postCreateCommand": "python --version" 63 | } 64 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | # ref: https://github.com/pre-commit-ci-demo/demo/blob/main/.github/workflows/pre-commit.yml 2 | name: pre-commit 3 | 4 | on: 5 | push: 6 | branches: [main] 7 | pull_request: 8 | branches: [main] 9 | 10 | jobs: 11 | pre-commit: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - uses: actions/setup-python@v2 16 | - uses: pre-commit/action@v2.0.0 17 | -------------------------------------------------------------------------------- /.github/workflows/template-cleanup.yml: -------------------------------------------------------------------------------- 1 | # credits: https://github.com/JetBrains/intellij-platform-plugin-template/blob/main/.github/workflows/template-cleanup.yml 2 | 3 | # GitHub Actions Workflow responsible for cleaning up the Autonomous Research Systems' ml_template repository from 4 | # the template-specific files and configurations. This workflow is supposed to be triggered automatically 5 | # when a new template-based repository has been created. 6 | 7 | name: Template Cleanup 8 | on: 9 | push: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | # Run cleaning process only if workflow is triggered by the ml_template repository. 15 | template-cleanup: 16 | name: Template Cleanup 17 | runs-on: ubuntu-latest 18 | if: github.event.repository.name != 'auto-sys-ml-template' 19 | steps: 20 | # Check out current repository 21 | - name: Fetch Sources 22 | uses: actions/checkout@v2.4.0 23 | 24 | # Cleanup project 25 | - name: Cleanup 26 | run: | 27 | rm -r assets/ 28 | mv README_template.md README.md 29 | rm .github/workflows/template-cleanup.yml 30 | 31 | # Commit modified files 32 | - name: Commit files 33 | run: | 34 | git config --local user.email "ratneshmadaan@gmail.com" 35 | git config --local user.name "madratman" 36 | git add . 37 | git commit -m "bla" 38 | git reset $(git commit-tree HEAD^{tree} -m "microsoft/AutonomousSystemsResearchGroup: init ml template repo") 39 | 40 | # Push changes 41 | - name: Push changes 42 | uses: ad-m/github-push-action@master 43 | with: 44 | branch: main 45 | github_token: ${{ secrets.GITHUB_TOKEN }} 46 | force: true 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # amulet 2 | .amltconfig 3 | amlt/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # Distribution / packaging 11 | build/ 12 | dist/ 13 | *.egg-info/ 14 | 15 | # data 16 | data/ 17 | 18 | # logs 19 | logs/ 20 | outputs/ 21 | 22 | # env 23 | .env 24 | .autoenv 25 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | 4 | ci: 5 | autofix_prs: true 6 | autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions" 7 | autoupdate_schedule: quarterly 8 | 9 | repos: 10 | - repo: https://github.com/pre-commit/pre-commit-hooks 11 | rev: v4.4.0 12 | hooks: 13 | # list of supported hooks: https://pre-commit.com/hooks.html 14 | - id: trailing-whitespace 15 | - id: end-of-file-fixer 16 | - id: check-yaml 17 | - id: check-case-conflict 18 | - id: debug-statements 19 | - id: detect-private-key 20 | - id: check-added-large-files 21 | args: ["--maxkb=500", "--enforce-all"] 22 | exclude: | 23 | (?x)^( 24 | )$ 25 | 26 | - repo: https://github.com/asottile/pyupgrade 27 | rev: v3.3.1 28 | hooks: 29 | - id: pyupgrade 30 | args: [--py37-plus] 31 | name: Upgrade code 32 | 33 | # python formatting 34 | - repo: https://github.com/psf/black 35 | rev: 23.1.0 36 | hooks: 37 | - id: black 38 | name: Format code 39 | args: ["--line-length=120"] 40 | 41 | - repo: https://github.com/hadialqattan/pycln 42 | rev: v2.1.3 # Possible releases: https://github.com/hadialqattan/pycln/releases 43 | hooks: 44 | - id: pycln 45 | args: [--all] 46 | 47 | # ref: https://github.com/microsoft/vscode-isort] 48 | - repo: https://github.com/pycqa/isort 49 | rev: 5.12.0 50 | hooks: 51 | - id: isort 52 | name: isort (python) 53 | args: [--profile, "black"] 54 | 55 | # python docstring formatting 56 | - repo: https://github.com/myint/docformatter 57 | rev: v1.5.1 58 | hooks: 59 | - id: docformatter 60 | args: [--in-place, --wrap-summaries, "99", --wrap-descriptions, "92"] 61 | 62 | # yaml formatting 63 | - repo: https://github.com/pre-commit/mirrors-prettier 64 | rev: v3.0.0-alpha.6 65 | hooks: 66 | - id: prettier 67 | types: [yaml] 68 | 69 | # markdown formatting 70 | - repo: https://github.com/executablebooks/mdformat 71 | rev: 0.7.16 72 | hooks: 73 | - id: mdformat 74 | additional_dependencies: 75 | - mdformat-gfm 76 | #- mdformat-black 77 | - mdformat_frontmatter 78 | exclude: CHANGELOG.md 79 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations. 3 | // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp 4 | // List of extensions which should be recommended for users of this workspace. 5 | "recommendations": [ 6 | "eamodio.gitlens", 7 | "ms-python.python", 8 | "ms-python.vscode-pylance", 9 | "ms-azuretools.vscode-docker", 10 | "ms-vscode-remote.remote-containers", 11 | "ms-vscode-remote.remote-ssh", 12 | "ms-vscode-remote.remote-ssh-edit", 13 | "ms-vscode-remote.remote-wsl", 14 | "ms-vscode-remote.vscode-remote-extensionpack", 15 | "redhat.vscode-yaml", 16 | "yzhang.markdown-all-in-one", 17 | "TrungNgo.autoflake", 18 | "Shan.code-settings-sync", 19 | "njpwerner.autodocstring", 20 | "jbockle.jbockle-format-files" 21 | ], 22 | // List of extensions recommended by VS Code that should not be recommended for users of this workspace. 23 | "unwantedRecommendations": [] 24 | } 25 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "justMyCode": true 14 | }, 15 | { 16 | "name": "train.py", 17 | "type": "python", 18 | "request": "launch", 19 | "program": "src/train.py", 20 | "console": "integratedTerminal", 21 | "justMyCode": true, 22 | "args": [ 23 | "base=configs/train.yaml", 24 | "trainer.num_nodes=1", 25 | "trainer.devices=1", 26 | "data.train_params.batch_size=256", 27 | "model.optimizer_config.lr=1e-3" 28 | ] 29 | } 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.defaultFormatter": "ms-python.black-formatter", 3 | "editor.formatOnPaste": true, 4 | "editor.formatOnSave": true, 5 | "editor.codeActionsOnSave": { 6 | "source.organizeImports": true 7 | }, 8 | "python.analysis.typeCheckingMode": "basic", 9 | "python.formatting.provider": "black", 10 | "python.formatting.blackArgs": [ 11 | "--line-length", 12 | "120" 13 | ], 14 | "python.linting.enabled": true, 15 | "python.linting.pylintEnabled": false, 16 | "python.linting.flake8Enabled": true, 17 | "python.linting.flake8Args": [ 18 | "--max-line-length=120", 19 | ], 20 | "python.testing.pytestArgs": [ 21 | "tests" 22 | ], 23 | "python.testing.unittestEnabled": false, 24 | "python.testing.pytestEnabled": true, 25 | "isort.args": [ 26 | "--profile", 27 | "black" 28 | ], 29 | } 30 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Autonomous Systems Research Group: ML Template 2 | 3 | This document serves as an onboarding document as well as a template repository to quickstart machine learning experimentation at the [Autonomous Systems Research Group at Microsoft](https://www.microsoft.com/en-us/research/group/autonomous-systems-group-robotics/) 4 | 5 | **Note** Use the table of contents icon on the top left corner of this document to get to a specific section quickly. 6 | 7 | ### Using this template to generate a repository: 8 | 9 | - Click on the green colored box titled **Use this template** top right, and name your new repository. 10 | - You can clone your repo when it looks like [example_repo_generated_from_ml_template](https://github.com/AutonomousSystemsResearch/example_repo_generated_from_ml_template). 11 | 12 | > **Note** that after you create the template, it will take about **20 seconds** for an automated github action to clean up the generated repository using an auto-commit. Please ensure your repository looks like [example_repo_generated_from_ml_template](https://github.com/AutonomousSystemsResearch/example_repo_generated_from_ml_template) before cloning it. 13 | 14 | ## Introduction 15 | 16 | For the template repository, we will use: 17 | 18 | - [Pytorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/) 19 | - For minimizing boilerplate code 20 | - [OmegaConf](https://omegaconf.readthedocs.io/) 21 | - Please go through [OmegaConf's github readme](https://github.com/omry/omegaconf#releases) for tutorials. 22 | - For config management 23 | > **Note**: we have an [archived branch called `hydra`](https://github.com/AutonomousSystemsResearch/ml_template/tree/hydra) which uses [hydra](https://hydra.cc/) for config management. 24 | - Logging 25 | - We primarily use tensorboard. Amulet automatically patches tensorboard scalars to MLFlow for viewing metrics in Azure ML Studio. 26 | - Conda and Docker 27 | - For development 28 | 29 | ## Using this repository 30 | 31 | ### **Running locally** 32 | 33 | #### Setup 34 | 35 | - **VSCode** 36 | 37 | - Extensions: 38 | 39 | - Hit `Ctrl+Shift+P` and type `Show Recommended Extensions` and install them from the sidebar. 40 | Or click "yes" when you get a VS Code pop up to install the recommended extensions, which are specified in [.vscode/extensions.json](.vscode/extensions.json). 41 | Follow [this doc](https://code.visualstudio.com/docs/editor/extension-marketplace#_recommended-extensions) for more details. 42 | - `Python`, `Pylance`, `Docker`, `GitLens`, `YAML`, and the [Remote development extension pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) are strongly recommended. 43 | 44 | - Debugging: 45 | 46 | - Please follow [VSCode docs and tutorials](https://code.visualstudio.com/docs/python/debugging) on Python debugging 47 | - A minimal debugging configuration has been provided in [.vscode/launch.json](.vscode/launch.json). Please see VSCode docs on [launch.json configs](https://code.visualstudio.com/docs/python/debugging#_additional-configurations) and [config options](https://code.visualstudio.com/docs/python/debugging#_set-configuration-options). 48 | 49 | - **Conda** 50 | 51 | - Recommended for local development and debugging. 52 | - Note: For CUDA 11.6, see `Creating the conda environment from scratch (click to expand)` below. 53 | 54 | ``` 55 | # create env 56 | conda env create --file docker/environment.yml 57 | 58 | # activate it 59 | conda activate ml_template 60 | 61 | # install this repo 62 | (ml_template) $ pip install -e . 63 | 64 | # install pre-commit (recommended). Scroll down to the #Developing section for details. 65 | (ml_template) $ pre-commit install 66 | ``` 67 | 68 | > **Note** If you install additional packages in your environment manually, you should update the `environment.yml` correspondingly by doing a `$ conda env export | grep -v "^prefix: " > docker/environment.yml`. 69 | 70 |
71 | 72 | Creating the conda environment from scratch (click to expand) 73 | 74 | 75 | ``` 76 | conda update -n base -c defaults conda 77 | conda create --name ml_template python=3.9 78 | conda activate ml_template 79 | conda install pip 80 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch 81 | conda install pytorch-lightning -c conda-forge 82 | pip install omegaconf \ 83 | pytest \ 84 | sh \ 85 | pre-commit \ 86 | mlflow \ 87 | azureml-mlflow \ 88 | azureml-core \ 89 | torch_tb_profiler \ 90 | opencv-python \ 91 | black isort flake8 \ 92 | psutil \ 93 | rich 94 | conda env export | grep -v "^prefix: " > docker/environment.yml 95 | pre-commit install 96 | pre-commit run --all-files 97 | pip install -e . 98 | ``` 99 | 100 | For CUDA 11.6: 101 | 102 | ``` 103 | conda update -n base -c defaults conda 104 | conda create --name ml_template_cu116 python=3.9 105 | conda activate ml_template_cu116 106 | conda install pip 107 | conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge 108 | pip install pytorch-lightning 109 | pip install omegaconf \ 110 | pytest \ 111 | sh \ 112 | pre-commit \ 113 | mlflow \ 114 | azureml-mlflow \ 115 | azureml-core \ 116 | torch_tb_profiler \ 117 | opencv-python \ 118 | black isort flake8 \ 119 | psutil \ 120 | rich 121 | conda env export | grep -v "^prefix: " > docker/environment_cu116.yml 122 | pre-commit install 123 | pre-commit run --all-files 124 | pip install -e . 125 | ``` 126 | 127 |
128 | 129 |
130 | 131 | Upgrading pytorch and cudatoolkit (click to expand) 132 | 133 | 134 | ``` 135 | conda remove pytorch torchvision torchaudio cudatoolkit 136 | # then follow pytorch installation steps, for example: 137 | conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge 138 | # then update pytorch lightning: 139 | pip install pytorch-lightning --upgrade 140 | pip install pytorch-lightning[extra] --upgrade 141 | pip install -U jsonargparse[signatures] --upgrade 142 | ``` 143 | 144 |
145 | 146 | - **Docker** 147 | 148 | - While submitting jobs to AzureML, we take our local conda environment and overlay them on an appropriate docker base image. For a new project / a custom conda environment, you can build the docker image locally as explained in a note later in this section. Optionally, the docker image building can be automated by CI (as explained later) if your project has a frequently update conda environment. 149 | 150 | - For `ml_template`, we have [three docker images](docker/) built automatically on each commit to `main` branch or a branch corresponding to a Pull Request. 151 | Docker images are pushed to [PRIVATEAZURECONTAINERREGISTRYNAME](https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/964a24a8-8835-43c1-9633-7d78841facf1/resourceGroups/research_team/providers/Microsoft.ContainerRegistry/registries/PRIVATEAZURECONTAINERREGISTRYNAME/repository) container registory under [ml_template](https://ms.portal.azure.com/#view/Microsoft_Azure_ContainerRegistries/RepositoryBlade/id/%2Fsubscriptions%2F964a24a8-8835-43c1-9633-7d78841facf1%2FresourceGroups%2Fresearch_team%2Fproviders%2FMicrosoft.ContainerRegistry%2Fregistries%25PRIVATEAZURECONTAINERREGISTRYNAME/repository/ml_template). 152 | To automate this for your generated repository from this template, please follow make an Azure Pipelines which will `azure-pipelines.yml` 153 | 154 | - The following tags correspond to the the *latest commit on the main branch.* 155 | 156 | | Tag | Dockerfile | docker pull command | Base Image | 157 | | :-------------------------------------------: | :-------------------------------------------------------: | :-----------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | 158 | | `latest` or `latest-azureml` | [azureml](docker/Dockerfile_base_azureml) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest` | [mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04) | 159 | | `latest-nightly` or `latest-azureml-nightly` | [azureml_nightly](docker/Dockerfile_base_azureml_nightly) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nightly` | [mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu/openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04) | 160 | | `latest-nvidia` | [nvidia](docker/Dockerfile_base_nvidia) | `docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia` | [nvcr.io/nvidia/pytorch:22-06-py3](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) | 161 | 162 | - Building docker images and running docker containers locally - can be useful to reproduce issues which might occur while submitting to AzureML on your local machine. Please peruse public documentation on docker + vscode. 163 | 164 | ``` 165 | # pull image with [azureml image](https://hub.docker.com/_/microsoft-azureml?tab=description) as base with docker/environment.yml on top 166 | docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest 167 | 168 | # (optional) pull image with nvidia pytorch image as base 169 | docker pull PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia (for nvidia pytorch base image. See the note below for more details.) 170 | 171 | # run image 172 | docker run -it --gpus=all -v : PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest 173 | 174 | # (optional) recommended give a name to your container 175 | docker run -it --rm --name=MYFANCYCONTAINERNAME --gpus=all -v : PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest 176 | 177 | # setup the repo (run inside the container) 178 | pip install -e . 179 | 180 | # install pre-commit (recommended). Scroll down to the "Developing" section for details. 181 | pre-commit install 182 | ``` 183 | 184 |
185 | 186 | More details on docker image tags for Pull Request and main branch builds (click to expand) 187 | 188 | Similar to the `main` branch, for each pull request, we have: 189 | 190 | - `PR-<#pr_number>-latest` aka `PR--latest-azureml` 191 | - `PR-<#pr_number>-latest-nightly` aka `PR--latest-azureml-nightly` 192 | - `PR-<#pr_number>-latest-nvidia` 193 | 194 | And finally for both `main` and PR branches, we have tags corresponding to git commit hashes 195 | 196 | - `main--azureml` and `PR---azureml` 197 | - `main--azureml-nightly` and `PR---azureml-nightly` 198 | - `main--nvidia` and `PR---nvidia` 199 | 200 | For example: 201 | 202 | - `main-7fadad2b-azureml`, `main-7fadad2b-azureml-nightly`, `main-7fadad2b-nvidia`: correspond to [commit 7fadad2b](https://github.com/AutonomousSystemsResearch/ml_template/commit/7fadad2b1391cdbbc46422a6865caaf0300b9af8) on `main` branch with our three different dockerfiles 203 | - `PR-50-latest-azureml`, `PR-50-latest-azureml-nightly`, `PR-50-latest-nvidia`: correspond to latest commit on [PR#50](https://github.com/AutonomousSystemsResearch/ml_template/pull/50) with our three different dockerfiles 204 | - `PR-50-eef3b90-azureml`, `PR-50-eef3b90-azureml-nightly`, `PR-50-eef3b90-nvidia`: correspond to [commit eef3b90](https://github.com/AutonomousSystemsResearch/ml_template/pull/50/commits/eef3b900fc956614c7d45eac6fa9245b57f7bd72) on [PR#50](https://github.com/AutonomousSystemsResearch/ml_template/pull/50) with our three different dockerfiles 205 | 206 |
207 |
208 | 209 | Building and understanding our Dockerfiles (click to expand) 210 | 211 | 212 | - We have three docker files: 213 | 214 | - azureml base: 215 | - [docker/Dockerfile_base_azureml](docker/Dockerfile_base_azureml) 216 | - [docker/Dockerfile_base_azureml_latest](docker/Dockerfile_base_azureml_latest) 217 | - nvidia pytorch base: 218 | - [docker/Dockerfile_base_nvidia](docker/Dockerfile_base_nvidia). 219 | 220 | - Both of the azureml base images grabs a base image from [here](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu), and put the user's conda environment ([docker/environment.yml](docker/environment.yml)) on top of the base page. 221 | 222 | - In the `latest-azureml` version, packages in your local conda environment should match the docker image exactly. 223 | 224 | - In the `latest-azureml-nightly` image, pytorch (including cudatoolkit) and pytorch lightning are updated to the nightly versions. 225 | 226 | - The nvidia pytorch base image grabs a base image from [here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) ([here](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) for details), which already has the latest version of pytorch. 227 | Instead of using user's conda environment, this docker file uses `pip` to install pytorch lightning and other dependencies on top of base image. So this image can have different versions of packages as compared to your conda environment. 228 | 229 | All docker images accept a build argument to update the base image version easily: 230 | 231 | - azureml images: 232 | - take base azure image name's suffix **and** tag. see available options [here](https://github.com/Azure/AzureML-Containers/tree/master/base/gpu): 233 | - examples: `openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest`, `openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest`, and so on. 234 | - nvidia pytorch image: 235 | - takes base nvidia image name's tag only. 236 | - see [available tags here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) and [the release notes for their contents](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) 237 | - examples: `22.06-py3`, `22.05-py3`, and so on. 238 | 239 | Please review the arguments in the dockerfiles carefully. These can also be seen by reading through [azure-pipelines.yml](azure-pipelines.yml). 240 | 241 | Building the azure-ml base + conda env images locally: 242 | 243 | ``` 244 | cd docker; 245 | 246 | docker build \ 247 | -f Dockerfile_base_azureml \ 248 | --build-arg BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest \ 249 | -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml . 250 | 251 | # note that in the PRIVATEAZURECONTAINERREGISTRYNAME acr, latest is equivalent to latest-azureml tag. So, we can just re-tag the image: 252 | docker tag PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest 253 | ``` 254 | 255 | For the CUDA 11.6 version: 256 | 257 | ``` 258 | cd docker; 259 | 260 | docker build \ 261 | -f Dockerfile_base_azureml_cu116 \ 262 | --build-arg BASE_IMAGE=openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest \ 263 | -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml-cu116 . 264 | 265 | # note that in the PRIVATEAZURECONTAINERREGISTRYNAME acr, latest is equivalent to latest-azureml tag. So, we can just re-tag the image: 266 | docker tag PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-azureml-cu116 PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-cu116 267 | ``` 268 | 269 | Building the nvidia-pytorch image locally: 270 | 271 | ``` 272 | # building nvidia-pytorch image with locally. 273 | cd docker; 274 | 275 | docker build \ 276 | -f Dockerfile_base_nvidia \ 277 | --build-arg BASE_IMAGE=22.06-py3 \ 278 | -t PRIVATEAZURECONTAINERREGISTRYNAME.azurecr.io/ml_template:latest-nvidia . 279 | ``` 280 | 281 |
282 | 283 |
284 | 285 | Developing inside docker containers with VSCode: (click to expand) 286 | 287 | 288 | - [Attach to a docker container](https://code.visualstudio.com/docs/remote/attach-container) 289 | 290 | - [Devcontainer](https://code.visualstudio.com/docs/remote/containers) 291 | 292 | > **Note**: This method can be used on an Azure VM or locally with no change and uses docker 293 | 294 | Follow the steps below: 295 | 296 | - Connect to your remote Azure VM using VS Code 297 | - Open the workspace within a docker container for development, either using the popup as shown in the animation above, or by searching for `(Re)Build and (Re)open in container` in the command palette (hit `Ctrl+Shift+P` to open the command palette) 298 | - After setup is complete, it is time to set up the repository: 299 | ``` 300 | pip install -e . 301 | pre-commit install 302 | ``` 303 | - > **Note**: By default, the devcontainer uses the [azureml-conda base image](docker/Dockerfile_base_azureml). We can also use the [nvidia base image](docker/Dockerfile_base_nvidia) by modifying the `dockerfile` line in [devcontainer.json](.devcontainer/devcontainer.json). Similarly, we can edit the docker files build argument therein itself. 304 | 305 |
306 | 307 | #### Running MNIST example 308 | 309 | - Understanding OmegaConf and config files 310 | 311 | - Please review OmegaConf's [github readme](https://github.com/omry/omegaconf#releases) for [their documentation](https://omegaconf.readthedocs.io/en/2.2_branch/), [slides (for ver 2.1)](https://docs.google.com/presentation/d/e/2PACX-1vT_UIV7hCnquIbLUm4NnkUpXvPEh33IKiUEvPRF850WKA8opOlZOszjKdZ3tPmf8u7hGNP6HpqS-NT5/pub?start=false&loop=false&delayms=3000&slide=id.p), and a [live tutorial](https://github.com/omry/omegaconf#live-tutorial). 312 | 313 | - Single GPU 314 | 315 | ``` 316 | python src/train.py base=configs/train.yaml trainer.num_nodes=1 trainer.devices=1 317 | ``` 318 | 319 | - Multiple GPUs 320 | 321 | ``` 322 | python src/train.py base=configs/train.yaml trainer.num_nodes=1 trainer.devices=4 323 | ``` 324 | 325 | ### Running on Azure 326 | 327 | Note: This section used internal tools for job submission to Azure ML workspaces. This section is not supported publicly at the time of writing. However, one may peruse existing public documentation on azure ml. 328 | 329 | ### Developing 330 | 331 | #### Tests 332 | 333 | The template has some basic tests in `tests/` directory. To run them, run: 334 | 335 | ``` 336 | # run all tests 337 | pytest 338 | 339 | # run single test 340 | pytest tests/test_dev_fast_run.py 341 | ``` 342 | 343 | List of tests implemented: 344 | 345 | - [fast_dev_run](https://pytorch-lightning.readthedocs.io/en/stable/common/debugging.html#fast-dev-run): a simple check to run your trainer on single batch of train, valid, and test datase. 346 | It can also be useful to quickly check your code works by running while adding new features: 347 | ``` 348 | python src/train.py base=configs/train.yaml --fast_dev_run=True 349 | ``` 350 | 351 | #### Code formatting and Linting 352 | 353 | We use: 354 | 355 | - [black](https://black.readthedocs.io/en/stable/) for code formatting 356 | 357 | - [isort](https://pycqa.github.io/isort/) for import ordering 358 | 359 | - [pycln](https://hadialqattan.github.io/pycln/#/) for removing unused imports 360 | 361 | - Running locally: 362 | 363 | ``` 364 | $ cd ml_template; 365 | $ black . 366 | $ isort . 367 | $ pycln --all . 368 | ``` 369 | 370 | #### Pre-commit Hooks: Automating Code formatting and Linting 371 | 372 | [pre-commit](https://pre-commit.com/) hooks automate black autoformatting and ensuring PEP8 compliance. 373 | 374 | - Setting up: 375 | 376 | ``` 377 | $ cd ml_template; 378 | $ pre-commit install 379 | ``` 380 | 381 | - Running: 382 | 383 | After the above step, `pre-commit` will run **automatically** when you `git commit`. 384 | If the run fails with errors in red, you can check the edits made by `pre-commit` by `git diff`. 385 | If the changes look good, (1) `git add` those files again, and then (2) run `git commit` again. 386 | 387 | Optionally, you can also run pre-commit manually by: 388 | 389 | ``` 390 | $ pre-commit run --all-files 391 | ``` 392 | 393 | - Updating hooks: 394 | Use the `autoupdate` command to keep the versions of formatters in `.pre-commit-config.yaml` up to date. 395 | 396 | ``` 397 | $ pre-commit autoupdate 398 | ``` 399 | 400 | ### Continuous Integration 401 | 402 | - **Github Actions** 403 | 404 | - [Pre-commit checks](.github/workflows/pre-commit.yml) 405 | - [Template cleanup](.github/workflows/template-cleanup.yml): 406 | When a new repository is generated using this template, this action replace `README.md` with `README_template.md` to keep microsoft links internal. 407 | 408 | - **Azure Pipelines** 409 | 410 | - Create an azure devops pipeline for your repository. 411 | This automates building of your docker images, and also run pytests on them. 412 | 413 | - The azure pipeline logs can be seen at Azure DevOps webpage, but not on with github UI directly. 414 | 415 | Pull Request example: 416 | 417 | - You can click `View more details on Azure Pipelines` under the `Checks` section of a github PR. 418 | - See [PR#6/checks](https://github.com/AutonomousSystemsResearch/ml_template/pull/6/checks) for an example. 419 | 420 |
421 | 422 | - [Docker Build and Push Image](azure-pipelines.yml) 423 | 424 | See the job `BuildDockerImageAndPush` in [azure-pipelines.yml](azure-pipelines.yml). It will build the image in [docker/Dockerfile](docker/Dockerfile) and push it to a private azure container registry 425 | 426 | See docker section under #running-locally for details 427 | 428 | ### Contributing 429 | 430 | - conda `environment.yml` update: 431 | 432 | If you install packages in conda, update the `docker/environment.yml` by `conda env export | grep -v "^prefix: " > docker/environment.yml`, and send a PR. 433 | 434 | ## Reference Repositories 435 | 436 | - Pytorch Lightning: 437 | 438 | - Pytorch v/s Pytorch Lightning 439 | 440 | - [PyTorch Lightning for Dummies - A Tutorial and Overview 441 | ](https://www.assemblyai.com/blog/pytorch-lightning-for-dummies/) 442 | - [PyTorch Lightning: DataModules, Callbacks, TPU, and Loggers 443 | ](https://dev.to/krypticmouse/pytorch-lightning-datamodules-callbacks-tpu-and-loggers-4nhb) 444 | 445 | - Template / reference repositories 446 | 447 | - https://github.com/ashleve/lightning-hydra-template 448 | - https://github.com/lkhphuc/lightning-hydra-template 449 | - [Pytorch lightning bolts](https://lightning-bolts.readthedocs.io/en/latest/) 450 | - Look inside the code for datamodules, datasets, models, etc: https://github.com/PyTorchLightning/lightning-bolts/tree/master/pl_bolts 451 | 452 | - Pytorch Geometric: 453 | 454 | - [lightning-examples](https://github.com/pyg-team/pytorch_geometric/tree/d451d6d20287b03cbe5036e5c53ee5f633f3c429/examples/pytorch_lightning) 455 | - [torch_geometric.data.lightning_datamodule](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/lightning_datamodule.html) 456 | - [Graph Gym](https://pytorch-geometric.readthedocs.io/en/latest/notes/graphgym.html) 457 | 458 | - Pytorch data, datapipes, dataloaders: 459 | 460 | - https://pytorch.org/data/main/examples.html 461 | - https://github.com/tcapelle/torchdata 462 | - https://github.com/pytorch/data 463 | -------------------------------------------------------------------------------- /README_template.md: -------------------------------------------------------------------------------- 1 | # Name_of_Your_Project 2 | 3 | ## Setting up 4 | 5 | - Using conda 6 | 7 | ``` 8 | # create env 9 | conda env create --file docker/environment.yml 10 | 11 | # activate it 12 | conda activate NAMEOFYOURPROJECT 13 | 14 | # install this repo 15 | (NAMEOFYOURPROJECT) $ pip install -e . 16 | ``` 17 | 18 | - Using docker 19 | 20 | ``` 21 | # pull image with [azureml image](https://hub.docker.com/_/microsoft-azureml?tab=description) as base with docker/environment.yml on top 22 | docker pull NAMEOFYOURPROJECT:latest 23 | 24 | # pull image with nvidia pytorch image as base 25 | # docker pull NAMEOFYOURPROJECT:latest-nvidia 26 | 27 | # run image 28 | docker run -it --gpus=all -v : 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | - Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | - Full paths of source file(s) related to the manifestation of the issue 23 | - The location of the affected source code (tag/branch/commit or direct URL) 24 | - Any special configuration required to reproduce the issue 25 | - Step-by-step instructions to reproduce the issue 26 | - Proof-of-concept or exploit code (if possible) 27 | - Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /assets/images/table-of-contents.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/assets/images/table-of-contents.png -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - main 3 | # paths: 4 | # include: 5 | # - docker 6 | 7 | pr: 8 | - main 9 | 10 | resources: 11 | - repo: self 12 | 13 | variables: 14 | # Note: to customize the pipeline to use private ACRs other than commondockerimages, 15 | # you need to change BOTH dockerRegistryServiceConnection and containerRegistryName 16 | 17 | # Container registry service connection established during pipeline creation 18 | dockerRegistryServiceConnection: "442ea973-c852-4792-aa09-fab4a9df791f" 19 | containerRegistryName: "commondockerimages.azurecr.io" 20 | 21 | dockerfileazuremlPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_azureml" 22 | dockerfileazuremlnightlyPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_azureml_nightly" 23 | dockerfilenvidiaPath: "$(Build.SourcesDirectory)/docker/Dockerfile_base_nvidia" 24 | 25 | tagLatest: "latest" 26 | 27 | # Agent VM image name 28 | vmImageName: "ubuntu-latest" 29 | 30 | stages: 31 | # docs: https://docs.microsoft.com/azure/devops/pipelines/languages/docker 32 | - stage: BuildDockerImagesAndRunPytest 33 | displayName: Build docker; run pytests on built images 34 | jobs: 35 | - job: DefineDockerTags 36 | displayName: define docker tags 37 | pool: 38 | vmImage: $(vmImageName) 39 | steps: 40 | - bash: | 41 | github_organization_prefix="AutonomousSystemsResearch/" 42 | full_repo_name=$(Build.Repository.Name) 43 | repo_name=${full_repo_name#"$github_organization_prefix"} 44 | branch_name=$(Build.SourceBranchName) 45 | 46 | git_short_hash_main=`git rev-parse --short=7 HEAD` 47 | git_hash_pr=$(System.PullRequest.SourceCommitId) 48 | git_short_hash_pr=${git_hash_pr:0:7} 49 | pr_number=$(System.PullRequest.PullRequestNumber) 50 | 51 | tag_main_git_commit=main-$git_short_hash_main 52 | tag_pr_git_commit=PR-$pr_number-$git_short_hash_pr 53 | tag_pr_latest=PR-$pr_number-latest 54 | 55 | echo "" 56 | echo "full repo name: $(Build.Repository.Name)" 57 | echo "repo name: $repo_name" 58 | echo "Build Id: $(Build.BuildId)" 59 | echo "Build BuildNumber: $(Build.BuildNumber)" 60 | echo "Build Reason: $(Build.Reason)" 61 | echo "Build Branch Name: $(Build.SourceBranchName)" 62 | echo "git commit message: $(Build.SourceVersionMessage)" 63 | echo "git hash (main branch): $(Build.SourceVersion)" 64 | echo "git hash short (main branch): $git_short_hash_main" 65 | echo "PR branch: $(System.PullRequest.SourceBranch)" 66 | echo "PR number: $(System.PullRequest.PullRequestNumber)" 67 | echo "PR ID: $(System.PullRequest.PullRequestId)" 68 | echo "git hash (PR branch): $(System.PullRequest.SourceCommitId)" 69 | echo "git hash short (PR branch): : $git_short_hash_pr" 70 | 71 | # set pipeline variables which can be referenced in the jobs that follow to tag docker images appropriately 72 | echo "##vso[task.setvariable variable=repoName;isoutput=true]$repo_name" 73 | echo "##vso[task.setvariable variable=tagPRLatest;isoutput=true]$tag_pr_latest" 74 | 75 | if [[ "$branch_name" == "main" ]]; then 76 | echo "##vso[task.setvariable variable=tagOfThisBuild;isoutput=true]$tag_main_git_commit" 77 | fi 78 | if [[ "$branch_name" == "merge" ]]; then 79 | echo "##vso[task.setvariable variable=tagOfThisBuild;isoutput=true]$tag_pr_git_commit" 80 | fi 81 | 82 | ## deprecated; but might be of use in the future 83 | # echo "##vso[task.setvariable variable=tagMainGitCommitHash;isoutput=true]$tag_main_git_commit" 84 | # echo "##vso[task.setvariable variable=tagPRGitCommitHash;isoutput=true]$tag_pr_git_commit" 85 | 86 | # print tags: 87 | echo "tag_pr_latest: $tag_pr_latest" 88 | echo "tag_pr_git_commit: $tag_pr_git_commit" 89 | echo "tag_main_git_commit: $tag_main_git_commit" 90 | 91 | # print outputvars: 92 | echo "tag_pr_latest: $tag_pr_latest" 93 | echo "tag_pr_git_commit: $tag_pr_git_commit" 94 | echo "tag_main_git_commit: $tag_main_git_commit" 95 | name: DockerTagVars # because we're going to depend on it, we need to name the step 96 | displayName: (debug) print git info 97 | 98 | - job: BuildDockerAzureMLBase 99 | dependsOn: DefineDockerTags 100 | displayName: build azureml; run pytest 101 | pool: 102 | vmImage: $(vmImageName) 103 | variables: 104 | tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ] 105 | tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ] 106 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ] 107 | steps: 108 | - bash: | 109 | echo "repoName: $(repoName)" 110 | echo "tagOfThisBuild: $(tagOfThisBuild)" 111 | echo "tagPRLatest: $(tagPRLatest)" 112 | displayName: (debug) print pipeline vars 113 | 114 | - task: Docker@2 115 | displayName: Build and Push Image 116 | inputs: 117 | command: buildAndPush 118 | repository: $(repoName) 119 | dockerfile: $(dockerfileazuremlPath) 120 | containerRegistry: $(dockerRegistryServiceConnection) 121 | ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}: 122 | tags: | 123 | $(tagOfThisBuild)-azureml 124 | $(tagPRLatest)-azureml 125 | $(tagPRLatest) 126 | ${{ if eq(variables['Build.SourceBranchName'], 'main') }}: 127 | tags: | 128 | $(tagOfThisBuild)-azureml 129 | $(tagLatest)-azureml 130 | $(tagLatest) 131 | 132 | - bash: | 133 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-azureml 134 | docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\"" 135 | docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\"" 136 | docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\"" 137 | displayName: print versions 138 | 139 | - script: | 140 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu" 141 | displayName: pytest 142 | 143 | - job: BuildDockerAzureMLBaseNightly 144 | dependsOn: DefineDockerTags 145 | displayName: build azureml nightly; run pytest 146 | pool: 147 | vmImage: $(vmImageName) 148 | variables: 149 | tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ] 150 | tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ] 151 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ] 152 | steps: 153 | - bash: | 154 | echo "repoName: $(repoName)" 155 | echo "tagOfThisBuild: $(tagOfThisBuild)-azureml-nightly" 156 | echo "tagPRLatest: $(tagPRLatest)-azureml-nightly" 157 | displayName: (debug) print pipeline vars 158 | 159 | - task: Docker@2 160 | displayName: Build and Push Image 161 | inputs: 162 | command: buildAndPush 163 | repository: $(repoName) 164 | dockerfile: $(dockerfileazuremlnightlyPath) 165 | containerRegistry: $(dockerRegistryServiceConnection) 166 | ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}: 167 | tags: | 168 | $(tagOfThisBuild)-azureml-nightly 169 | $(tagPRLatest)-azureml-nightly 170 | ${{ if eq(variables['Build.SourceBranchName'], 'main') }}: 171 | tags: | 172 | $(tagOfThisBuild)-azureml-nightly 173 | $(tagLatest)-azureml-nightly 174 | 175 | - bash: | 176 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-azureml-nightly 177 | docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\"" 178 | docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\"" 179 | docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\"" 180 | displayName: print versions 181 | 182 | - script: | 183 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu" 184 | displayName: pytest 185 | 186 | - job: BuildDockerNvidiaBasePipInstall 187 | dependsOn: DefineDockerTags 188 | displayName: build nvidia pytorch; run pytest 189 | pool: 190 | vmImage: $(vmImageName) 191 | variables: 192 | tagOfThisBuild: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagOfThisBuild'] ] 193 | tagPRLatest: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.tagPRLatest'] ] 194 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ] 195 | steps: 196 | - bash: | 197 | echo "repoName: $(repoName)" 198 | echo "tagOfThisBuild: $(tagOfThisBuild)-nvidia" 199 | echo "tagPRLatest: $(tagPRLatest)-nvidia" 200 | 201 | # echo "tagMainGitCommitHash: $(tagMainGitCommitHash)" 202 | # echo "tagPRGitCommitHash: $(tagPRGitCommitHash)" 203 | displayName: (debug) print pipeline vars 204 | 205 | - task: Docker@2 206 | displayName: Build and Push Image 207 | inputs: 208 | command: buildAndPush 209 | repository: $(repoName) 210 | dockerfile: $(dockerfilenvidiaPath) 211 | containerRegistry: $(dockerRegistryServiceConnection) 212 | ${{ if eq(variables['Build.SourceBranchName'], 'merge') }}: 213 | tags: | 214 | $(tagOfThisBuild)-nvidia 215 | $(tagPRLatest)-nvidia 216 | ${{ if eq(variables['Build.SourceBranchName'], 'main') }}: 217 | tags: | 218 | $(tagOfThisBuild)-nvidia 219 | $(tagLatest)-nvidia 220 | 221 | - bash: | 222 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagOfThisBuild)-nvidia 223 | docker exec testapp bash -c "python3 -c \"import torch; print('torch: version', torch.__version__)\"" 224 | docker exec testapp bash -c "python3 -c \"import pytorch_lightning; print('pytorch_lightning: version', pytorch_lightning.__version__)\"" 225 | docker exec testapp bash -c "python3 -c \"import torch; print('torch.cuda.version:', torch.version.cuda)\"" 226 | displayName: print versions 227 | 228 | - script: | 229 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu" 230 | displayName: pytest 231 | 232 | - job: PytestAzureMLBaseTagLatest 233 | dependsOn: DefineDockerTags 234 | displayName: pytest latest-azureml 235 | pool: 236 | vmImage: $(vmImageName) 237 | variables: 238 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ] 239 | steps: 240 | - checkout: self 241 | 242 | - task: Docker@2 243 | displayName: Login to ACR 244 | inputs: 245 | command: login 246 | containerRegistry: $(dockerRegistryServiceConnection) 247 | 248 | - script: | 249 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-azureml 250 | displayName: docker pull and run 251 | 252 | - script: | 253 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu" 254 | displayName: pytest 255 | 256 | - job: PytestAzureMLBaseTagLatestNightly 257 | dependsOn: DefineDockerTags 258 | displayName: pytest latest-azureml-nightly 259 | pool: 260 | vmImage: $(vmImageName) 261 | variables: 262 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ] 263 | steps: 264 | - checkout: self 265 | 266 | - task: Docker@2 267 | displayName: Login to ACR 268 | inputs: 269 | command: login 270 | containerRegistry: $(dockerRegistryServiceConnection) 271 | 272 | - script: | 273 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-azureml-nightly 274 | displayName: docker pull and run 275 | 276 | - script: | 277 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu" 278 | displayName: pytest 279 | 280 | - job: PytestNvidiaBaseTagLatest 281 | dependsOn: DefineDockerTags 282 | displayName: pytest latest-nvidia 283 | pool: 284 | vmImage: $(vmImageName) 285 | variables: 286 | repoName: $[ dependencies.DefineDockerTags.outputs['DockerTagVars.repoName'] ] 287 | steps: 288 | - checkout: self 289 | 290 | - task: Docker@2 291 | displayName: Login to ACR 292 | inputs: 293 | command: login 294 | containerRegistry: $(dockerRegistryServiceConnection) 295 | 296 | - script: | 297 | docker run -d -i --name testapp -v $(Build.SourcesDirectory):/workdir -w /workdir $(containerRegistryName)/$(repoName):$(tagLatest)-nvidia 298 | displayName: docker pull and run 299 | 300 | - script: | 301 | docker exec testapp bash -c "pip install -e .; pytest tests/test_dev_fast_run.py -k test_fast_dev_run_cpu" 302 | displayName: pytest 303 | -------------------------------------------------------------------------------- /configs/train.yaml: -------------------------------------------------------------------------------- 1 | seed_everything: 42 2 | 3 | trainer: 4 | default_root_dir: ${oc.env:AMLT_OUTPUT_DIR,outputs} 5 | 6 | num_nodes: 1 7 | devices: 1 8 | accelerator: gpu 9 | strategy: ddp_find_unused_parameters_false 10 | 11 | min_epochs: 1 12 | max_epochs: 10 13 | enable_progress_bar: true 14 | 15 | sync_batchnorm: True 16 | enable_checkpointing: True 17 | resume_from_checkpoint: null 18 | 19 | # debugging 20 | fast_dev_run: false 21 | 22 | data: 23 | _target_: datamodules.mnist_datamodule.MNISTDataModule 24 | 25 | file_params: 26 | base_dir: data/ 27 | train_val_test_split: [55_000, 5_000, 10_000] 28 | 29 | train_params: 30 | batch_size: 128 31 | num_workers: 0 32 | pin_memory: False 33 | 34 | model: 35 | _target_: models.mnist_module.MNISTLitModule 36 | 37 | mlp_config: 38 | input_size: 784 39 | lin1_size: 256 40 | lin2_size: 256 41 | lin3_size: 256 42 | output_size: 10 43 | 44 | optimizer_config: 45 | lr: 0.001 46 | weight_decay: 0.0005 47 | 48 | logger: 49 | tensorboard: 50 | _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger 51 | save_dir: ${trainer.default_root_dir}/logs 52 | name: null 53 | version: null 54 | log_graph: False 55 | default_hp_metric: True 56 | prefix: "" 57 | 58 | callbacks: 59 | checkpoint: 60 | _target_: pytorch_lightning.callbacks.ModelCheckpoint 61 | dirpath: "${trainer.default_root_dir}/checkpoints/" 62 | monitor: "val/acc" # name of the logged metric which determines when model is improving 63 | mode: "max" # "max" means higher metric value is better, can be also "min" 64 | save_top_k: 1 # save k best models (determined by above metric) 65 | save_last: True # additionaly always save model from last epoch 66 | verbose: False 67 | filename: "epoch_{epoch:03d}" 68 | auto_insert_metric_name: False 69 | 70 | early_stopping: 71 | _target_: pytorch_lightning.callbacks.EarlyStopping 72 | monitor: "val/loss" # name of the logged metric which determines when model is improving 73 | mode: "min" # "max" means higher metric value is better, can be also "min" 74 | patience: 100 # how many validation epochs of not improving until training stops 75 | min_delta: 0 # minimum change in the monitored metric needed to qualify as an improvement 76 | 77 | model_summary: 78 | _target_: pytorch_lightning.callbacks.RichModelSummary 79 | max_depth: -1 80 | 81 | progress: 82 | _target_: pytorch_lightning.callbacks.RichProgressBar 83 | 84 | lr_mon: 85 | _target_: pytorch_lightning.callbacks.LearningRateMonitor 86 | logging_interval: "epoch" 87 | -------------------------------------------------------------------------------- /docker/Dockerfile_base_azureml: -------------------------------------------------------------------------------- 1 | # see latest azureml base images tags here 2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu 3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description 4 | 5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest 6 | 7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE} 8 | 9 | ARG DEBIAN_FRONTEND=noninteractive 10 | 11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 12 | build-essential \ 13 | software-properties-common \ 14 | cmake \ 15 | g++-7 \ 16 | git \ 17 | gpg \ 18 | curl \ 19 | vim \ 20 | wget \ 21 | ca-certificates \ 22 | libjpeg-dev \ 23 | libpng-dev \ 24 | librdmacm1 \ 25 | libibverbs1 \ 26 | ibverbs-providers \ 27 | openssh-client \ 28 | openssh-server \ 29 | libsm6 \ 30 | libxext6 \ 31 | ffmpeg \ 32 | libfontconfig1 \ 33 | libxrender1 \ 34 | libgl1-mesa-glx &&\ 35 | apt-get clean && rm -rf /var/lib/apt/lists/* 36 | 37 | ADD environment.yml /tmp/environment.yml 38 | RUN conda env update -n base -f /tmp/environment.yml 39 | -------------------------------------------------------------------------------- /docker/Dockerfile_base_azureml_cu116: -------------------------------------------------------------------------------- 1 | # see latest azureml base images tags here 2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu 3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description 4 | 5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest 6 | 7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE} 8 | 9 | ARG DEBIAN_FRONTEND=noninteractive 10 | 11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 12 | build-essential \ 13 | software-properties-common \ 14 | cmake \ 15 | g++-7 \ 16 | git \ 17 | gpg \ 18 | curl \ 19 | vim \ 20 | wget \ 21 | ca-certificates \ 22 | libjpeg-dev \ 23 | libpng-dev \ 24 | librdmacm1 \ 25 | libibverbs1 \ 26 | ibverbs-providers \ 27 | openssh-client \ 28 | openssh-server \ 29 | libsm6 \ 30 | libxext6 \ 31 | ffmpeg \ 32 | libfontconfig1 \ 33 | libxrender1 \ 34 | libgl1-mesa-glx &&\ 35 | apt-get clean && rm -rf /var/lib/apt/lists/* 36 | 37 | ADD environment_cu116.yml /tmp/environment.yml 38 | RUN conda env update -n base -f /tmp/environment.yml 39 | -------------------------------------------------------------------------------- /docker/Dockerfile_base_azureml_nightly: -------------------------------------------------------------------------------- 1 | # see latest azureml base images tags here 2 | # - https://github.com/Azure/AzureML-Containers/tree/master/base/gpu 3 | # - https://hub.docker.com/_/microsoft-azureml?tab=description 4 | 5 | ARG BASE_IMAGE=openmpi4.1.0-cuda11.3-cudnn8-ubuntu20.04:latest 6 | 7 | FROM mcr.microsoft.com/azureml/${BASE_IMAGE} 8 | 9 | ARG DEBIAN_FRONTEND=noninteractive 10 | 11 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 12 | build-essential \ 13 | software-properties-common \ 14 | cmake \ 15 | g++-7 \ 16 | git \ 17 | gpg \ 18 | curl \ 19 | vim \ 20 | wget \ 21 | ca-certificates \ 22 | libjpeg-dev \ 23 | libpng-dev \ 24 | librdmacm1 \ 25 | libibverbs1 \ 26 | ibverbs-providers \ 27 | openssh-client \ 28 | openssh-server \ 29 | libsm6 \ 30 | libxext6 \ 31 | ffmpeg \ 32 | libfontconfig1 \ 33 | libxrender1 \ 34 | libgl1-mesa-glx &&\ 35 | apt-get clean && rm -rf /var/lib/apt/lists/* 36 | 37 | # use user's conda env as base 38 | ADD environment.yml /tmp/environment.yml 39 | RUN conda env update -n base -f /tmp/environment.yml 40 | 41 | # update pytorch installed from the above step to nightly 42 | RUN conda update pytorch torchvision torchaudio -c pytorch-nightly -y 43 | 44 | # install pytorch lightning nightly 45 | RUN pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip && \ 46 | pip install jsonargparse[signatures] --upgrade 47 | 48 | # (optional) update all conda pkgs 49 | # RUN conda update --all 50 | -------------------------------------------------------------------------------- /docker/Dockerfile_base_nvidia: -------------------------------------------------------------------------------- 1 | # tags release notes: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html 2 | # tags: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags?quick-deploy=false 3 | 4 | ARG BASE_IMAGE=22.06-py3 5 | 6 | FROM nvcr.io/nvidia/pytorch:${BASE_IMAGE} 7 | 8 | ARG DEBIAN_FRONTEND=noninteractive 9 | 10 | RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 11 | build-essential \ 12 | software-properties-common \ 13 | cmake \ 14 | g++-7 \ 15 | git \ 16 | gpg \ 17 | curl \ 18 | vim \ 19 | wget \ 20 | ca-certificates \ 21 | libjpeg-dev \ 22 | libpng-dev \ 23 | librdmacm1 \ 24 | libibverbs1 \ 25 | ibverbs-providers \ 26 | openssh-client \ 27 | openssh-server \ 28 | libsm6 \ 29 | libxext6 \ 30 | ffmpeg \ 31 | libfontconfig1 \ 32 | libxrender1 \ 33 | libgl1-mesa-glx &&\ 34 | apt-get clean && rm -rf /var/lib/apt/lists/* 35 | 36 | RUN pip install click termcolor future python-dateutil \ 37 | azureml-core azureml-mlflow \ 38 | opencv-python scipy psutil 39 | 40 | # jsonargparse[signatures] does not work in docker, so need lightning[extra] 41 | # in conda, jsonargparse[signatures] is enough 42 | RUN pip install pytorch-lightning[extra] einops pre-commit pytest sh rich 43 | # RUN pip install pytorch-lightning jsonargparse[signatures] einops 44 | -------------------------------------------------------------------------------- /docker/environment.yml: -------------------------------------------------------------------------------- 1 | name: ml_template 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=5.1=1_gnu 9 | - absl-py=1.2.0=pyhd8ed1ab_0 10 | - aiohttp=3.8.1=py39hb9d737c_1 11 | - aiosignal=1.2.0=pyhd8ed1ab_0 12 | - async-timeout=4.0.2=pyhd8ed1ab_0 13 | - attrs=22.1.0=pyh71513ae_1 14 | - blas=1.0=mkl 15 | - blinker=1.4=py_1 16 | - brotlipy=0.7.0=py39h27cfd23_1003 17 | - bzip2=1.0.8=h7b6447c_0 18 | - c-ares=1.18.1=h7f98852_0 19 | - ca-certificates=2022.9.24=ha878542_0 20 | - cachetools=5.2.0=pyhd8ed1ab_0 21 | - certifi=2022.9.24=pyhd8ed1ab_0 22 | - cffi=1.15.1=py39h74dc2b5_0 23 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 24 | - click=8.1.3=py39hf3d152e_0 25 | - colorama=0.4.5=pyhd8ed1ab_0 26 | - cryptography=37.0.1=py39h9ce1e76_0 27 | - cudatoolkit=11.3.1=h2bc3f7f_2 28 | - ffmpeg=4.3=hf484d3e_0 29 | - freetype=2.11.0=h70c0345_0 30 | - frozenlist=1.2.0=py39h7f8727e_0 31 | - fsspec=2022.8.2=pyhd8ed1ab_0 32 | - giflib=5.2.1=h7b6447c_0 33 | - gmp=6.2.1=h295c915_3 34 | - gnutls=3.6.15=he1e5248_0 35 | - google-auth=2.11.1=pyh1a96a4e_0 36 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 37 | - grpcio=1.42.0=py39hce63b2e_0 38 | - idna=3.3=pyhd3eb1b0_0 39 | - importlib-metadata=4.11.4=py39hf3d152e_0 40 | - intel-openmp=2021.4.0=h06a4308_3561 41 | - jpeg=9e=h7f8727e_0 42 | - lame=3.100=h7b6447c_0 43 | - lcms2=2.12=h3be6417_0 44 | - ld_impl_linux-64=2.38=h1181459_1 45 | - lerc=3.0=h295c915_0 46 | - libdeflate=1.8=h7f8727e_5 47 | - libffi=3.3=he6710b0_2 48 | - libgcc-ng=11.2.0=h1234567_1 49 | - libgomp=11.2.0=h1234567_1 50 | - libiconv=1.16=h7f8727e_2 51 | - libidn2=2.3.2=h7f8727e_0 52 | - libpng=1.6.37=hbc83047_0 53 | - libprotobuf=3.15.8=h780b84a_1 54 | - libstdcxx-ng=11.2.0=h1234567_1 55 | - libtasn1=4.16.0=h27cfd23_0 56 | - libtiff=4.4.0=hecacb30_0 57 | - libunistring=0.9.10=h27cfd23_0 58 | - libwebp=1.2.2=h55f646e_0 59 | - libwebp-base=1.2.2=h7f8727e_0 60 | - lz4-c=1.9.3=h295c915_1 61 | - markdown=3.4.1=pyhd8ed1ab_0 62 | - markupsafe=2.1.1=py39hb9d737c_1 63 | - mkl=2021.4.0=h06a4308_640 64 | - mkl-service=2.4.0=py39h7f8727e_0 65 | - mkl_fft=1.3.1=py39hd3c417c_0 66 | - mkl_random=1.2.2=py39h51133e4_0 67 | - multidict=6.0.2=py39hb9d737c_1 68 | - ncurses=6.3=h5eee18b_3 69 | - nettle=3.7.3=hbbd107a_1 70 | - numpy=1.23.1=py39h6c91a56_0 71 | - numpy-base=1.23.1=py39ha15fc14_0 72 | - oauthlib=3.2.1=pyhd8ed1ab_0 73 | - openh264=2.1.1=h4ff587b_0 74 | - openssl=1.1.1q=h7f8727e_0 75 | - packaging=21.3=pyhd8ed1ab_0 76 | - pillow=9.2.0=py39hace64e9_1 77 | - pip=22.1.2=py39h06a4308_0 78 | - protobuf=3.15.8=py39he80948d_0 79 | - pyasn1=0.4.8=py_0 80 | - pyasn1-modules=0.2.7=py_0 81 | - pycparser=2.21=pyhd3eb1b0_0 82 | - pydeprecate=0.3.2=pyhd8ed1ab_0 83 | - pyjwt=2.5.0=pyhd8ed1ab_0 84 | - pyopenssl=22.0.0=pyhd3eb1b0_0 85 | - pyparsing=3.0.9=pyhd8ed1ab_0 86 | - pysocks=1.7.1=py39h06a4308_0 87 | - python=3.9.13=haa1d7c7_1 88 | - python_abi=3.9=2_cp39 89 | - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0 90 | - pytorch-lightning=1.7.7=pyhd8ed1ab_0 91 | - pytorch-mutex=1.0=cuda 92 | - pyu2f=0.1.5=pyhd8ed1ab_0 93 | - pyyaml=6.0=py39hb9d737c_4 94 | - readline=8.1.2=h7f8727e_1 95 | - requests=2.28.1=py39h06a4308_0 96 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0 97 | - rsa=4.9=pyhd8ed1ab_0 98 | - setuptools=63.4.1=py39h06a4308_0 99 | - six=1.16.0=pyhd3eb1b0_1 100 | - sqlite=3.39.3=h5082296_0 101 | - tensorboard=2.10.1=pyhd8ed1ab_0 102 | - tensorboard-data-server=0.6.0=py39hd97740a_2 103 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 104 | - tk=8.6.12=h1ccaba5_0 105 | - torchaudio=0.12.1=py39_cu113 106 | - torchmetrics=0.9.3=pyhd8ed1ab_0 107 | - torchvision=0.13.1=py39_cu113 108 | - tqdm=4.64.1=pyhd8ed1ab_0 109 | - typing-extensions=4.3.0=py39h06a4308_0 110 | - typing_extensions=4.3.0=py39h06a4308_0 111 | - tzdata=2022c=h04d1e81_0 112 | - urllib3=1.26.11=py39h06a4308_0 113 | - werkzeug=2.2.2=pyhd8ed1ab_0 114 | - wheel=0.37.1=pyhd3eb1b0_0 115 | - xz=5.2.6=h5eee18b_0 116 | - yaml=0.2.5=h7f98852_2 117 | - yarl=1.7.2=py39hb9d737c_2 118 | - zipp=3.8.1=pyhd8ed1ab_0 119 | - zlib=1.2.12=h5eee18b_3 120 | - zstd=1.5.2=ha4553b6_0 121 | - pip: 122 | - adal==1.2.7 123 | - alembic==1.8.1 124 | - antlr4-python3-runtime==4.9.3 125 | - argcomplete==2.0.0 126 | - azure-common==1.1.28 127 | - azure-core==1.25.1 128 | - azure-graphrbac==0.61.1 129 | - azure-identity==1.11.0 130 | - azure-mgmt-authorization==2.0.0 131 | - azure-mgmt-containerregistry==10.0.0 132 | - azure-mgmt-core==1.3.2 133 | - azure-mgmt-keyvault==10.1.0 134 | - azure-mgmt-resource==21.1.0 135 | - azure-mgmt-storage==20.0.0 136 | - azure-storage-blob==12.13.0 137 | - azureml-core==1.45.0.post2 138 | - azureml-mlflow==1.45.0 139 | - backports-tempfile==1.0 140 | - backports-weakref==1.0.post1 141 | - bcrypt==4.0.0 142 | - black==22.8.0 143 | - cfgv==3.3.1 144 | - cloudpickle==2.2.0 145 | - commonmark==0.9.1 146 | - contextlib2==21.6.0 147 | - databricks-cli==0.17.3 148 | - distlib==0.3.6 149 | - docker==5.0.3 150 | - filelock==3.8.0 151 | - flake8==5.0.4 152 | - flask==2.2.2 153 | - gitdb==4.0.9 154 | - gitpython==3.1.27 155 | - greenlet==1.1.3 156 | - gunicorn==20.1.0 157 | - humanfriendly==10.0 158 | - identify==2.5.5 159 | - iniconfig==1.1.1 160 | - isodate==0.6.1 161 | - isort==5.10.1 162 | - itsdangerous==2.1.2 163 | - jeepney==0.8.0 164 | - jmespath==1.0.1 165 | - jsonpickle==2.2.0 166 | - knack==0.9.0 167 | - mako==1.2.3 168 | - mccabe==0.7.0 169 | - mlflow==1.29.0 170 | - mlflow-skinny==1.29.0 171 | - msal==1.19.0 172 | - msal-extensions==1.0.0 173 | - msrest==0.7.1 174 | - msrestazure==0.6.4 175 | - mypy-extensions==0.4.3 176 | - ndg-httpsclient==0.5.1 177 | - nodeenv==1.7.0 178 | - omegaconf==2.2.3 179 | - opencv-python==4.6.0.66 180 | - pandas==1.5.0 181 | - paramiko==2.11.0 182 | - pathspec==0.10.1 183 | - pkginfo==1.8.3 184 | - platformdirs==2.5.2 185 | - pluggy==1.0.0 186 | - portalocker==2.5.1 187 | - pre-commit==2.20.0 188 | - prometheus-flask-exporter==0.20.3 189 | - psutil==5.9.2 190 | - py==1.11.0 191 | - pycodestyle==2.9.1 192 | - pyflakes==2.5.0 193 | - pynacl==1.5.0 194 | - pytest==7.1.3 195 | - pytz==2022.2.1 196 | - querystring-parser==1.2.4 197 | - rich==12.5.1 198 | - scipy==1.9.1 199 | - secretstorage==3.3.3 200 | - sh==1.14.3 201 | - smmap==5.0.0 202 | - sqlalchemy==1.4.41 203 | - sqlparse==0.4.3 204 | - tabulate==0.8.10 205 | - toml==0.10.2 206 | - tomli==2.0.1 207 | - torch-tb-profiler==0.4.0 208 | - types-cryptography==3.3.23 209 | - virtualenv==20.16.5 210 | - websocket-client==1.4.1 211 | -------------------------------------------------------------------------------- /docker/environment_cu116.yml: -------------------------------------------------------------------------------- 1 | name: ml_template_cu_116 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=5.1=1_gnu 9 | - absl-py=1.2.0=pyhd8ed1ab_0 10 | - aiohttp=3.8.1=py39hb9d737c_1 11 | - aiosignal=1.2.0=pyhd8ed1ab_0 12 | - async-timeout=4.0.2=pyhd8ed1ab_0 13 | - attrs=22.1.0=pyh71513ae_1 14 | - blas=1.0=mkl 15 | - blinker=1.4=py_1 16 | - brotlipy=0.7.0=py39hb9d737c_1004 17 | - bzip2=1.0.8=h7f98852_4 18 | - c-ares=1.18.1=h7f98852_0 19 | - ca-certificates=2022.9.24=ha878542_0 20 | - cachetools=5.2.0=pyhd8ed1ab_0 21 | - certifi=2022.9.24=pyhd8ed1ab_0 22 | - cffi=1.14.6=py39he32792d_0 23 | - charset-normalizer=2.1.1=pyhd8ed1ab_0 24 | - click=8.1.3=py39hf3d152e_0 25 | - colorama=0.4.5=pyhd8ed1ab_0 26 | - cryptography=37.0.2=py39hd97740a_0 27 | - cudatoolkit=11.6.0=hecad31d_10 28 | - ffmpeg=4.3=hf484d3e_0 29 | - freetype=2.10.4=h0708190_1 30 | - frozenlist=1.2.0=py39h7f8727e_0 31 | - fsspec=2022.8.2=pyhd8ed1ab_0 32 | - gmp=6.2.1=h58526e2_0 33 | - gnutls=3.6.13=h85f3911_1 34 | - google-auth=2.11.1=pyh1a96a4e_0 35 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 36 | - grpcio=1.42.0=py39hce63b2e_0 37 | - idna=3.4=pyhd8ed1ab_0 38 | - importlib-metadata=4.11.4=py39hf3d152e_0 39 | - intel-openmp=2021.4.0=h06a4308_3561 40 | - jpeg=9e=h166bdaf_1 41 | - lame=3.100=h7f98852_1001 42 | - lcms2=2.12=hddcbb42_0 43 | - ld_impl_linux-64=2.38=h1181459_1 44 | - libffi=3.3=he6710b0_2 45 | - libgcc-ng=11.2.0=h1234567_1 46 | - libgomp=11.2.0=h1234567_1 47 | - libiconv=1.17=h166bdaf_0 48 | - libpng=1.6.37=h21135ba_2 49 | - libprotobuf=3.15.8=h780b84a_1 50 | - libstdcxx-ng=11.2.0=h1234567_1 51 | - libtiff=4.2.0=hf544144_3 52 | - libwebp-base=1.2.2=h7f98852_1 53 | - lz4-c=1.9.3=h9c3ff4c_1 54 | - markdown=3.4.1=pyhd8ed1ab_0 55 | - markupsafe=2.1.1=py39hb9d737c_1 56 | - mkl=2021.4.0=h06a4308_640 57 | - mkl-service=2.4.0=py39h7e14d7c_0 58 | - mkl_fft=1.3.1=py39h0c7bc48_1 59 | - mkl_random=1.2.2=py39hde0f152_0 60 | - multidict=6.0.2=py39hb9d737c_1 61 | - ncurses=6.3=h5eee18b_3 62 | - nettle=3.6=he412f7d_0 63 | - numpy=1.23.1=py39h6c91a56_0 64 | - numpy-base=1.23.1=py39ha15fc14_0 65 | - oauthlib=3.2.1=pyhd8ed1ab_0 66 | - olefile=0.46=pyh9f0ad1d_1 67 | - openh264=2.1.1=h780b84a_0 68 | - openjpeg=2.4.0=hb52868f_1 69 | - openssl=1.1.1q=h7f8727e_0 70 | - packaging=21.3=pyhd8ed1ab_0 71 | - pillow=8.2.0=py39hf95b381_1 72 | - pip=22.1.2=py39h06a4308_0 73 | - pyasn1=0.4.8=py_0 74 | - pyasn1-modules=0.2.7=py_0 75 | - pycparser=2.21=pyhd8ed1ab_0 76 | - pydeprecate=0.3.2=pyhd8ed1ab_0 77 | - pyjwt=2.5.0=pyhd8ed1ab_0 78 | - pyopenssl=22.0.0=pyhd8ed1ab_1 79 | - pyparsing=3.0.9=pyhd8ed1ab_0 80 | - pysocks=1.7.1=pyha2e5f31_6 81 | - python=3.9.13=haa1d7c7_1 82 | - python_abi=3.9=2_cp39 83 | - pytorch=1.12.1=py3.9_cuda11.6_cudnn8.3.2_0 84 | - pytorch-lightning=1.7.7=pyhd8ed1ab_0 85 | - pytorch-mutex=1.0=cuda 86 | - pyu2f=0.1.5=pyhd8ed1ab_0 87 | - pyyaml=6.0=py39hb9d737c_4 88 | - readline=8.1.2=h7f8727e_1 89 | - requests=2.28.1=pyhd8ed1ab_1 90 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0 91 | - rsa=4.9=pyhd8ed1ab_0 92 | - setuptools=63.4.1=py39h06a4308_0 93 | - six=1.16.0=pyh6c4a22f_0 94 | - sqlite=3.39.2=h5082296_0 95 | - tensorboard=2.10.1=pyhd8ed1ab_0 96 | - tensorboard-data-server=0.6.0=py39hd97740a_2 97 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 98 | - tk=8.6.12=h1ccaba5_0 99 | - torchaudio=0.12.1=py39_cu116 100 | - torchmetrics=0.9.3=pyhd8ed1ab_0 101 | - torchvision=0.13.1=py39_cu116 102 | - tqdm=4.64.1=pyhd8ed1ab_0 103 | - typing-extensions=4.3.0=hd8ed1ab_0 104 | - typing_extensions=4.3.0=pyha770c72_0 105 | - tzdata=2022c=h04d1e81_0 106 | - urllib3=1.26.11=pyhd8ed1ab_0 107 | - werkzeug=2.2.2=pyhd8ed1ab_0 108 | - wheel=0.37.1=pyhd3eb1b0_0 109 | - xz=5.2.6=h5eee18b_0 110 | - yaml=0.2.5=h7f98852_2 111 | - yarl=1.7.2=py39hb9d737c_2 112 | - zipp=3.8.1=pyhd8ed1ab_0 113 | - zlib=1.2.12=h5eee18b_3 114 | - zstd=1.5.0=ha95c52a_0 115 | - pip: 116 | - adal==1.2.7 117 | - alembic==1.8.1 118 | - antlr4-python3-runtime==4.9.3 119 | - argcomplete==2.0.0 120 | - azure-common==1.1.28 121 | - azure-core==1.25.1 122 | - azure-graphrbac==0.61.1 123 | - azure-identity==1.11.0 124 | - azure-mgmt-authorization==2.0.0 125 | - azure-mgmt-containerregistry==10.0.0 126 | - azure-mgmt-core==1.3.2 127 | - azure-mgmt-keyvault==10.1.0 128 | - azure-mgmt-resource==21.1.0 129 | - azure-mgmt-storage==20.0.0 130 | - azure-storage-blob==12.13.0 131 | - azureml-core==1.45.0.post2 132 | - azureml-mlflow==1.45.0 133 | - backports-tempfile==1.0 134 | - backports-weakref==1.0.post1 135 | - bcrypt==4.0.0 136 | - black==22.8.0 137 | - cfgv==3.3.1 138 | - cloudpickle==2.2.0 139 | - commonmark==0.9.1 140 | - contextlib2==21.6.0 141 | - contourpy==1.0.5 142 | - databricks-cli==0.17.3 143 | - distlib==0.3.6 144 | - docker==5.0.3 145 | - docstring-parser==0.15 146 | - filelock==3.8.0 147 | - flake8==5.0.4 148 | - flask==2.2.2 149 | - fonttools==4.37.3 150 | - gcsfs==2022.8.2 151 | - gitdb==4.0.9 152 | - gitpython==3.1.27 153 | - google-api-core==2.10.1 154 | - google-cloud-core==2.3.2 155 | - google-cloud-storage==2.5.0 156 | - google-crc32c==1.5.0 157 | - google-resumable-media==2.3.3 158 | - googleapis-common-protos==1.56.4 159 | - greenlet==1.1.3 160 | - gunicorn==20.1.0 161 | - humanfriendly==10.0 162 | - hydra-core==1.2.0 163 | - identify==2.5.5 164 | - iniconfig==1.1.1 165 | - isodate==0.6.1 166 | - isort==5.10.1 167 | - itsdangerous==2.1.2 168 | - jeepney==0.8.0 169 | - jmespath==1.0.1 170 | - jsonargparse==4.15.0 171 | - jsonpickle==2.2.0 172 | - kiwisolver==1.4.4 173 | - knack==0.9.0 174 | - mako==1.2.3 175 | - matplotlib==3.6.0 176 | - mccabe==0.7.0 177 | - mlflow==1.29.0 178 | - mlflow-skinny==1.29.0 179 | - msal==1.19.0 180 | - msal-extensions==1.0.0 181 | - msrest==0.7.1 182 | - msrestazure==0.6.4 183 | - mypy-extensions==0.4.3 184 | - ndg-httpsclient==0.5.1 185 | - nodeenv==1.7.0 186 | - omegaconf==2.2.3 187 | - opencv-python==4.6.0.66 188 | - pandas==1.5.0 189 | - paramiko==2.11.0 190 | - pathspec==0.10.1 191 | - pkginfo==1.8.3 192 | - platformdirs==2.5.2 193 | - pluggy==1.0.0 194 | - portalocker==2.5.1 195 | - pre-commit==2.20.0 196 | - prometheus-flask-exporter==0.20.3 197 | - protobuf==3.20.1 198 | - psutil==5.9.2 199 | - py==1.11.0 200 | - pycodestyle==2.9.1 201 | - pyflakes==2.5.0 202 | - pynacl==1.5.0 203 | - pytest==7.1.3 204 | - pytz==2022.2.1 205 | - querystring-parser==1.2.4 206 | - rich==12.5.1 207 | - scipy==1.9.1 208 | - secretstorage==3.3.3 209 | - sh==1.14.3 210 | - smmap==5.0.0 211 | - sqlalchemy==1.4.41 212 | - sqlparse==0.4.3 213 | - tabulate==0.8.10 214 | - toml==0.10.2 215 | - tomli==2.0.1 216 | - torch-tb-profiler==0.4.0 217 | - torchtext==0.13.1 218 | - types-cryptography==3.3.23 219 | - virtualenv==20.16.5 220 | - websocket-client==1.4.1 221 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = asmr_ml_template 3 | version = 0.0.1 4 | author = Example Author 5 | author_email = author@example.com 6 | description = A small example package 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/pypa/sampleproject 10 | project_urls = 11 | Bug Tracker = https://github.com/pypa/sampleproject/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | = . 20 | packages = find: 21 | python_requires = >=3.6 22 | 23 | [options.packages.find] 24 | where = . 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/__init__.py -------------------------------------------------------------------------------- /src/datamodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/datamodules/__init__.py -------------------------------------------------------------------------------- /src/datamodules/mnist_datamodule.py: -------------------------------------------------------------------------------- 1 | # credits: https://github.com/ashleve/lightning-hydra-template/tree/main/src/datamodules 2 | from typing import Dict, Optional, Tuple, Union 3 | 4 | import torch 5 | from pytorch_lightning import LightningDataModule 6 | from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split 7 | from torchvision.datasets import MNIST 8 | from torchvision.transforms import transforms 9 | 10 | 11 | class MNISTDataModule(LightningDataModule): 12 | """Example of LightningDataModule for MNIST dataset. 13 | 14 | A DataModule implements 5 key methods: 15 | - prepare_data (things to do on 1 GPU/TPU, not on every GPU/TPU in distributed mode) 16 | - setup (things to do on every accelerator in distributed mode) 17 | - train_dataloader (the training dataloader) 18 | - val_dataloader (the validation dataloader(s)) 19 | - test_dataloader (the test dataloader(s)) 20 | 21 | This allows you to share a full dataset without explaining how to download, 22 | split, transform and process the data. 23 | 24 | Read the docs: 25 | https://pytorch-lightning.readthedocs.io/en/latest/extensions/datamodules.html 26 | """ 27 | 28 | def __init__( 29 | self, file_params: Dict[str, Union[str, Tuple[int, int, int]]], train_params: Dict[str, Union[int, bool]] 30 | ): 31 | super().__init__() 32 | 33 | # this line allows to access init params with 'self.hparams' attribute 34 | self.save_hyperparameters(logger=False) 35 | 36 | # data transformations 37 | self.transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) 38 | 39 | self.data_train: Optional[Dataset] = None 40 | self.data_val: Optional[Dataset] = None 41 | self.data_test: Optional[Dataset] = None 42 | 43 | @property 44 | def num_classes(self) -> int: 45 | return 10 46 | 47 | def prepare_data(self): 48 | """Download data if needed. 49 | 50 | This method is called only from a single GPU. 51 | Do not use it to assign state (self.x = y). 52 | """ 53 | MNIST(self.hparams.file_params["base_dir"], train=True, download=True) 54 | MNIST(self.hparams.file_params["base_dir"], train=False, download=True) 55 | 56 | def setup(self, stage: Optional[str] = None): 57 | """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`. 58 | 59 | This method is called by lightning when doing `trainer.fit()` and `trainer.test()`, 60 | so be careful not to execute the random split twice! The `stage` can be used to 61 | differentiate whether it's called before trainer.fit()` or `trainer.test()`. 62 | """ 63 | 64 | # load datasets only if they're not loaded already 65 | if not self.data_train and not self.data_val and not self.data_test: 66 | trainset = MNIST(self.hparams.file_params["base_dir"], train=True, transform=self.transforms) 67 | testset = MNIST(self.hparams.file_params["base_dir"], train=False, transform=self.transforms) 68 | dataset = ConcatDataset(datasets=[trainset, testset]) 69 | self.data_train, self.data_val, self.data_test = random_split( 70 | dataset=dataset, 71 | lengths=self.hparams.file_params["train_val_test_split"], 72 | generator=torch.Generator().manual_seed(42), 73 | ) 74 | 75 | def train_dataloader(self): 76 | return DataLoader( 77 | dataset=self.data_train, 78 | batch_size=self.hparams.train_params["batch_size"], 79 | num_workers=self.hparams.train_params["num_workers"], 80 | pin_memory=self.hparams.train_params["pin_memory"], 81 | shuffle=True, 82 | ) 83 | 84 | def val_dataloader(self): 85 | return DataLoader( 86 | dataset=self.data_val, 87 | batch_size=self.hparams.train_params["batch_size"], 88 | num_workers=self.hparams.train_params["num_workers"], 89 | pin_memory=self.hparams.train_params["pin_memory"], 90 | shuffle=False, 91 | ) 92 | 93 | def test_dataloader(self): 94 | return DataLoader( 95 | dataset=self.data_test, 96 | batch_size=self.hparams.train_params["batch_size"], 97 | num_workers=self.hparams.train_params["num_workers"], 98 | pin_memory=self.hparams.train_params["pin_memory"], 99 | shuffle=False, 100 | ) 101 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/models/__init__.py -------------------------------------------------------------------------------- /src/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/auto-sys-ml-template/8884d8f3b826cab300cd8f0e227fd7ad6b887da8/src/models/components/__init__.py -------------------------------------------------------------------------------- /src/models/components/simple_dense_net.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class SimpleDenseNet(nn.Module): 5 | def __init__( 6 | self, 7 | input_size: int = 784, 8 | lin1_size: int = 256, 9 | lin2_size: int = 256, 10 | lin3_size: int = 256, 11 | output_size: int = 10, 12 | ): 13 | super().__init__() 14 | 15 | self.model = nn.Sequential( 16 | nn.Linear(input_size, lin1_size), 17 | nn.BatchNorm1d(lin1_size), 18 | nn.ReLU(), 19 | nn.Linear(lin1_size, lin2_size), 20 | nn.BatchNorm1d(lin2_size), 21 | nn.ReLU(), 22 | nn.Linear(lin2_size, lin3_size), 23 | nn.BatchNorm1d(lin3_size), 24 | nn.ReLU(), 25 | nn.Linear(lin3_size, output_size), 26 | ) 27 | 28 | def forward(self, x): 29 | batch_size, channels, width, height = x.size() 30 | 31 | # (batch, 1, width, height) -> (batch, 1*width*height) 32 | x = x.view(batch_size, -1) 33 | 34 | return self.model(x) 35 | -------------------------------------------------------------------------------- /src/models/mnist_module.py: -------------------------------------------------------------------------------- 1 | # credits: https://github.com/ashleve/lightning-hydra-template/blob/main/src/models/mnist_module.py 2 | from typing import Any, Dict, List, Union 3 | 4 | import torch 5 | from pytorch_lightning import LightningModule 6 | from torchmetrics import MaxMetric 7 | from torchmetrics.classification.accuracy import Accuracy 8 | 9 | from src.models.components.simple_dense_net import SimpleDenseNet 10 | 11 | 12 | class MNISTLitModule(LightningModule): 13 | """Example of LightningModule for MNIST classification. 14 | 15 | A LightningModule organizes your PyTorch code into 5 sections: 16 | - Computations (init). 17 | - Train loop (training_step) 18 | - Validation loop (validation_step) 19 | - Test loop (test_step) 20 | - Optimizers (configure_optimizers) 21 | 22 | Read the docs: 23 | https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html 24 | """ 25 | 26 | def __init__( 27 | self, 28 | mlp_config: Dict[str, dict], 29 | optimizer_config: Dict[str, Union[float, str, list]], 30 | ): 31 | super().__init__() 32 | 33 | # this line allows to access init params with 'self.hparams' attribute 34 | # it also ensures init params will be stored in ckpt 35 | self.save_hyperparameters("mlp_config", "optimizer_config") 36 | 37 | self.net = SimpleDenseNet(**mlp_config) 38 | 39 | # loss function 40 | self.criterion = torch.nn.CrossEntropyLoss() 41 | 42 | # use separate metric instance for train, val and test step 43 | # to ensure a proper reduction over the epoch 44 | self.train_acc = Accuracy() 45 | self.val_acc = Accuracy() 46 | self.test_acc = Accuracy() 47 | 48 | # for logging best so far validation accuracy 49 | self.val_acc_best = MaxMetric() 50 | 51 | def forward(self, x: torch.Tensor): 52 | return self.net(x) 53 | 54 | def on_train_start(self): 55 | # by default lightning executes validation step sanity checks before training starts, 56 | # so we need to make sure val_acc_best doesn't store accuracy from these checks 57 | self.val_acc_best.reset() 58 | 59 | def step(self, batch: Any): 60 | x, y = batch 61 | logits = self.forward(x) 62 | loss = self.criterion(logits, y) 63 | preds = torch.argmax(logits, dim=1) 64 | return loss, preds, y 65 | 66 | def training_step(self, batch: Any, batch_idx: int): 67 | loss, preds, targets = self.step(batch) 68 | 69 | # log train metrics 70 | acc = self.train_acc(preds, targets) 71 | self.log("train/loss", loss, on_step=False, on_epoch=True, prog_bar=False) 72 | self.log("train/acc", acc, on_step=False, on_epoch=True, prog_bar=True) 73 | 74 | # we can return here dict with any tensors 75 | # and then read it in some callback or in `training_epoch_end()` below 76 | # remember to always return loss from `training_step()` or else backpropagation will fail! 77 | return {"loss": loss, "preds": preds, "targets": targets} 78 | 79 | def training_epoch_end(self, outputs: List[Any]): 80 | # `outputs` is a list of dicts returned from `training_step()` 81 | self.train_acc.reset() 82 | 83 | def validation_step(self, batch: Any, batch_idx: int): 84 | loss, preds, targets = self.step(batch) 85 | 86 | # log val metrics 87 | acc = self.val_acc(preds, targets) 88 | self.log("val/loss", loss, on_step=False, on_epoch=True, prog_bar=False) 89 | self.log("val/acc", acc, on_step=False, on_epoch=True, prog_bar=True) 90 | 91 | return {"loss": loss, "preds": preds, "targets": targets} 92 | 93 | def validation_epoch_end(self, outputs: List[Any]): 94 | acc = self.val_acc.compute() # get val accuracy from current epoch 95 | self.val_acc_best.update(acc) 96 | self.log("val/acc_best", self.val_acc_best.compute(), on_epoch=True, prog_bar=True) 97 | 98 | self.val_acc.reset() # reset val accuracy for next epoch 99 | 100 | def test_step(self, batch: Any, batch_idx: int): 101 | loss, preds, targets = self.step(batch) 102 | 103 | # log test metrics 104 | acc = self.test_acc(preds, targets) 105 | self.log("test/loss", loss, on_step=False, on_epoch=True) 106 | self.log("test/acc", acc, on_step=False, on_epoch=True) 107 | 108 | return {"loss": loss, "preds": preds, "targets": targets} 109 | 110 | def test_epoch_end(self, outputs: List[Any]): 111 | self.test_acc.reset() 112 | 113 | def configure_optimizers(self): 114 | """Choose what optimizers and learning-rate schedulers to use in your optimization. 115 | Normally you'd need one. But in the case of GANs or similar you might have multiple. 116 | 117 | See examples here: https://pytorch- 118 | lightning.readthedocs.io/en/latest/common/lightning_module.html#configure-optimizers 119 | """ 120 | return torch.optim.Adam( 121 | params=self.parameters(), 122 | lr=self.hparams.optimizer_config["lr"], 123 | weight_decay=self.hparams.optimizer_config["weight_decay"], 124 | ) 125 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | 3 | from utils import pl_utils 4 | 5 | 6 | def main(cfg): 7 | trainer = pl_utils.instantiate_trainer(cfg) 8 | model = pl_utils.instantiate_class(cfg["model"]) 9 | datamodule = pl_utils.instantiate_class(cfg["data"]) 10 | 11 | trainer.fit(model, datamodule) 12 | 13 | 14 | if __name__ == "__main__": 15 | cfg = OmegaConf.from_cli() 16 | 17 | if "base" in cfg: 18 | basecfg = OmegaConf.load(cfg.base) 19 | del cfg.base 20 | cfg = OmegaConf.merge(basecfg, cfg) 21 | cfg = OmegaConf.to_container(cfg, resolve=True) 22 | print(OmegaConf.to_yaml(cfg)) 23 | main(cfg) 24 | else: 25 | raise SystemExit("Base configuration file not specified! Exiting.") 26 | -------------------------------------------------------------------------------- /src/utils/pl_utils.py: -------------------------------------------------------------------------------- 1 | from importlib import import_module 2 | from typing import Any, Dict, List 3 | 4 | import pytorch_lightning as pl 5 | from pytorch_lightning import Callback, Trainer 6 | from pytorch_lightning.loggers import LightningLoggerBase 7 | 8 | 9 | def instantiate_class(init: Dict[str, Any]) -> Any: 10 | """Instantiates a class with the given args and init. 11 | 12 | Args: 13 | todo 14 | 15 | Returns: 16 | The instantiated class object. 17 | """ 18 | kwargs = {k: init[k] for k in set(list(init.keys())) - {"_target_"}} 19 | 20 | class_module, class_name = init["_target_"].rsplit(".", 1) 21 | module = import_module(class_module, package=class_name) 22 | args_class = getattr(module, class_name) 23 | return args_class(**kwargs) 24 | 25 | 26 | def instantiate_callbacks(callbacks_cfg: dict) -> List[Callback]: 27 | """Instantiates callbacks from config.""" 28 | callbacks: List[Callback] = [] 29 | 30 | if not callbacks_cfg: 31 | return callbacks 32 | 33 | if not isinstance(callbacks_cfg, dict): 34 | raise TypeError("Callbacks config must be a DictConfig!") 35 | 36 | for _, cb_conf in callbacks_cfg.items(): 37 | if isinstance(cb_conf, dict) and "_target_" in cb_conf: 38 | callbacks.append(instantiate_class(cb_conf)) 39 | 40 | return callbacks 41 | 42 | 43 | def instantiate_loggers(logger_cfg: dict) -> List[LightningLoggerBase]: 44 | """Instantiates loggers from config.""" 45 | logger: List[LightningLoggerBase] = [] 46 | 47 | if not logger_cfg: 48 | return logger 49 | 50 | if not isinstance(logger_cfg, dict): 51 | raise TypeError("Logger config must be a Dict!") 52 | 53 | for _, lg_conf in logger_cfg.items(): 54 | if isinstance(lg_conf, dict) and "_target_" in lg_conf: 55 | logger.append(instantiate_class(lg_conf)) 56 | 57 | return logger 58 | 59 | 60 | def instantiate_trainer(cfg: dict): 61 | if cfg.get("seed", None): 62 | pl.seed_everything(cfg["seed"], workers=True) 63 | 64 | callbacks: List[Callback] = instantiate_callbacks(cfg.get("callbacks")) 65 | logger: List[LightningLoggerBase] = instantiate_loggers(cfg.get("logger")) 66 | trainer: Trainer = Trainer(**cfg["trainer"], callbacks=callbacks, logger=logger) 67 | 68 | return trainer 69 | -------------------------------------------------------------------------------- /src/utils/system_monitor.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess as sp 4 | import time 5 | 6 | import mlflow 7 | import psutil 8 | 9 | 10 | def get_gpu_mem_info(): 11 | output_to_list = lambda x: x.decode("ascii").split("\n")[:-1] 12 | COMMAND = "nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total --format=csv" 13 | 14 | try: 15 | memory_use_info = output_to_list(sp.check_output(COMMAND.split(), stderr=sp.STDOUT))[1:] 16 | except sp.CalledProcessError as e: 17 | raise RuntimeError(f"command '{e.cmd}' return with error (code {e.returncode}): {e.output}") 18 | return memory_use_info 19 | 20 | 21 | def get_dist_info(): 22 | dist_info_config = { 23 | "node_rank": "NODE_RANK", 24 | "local_rank": "LOCAL_RANK", 25 | "world_rank": "RANK", 26 | "world_size": "WORLD_SIZE", 27 | } 28 | 29 | dist_info = {key: os.environ.get(value) for key, value in dist_info_config.items()} 30 | 31 | # Single GPU job 32 | if dist_info["world_size"] is None: 33 | dist_info["node_rank"] = 0 34 | dist_info["world_rank"] = 0 35 | dist_info["local_rank"] = 0 36 | dist_info["world_size"] = 1 37 | 38 | dist_info = {key: int(value) for (key, value) in dist_info.items()} 39 | return dist_info 40 | 41 | 42 | def main(args): 43 | # run on each node but only on one process corresponding to first gpu 44 | dist_info = get_dist_info() 45 | if not (dist_info["local_rank"] == 0): 46 | return 47 | 48 | node_rank = dist_info["node_rank"] + 1 # one index for display 49 | 50 | metrics = {} 51 | metrics = { 52 | f"monitor/node_{node_rank:02}/ram_usage_percent": 0.0, 53 | f"monitor/node_{node_rank:02}/ram_usage_GB": 0.0, 54 | f"monitor/node_{node_rank:02}/cpu_usage_percent": 0.0, 55 | f"monitor/node_{node_rank:02}/swap": 0.0, 56 | } 57 | 58 | memory_use_info_list = get_gpu_mem_info() 59 | idx, gpu_percent, gpu_mem_used, gpu_mem_total = memory_use_info_list[0].split(",") 60 | gpu_mem_total = float(gpu_mem_total.split("MiB")[0]) 61 | 62 | for gpu_idx, gpu_info in enumerate(memory_use_info_list, 1): 63 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/usage_percent"] = 0.0 64 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_GB"] = 0.0 65 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_percent"] = 0.0 66 | 67 | now = 0 68 | dt_sleep = args.watch_every_n_seconds 69 | 70 | while True: 71 | metrics[f"monitor/node_{node_rank:02}/ram_usage_GB"] = psutil.virtual_memory().used / 2**30 72 | metrics[f"monitor/node_{node_rank:02}/ram_usage_percent"] = psutil.virtual_memory().percent 73 | metrics[f"monitor/node_{node_rank:02}/cpu_usage_percent"] = psutil.cpu_percent() 74 | metrics[f"monitor/node_{node_rank:02}/swap"] = psutil.swap_memory().percent 75 | 76 | memory_use_info_list = get_gpu_mem_info() 77 | for gpu_idx, gpu_info in enumerate(memory_use_info_list, 1): 78 | _, gpu_percent, gpu_mem_used, _ = gpu_info.split(",") 79 | gpu_percent = float(gpu_percent.split("%")[0]) 80 | gpu_mem_used = float(gpu_mem_used.split("MiB")[0]) 81 | gpu_mem_percent = gpu_mem_used / gpu_mem_total * 100.0 82 | gpu_mem_used /= 1024.0 83 | 84 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/usage_percent"] = gpu_percent 85 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_GB"] = gpu_mem_used 86 | metrics[f"monitor/node_{node_rank:02}/gpu_{gpu_idx:02}/mem_used_percent"] = gpu_mem_percent 87 | 88 | # for key, value in metrics.items(): 89 | # print(f"{key}: {value}") 90 | mlflow.log_metrics(metrics, step=now) 91 | 92 | time.sleep(dt_sleep) 93 | now += dt_sleep 94 | 95 | 96 | def get_parsed_args(): 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument("--watch_every_n_seconds", type=int, default=5) 99 | args = parser.parse_args() 100 | return args 101 | 102 | 103 | if __name__ == "__main__": 104 | print("system_monitor.py begins") 105 | args = get_parsed_args() 106 | main(args) 107 | print("system_monitor.py done") 108 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | import sh 5 | 6 | 7 | def run_command(command: List[str]): 8 | """Default method for executing shell commands with pytest.""" 9 | msg = None 10 | try: 11 | sh.python(command) 12 | except sh.ErrorReturnCode as e: 13 | msg = e.stderr.decode() 14 | if msg: 15 | pytest.fail(msg=msg) 16 | -------------------------------------------------------------------------------- /tests/test_dev_fast_run.py: -------------------------------------------------------------------------------- 1 | import pytest # nopycln: import 2 | from helpers import run_command 3 | 4 | 5 | def test_fast_dev_run(): 6 | """Test running for 1 train, val and test batch.""" 7 | command = [ 8 | "src/train.py", 9 | "base=configs/train.yaml", 10 | "trainer.fast_dev_run=true", 11 | ] 12 | run_command(command) 13 | 14 | 15 | # cpu only test for CI 16 | def test_fast_dev_run_cpu(): 17 | """Test running for 1 train, val and test batch.""" 18 | command = [ 19 | "src/train.py", 20 | "base=configs/train.yaml", 21 | "trainer.fast_dev_run=true", 22 | "trainer.accelerator=cpu", 23 | "trainer.sync_batchnorm=false", 24 | ] 25 | run_command(command) 26 | --------------------------------------------------------------------------------