├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── LICENSE.ml-template ├── README.md ├── data └── README.md ├── installation ├── conda-osx-arm64-mps │ ├── README.md │ ├── environment.yml │ └── update-env-file.sh ├── docker-amd64-cuda │ ├── .dockerignore │ ├── CSCS-Clariden-setup │ │ ├── .gitignore │ │ ├── README.md │ │ └── example-submit-scripts │ │ │ ├── README.md │ │ │ ├── edf.toml │ │ │ ├── minimal.sh │ │ │ ├── remote-development.sh │ │ │ ├── test-interactive.sh │ │ │ ├── unattended-distributed.sh │ │ │ └── unattended.sh │ ├── Dockerfile │ ├── Dockerfile-user │ ├── EPFL-SCITAS-setup │ │ ├── .gitignore │ │ ├── README.md │ │ └── example-submit-scripts │ │ │ ├── README.md │ │ │ ├── minimal.sh │ │ │ ├── remote-development.sh │ │ │ ├── unattended-distributed.sh │ │ │ └── unattended.sh │ ├── EPFL-runai-setup │ │ ├── .gitignore │ │ ├── README.md │ │ └── example-submit-scripts │ │ │ ├── minimal.sh │ │ │ ├── remote-development.sh │ │ │ └── unattended.sh │ ├── LICENSE.cresset │ ├── README.md │ ├── apt.txt │ ├── compose-base.yaml │ ├── compose.yaml │ ├── entrypoints │ │ ├── entrypoint.sh │ │ ├── logins-setup.sh │ │ ├── pre-entrypoint.sh │ │ └── remote-development-setup.sh │ ├── from-python-template │ │ ├── Dockerfile │ │ ├── compose-base.yaml │ │ ├── requirements.txt │ │ └── update-env-file.sh │ ├── from-scratch-template │ │ ├── Dockerfile │ │ ├── compose-base.yaml │ │ ├── environment.yml │ │ └── update-env-file.sh │ ├── requirements.txt │ ├── template.sh │ └── update-env-file.sh └── edit-platform-and-acceleration.sh ├── outputs └── README.md ├── pyproject.toml ├── reproducibility-scripts ├── README.md ├── template-experiment.sh └── template-sweep.yaml ├── src └── template_package_name │ ├── __init__.py │ ├── configs │ ├── override │ │ ├── .gitignore │ │ ├── README.md │ │ └── template-experiment.yaml │ ├── setup.yaml │ └── template-experiment.yaml │ ├── template_experiment.py │ └── utils │ ├── __init__.py │ ├── config.py │ └── seeding.py ├── template ├── README.md ├── change-project-name.sh └── template-variables.env └── tests └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | !**/configs/env 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # JetBrains 133 | .idea/* 134 | !.idea/runConfigurations/ 135 | 136 | # macOS 137 | .DS_Store 138 | 139 | # Project 140 | data/* 141 | !data/README.md 142 | 143 | outputs/* 144 | !outputs/README.md 145 | 146 | wandb 147 | **/*.out 148 | 149 | third-party 150 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Pre-commit sets git hooks for identifying simple issues when committing code. 2 | # See https://pre-commit.com for more information 3 | # See https://pre-commit.com/hooks.html for more hooks 4 | 5 | default_language_version: 6 | python: python3.10 7 | 8 | repos: 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v5.0.0 11 | hooks: 12 | - id: check-added-large-files 13 | - id: check-symlinks 14 | - id: destroyed-symlinks 15 | - id: trailing-whitespace 16 | - id: end-of-file-fixer 17 | - id: mixed-line-ending 18 | - id: check-yaml 19 | - id: check-toml 20 | - id: check-added-large-files 21 | - id: check-merge-conflict 22 | - id: check-shebang-scripts-are-executable 23 | - id: detect-private-key 24 | - id: debug-statements 25 | - id: check-case-conflict 26 | - repo: https://github.com/python/black 27 | rev: 25.1.0 28 | hooks: 29 | - id: black 30 | - id: black-jupyter 31 | - repo: https://github.com/PyCQA/isort 32 | rev: 6.0.1 33 | hooks: 34 | - id: isort 35 | args: [ "--profile", "black", "--filter-files" ] 36 | - repo: https://github.com/codespell-project/codespell 37 | rev: v2.4.1 38 | hooks: 39 | - id: codespell 40 | args: [ "--skip=*.ipynb" ] 41 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: Python Machine Learning Research Template 6 | message: >- 7 | If you use this template or borrow some of its code, please 8 | cite it as shown below. 9 | type: software 10 | authors: 11 | - given-names: Skander 12 | family-names: Moalla 13 | email: skander.moalla@epfl.ch 14 | affiliation: EPFL 15 | orcid: 'https://orcid.org/0000-0002-8494-8071' 16 | repository-code: 'https://github.com/CLAIRE-Labo/python-ml-research-template' 17 | abstract: >- 18 | A template for starting Python machine learning research 19 | projects with hardware acceleration. It features reproducible 20 | environments on major platforms, a great development experience, 21 | Python project packaging following PyPA guidelines to avoid 22 | hacky imports, experiment management and tracking with Hydra 23 | and Weights & Biases, checkpointing for research experiments 24 | compatible with Weights & Biases, and code quality enforcement 25 | with pre-commit. 26 | keywords: 27 | - python 28 | - machine learning 29 | - reproducibility 30 | - containers 31 | - template 32 | license: MIT 33 | doi: 10.5281/zenodo.15609829 34 | version: 0.1.0 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Skander Moalla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.ml-template: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Skander Moalla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > [!TIP] 2 | > 🌟 If you like this template, please give it a star! 🌟 3 | > 4 | > 📜 If you use this template, borrow some of its code, or refer to it, please cite it as shown on GitHub! 📜 5 | 6 | > [!NOTE] 7 | > **TEMPLATE TODO:** 8 | > Replace the title below with your project title, then delete this note. 9 | 10 | # Python Machine Learning Research Template 11 | 12 | ## Overview 13 | 14 | > [!NOTE] 15 | > **TEMPLATE TODO:** 16 | > Replace the description below with a description of your project, then delete this note. 17 | 18 | A template for starting Python machine-learning research 19 | projects with hardware acceleration featuring: 20 | 21 | - ✅ Reproducible environments on major platforms with hardware acceleration with a great development experience 22 | covering multiple use cases: 23 | - 💻 local machines, e.g., macOS (+ Apple Silicon/MPS) and Linux/Windows WSL (+ NVIDIA GPU). 24 | - 🌐 Remote Linux servers with GPUs, e.g., VMs on cloud providers and IC and RCP HaaS at EPFL. 25 | - ☁️ Managed clusters supporting OCI containers with GPUs, e.g., the EPFL IC and RCP Run:ai (Kubernetes) clusters 26 | and the SCITAS Slurm clusters. 27 | - 📦 Python project packaging following the 28 | [PyPA packaging guidelines](https://packaging.python.org/en/latest/tutorials/packaging-projects/) to avoid hacky 29 | imports. 30 | - 📊 Experiment management, tracking, and sharing with [Hydra](https://hydra.cc/) 31 | and [Weights & Biases](https://wandb.ai/site). 32 | - 💾 Checkpointing setup for research experiments compatible with Weights & Biases. 33 | - 🧹 Code quality with [pre-commit](https://pre-commit.com) hooks. 34 | 35 | 🤝 The template makes collaboration and open-sourcing straightforward, avoiding setup issues and 36 | [maximizing impact](https://medium.com/paperswithcode/ml-code-completeness-checklist-e9127b168501#a826). 37 | 38 | 🏆 The practices in this template earned its authors 39 | an [Outstanding Paper (Honorable Mention)](https://openreview.net/forum?id=E0qO5dI5aEn) 40 | at the [ML Reproducibility Challenge 2022](https://paperswithcode.com/rc2022). 41 | 42 | 📌 Projects made with the template would look like 43 | [this toy project](https://github.com/skandermoalla/pytoych-benchmark) 44 | or [this paper](https://github.com/CLAIRE-Labo/no-representation-no-trust) whose curves have been exactly reproduced 45 | (exact same numbers) on multiple different platforms (EPFL Kubernetes cluster, VM on GCP, HPC cluster with Apptainer). 46 | 47 | 📖 Follow this README to get started with the template. 48 | 49 | For a brief discussion of the template's design choices, features, and a Q&A check `template/README.md` file. 50 | 51 | ## Getting started with the template 52 | 53 | > [!NOTE] 54 | > **TEMPLATE TODO:** 55 | > Delete this whole section when you're done with the template getting started. 56 | 57 | Click on the `Use this template` GitHub button to create a new GitHub repository from this template. 58 | Give it a lowercase hyphen-separated name (we will refer to this name as `PROJECT_NAME`), 59 | then follow the instructions below to set up your project. 60 | You can also give your GitHub repo another name format if you prefer, but for the template, you will have to pick 61 | a `PROJECT_NAME` as well. 62 | 63 | It's useful to commit after some checkpoints to be able to go back if you make a mistake. 64 | Some instructions will send you to different READMEs in the template that will compile nicely together in the end. 65 | Remember to get back to this root one after finishing each step. 66 | 67 | 1. Clone the repo with destination `PROJECT_NAME. See where and how below: 68 | - If you plan to develop on your local computer, clone it there. 69 | - If you plan to develop or deploy on a remote server/cluster without access to a build engine 70 | (e.g., EPFL Run:ai/Kubernetes clusters, SCITAS clusters), clone on your local machine. 71 | (You will build the image on your local machine, then clone on your server for deployment. 72 | Docker allows cross-platform builds with emulation, but it can be slow. 73 | We would recommend that your local machine is of the same platform as the cluster (e.g. `amd64`, `arm64`), 74 | or that you have access to a remote Docker engine running on the same platform as the cluster.) 75 | - If you plan to develop on a remote server/cluster with access to a build engine 76 | (e.g. EPFL HaaS, CSCS Clariden), clone it there. 77 | ```bash 78 | # For your local machine clone anywhere 79 | 80 | # For clusters with scratch filesystems with a cleaning policy, clone in your home directory (no cleaning policy). 81 | # The training artifacts will be later stored on the scratch filesystem and symlinked to this directory. 82 | 83 | # Note the creation of a `dev` instance of the repo (And later `run` instance for unattended jobs) 84 | # This allows to run unattended jobs in the `run` while changing the code in the `dev`. 85 | mkdir PROJECT_NAME 86 | cd PROJECT_NAME 87 | git clone dev 88 | cd dev 89 | # The current directory is referred to as PROJECT_ROOT 90 | ``` 91 | We will refer to the absolute path to the root of the repository as `PROJECT_ROOT`. 92 | 2. Fill the template variables in `template/template-variables.env` by 93 | editing the ones with the `$NEW_` prefix, then run the script 94 | ```bash 95 | # After filling the template variables in template/template-variables.env. 96 | ./template/change-project-name.sh 97 | ``` 98 | Commit. 99 | 3. Initialize the pre-commit hooks as described in the [contributing](#contributing) section. 100 | Update them to their latest version with `pre-commit autoupdate`. 101 | Commit. 102 | 4. Edit the `LICENSE` file, or delete it and remember to add one when open-sourcing your code. 103 | [(Some help here).](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/licensing-a-repository) 104 | A simple change if you're fine with the MIT license is to replace the `2022 Skander Moalla` with your year and name. 105 | Same for the `CITATION.cff` file. 106 | Commit. 107 | 5. Set up and edit the development environment instructions for the methods and platforms you will use and support. 108 | Each method supports a group of use cases: 109 | - **Docker**. 110 | This is the preferred method to run on Linux machines (e.g. EPFL HaaS servers), 111 | Windows machines with WSL, clusters running OCI-compliant containers like the EPFL Run:ai (Kubernetes) clusters, 112 | the SCITAS clusters, and other cloud services. 113 | (We provide tutorials for deployment with Docker, on the EPFL runai cluster, and on the SCITAS cluster.) 114 | 115 | The environment is shipped as a Linux Docker image, ensuring the highest level of reproducibility. 116 | You are free to choose the architecture you want to build the image for, 117 | e.g. `amd64` or `arm64`. 118 | By default, this image is set up for `amd64`. 119 | You are also free to choose the hardware acceleration you want to support. 120 | By default, this template allows local deployment with NVIDIA GPUs and can extend 121 | [NGC images](https://catalog.ngc.nvidia.com/containers). 122 | 123 | If you plan to support multiple platforms or hardware accelerations, 124 | you can duplicate this installation method 125 | or adapt it to support multiple platforms at the same time. 126 | 127 | Go to `installation/docker-amd64-cuda/README.md` for the setup. 128 | Come back here after following the instructions there. 129 | 130 | - **Conda**. 131 | The environment is shipped as a conda environment file. 132 | The level of reproducibility is lower than with Docker, as system dependencies will not be strictly recorded. 133 | The only reason this option is available is to leverage hardware acceleration of platforms not compatible with 134 | OCI containers, in particular, [MPS](https://developer.apple.com/metal/pytorch/) 135 | which is [not supported](https://github.com/pytorch/pytorch/issues/81224) 136 | on Docker for macOS with Apple Silicon. 137 | 138 | By default, this option is set up for `osx-arm64` to run on macOS with Apple Silicon. 139 | This installation method could also be used if you want to settle for a lower level of reproducibility 140 | and do not need to run on container clusters. 141 | In that case, you might support another platform, e.g. `amd64`, and hardware acceleration, e.g., NVIDIA GPUs. 142 | 143 | If you plan to support multiple platforms or hardware accelerations, 144 | you can duplicate this installation method 145 | or adapt it to support multiple platforms at the same time. 146 | 147 | Go to `installation/conda-osx-arm64-mps/README.md` for the setup. 148 | Come back here after following the instructions there. 149 | 150 | Delete the installation directory for the installation method you don't use. 151 | 152 | Naturally, results will be reproducible on machines with the same architecture and hardware acceleration 153 | using the same installation method, 154 | but not necessarily across architectures and installation methods. 155 | This is because dependency versions may vary across platforms. 156 | Try to keep the dependency versions close to ensure an easy replicability of your results. 157 | 158 | 6. Edit this `README.md` file. 159 | 1. Edit the title with the name of your project. 160 | Replace the [Overview](#overview) section with a description of your project. 161 | 2. Delete the installation options you don't support in 162 | the [Getting Started](#getting-started) section. 163 | 3. Have a look at the last paragraph below describing how to keep your project in good shape, 164 | then delete this getting started, to only keep the project [Getting Started](#getting-started) section. 165 | 166 | You're off to a good start! If you made it here, give the template a star! 167 | Here are a few tips for keeping your project in good shape. 168 | 169 | - Keep this README up to date. 170 | Fill in the rest of the sections after the Getting Started section when releasing your project. 171 | We give a structure and some templates for those. 172 | 173 | If you use datasets, follow `data/README.md` to set them and write the instructions 174 | for the subsequent users there. 175 | Otherwise, delete the [data](#data) section. 176 | 177 | Similarly, you can use the `outputs/README.md` file to share your trained models, logs, etc. 178 | - Remember to pin your dependencies whenever you install new ones. 179 | This is well described in the Maintaining the environment section of the installation instructions. 180 | - Keep your `reproducibility-scripts/` directory up to date. 181 | Commit it regularly and run your jobs with those scripts. 182 | More on this in the [reproducibility](#reproducing-our-results) section. 183 | - Maintain good commit hooks. More on this in the [Contributing](#contributing) section. 184 | - Have a look at the [ML Code Completeness Checklist](https://github.com/paperswithcode/releasing-research-code). 185 | This template facilitates meeting all the checklist items, with a different design. 186 | Have a look at the checklist when you ship your project. 187 | 188 | ## Getting started 189 | 190 | ### Code and development environment 191 | 192 | > [!NOTE] 193 | > **TEMPLATE TODO**: 194 | > Update the installation methods and platforms you support, delete the rest, and delete this note. 195 | > I.e. keep either Docker or Conda, or both, or multiple of each if you support multiple platforms. 196 | > 1. Specify the platform for each option and its description 197 | > e.g., for Docker amd64, arm64, etc., and for conda osx-arm64, linux-amd64, etc. 198 | > 2. Specify the hardware acceleration options for each platform 199 | > e.g., for Docker NVIDIA GPUs, AMD GPUs etc. 200 | > 3. Specify the hardware on which you ran your experiments (e.g., type of CPU/GPU and size of memory) and 201 | > the minimum hardware required to run your code if applicable (e.g., NVIDIA GPU with 80GB of memory). 202 | 203 | We support the following methods and platforms for installing the project dependencies and running the code. 204 | 205 | - **Docker/OCI-container for AMD64 machines (+ NVIDIA GPUs)**: 206 | This option works for machines with AMD64 CPUs and NVIDIA GPUs. 207 | E.g. Linux machines (EPFL HaaS servers, VMs on cloud providers), 208 | Windows machines with WSL, and clusters running OCI-compliant containers, 209 | like the EPFL Run:ai (Kubernetes) clusters. 210 | 211 | Follow the instructions in `installation/docker-amd64-cuda/README.md` to install the environment 212 | then get back here for the rest of the instructions to run the experiments. 213 | 214 | We ran our experiments on TODO: FILL IN THE HARDWARE YOU USED. 215 | To run them, you should have at least TODO: FILL IN THE MINIMUM HARDWARE REQS IF APPLICABLE. 216 | 217 | - **Conda for osx-arm64** 218 | This option works for macOS machines with Apple Silicon and can leverage MPS acceleration. 219 | 220 | Follow the instructions in `installation/conda-osx-arm64-mps/README.md` to install the environment 221 | then get back here for the rest of the instructions to run the experiments. 222 | 223 | We ran our experiments on TODO: FILL IN THE HARDWARE YOU USED. 224 | To run them, you should have at least TODO: FILL IN THE MINIMUM HARDWARE REQS IF APPLICABLE. 225 | 226 | ### Data 227 | 228 | > [!NOTE] 229 | > **TEMPLATE TODO**: 230 | > Fill `data/README.md` or delete this section, then delete this note. 231 | 232 | Refer to `data/README.md`. 233 | 234 | ### Logging and tracking experiments 235 | 236 | We use [Weights & Biases](https://wandb.ai/site) to log and track our experiments. 237 | If you're logged in, your default entity will be used (a fixed entity is not set in the config), 238 | and you can set another entity with the `WANDB_ENTITY` environment variable. 239 | Otherwise, the runs will be anonymous (you don't need to be logged in). 240 | 241 | ## Reproduction and experimentation 242 | 243 | ### Reproducing our results 244 | 245 | > [!NOTE] 246 | > **TEMPLATE TODO**: 247 | > Keep these scripts up to date and run your experiments using them. 248 | > Do provide the W&B runs and trained models or update this section. 249 | > Delete this note when shipping. 250 | 251 | We provide scripts to reproduce our work in the `reproducibility-scripts/` directory. 252 | It has a README at its root describing which scripts reproduce which experiments. 253 | 254 | We share our Weights and Biases runs in [this W&B project](https://wandb.ai/claire-labo/template-project-name). 255 | 256 | Moreover, we make our trained models available. 257 | You can follow the instructions in `outputs/README.md` to download and use them. 258 | 259 | ### Experiment with different configurations 260 | 261 | The default configuration for each script is stored in the `configs/` directory. 262 | They are managed by [Hydra](https://hydra.cc/docs/intro/). 263 | You can experiment with different configurations by passing the relevant arguments. 264 | You can get examples of how to do so in the `reproducibility-scripts/` directory. 265 | 266 | ## Repository structure 267 | 268 | > [!NOTE] 269 | > **TEMPLATE TODO**: 270 | > Provide a quick overview of the main files in the repo for users to understand your code, 271 | > then delete this note. 272 | 273 | Below, we give a description of the main files and directories in this repository. 274 | 275 | ``` 276 | └─── src/ # Source code. 277 | └── template_package_name # Our package. 278 | ├── configs/ # Hydra configuration files. 279 | └── template_experiment.py # A template experiment. 280 | ``` 281 | 282 | ## Contributing 283 | 284 | We use [`pre-commit`](https://pre-commit.com) hooks to ensure high-quality code. 285 | Make sure it's installed on the system where you're developing 286 | (it is in the dependencies of the project, but you may be editing the code from outside the development environment. 287 | If you have conda you can install it in your base environment, otherwise, you can install it with `brew`). 288 | Install the pre-commit hooks with 289 | 290 | ```bash 291 | # When in the PROJECT_ROOT. 292 | pre-commit install --install-hooks 293 | ``` 294 | 295 | Then every time you commit, the pre-commit hooks will be triggered. 296 | You can also trigger them manually with: 297 | 298 | ```bash 299 | pre-commit run --all-files 300 | ``` 301 | 302 | ## Licenses and acknowledgements 303 | 304 | This project is licensed under the LICENSE file in the root directory of the project. 305 | 306 | The initial code of this repository has been initiated by the [Python Machine Learning Research Template](https://github.com/CLAIRE-Labo/python-ml-research-template) 307 | with the LICENSE.ml-template file. 308 | 309 | Additional LICENSE files may be present in subdirectories of the project. 310 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Instructions for the data 2 | 3 | ## [TEMPLATE] Where and how to set up the data 4 | 5 | > [!IMPORTANT] 6 | > **TEMPLATE TODO:** 7 | > Update the instructions below to explain how to obtain the data and delete this section. 8 | 9 | The template provides the `PROJECT_ROOT/data/` directory as a placeholder for the data used in the project. 10 | This allows the experiment code to always refer to the same path for the data independently of the deployment method 11 | and the user configuration for better reproducibility. 12 | The directory can be accessed in the experiments with `config.data_dir`. 13 | Of course, this doesn't mean that the datasets inside `PROJECT_ROOT/data/` need to be physically in the same directory 14 | as the project. 15 | You can create symlinks to them. 16 | This shifts the data path configuration from the code and config to the installation steps 17 | (which we prefer, as it makes the committed code identical across deployment options). 18 | This is also more convenient than using environment variables to point to individual dataset locations. 19 | 20 | Below, you can instruct the users on how to download or link to the data and preprocess it. 21 | 22 | When the data is small enough (a few MBs), 23 | you can instruct the users (including you) to download it in the `PROJECT_ROOT/data/` directory. 24 | 25 | Otherwise, you can provide hints to them on how to download it (or reuse parts of it) in a separate storage 26 | (likely in a shared storage where some datasets already exist) and then create symlinks to the different parts. 27 | For managed clusters you need to mount different filesystems remember to add this to the deployment scripts 28 | and setup files (e.g. `compose.yaml` for deployment with Docker.) 29 | 30 | Here are example instructions: 31 | 32 | To setup the `data` directory you can download the data anywhere on your system and then symlink to the data from 33 | the `PROJECT_ROOT/data/` directory. 34 | 35 | ```bash 36 | # The data set already exist at /absolute_path/to/some-dataset 37 | # FROM the PROJECT_ROOT do 38 | ln -s /absolute-path/to/some-dataset data/some-dataset 39 | # Do this for each dataset root. 40 | # TEMPLATE TODO list all dataset roots (it's better to group them and use the groups accordingly in your code). 41 | ``` 42 | 43 | Be mindful that for the different deployment methods with container engines you will have to mount the filesystems 44 | where the data is stored (E.g. the local deployment option with Docker, and the container deployment on managed clusters) 45 | 46 | `TEMPLATE TODO:` For the local deployment option with Docker you would edit the `../installation/docker-*/compose.yaml` 47 | file for the local deployment option with Docker, 48 | for the managed clusters you would edit the flags of the cluster client (`runai`, `srun`, etc.). 49 | Avoid nested mounts. 50 | It's better to mount the whole "scratch" filesystem and let the symlinks handle the rest. 51 | 52 | ## Description of the data 53 | 54 | ## Instructions to obtain the data 55 | 56 | ## Instructions to process the data 57 | -------------------------------------------------------------------------------- /installation/conda-osx-arm64-mps/README.md: -------------------------------------------------------------------------------- 1 | # Installation with conda 2 | 3 | ## Template getting started 4 | 5 | > [!NOTE] 6 | > **TEMPLATE TODO:** 7 | > Follow the instructions then delete this section. 8 | 9 | This template provides a minimal `environment.yml` file for setting a conda environment. 10 | Follow the steps below to get started. 11 | Some steps will send you to different sections of the document. 12 | It may feel like jumping back and forth, but everything should read nicely after the setup 13 | for your future users (and yourself). 14 | 15 | 1. Choose the platform and hardware acceleration that you will build the environment for. 16 | You have to pick one as fully specified conda environment files are not trivially 17 | portable across platforms and hardware accelerations. 18 | Packages are different for different platforms and hardware accelerations, 19 | so you cannot freeze an environment used for a platform and create it in another. 20 | 21 | The default platform is macOS on Apple Silicon `osx-arm64` to get support for `mps` hardware acceleration 22 | (reflected in the name of the directory `conda-osx-arm64-mps` by default). 23 | To edit it, run 24 | ```bash 25 | # When in the PROJECT_ROOT directory. 26 | # For examples run: 27 | ./installation/edit-platform-and-acceleration.sh 28 | # To do the change run: 29 | ./installation/edit-platform-and-acceleration.sh change conda CURR_PLATFORM CURR_ACCELERATION NEW_PLATFORM NEW_ACCELERATION 30 | # For a list of available platforms you can see the installers below 31 | # https://anaconda.org/pytorch/pytorch 32 | # The hardware acceleration will be determined by the packages you install. 33 | # E.g. if you install PyTorch with CUDA, set the acceleration to cuda. 34 | # Note: new PyTorch versions are only distributed on PyPI (i.e. with `pip`). 35 | ``` 36 | If you plan to support multiple platforms or hardware accelerations, 37 | you can duplicate this installation method directory 38 | with `./installation/edit-platform-and-acceleration.sh copy ...` 39 | then perform the setup again. 40 | 2. You can try to specify your dependencies if you are sure of how to install them and that they are compatible. 41 | Otherwise, you should build with the default dependencies and install them interactively in the running container 42 | then freeze them in the dependency files once you are sure of which to include and how to include them. 43 | You will find more information in the [instructions to maintain the environment](#from-python-instructions-to-maintain-the-environment). 44 | The Python version and package name have already been filled by the `fill-template.sh` script. 45 | 46 | If you change the dependency files commit so that you can track what worked and what didn't. 47 | 3. Create the environment following the user 48 | [instructions to create the environment](#creating-the-environment) below. 49 | 4. Get familiar with running the environment following the user [instructions to 50 | run the environment](#running-the-code-in-the-environment). 51 | 5. If everything works fine, 52 | (we suggest checking that all of your dependencies are there with `mamba list`, 53 | and trying to import the important ones), 54 | then pin the dependencies you got following the [freeze the environment](#freeze-the-environment) section. 55 | You can then add more dependencies as your project grows following 56 | the [instructions to maintain the environment](#maintaining-the-environment). 57 | Commit. 58 | 6. Go back to the root README for the rest of the instructions to set the template up. 59 | 60 | ## Cloning the repository 61 | 62 | Clone the git repository. 63 | 64 | ```bash 65 | # Keep a /dev copy for development and a /run copy for running unattended experiments. 66 | mkdir template-project-name 67 | cd template-project-name 68 | git clone dev 69 | cd dev 70 | ``` 71 | 72 | We will refer the absolute path to the root of the repository as `PROJECT_ROOT`. 73 | 74 | ## Creating the environment 75 | 76 | **Prerequisites** 77 | 78 | - `brew`: [Homebrew](https://brew.sh/). 79 | - `mamba` (or equivalently `conda`): we recommend [Miniforge](https://github.com/conda-forge/miniforge). 80 | 81 | **Installation** 82 | 83 | System dependencies: 84 | 85 | We list below the important system dependencies that are not available in conda, 86 | but it is hard to list all the system dependencies needed to run the code. 87 | We let you install the missing ones when you encounter errors. 88 | 89 | - None. 90 | 91 | The conda environment: 92 | 93 | Create the environment with 94 | 95 | ```bash 96 | # When in the PROJECT_ROOT directory. 97 | mamba env create --file installation/conda-osx-arm64-mps/environment.yml 98 | ``` 99 | 100 | Install the project with 101 | 102 | ```bash 103 | # Activate the environment 104 | mamba activate template-project-name 105 | # When in the PROJECT_ROOT directory. 106 | pip install -e . 107 | ``` 108 | 109 | ## Running code in the environment 110 | 111 | ```bash 112 | mamba activate template-project-name 113 | ``` 114 | 115 | Run scripts from the `PROJECT_ROOT` directory. 116 | Here are some examples. 117 | 118 | ```bash 119 | # When in the PROJECT_ROOT directory. 120 | # template_experiment is an actual script that you can run. 121 | python -m template_package_name.template_experiment some_arg=some_value 122 | zsh reproducibility-scripts/template-experiment.sh 123 | ``` 124 | 125 | The environment is set up. 126 | Return to the root README for the rest of the instructions to run our experiments. 127 | 128 | ## Maintaining the environment 129 | 130 | System dependencies are managed by conda, otherwise when not available, by brew. 131 | (We try to keep everything self-contained as much as possible.) 132 | Python dependencies are managed by both conda and pip. 133 | 134 | - Use `conda` for system and non-Python dependencies needed to run the project code (e.g., image libraries, etc.). 135 | If not available on conda use `brew`. 136 | - Use `conda` for Python dependencies packaged with more that just Python code (e.g. `pytorch`, `numpy`). 137 | These will typically be your main dependencies and will likely not change as your project grows. 138 | Note: new PyTorch versions are only distributed on PyPI (i.e., with `pip`). 139 | - Use `pip` for the rest of the Python dependencies (e.g. `tqdm`). 140 | - For more complex dependencies that may require a custom installation or build, 141 | manually follow their installation steps. 142 | 143 | Here are references and reasons to follow the above claims: 144 | 145 | * [A guide for managing conda + `pip` environments](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#using-pip-in-an-environment). 146 | * [Reasons to use conda for not-Python-only dependencies](https://numpy.org/install/#numpy-packages--accelerated-linear-algebra-libraries). 147 | * [Ways of combining conda and `pip`](https://medium.com/data-science/conda-essential-concepts-and-tricks-e478ed53b5b#42cb). 148 | 149 | There are two ways to add dependencies to the environment: 150 | 151 | 1. **Manually edit the `environment.yml` file.** 152 | This is used the first time you set up the environment. 153 | It will also be useful if you run into conflicts and have to restart from scratch. 154 | 2. **Add/upgrade dependencies interactively** while running a shell with the environment activated 155 | to experiment with which dependency is needed. 156 | This is probably what you'll be doing after creating the environment for the first time. 157 | 158 | In both cases, after any change, a snapshot of the full environment specification should be saved. 159 | We describe how to do so in the freeze the environment section. 160 | Remember to commit the changes every time you freeze the environment. 161 | 162 | ### Manual editing (before/while building) 163 | 164 | - To edit the conda and pip dependencies, edit the `environment.yml` file. 165 | - For the `brew` and the more complex dependencies, describe the installation steps in the 166 | [Creating the environment](#creating-the-environment) section. 167 | 168 | When manually editing the `environment.yml` file, 169 | you do not need to specify the version of all the dependencies, 170 | these will be written to the file when you freeze the environment. 171 | You should just specify the major versions of specific dependencies you need. 172 | 173 | After manually editing the `environment.yml` file, you need to recreate the environment. 174 | 175 | ```bash 176 | # When in the PROJECT_ROOT directory. 177 | mamba deactivate 178 | mamba env remove --name template-project-name 179 | mamba env create --file installation/conda-osx-arm64-mps/environment.yml 180 | mamba activate template-project-name 181 | ``` 182 | 183 | ### Interactively (while developing) 184 | 185 | Conda dependencies should all be installed before any `pip` dependency. 186 | This will cause conflicts otherwise as conda doesn't track the `pip` dependencies. 187 | So if you need to add a conda dependency after you already installed some `pip` dependencies, you need to 188 | manually add the dependency to the `environment.yml` file then recreate the environment. 189 | 190 | * To add conda/pip dependencies run `(mamba | pip) install ` 191 | * To add a `brew` dependency run `brew install ` 192 | 193 | ### Freeze the environment 194 | 195 | After any change to the dependencies, a snapshot of the full environment specification should be written to the 196 | `environment.yml` file. 197 | This includes manual changes to the file and changes made interactively. 198 | This is to ensure that the environment is reproducible and that the dependencies are tracked at any point in time. 199 | 200 | To do so, run the following command. 201 | The script overwrites the `environment.yml` file with the current environment specification, 202 | so it's a good idea to commit the changes to the environment file before and after running it. 203 | 204 | ```bash 205 | # When in the PROJECT_ROOT directory. 206 | zsh installation/conda-osx-arm64-mps/update-env-file.sh 207 | ``` 208 | 209 | There are some caveats (e.g., packages installed from GitHub with pip), so have a look at 210 | the output file to make sure it does what you want. 211 | The `update-env-file.sh` gives some hints for what to do, and in any case you can always patch the file manually. 212 | 213 | For `brew` and more complex dependencies describe how to install them in the system dependencies section of 214 | the [instructions to install the environment](#creating-the-environment). 215 | 216 | If one of the complex dependencies shows in the `environment.yml` after the freeze, 217 | you have to remove it, so that conda does not pick it up, and it is installed later by the user. 218 | 219 | ## Troubleshooting 220 | -------------------------------------------------------------------------------- /installation/conda-osx-arm64-mps/environment.yml: -------------------------------------------------------------------------------- 1 | name: template-project-name 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.10 6 | - pip 7 | - pip: 8 | - hydra-core 9 | - tqdm 10 | - wandb 11 | - pre-commit 12 | - black 13 | -------------------------------------------------------------------------------- /installation/conda-osx-arm64-mps/update-env-file.sh: -------------------------------------------------------------------------------- 1 | # Records the current environment to a file. 2 | # Packages installed from GitHub with pip install will not be recorded 3 | # properly (i.e. the link can be omitted and just replaced with the version). 4 | # In that case, you have to update this file to add commands that 5 | # will fix the environment file. 6 | # (you could also patch the file manually afterwards). 7 | # Similarly the conda channels used to install packages may not be recorded properly 8 | # if you used complex combinations of channels. 9 | # In that case you also have to make the edits here or patch the file manually. 10 | 11 | ENVIR_FILE="installation/conda-osx-arm64-mps/environment.yml" 12 | conda env export --file "$ENVIR_FILE" 13 | 14 | # Delete the path line. 15 | sed -i.deleteme "$ d" "$ENVIR_FILE" 16 | # Set the package to a local installation. 17 | sed -i.deleteme "/template-project-name==/d" "$ENVIR_FILE" 18 | # .deleteme is a trick to make sed work the same way on both Linux and OSX. 19 | # https://stackoverflow.com/questions/5694228/sed-in-place-flag-that-works-both-on-mac-bsd-and-linux 20 | rm "${ENVIR_FILE}.deleteme" 21 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/.dockerignore: -------------------------------------------------------------------------------- 1 | CSCS-Clariden-setup 2 | EPFL-runai-setup 3 | EPFL-SCITAS-setup 4 | from-python-template 5 | from-scratch-template 6 | README.md 7 | template.sh 8 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/.gitignore: -------------------------------------------------------------------------------- 1 | # If you want to put your own scripts contraining API keys (e.g. W&B). 2 | submit-scripts/ 3 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/README.md: -------------------------------------------------------------------------------- 1 | # Guide for using the template with the CSCS Clariden cluster 2 | 3 | ## Overview 4 | 5 | This guide will show you how to build and run your image on the CSCS Clariden cluster and use it for 6 | 7 | 1. Remote development. 8 | 2. Running unattended jobs. 9 | 10 | ## Clone your repository in your home directory 11 | 12 | We strongly suggest having two instances of your project repository. 13 | 14 | 1. One for development, which may have uncommitted changes, be in a broken state, etc. 15 | 2. One for running unattended jobs, which is always referring to a commit at a working state of the code. 16 | 17 | The outputs and data directories of those two instances will be symlinked to the scratch storage 18 | and will be shared anyway. 19 | This guide includes the steps to do it, and there are general details in `data/README.md` and `outputs/README.md`. 20 | 21 | ```bash 22 | # SSH to a cluster. 23 | ssh clariden 24 | mkdir -p $HOME/projects/template-project-name 25 | cd $HOME/projects/template-project-name 26 | git clone dev 27 | git clone run 28 | 29 | # To setup symlinks to the scratch storage you can run the following commands 30 | mkdir -p $SCRATCH/projects/template-project-name/data/dev 31 | mkdir -p $SCRATCH/projects/template-project-name/outputs/dev 32 | for instance in run dev; do 33 | ln -s $SCRATCH/projects/template-project-name/data/dev $HOME/projects/template-project-name/$instance/data/dev 34 | ln -s $SCRATCH/projects/template-project-name/outputs/dev $HOME/projects/template-project-name/$instance/outputs/dev 35 | done 36 | ``` 37 | 38 | The rest of the instructions should be performed on the cluster from the dev instance of the project. 39 | 40 | ```bash 41 | cd dev 42 | # It may also be useful to open a remote code editor on a login node to view the project. 43 | # (The remote development will happen in another IDE in the container.) 44 | cd installation/docker-amd64-cuda 45 | ``` 46 | 47 | ## Building the environment (skip if already have access to the image) 48 | 49 | > [!NOTE] 50 | > **TEMPLATE TODO:** 51 | > After saving your generic image, provide the image location to your teammates. 52 | > Ideally also push it to team registry and later on a public registry if you open-source your project. 53 | > Add it below in the TODO ADD IMAGE PATH. 54 | 55 | ### Prerequisites 56 | 57 | * `podman` (Already installed on the CSCS clusters). Configure it as described [here](https://confluence.cscs.ch/display/KB/LLM+Inference) 58 | (step after "To use Podman, we first need to configure some storage ...") 59 | * `podman-compose` (A utility to run Docker compose files with Podman) [Install here](https://github.com/containers/podman-compose/tree/main) 60 | or follow the steps below for an installation from scratch on CSCS. 61 | 62 | ```bash 63 | # Install Miniconda 64 | curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 65 | bash Miniforge3-$(uname)-$(uname -m).sh 66 | # Follow the instructions 67 | # Close and reopen your terminal 68 | bash 69 | # Create a new conda environment 70 | mamba create -n podman python=3.10 71 | mamba activate podman 72 | pip install podman-compose 73 | 74 | # Activate this environment whenever you use this template. 75 | ``` 76 | 77 | ### Build the images 78 | 79 | All commands should be run from the `installation/docker-amd64-cuda/` directory. 80 | 81 | You should be on a compute node. If not already, get one. 82 | ```bash 83 | # Request a compute node 84 | sbatch --time 4:00:00 -A a-a10 --wrap "sleep infinity" --output=/dev/null --error=/dev/null 85 | # Connect to it 86 | srun --overlap --pty --jobid=GET_THE_JOB_ID bash 87 | tmux 88 | # or if reconnecting 89 | tmux at 90 | ``` 91 | 92 | ```bash 93 | cd installation/docker-amd64-cuda 94 | ``` 95 | 96 | 1. Create an environment file for your personal configuration with 97 | ```bash 98 | ./template.sh env 99 | ``` 100 | This creates a `.env` file with pre-filled values. 101 | - Edit the `DOCKER` variable to `podman` and the `COMPOSE` variable to `podman-compose`. 102 | - The rest of the variables are set correctly (`USR, USRID, GRP, GRPID, and PASSW`, 103 | e.g.`LAB_NAME` will be the first element in name of the local images you get, 104 | it's by default your horizontal/vertical) 105 | - You can ignore the rest of the variables after `## For running locally`. 106 | 2. Edit the Dockerfile to make it compatible with Podman: 107 | There are commented lines starting with `# Podman` which should be uncommented 108 | and replace the corresponding lines above them. 109 | 3. Build the generic image. 110 | This is the image with root as user. 111 | It will be named according to the image name in your `.env`. 112 | It will be tagged with `-root-latest` and if you're building it, 113 | it will also be tagged with the latest git commit hash `-root-` and `-root-`. 114 | ```bash 115 | # Make sure the Conda environment with podman-compose is activated. 116 | # mamba activate podman 117 | ./template.sh build_generic 118 | ``` 119 | 4. Export the image to a file and move it to a directory where you keep the images. 120 | ```bash 121 | ./template.sh import_from_podman 122 | # Move the images 123 | # Make a directory where you store your images 124 | # Add it to your bashrc as it'll be used often 125 | CONTAINER_IMAGES=$SCRATCH/container-images 126 | mkdir -p $CONTAINER_IMAGES 127 | mv *.sqsh $CONTAINER_IMAGES 128 | ``` 129 | 5. You can run quick checks on the image to check it that it has what you expect it to have. 130 | When the example scripts are described later, run the `test-interactive.sh` example script before the other scripts. 131 | 132 | ## Getting your image (if already built, or just built) 133 | 134 | #### From a file 135 | 136 | You will find the image to use for this project in _TODO ADD IMAGE_PATH_. 137 | Copy it or create a symlink to it where you keep your images. E.g., 138 | ```bash 139 | # Make a directory where you store your images 140 | # Add it to your bashrc as it'll be used often 141 | CONTAINER_IMAGES=$SCRATCH/container-images 142 | mkdir -p $CONTAINER_IMAGES 143 | # Copy the image with an adapted name with your horizontal/vertical name and username 144 | # (it will be readily-usable by the submit scripts) 145 | cp _TODO ADD IMAGE_PATH_ $CONTAINER_IMAGES/ADAPTED_NAME.sqsh 146 | ``` 147 | 148 | #### From a registry 149 | 150 | > [!NOTE] 151 | > **TEMPLATE TODO:** 152 | > You can push your image to a registry after building and provide the path to your teammates. 153 | 154 | Example submit scripts are provided in the `example-submit-scripts` directory and are used in the following examples. 155 | You can copy them to the directory `submit-scripts` which is not tracked by git and edit them to your needs. 156 | 157 | ### A quick test to understand how the template works 158 | 159 | Adapt the `submit-scripts/minimal.sh` with the name of your image and your cluster storage setup 160 | (should be correct by default). 161 | 162 | The submission script gives an example of how to run containers on Clariden with [`enroot`](https://github.com/NVIDIA/enroo) 163 | and the [`pyxis`](https://github.com/NVIDIA/pyxis) plugin directly integrated in `srun`, 164 | 165 | Run the script to see how the template works. 166 | ```bash 167 | cd installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts 168 | bash minimal.sh 169 | ``` 170 | 171 | When the container starts, its entrypoint does the following: 172 | 173 | - It runs the entrypoint of the base image if you specified it in the `compose-base.yaml` file. 174 | - It expects you specify `PROJECT_ROOT_AT=`. 175 | and `PROJECT_ROOT_AT` to be the working directory of the container. 176 | Otherwise, it will issue a warning and set it to the default working directory of the container. 177 | - It then tries to install the project in editable mode. 178 | This is a lightweight installation that allows to avoid all the hacky import path manipulations. 179 | (This will be skipped if `PROJECT_ROOT_AT` has not been specified or if you specify `SKIP_INSTALL_PROJECT=1`.) 180 | - It also handles all the remote development setups (VS Code, Cursor, PyCharm, Jupyter, ...) 181 | that you specify with environment variables. 182 | These are described in the later sections of this README. 183 | - Finally, it executes a provided command (e.g. `bash` here for an interactive job with a connected --pty). 184 | 185 | You need to make sure that this minimal submission works before proceeding. 186 | The logs of the entrypoint are only shown in case there was an error (design from pyxis). 187 | (A current workaround runs the entrypoint as a script at the start instead of as an entrypoint) 188 | 189 | If the entrypoint fails the installation of your project, you can resubmit your job with `export SKIP_INSTALL_PROJECT=1` 190 | which will skip the installation step then you can replay the installation manually in the container to debug it. 191 | 192 | ## Use cases 193 | 194 | The basic configuration for the project's environment is now set up. 195 | You can follow the remaining sections below to see how to run unattended jobs and set up remote development. 196 | After that, return to the root README for the rest of the instructions to run our experiments. 197 | 198 | 199 | ### Running unattended jobs 200 | 201 | By performing the above first steps, you should have all the required setup to run unattended jobs. 202 | The main difference is that the unattended job is run with `sbatch`. 203 | An example of an unattended job can be found in `submit-scripts/unattended.sh` to run with `sbatch`. 204 | Note the emphasis on having a frozen copy `run` of the repository for running unattended jobs. 205 | 206 | ### Weights&Biases 207 | 208 | Your W&B API key should be exposed as the `WANDB_API_KEY` environment variable. 209 | You can export it or if you're sharing the script with others export a location to a file containing it with 210 | `export WANDB_API_KEY_FILE_AT` and let the template handle it. 211 | 212 | E.g., 213 | 214 | ```bash 215 | echo > $HOME/.wandb-api-key 216 | chmod 600 $HOME/.wandb-api-key 217 | ``` 218 | 219 | 220 | Then `export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key` in the submit script. 221 | You should also mount the file in the container. 222 | 223 | ### Hugging Face 224 | 225 | Your HF API key should be exposed as the `HF_TOKEN` environment variable. 226 | You can export it or if you're sharing the script with others export a location to a file containing it with 227 | `export HF_TOKEN_AT` and let the template handle it. 228 | 229 | E.g., 230 | 231 | ```bash 232 | echo > $HOME/.hf-token 233 | chmod 600 $HOME/.hf-token 234 | ``` 235 | 236 | Then `export HF_TOKEN_AT=$HOME/.hf-token` in the submit script. 237 | You should also mount the file in the container. 238 | 239 | ### Remote development 240 | 241 | This would be the typical use case for a researcher at CLAIRE using the cluster as their daily driver to do 242 | development, testing, and debugging. 243 | Your job would be running a remote IDE/code editor on the cluster, and you would only have a lightweight local client 244 | running on your laptop. 245 | 246 | The entrypoint will start an ssh server and a remote development server for your preferred IDE/code editor 247 | when you set some environment variables. 248 | An example of an interactive job submission can be found in `submit-scripts/remote-development.sh` 249 | to run with `sbatch`. 250 | 251 | Below, we list and describe in more detail the tools and IDEs supported for remote development. 252 | 253 | ### SSH Configuration (Necessary for PyCharm, VS Code, and Cursor) 254 | 255 | Your job will open an ssh server when you set the environment variable `SSH_SERVER=1`. 256 | You also have to mount the authorized keys file from your home directory to the container (done in the example). 257 | The SSH connection is necessary for some remote IDEs like PyCharm to work and can be beneficial 258 | for other things like ssh key forwarding. 259 | The ssh server is configured to run on port 2223 of the container. 260 | 261 | With the ssh connection, you can forward the ssh keys on your local machine (that you use for GitHub, etc.) 262 | on the remote server. 263 | This allows using the ssh keys on the remote server without having to copy them there. 264 | 265 | For that, you need three things: an ssh agent running on your local machine, the key added to the agent, 266 | and a configuration file saying that the agent should be used with the ssh connection to SCITAS. 267 | GitHub provides a guide for that 268 | [here (look at the troubleshooting section too)](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding). 269 | 270 | Use the following configuration in your local `~/.ssh/config` 271 | 272 | ```bash 273 | Host clariden 274 | HostName clariden.cscs.ch 275 | User smoalla 276 | ProxyJump ela 277 | ForwardAgent yes 278 | 279 | # EDIT THIS HOSTNAME WITH EVERY NEW JOB 280 | Host clariden-job 281 | HostName nid007545 282 | User smoalla 283 | ProxyJump clariden 284 | StrictHostKeyChecking no 285 | UserKnownHostsFile=/dev/null 286 | ForwardAgent yes 287 | 288 | Host clariden-container 289 | HostName localhost 290 | ProxyJump clariden-job 291 | Port 2223 292 | User smoalla 293 | StrictHostKeyChecking no 294 | UserKnownHostsFile=/dev/null 295 | ForwardAgent yes 296 | ``` 297 | 298 | To update the hostname of the `clariden-job` you can add this to your `~/.zshrc` on macOS for example: 299 | 300 | ```bash 301 | # Tested on macos with zsh 302 | function update-ssh-config() { 303 | local config_file="$HOME/.ssh/config" # Adjust this path if needed 304 | local host="$1" 305 | local new_hostname="$2" 306 | 307 | if [[ -z "$host" || -z "$new_hostname" ]]; then 308 | echo "Usage: update-ssh-config " 309 | return 1 310 | fi 311 | 312 | sed -i '' '/Host '"$host"'/,/Host / s/^[[:space:]]*HostName.*/ HostName '"$new_hostname"'/' "$config_file" 313 | echo "Updated HostName for '${host}' to '${new_hostname}' in ~/.ssh/config" 314 | } 315 | ``` 316 | 317 | The `StrictHostKeyChecking no` and `UserKnownHostsFile=/dev/null` allow bypass checking the identity 318 | of the host [(ref)](https://linuxcommando.blogspot.com/2008/10/how-to-disable-ssh-host-key-checking.html) 319 | which keeps changing every time a job is scheduled, 320 | so that you don't have to reset it each time. 321 | 322 | With this config you can then connect to your container with `ssh clariden-container`. 323 | 324 | **Limitations** 325 | 326 | Note that an ssh connection to the container is not like executing a shell on the container. 327 | In particular, the following limitations apply: 328 | 329 | - environment variables in the image sent to the entrypoint of the container and any command exec'ed in it 330 | are not available in ssh connections. 331 | There is a workaround for that in `entrypoints/remote-development-setup.sh` when opening an ssh server 332 | which should work for most cases, but you may still want to adapt it to your needs. 333 | 334 | ### Git config 335 | 336 | You can persist your Git config (username, email, etc.) by mounting it in the container. 337 | This is done in the examples. 338 | 339 | E.g., create your config in your home directory with 340 | 341 | ```bash 342 | cat >$HOME/.gitconfig < Tools -> Terminal. 412 | * When running Run/Debug configurations, set your working directory the project root (`$PROJECT_ROOT_AT`), not the script's directory. 413 | * Your interpreter will be 414 | * the system Python `/usr/bin/python` with the `from-python` option. 415 | * the Python in your conda environment with the `from-scratch` option, with the conda binary found at `/opt/conda/condabin/conda`. 416 | 417 | **Limitations:** 418 | 419 | - The terminal in PyCharm opens ssh connections to the container, 420 | so the workaround (and its limitations) in the ssh section apply. 421 | If needed, you could just open a separate terminal on your local machine 422 | and directly exec a shell into the container. 423 | - It's not clear which environment variables are passed to the programs run from the IDE like the debugger. 424 | So far, it seems like the SSH env variables workaround works fine for this. 425 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet. 426 | 427 | ### VSCode / Cursor 428 | 429 | We support the [Remote Development using SSH ](https://code.visualstudio.com/docs/remote/ssh) 430 | feature of VS code that runs a remote IDE in the container via SSH. To set this up for Cursor, simply replace `VSCODE` by `CURSOR` and `vscode` by `cursor` in all instructions below. For example, `VSCODE_SERVER_AT` becomes `CURSOR_SERVER_AT`, and `~/.vscode-server` becomes `~/.cursor-server`. 431 | 432 | **Preliminaries: saving the IDE configuration** 433 | 434 | The remote IDE stores its configuration (e.g., the extensions you set up) in `~/.vscode-server`. 435 | To have it preserved between different dev containers, you should specify the 436 | `VSCODE_SERVER_AT` env variable with your submit command 437 | as shown in the examples in `submit-scripts/remote-development.sh`. 438 | The template will use it to store the IDE configuration and cache in a separate directory 439 | per project (defined by its $PROJECT_ROOT_AT). 440 | All the directories will be created automatically. 441 | 442 | **ssh configuration** 443 | 444 | VS Code takes ssh configuration from files. 445 | Follow the steps in the [SSH configuration section](#ssh-configuration-necessary-for-pycharm-and-vs-code) 446 | to set up your ssh config file. 447 | 448 | **Connecting VS Code to the container**: 449 | 450 | 1. `mkdir $HOME/vscode-server` 451 | 2. In your submit command, set the environment variables for 452 | - Opening an ssh server `SSH_SERVER=1`. 453 | - preserving your config `VSCODE_SERVER_AT`. 454 | And add `VSCODE_SERVER_AT` in the `--container-mounts`. 455 | 3. Have the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) 456 | extension on your local VS Code. 457 | 4. Connect to the ssh host following the 458 | steps [here](https://code.visualstudio.com/docs/remote/ssh#_connect-to-a-remote-host). 459 | 460 | The directory to add to your VS Code workspace should be the same as the one specified in the `PROJECT_ROOT_AT`. 461 | 462 | **Limitations** 463 | 464 | - The terminal in VS Code opens ssh connections to the container, 465 | so the workaround (and its limitations) in the ssh section apply. 466 | If needed, you could just open a separate terminal on your local machine 467 | and directly exec a shell into the container. 468 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet. 469 | 470 | ### JupyterLab (TODO) 471 | 472 | ### Examples 473 | 474 | We provide examples of how to use the template in the `submit-scripts` directory. 475 | 476 | ### Troubleshooting 477 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/README.md: -------------------------------------------------------------------------------- 1 | # Tips and Best Practices for Running Jobs with Slurm + pyxis + enroot 2 | 3 | Placeholder. 4 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/edf.toml: -------------------------------------------------------------------------------- 1 | [annotations] 2 | com.hooks.aws_ofi_nccl.enabled = "true" 3 | com.hooks.aws_ofi_nccl.variant = "cuda12" 4 | 5 | [env] 6 | NCCL_DEBUG = "INFO" 7 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/minimal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Variables used by the entrypoint script 4 | # Change this to the path of your project (can be the /dev or /run copy) 5 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev 6 | export PROJECT_NAME=template-project-name 7 | export PACKAGE_NAME=template_package_name 8 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 9 | 10 | # Enroot + Pyxis 11 | 12 | # Limitation: pyxis doesn't send environment variables to the entrypoint so it has to be run manually 13 | # This is fixed in v0.20.0 14 | 15 | srun \ 16 | -J template-minimal \ 17 | --pty \ 18 | --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \ 19 | --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \ 20 | --container-mounts=$PROJECT_ROOT_AT,$SCRATCH \ 21 | --container-workdir=$PROJECT_ROOT_AT \ 22 | --no-container-mount-home \ 23 | --no-container-remap-root \ 24 | --no-container-entrypoint \ 25 | --container-writable \ 26 | /opt/template-entrypoints/pre-entrypoint.sh \ 27 | bash 28 | 29 | # additional options for pyxis 30 | # --container-env to override environment variables defined in the container 31 | 32 | exit 0 33 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/remote-development.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J template-remote-development 4 | #SBATCH -t 12:00:00 5 | 6 | # Variables used by the entrypoint script 7 | # Change this to the path of your project (can be the /dev or /run copy) 8 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev 9 | export PROJECT_NAME=template-project-name 10 | export PACKAGE_NAME=template_package_name 11 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 12 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key 13 | # You can remove the Hugging Face variables if you don't use it, also remove them from the container mounts. 14 | export HF_TOKEN_AT=$HOME/.hf-token 15 | export HF_HOME=$SCRATCH/huggingface 16 | 17 | export SSH_SERVER=1 18 | export NO_SUDO_NEEDED=1 19 | # For the first time, mkdir -p $HOME/jetbrains-server, and comment out PYCHARM_IDE_AT 20 | export JETBRAINS_SERVER_AT=$HOME/jetbrains-server 21 | #export PYCHARM_IDE_AT=744eea3d4045b_pycharm-professional-2024.1.6-aarch64 22 | # or 23 | # export VSCODE_SERVER_AT=$HOME/vscode-server 24 | # We use a different path than the default .vscode-server to separate the container installation from the local installation 25 | # and replace JETBRAINS_SERVER_AT in the container-mounts with VSCODE_SERVER_AT 26 | 27 | srun \ 28 | --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \ 29 | --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \ 30 | --container-mounts=\ 31 | $PROJECT_ROOT_AT,\ 32 | $SCRATCH,\ 33 | $WANDB_API_KEY_FILE_AT,\ 34 | $HOME/.gitconfig,\ 35 | $HF_TOKEN_AT,\ 36 | $JETBRAINS_SERVER_AT,\ 37 | $HOME/.ssh \ 38 | --container-workdir=$PROJECT_ROOT_AT \ 39 | --no-container-mount-home \ 40 | --no-container-remap-root \ 41 | --no-container-entrypoint \ 42 | --container-writable \ 43 | /opt/template-entrypoints/pre-entrypoint.sh \ 44 | sleep infinity 45 | 46 | # additional options 47 | # --container-env to override environment variables defined in the container 48 | 49 | # Draft. 50 | # Here can connect to the container with 51 | # Get the job id (and node id if multinode) 52 | # 53 | # Connect to the allocation 54 | # srun --overlap --pty --jobid=JOBID bash 55 | # Inside the job find the container name 56 | # enroot list -f 57 | # Exec to the container 58 | # enroot exec zsh 59 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/test-interactive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Enroot + Pyxis 4 | 5 | srun \ 6 | -J template-test \ 7 | --pty \ 8 | --container-image=$CONTAINER_IMAGES/claire+smoalla+template-project-name+amd64-cuda-root-latest.sqsh \ 9 | --no-container-mount-home \ 10 | --no-container-remap-root \ 11 | --no-container-entrypoint \ 12 | --container-writable \ 13 | bash 14 | 15 | exit 0 16 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/unattended-distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J template-untattended-distributed 4 | #SBATCH -t 0:30:00 5 | #SBATCH --nodes 2 6 | #SBATCH --ntasks-per-node 3 7 | 8 | # There is a current limitation in pyxis with the entrypoint and it has to run manually. 9 | # It has to run only once per node and the other tasks in the nodes have to wait for it to finish. 10 | # So you can either limit your jobs to 1 task per node or use a sleep command to wait for the entrypoint to finish. 11 | 12 | 13 | # Variables used by the entrypoint script 14 | # Change this to the path of your project (can be the /dev or /run copy) 15 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run 16 | export PROJECT_NAME=template-project-name 17 | export PACKAGE_NAME=template_package_name 18 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 19 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key 20 | 21 | srun \ 22 | --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \ 23 | --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \ 24 | --container-mounts=\ 25 | $PROJECT_ROOT_AT,\ 26 | $SCRATCH,\ 27 | $WANDB_API_KEY_FILE_AT \ 28 | --container-workdir=$PROJECT_ROOT_AT \ 29 | --no-container-mount-home \ 30 | --no-container-remap-root \ 31 | --no-container-entrypoint \ 32 | --container-writable \ 33 | /opt/template-entrypoints/pre-entrypoint.sh \ 34 | bash -c 'sleep 60; python -m template_package_name.template_experiment some_arg=LOCALID-$SLURM_LOCALID-PROCID-$SLURM_PROCID' 35 | 36 | # Sleep to wait for the installation of the project. 37 | 38 | # additional options 39 | # --container-env to override environment variables defined in the container 40 | 41 | exit 0 42 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/unattended.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J template-unattended 4 | #SBATCH -t 0:30:00 5 | 6 | # Variables used by the entrypoint script 7 | # Change this to the path of your project (can be the /dev or /run copy) 8 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run 9 | export PROJECT_NAME=template-project-name 10 | export PACKAGE_NAME=template_package_name 11 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 12 | # For wandb, huggingface, etc. look at the remote-development.sh 13 | 14 | srun \ 15 | --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \ 16 | --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \ 17 | --container-mounts=$PROJECT_ROOT_AT,$SCRATCH \ 18 | --container-workdir=$PROJECT_ROOT_AT \ 19 | --no-container-mount-home \ 20 | --no-container-remap-root \ 21 | --no-container-entrypoint \ 22 | --container-writable \ 23 | /opt/template-entrypoints/pre-entrypoint.sh \ 24 | python -m template_package_name.template_experiment some_arg=some_value wandb.mode=offline 25 | 26 | # additional options for pyxis 27 | # --container-env to override environment variables defined in the container 28 | 29 | exit 0 30 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1 2 | 3 | # BASE_IMAGE is the image that will be extended by this Dockerfile. 4 | # It is assumed to a well configured Python installation. 5 | # The reminaing packages will be installed with pip. 6 | ARG BASE_IMAGE 7 | ARG GIT_IMAGE 8 | 9 | ######################################################################## 10 | # Install apt packages. 11 | 12 | FROM ${BASE_IMAGE} AS runtime-apt-pkgs 13 | 14 | # A directory to record all the dependency files used at multiple stages. 15 | # This is useful for a later inspection or debugging. 16 | ENV DEPENDENCIES_DIR=/opt/template-dependencies 17 | RUN mkdir ${DEPENDENCIES_DIR} 18 | COPY apt.txt ${DEPENDENCIES_DIR}/apt.txt 19 | 20 | # Enable caching for `apt` packages in Docker. 21 | # https://docs.docker.com/engine/reference/builder/#run---mounttypecache 22 | RUN rm -f /etc/apt/apt.conf.d/docker-clean; \ 23 | echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > \ 24 | /etc/apt/apt.conf.d/keep-cache 25 | 26 | ARG DEBIAN_FRONTEND=noninteractive 27 | # sed is only used as a hack to remove comments from the file apt.txt. 28 | RUN --mount=type=cache,target=/var/cache/apt,sharing=private \ 29 | --mount=type=cache,target=/var/lib/apt,sharing=private \ 30 | apt update && \ 31 | sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \ 32 | xargs -t apt-get install -y --no-install-recommends && \ 33 | rm -rf /var/lib/apt/lists/* 34 | 35 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes. 36 | #RUN --mount=type=cache,target=/var/cache/apt \ 37 | # --mount=type=cache,target=/var/lib/apt \ 38 | # apt update && \ 39 | # sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \ 40 | # xargs -t apt-get install -y --no-install-recommends && \ 41 | # rm -rf /var/lib/apt/lists/* 42 | 43 | ######################################################################## 44 | # Install dependencies. 45 | 46 | FROM runtime-apt-pkgs AS runtime-deps 47 | 48 | # Install pip packages. 49 | ENV PIP_CACHE_DIR=/root/.cache/pip 50 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-before-pip-install.txt 51 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-before-pip-install.txt 52 | COPY requirements.txt ${DEPENDENCIES_DIR}/requirements.txt 53 | RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \ 54 | pip install -r ${DEPENDENCIES_DIR}/requirements.txt 55 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes. 56 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \ 57 | # pip install -r ${DEPENDENCIES_DIR}/requirements.txt 58 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-after-pip-install.txt 59 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-after-pip-install.txt 60 | 61 | # For reproducible requirements use the following after getting the requirements-freeze.txt file from the first build. 62 | #COPY requirements-freeze.txt ${DEPENDENCIES_DIR}/requirements-freeze.txt 63 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \ 64 | # pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt 65 | # For podman 66 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \ 67 | # pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt 68 | 69 | # Optional optimizations. 70 | # Hack to enable Intel MKL optimizations on AMD CPUs. 71 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html 72 | # ARG FAKEINTEL_PATH=/opt/fakeintel/libfakeintel.so 73 | # ENV FAKEINTEL_PATH=${FAKEINTEL_PATH} 74 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html 75 | # Build. 76 | # RUN echo 'int mkl_serv_intel_cpu_true() {return 1;}' > /tmp/fakeintel.c && \ 77 | # mkdir -p /opt/fakeintel && \ 78 | # gcc -shared -fPIC -o ${FAKEINTEL_PATH} /tmp/fakeintel.c 79 | # Enable. 80 | # ENV LD_PRELOAD=${FAKEINTEL_PATH}:${LD_PRELOAD} 81 | 82 | ######################################################################## 83 | # Here you can install other software 84 | 85 | 86 | ######################################################################## 87 | # Download Z-Shell enhancements. 88 | 89 | FROM ${GIT_IMAGE} AS get-pure 90 | 91 | ARG PURE_URL=https://github.com/sindresorhus/pure.git 92 | ARG ZSHA_URL=https://github.com/zsh-users/zsh-autosuggestions.git 93 | ARG ZSHS_URL=https://github.com/zsh-users/zsh-syntax-highlighting.git 94 | 95 | RUN git clone --depth 1 ${PURE_URL} /opt/zsh/pure 96 | RUN git clone --depth 1 ${ZSHA_URL} /opt/zsh/zsh-autosuggestions 97 | RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting 98 | 99 | ######################################################################## 100 | # This stage is the final user-agnostic (generic) stage. 101 | # This layer can be distributed so that subsequent users 102 | 103 | FROM runtime-deps AS runtime-generic 104 | 105 | ENV HYDRA_FULL_ERROR=1 106 | 107 | # A final record of the dependencies from pip freeze. 108 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-final.txt 109 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-final.txt 110 | 111 | # Shell configuration. 112 | ENV ZSH_ENHANCE_DIR=/etc/zsh/enhance 113 | ARG PURE_PATH=${ZSH_ENHANCE_DIR}/pure 114 | ARG ZSHA_PATH=${ZSH_ENHANCE_DIR}/zsh-autosuggestions 115 | ARG ZSHS_PATH=${ZSH_ENHANCE_DIR}/zsh-syntax-highlighting 116 | COPY --from=get-pure /opt/zsh/pure ${PURE_PATH} 117 | COPY --from=get-pure /opt/zsh/zsh-autosuggestions ${ZSHA_PATH} 118 | COPY --from=get-pure /opt/zsh/zsh-syntax-highlighting ${ZSHS_PATH} 119 | RUN { echo "fpath+=${PURE_PATH}"; \ 120 | echo "autoload -Uz promptinit; promptinit"; \ 121 | echo "prompt pure"; \ 122 | echo "source ${ZSHA_PATH}/zsh-autosuggestions.zsh"; \ 123 | echo "source ${ZSHS_PATH}/zsh-syntax-highlighting.zsh"; \ 124 | echo "alias ls='ls --color=auto'"; \ 125 | echo "alias ll='ls -lh'"; \ 126 | echo "alias update-env-file='source \${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/update-env-file.sh'"; \ 127 | } >> /etc/zsh/zshrc 128 | 129 | 130 | # Entrypoints. 131 | # Don't overwrite the entrypoint, it is installing the project 132 | # and testing that you correctly mounted the project code. 133 | # It also performs some other important setup depending on the deployment platform. 134 | ARG BASE_ENTRYPOINT 135 | ARG BASE_ENTRYPOINT_EXECS 136 | ENV BASE_ENTRYPOINT=${BASE_ENTRYPOINT} 137 | ENV BASE_ENTRYPOINT_EXECS=${BASE_ENTRYPOINT_EXECS} 138 | ENV ENTRYPOINTS_ROOT=/opt/template-entrypoints 139 | COPY entrypoints ${ENTRYPOINTS_ROOT} 140 | ENTRYPOINT ["/opt/template-entrypoints/pre-entrypoint.sh"] 141 | CMD ["/bin/zsh"] 142 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/Dockerfile-user: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | # 3. Stages for setting up the user and the development environment. 3 | ######################################################################## 4 | 5 | ARG GENERIC_IMAGE 6 | ARG IMAGE_PLATFORM 7 | 8 | ######################################################################## 9 | # Final runtime layer for the user. 10 | # Explicitly create a user for Docker Engine interfaces 11 | # which do no support selecting the user at runtime; this is the case for Run:ai. 12 | 13 | FROM ${GENERIC_IMAGE}:${IMAGE_PLATFORM}-root-latest AS runtime-user 14 | 15 | ARG GRPID 16 | ARG USRID 17 | ARG GRP 18 | ARG USR 19 | ARG PASSWD 20 | ENV PASSWD=${PASSWD} 21 | 22 | # Add user to sudoer to be able to install apt packages. 23 | RUN groupadd -f -g ${GRPID} ${GRP} && \ 24 | useradd --shell /bin/zsh --create-home -u ${USRID} -g ${GRP} -p $(openssl passwd -1 ${PASSWD}) ${USR} && \ 25 | usermod -aG sudo ${USR} 26 | 27 | USER ${USR} 28 | RUN touch /home/${USR}/.zshrc 29 | 30 | ######################################################################## 31 | # Final development layer for the user. 32 | 33 | FROM ${GENERIC_IMAGE}:${IMAGE_PLATFORM}-root-latest AS development-user 34 | 35 | ARG GRPID 36 | ARG USRID 37 | ARG GRP 38 | ARG USR 39 | ARG PASSWD 40 | ENV PASSWD=${PASSWD} 41 | 42 | # Add user to sudoer to be able to install apt packages. 43 | RUN groupadd -f -g ${GRPID} ${GRP} && \ 44 | useradd --shell /bin/zsh --create-home -u ${USRID} -g ${GRP} -p $(openssl passwd -1 ${PASSWD}) ${USR} && \ 45 | usermod -aG sudo ${USR} 46 | 47 | USER ${USR} 48 | RUN touch /home/${USR}/.zshrc 49 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/.gitignore: -------------------------------------------------------------------------------- 1 | # If you want to put your own scripts contraining API keys (e.g. W&B). 2 | submit-scripts/ 3 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/README.md: -------------------------------------------------------------------------------- 1 | # Guide for using the template with the EPFL SCITAS clusters (Kuma, Izar) 2 | 3 | ## Overview 4 | 5 | At this point, you should have the runtime image that can be deployed on multiple platforms. 6 | This guide will show you how to deploy your image on the EPFL SCITAS clusters supporting containers (Kuma, Izar) 7 | and use it for 8 | 9 | 1. Remote development. 10 | 2. Running unattended jobs. 11 | 12 | ## Prerequisites 13 | 14 | **SCITAS and Slurm**: 15 | 16 | 1. You should have access to the SCITAS clusters using containers (Kuma, Izar). 17 | 2. You should have some knowledge of Slurm. 18 | 19 | CLAIRE lab members can refer to our internal documentation on using the SCITAS clusters 20 | [here](https://prickly-lip-484.notion.site/Compute-and-Storage-CLAIRE-91b4eddcc16c4a95a5ab32a83f3a8294#1402ae1961ac4b3e86a6a3ee2d8602aa). 21 | 22 | ## First steps 23 | 24 | ### Getting your image on the SCITAS clusters 25 | 26 | You only need to pull the generic image as SCITAS mounts namespaces to the containers. 27 | 28 | All the commands should be run on the SCITAS clusters. 29 | ```bash 30 | ssh izar 31 | # or 32 | ssh kuma 33 | ``` 34 | Create an enroot config file in your home directory on the cluster if you don't have one yet. 35 | It will store your credentials for the registries. 36 | ```bash 37 | export ENROOT_CONFIG_PATH=$HOME/.config/enroot/.credentials 38 | mkdir -p $(dirname $ENROOT_CONFIG_PATH) 39 | touch $ENROOT_CONFIG_PATH 40 | # Make sur the file is only readable by you 41 | chmod 600 $ENROOT_CONFIG_PATH 42 | ``` 43 | Write the following to the file. 44 | ```bash 45 | # E.g. vim $ENROOT_CONFIG_PATH 46 | machine ic-registry.epfl.ch login password 47 | machine registry.rcp.epfl.ch login password 48 | ``` 49 | 50 | Optionally if you want to use Apptainer 51 | ```bash 52 | apptainer registry login --username docker://registry.rcp.epfl.ch 53 | apptainer registry login --username docker://ic-registry.epfl.ch 54 | ``` 55 | 56 | Then you can pull your image with 57 | ```bash 58 | # On Izar 59 | SCRATCH=/scratch/izar/$USER 60 | # On Kuma 61 | SCRATCH=/scratch/$USER 62 | # Make a directory where you store your images 63 | # Add it to your bashrc as it'll be used often 64 | CONTAINER_IMAGES=$SCRATCH/container-images 65 | mkdir -p $CONTAINER_IMAGES 66 | 67 | # Pull the generic image (with tagged with root) 68 | # E.g., 69 | cd $CONTAINER_IMAGES 70 | # Don't do this on a login node. 71 | # Replace with your image name 72 | 73 | srun --ntasks=1 --cpus-per-task=32 --partition h100 --time=0:30:00 \ 74 | enroot import docker://registry.rcp.epfl.ch#claire/moalla/template-project-name:amd64-cuda-root-latest 75 | # This will create a squashfs file that you'll use to start your jobs. 76 | ``` 77 | 78 | Optionally if you want to use Apptainer 79 | ```bash 80 | # Takes ages to convert to sif. 81 | # Don't do this on a login node. 82 | # In a tmux shell ideally. 83 | srun --ntasks=1 --cpus-per-task=32 --partition h100 --time=1:00:00 \ 84 | apptainer pull docker://registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-root-latest 85 | ``` 86 | 87 | ### Clone your repository in your home directory 88 | 89 | We strongly suggest having two instances of your project repository. 90 | 91 | 1. One for development, which may have uncommitted changes, be in a broken state, etc. 92 | 2. One for running unattended jobs, which is always referring to a commit at a working state of the code. 93 | 94 | The outputs and data directories of those two instances will be symlinked to the scratch storage 95 | and will be shared anyway. 96 | This guide includes the steps to do it, and there are general details in `data/README.md` and `outputs/README.md`. 97 | 98 | ```bash 99 | # SSH to a cluster. 100 | ssh kuma 101 | mkdir -p $HOME/projects/template-project-name 102 | cd $HOME/projects/template-project-name 103 | git clone dev 104 | git clone run 105 | ``` 106 | 107 | The rest of the instructions should be performed on the cluster from the dev instance of the project. 108 | ```bash 109 | cd dev 110 | # It may also be useful to open a remote code editor on a login node to view the project. (The remote development will happen in another IDE in the container.) 111 | # Push what you did on your local machine so far (change project name etc) and pull it on the cluster. 112 | git pull 113 | cd installation/docker-amd64-cuda 114 | ``` 115 | 116 | ### Note about the examples 117 | 118 | The example files were made with username `moalla` and lab-name `claire`. 119 | Adapt them accordingly to your username and lab name. 120 | Run 121 | ```bash 122 | # From the cluster this time. 123 | ./template.sh env 124 | # Edit the .env file with your lab name (you can ignore the rest). 125 | ./template.sh get_scitas_scripts 126 | ``` 127 | to get a copy of the examples in this guide with your username, lab name, etc. 128 | They will be in `./EPFL-SCITAS-setup/submit-scripts`. 129 | 130 | ### A quick test to understand how the template works 131 | 132 | Adapt the `submit-scripts/minimal.sh` with the name of your image and your cluster storage setup. 133 | 134 | The submission script gives two examples of how to run containers on SCITAS. 135 | Either with [`enroot`](https://github.com/NVIDIA/enroo) 136 | and the [`pyxis`](https://github.com/NVIDIA/pyxis) plugin directly integrated in `srun`, 137 | or with `apptainer` inside tasks as a separate command. 138 | We recommend using Pyxis+enroot as it allows more remote development tools to be used. 139 | 140 | Run the script to see how the template works. 141 | ```bash 142 | cd installation/docker-amd64-cuda/EPFL-SCITAS-setup/submit-scripts 143 | bash minimal.sh 144 | ``` 145 | 146 | When the container starts, its entrypoint does the following: 147 | 148 | - It runs the entrypoint of the base image if you specified it in the `compose-base.yaml` file. 149 | - It expects you specify `PROJECT_ROOT_AT=`. 150 | and `PROJECT_ROOT_AT` to be the working directory of the container. 151 | Otherwise, it will issue a warning and set it to the default working directory of the container. 152 | - It then tries to install the project in editable mode. 153 | This is a lightweight installation that allows to avoid all the hacky import path manipulations. 154 | (This will be skipped if `PROJECT_ROOT_AT` has not been specified or if you specify `SKIP_INSTALL_PROJECT=1`.) 155 | - It also handles all the remote development setups (VS Code, Cursor, PyCharm, Jupyter, ...) 156 | that you specify with environment variables. 157 | These are described in the later sections of this README. 158 | - Finally, it executes a provided command (e.g. `bash` here for an interactive job with a connected --pty). 159 | 160 | You need to make sure that this minimal submission works before proceeding. 161 | The logs of the entrypoint are only shown in case there was an error (design from pyxis). 162 | (A current workaround runs the entrypoint as a script at the start instead of as an entrypoint) 163 | 164 | If the entrypoint fails the installation of your project, you can resubmit your job with `export SKIP_INSTALL_PROJECT=1` 165 | which will skip the installation step then you can replay the installation manually in the container to debug it. 166 | 167 | ## Use cases 168 | 169 | The basic configuration for the project's environment is now set up. 170 | You can follow the remaining sections below to see how to run unattended jobs and set up remote development. 171 | After that, return to the root README for the rest of the instructions to run our experiments. 172 | 173 | 174 | ### Running unattended jobs 175 | 176 | By performing the above first steps, you should have all the required setup to run unattended jobs. 177 | The main difference is that the unattended job is run with `sbatch`. 178 | An example of an unattended job can be found in `submit-scripts/unattended.sh` to run with `sbatch`. 179 | Note the emphasis on having a frozen copy `run` of the repository for running unattended jobs. 180 | 181 | ### Weights&Biases 182 | 183 | Your W&B API key should be exposed as the `WANDB_API_KEY` environment variable. 184 | You can export it or if you're sharing the script with others export a location to a file containing it with 185 | `export WANDB_API_KEY_FILE_AT` and let the template handle it. 186 | 187 | E.g., 188 | 189 | ```bash 190 | echo > $HOME/.wandb-api-key 191 | chmod 600 $HOME/.wandb-api-key 192 | ``` 193 | 194 | Then `export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key` in the submit script. 195 | 196 | ### Remote development 197 | 198 | This would be the typical use case for a researcher at CLAIRE using the cluster as their daily driver to do 199 | development, testing, and debugging. 200 | Your job would be running a remote IDE/code editor on the cluster, and you would only have a lightweight local client 201 | running on your laptop. 202 | 203 | The entrypoint will start an ssh server and a remote development server for your preferred IDE/code editor 204 | when you set some environment variables. 205 | An example of an interactive job submission can be found in `submit-scripts/remote-development.sh` 206 | to run with `sbatch`. 207 | 208 | Below, we list and describe in more detail the tools and IDEs supported for remote development. 209 | 210 | ### SSH Configuration (Necessary for PyCharm, VS Code, and Cursor) 211 | 212 | Your job will open an ssh server when you set the environment variable `SSH_SERVER=1`. 213 | You also have to mount the authorized keys file from your home directory to the container (done in the example). 214 | The SSH connection is necessary for some remote IDEs like PyCharm to work and can be beneficial 215 | for other things like ssh key forwarding. 216 | The ssh server is configured to run on port 2223 of the container. 217 | 218 | With the ssh connection, you can forward the ssh keys on your local machine (that you use for GitHub, etc.) 219 | on the remote server. 220 | This allows using the ssh keys on the remote server without having to copy them there. 221 | 222 | For that, you need three things: an ssh agent running on your local machine, the key added to the agent, 223 | and a configuration file saying that the agent should be used with the ssh connection to SCITAS. 224 | GitHub provides a guide for that 225 | [here (look at the troubleshooting section too)](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding). 226 | 227 | Use the following configuration in your local `~/.ssh/config` 228 | 229 | ```bash 230 | Host kuma 231 | HostName kuma.hpc.epfl.ch 232 | User moalla 233 | ForwardAgent yes 234 | 235 | # EDIT THIS HOSTNAME WITH EVERY NEW JOB 236 | Host kuma-job 237 | HostName kh021 238 | User moalla 239 | ProxyJump kuma 240 | StrictHostKeyChecking no 241 | UserKnownHostsFile=/dev/null 242 | ForwardAgent yes 243 | 244 | Host kuma-container 245 | HostName localhost 246 | ProxyJump kuma-job 247 | Port 2223 248 | User moalla 249 | StrictHostKeyChecking no 250 | UserKnownHostsFile=/dev/null 251 | ForwardAgent yes 252 | ``` 253 | To update the hostname of the `clariden-job` you can add this to your `~/.zshrc` on macOS for example: 254 | 255 | ```bash 256 | # Tested on macos with zsh 257 | function update-ssh-config() { 258 | local config_file="$HOME/.ssh/config" # Adjust this path if needed 259 | local host="$1" 260 | local new_hostname="$2" 261 | 262 | if [[ -z "$host" || -z "$new_hostname" ]]; then 263 | echo "Usage: update-ssh-config " 264 | return 1 265 | fi 266 | 267 | sed -i '' '/Host '"$host"'/,/Host / s/^[[:space:]]*HostName.*/ HostName '"$new_hostname"'/' "$config_file" 268 | echo "Updated HostName for '${host}' to '${new_hostname}' in ~/.ssh/config" 269 | } 270 | ``` 271 | 272 | The `StrictHostKeyChecking no` and `UserKnownHostsFile=/dev/null` allow bypass checking the identity 273 | of the host [(ref)](https://linuxcommando.blogspot.com/2008/10/how-to-disable-ssh-host-key-checking.html) 274 | which keeps changing every time a job is scheduled, 275 | so that you don't have to reset it each time. 276 | 277 | With this config you can then connect to your container with `ssh clariden-container`. 278 | 279 | **Limitations** 280 | 281 | Note that an ssh connection to the container is not like executing a shell on the container. 282 | In particular, the following limitations apply: 283 | 284 | - environment variables in the image sent to the entrypoint of the container and any command exec'ed in it 285 | are not available in ssh connections. 286 | There is a workaround for that in `entrypoints/remote-development-setup.sh` when opening an ssh server 287 | which should work for most cases, but you may still want to adapt it to your needs. 288 | 289 | ### Git config 290 | 291 | You can persist your Git config (username, email, etc.) by mounting it in the container. 292 | This is done in the examples. 293 | 294 | E.g., create your config in your home directory with 295 | 296 | ```bash 297 | cat >$HOME/.gitconfig < Tools -> Terminal. 367 | * When running Run/Debug configurations, set your working directory the project root (`$PROJECT_ROOT_AT`), not the script's directory. 368 | * Your interpreter will be 369 | * the system Python `/usr/bin/python` with the `from-python` option. 370 | * the Python in your conda environment with the `from-scratch` option, with the conda binary found at `/opt/conda/condabin/conda`. 371 | 372 | **Limitations:** 373 | 374 | - The terminal in PyCharm opens ssh connections to the container, 375 | so the workaround (and its limitations) in the ssh section apply. 376 | If needed, you could just open a separate terminal on your local machine 377 | and directly exec a shell into the container. 378 | - It's not clear which environment variables are passed to the programs run from the IDE like the debugger. 379 | So far, it seems like the SSH env variables workaround works fine for this. 380 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet. 381 | 382 | ### VSCode / Cursor 383 | 384 | We support the [Remote Development using SSH ](https://code.visualstudio.com/docs/remote/ssh) 385 | feature of VS code that runs a remote IDE in the container via SSH. To set this up for Cursor, simply replace `VSCODE` by `CURSOR` and `vscode` by `cursor` in all instructions below. For example, `VSCODE_SERVER_AT` becomes `CURSOR_SERVER_AT`, and `~/.vscode-server` becomes `~/.cursor-server`. 386 | 387 | **Preliminaries: saving the IDE configuration** 388 | 389 | The remote IDE stores its configuration (e.g., the extensions you set up) in `~/.vscode-server`. 390 | To have it preserved between different dev containers, you should specify the 391 | `VSCODE_SERVER_AT` env variable with your submit command 392 | as shown in the examples in `submit-scripts/remote-development.sh`. 393 | The template will use it to store the IDE configuration and cache in a separate directory 394 | per project (defined by its $PROJECT_ROOT_AT). 395 | All the directories will be created automatically. 396 | 397 | **ssh configuration** 398 | 399 | VS Code takes ssh configuration from files. 400 | Follow the steps in the [SSH configuration section](#ssh-configuration-necessary-for-pycharm-and-vs-code) 401 | to set up your ssh config file. 402 | 403 | **Connecting VS Code to the container**: 404 | 405 | 1. `mkdir $HOME/vscode-server` 406 | 2. In your submit command, set the environment variables for 407 | - Opening an ssh server `SSH_SERVER=1`. 408 | - preserving your config `VSCODE_SERVER_AT`. 409 | And add `VSCODE_SERVER_AT` in the `--container-mounts`. 410 | 3. Have the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) 411 | extension on your local VS Code. 412 | 4. Connect to the ssh host following the 413 | steps [here](https://code.visualstudio.com/docs/remote/ssh#_connect-to-a-remote-host). 414 | 415 | The directory to add to your VS Code workspace should be the same as the one specified in the `PROJECT_ROOT_AT`. 416 | 417 | **Limitations** 418 | 419 | - The terminal in VS Code opens ssh connections to the container, 420 | so the workaround (and its limitations) in the ssh section apply. 421 | If needed, you could just open a separate terminal on your local machine 422 | and directly exec a shell into the container. 423 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet. 424 | 425 | ### JupyterLab (TODO) 426 | 427 | ### Examples 428 | 429 | We provide examples of how to use the template in the `submit-scripts` directory. 430 | 431 | ### Troubleshooting 432 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/README.md: -------------------------------------------------------------------------------- 1 | # Tips and Best Practices for Running Jobs with Slurm + pyxis + enroot 2 | 3 | Placeholder. 4 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/minimal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If not done already in your bashrc (depends on the cluster so better write that logic there.) 4 | # export SCRATCH=/scratch/moalla 5 | 6 | # Variables used by the entrypoint script 7 | # Change this to the path of your project (can be the /dev or /run copy) 8 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev 9 | export PROJECT_NAME=template-project-name 10 | export PACKAGE_NAME=template_package_name 11 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 12 | 13 | # Enroot + Pyxis 14 | 15 | # Limitation: pyxis doesn't send environment variables to the entrypoint so it has to be run manually 16 | # This is fixed in v0.20.0 17 | 18 | srun \ 19 | -J template-minimal \ 20 | -G 1 --partition h100 \ 21 | --pty \ 22 | --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \ 23 | --container-mounts=/etc/slurm,$PROJECT_ROOT_AT,$SCRATCH \ 24 | --container-workdir=$PROJECT_ROOT_AT \ 25 | --no-container-mount-home \ 26 | --no-container-remap-root \ 27 | --no-container-entrypoint \ 28 | --container-writable \ 29 | /opt/template-entrypoints/pre-entrypoint.sh \ 30 | bash 31 | 32 | # additional options for pyxis 33 | # --container-env to override environment variables defined in the container 34 | 35 | exit 0 36 | 37 | # Some other possible option 38 | # Apptainer/Singularity 39 | srun \ 40 | -G 1 --partition h100 -J template-minimal \ 41 | --pty \ 42 | apptainer run \ 43 | --contain \ 44 | --bind $SCRATCH:$SCRATCH \ 45 | --cwd $PROJECT_ROOT_AT \ 46 | --no-home \ 47 | --nv \ 48 | --writable-tmpfs \ 49 | $CONTAINER_IMAGES/template-project-name_amd64-cuda-root-latest.sif \ 50 | bash 51 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/remote-development.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J template-remote-development 4 | #SBATCH -t 12:00:00 5 | #SBATCH --partition h100 6 | #SBATCH --gpus 4 7 | #SBATCH --cpus-per-task 60 8 | 9 | # Only for Kuma temporarily 10 | 11 | # If not done already in your bashrc (depends on the cluster so better write that logic there.) 12 | # export SCRATCH=/scratch/moalla 13 | 14 | # Variables used by the entrypoint script 15 | # Change this to the path of your project (can be the /dev or /run copy) 16 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev 17 | export PROJECT_NAME=template-project-name 18 | export PACKAGE_NAME=template_package_name 19 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 20 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key 21 | export SSH_SERVER=1 22 | export NO_SUDO_NEEDED=1 23 | export JETBRAINS_SERVER_AT=$HOME/jetbrains-server 24 | #export PYCHARM_IDE_AT=e632f2156c14a_pycharm-professional-2024.1.4 25 | # or 26 | # export VSCODE_SERVER_AT=$SCRATCH/vscode-server 27 | 28 | srun \ 29 | --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \ 30 | --container-mounts=\ 31 | /etc/slurm,\ 32 | $PROJECT_ROOT_AT,\ 33 | $SCRATCH,\ 34 | $WANDB_API_KEY_FILE_AT,\ 35 | $JETBRAINS_SERVER_AT,\ 36 | $HOME/.gitconfig,\ 37 | $HOME/.ssh \ 38 | --container-workdir=$PROJECT_ROOT_AT \ 39 | --no-container-mount-home \ 40 | --no-container-remap-root \ 41 | --no-container-entrypoint \ 42 | --container-writable \ 43 | -G 4 -c 60 \ 44 | /opt/template-entrypoints/pre-entrypoint.sh \ 45 | sleep infinity 46 | 47 | # additional options 48 | # --container-env to override environment variables defined in the container 49 | 50 | # Draft. 51 | # Here can connect to the container with 52 | # Get the job id (and node id if multinode) 53 | # 54 | # Connect to the allocation 55 | # srun --overlap --pty --jobid=JOBID bash 56 | # Inside the job find the container name 57 | # enroot list -f 58 | # Exec to the container 59 | # enroot exec zsh 60 | 61 | # additional options 62 | # --container-env to override environment variables defined in the container 63 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/unattended-distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J template-untattended-distributed 4 | #SBATCH -t 0:30:00 5 | #SBATCH --partition h100 6 | #SBATCH --nodes 2 7 | #SBATCH --ntasks-per-node 3 8 | 9 | # There is a current limitation in pyxis with the entrypoint and it has to run manually. 10 | # It has to run only once per node and the other tasks in the nodes have to wait for it to finish. 11 | # So you can either limit your jobs to 1 task per node or use a sleep command to wait for the entrypoint to finish. 12 | 13 | # Only for Kuma temporarily 14 | 15 | # If not done already in your bashrc (depends on the cluster so better write that logic there.) 16 | # export SCRATCH=/scratch/moalla 17 | 18 | # Variables used by the entrypoint script 19 | # Change this to the path of your project (can be the /dev or /run copy) 20 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run 21 | export PROJECT_NAME=template-project-name 22 | export PACKAGE_NAME=template_package_name 23 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 24 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key 25 | 26 | srun \ 27 | --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \ 28 | --container-mounts=\ 29 | /etc/slurm,\ 30 | $PROJECT_ROOT_AT,\ 31 | $SCRATCH,\ 32 | $WANDB_API_KEY_FILE_AT \ 33 | --container-workdir=$PROJECT_ROOT_AT \ 34 | --no-container-mount-home \ 35 | --no-container-remap-root \ 36 | --no-container-entrypoint \ 37 | --container-writable \ 38 | /opt/template-entrypoints/pre-entrypoint.sh \ 39 | bash -c 'sleep 60; python -m template_package_name.template_experiment some_arg=LOCALID-$SLURM_LOCALID-PROCID-$SLURM_PROCID' 40 | 41 | # Sleep to wait for the installation of the project. 42 | 43 | # additional options 44 | # --container-env to override environment variables defined in the container 45 | 46 | exit 0 47 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/unattended.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J template-unattended 4 | #SBATCH -t 0:30:00 5 | #SBATCH --partition h100 6 | #SBATCH --gpus 1 7 | 8 | # Only for Kuma temporarily 9 | 10 | # If not done already in your bashrc (depends on the cluster so better write that logic there.) 11 | # export SCRATCH=/scratch/moalla 12 | 13 | # Variables used by the entrypoint script 14 | # Change this to the path of your project (can be the /dev or /run copy) 15 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run 16 | export PROJECT_NAME=template-project-name 17 | export PACKAGE_NAME=template_package_name 18 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1 19 | 20 | srun \ 21 | --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \ 22 | --container-mounts=/etc/slurm,$PROJECT_ROOT_AT,$SCRATCH \ 23 | --container-workdir=$PROJECT_ROOT_AT \ 24 | --no-container-mount-home \ 25 | --no-container-remap-root \ 26 | --no-container-entrypoint \ 27 | --container-writable \ 28 | -G 1 \ 29 | /opt/template-entrypoints/pre-entrypoint.sh \ 30 | python -m template_package_name.template_experiment some_arg=some_value wandb.mode=offline 31 | 32 | # additional options for pyxis 33 | # --container-env to override environment variables defined in the container 34 | 35 | exit 0 36 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-runai-setup/.gitignore: -------------------------------------------------------------------------------- 1 | # If you want to put your own scripts contraining API keys (e.g. W&B). 2 | submit-scripts/ 3 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-runai-setup/README.md: -------------------------------------------------------------------------------- 1 | # Guide for using the template with the EPFL IC and RCP Run:ai clusters 2 | 3 | ## Overview 4 | 5 | At this point, you should have the runtime image that can be deployed on multiple platforms. 6 | This guide will show you how to deploy your image on the EPFL IC and RCP Run:ai clusters and use it for: 7 | 8 | 1. Remote development. (At CLAIRE, we use the Run:ai platform as our daily driver.) 9 | 2. Running unattended jobs. 10 | 11 | Using the image on HaaS machines falls into the public instructions 12 | using the local deployment option with Docker Compose service and is covered by the 13 | instructions in the `installation/docker-amd64-cuda/README.md` file. 14 | 15 | ## Prerequisites 16 | 17 | **Run:ai**: 18 | 19 | 1. You should have access to a Run:ai project and have some knowledge of the Run:ai platform, e.g., 20 | know the commands to submit jobs and check their status. 21 | 2. You should have one or more PVC(s) (Persistent Volume Claim) connecting some persistent storage 22 | to your Run:ai jobs, typically your lab's shared storage. 23 | (E.g. `runai-claire-gaspar-scratch`, you can run `kubectl get pvc` to list them). 24 | 3. You should have access to a project on the [IC](https://ic-registry.epfl.ch/) or [RCP](https://registry.rcp.epfl.ch/) 25 | image registries 26 | and should be logged in to them (`docker login `). 27 | 28 | EPIC provides an introduction to these tools [here](https://epic-guide.github.io/tools/ic-compute-storage). 29 | We also have a guide at CLAIRE which you can get inspiration from 30 | [here](https://prickly-lip-484.notion.site/Compute-and-Storage-CLAIRE-91b4eddcc16c4a95a5ab32a83f3a8294#1402ae1961ac4b3e86a6a3ee2d8602aa). 31 | 32 | ## First steps 33 | 34 | ### Note about the examples 35 | 36 | The examples in this README were made with username `moalla` and lab-name `claire`. 37 | Adapt them accordingly to your username and lab name. 38 | Run 39 | ```bash 40 | ./template.sh get_runai_scripts 41 | ``` 42 | to get a copy of the examples in this guide with your username, lab name, etc. 43 | They will be in `.EPFL-runai-setup/submit-scripts`. 44 | 45 | ### Clone your repository in your PVC / shared storage 46 | 47 | We strongly suggest having two instances of your project repository on your PVCs. 48 | 49 | 1. One for development, which may have uncommitted changes, be in a broken state, etc. 50 | 2. One for running unattended jobs, which is always referring to a commit at a working state of the code. 51 | 52 | You can still have the outputs and data directories of those two instances shared. 53 | This can be done by creating symlinks between them, in the same the way you can read data from another PVC, 54 | say a shared PVC that has model weights, etc. All of this is described in the 55 | `data/README.md` and `outputs/README.md` files of the template and can be done later. 56 | 57 | Follow the steps below to clone your repository in your PVCs / shared storage. 58 | 59 | Typically the storage underlying your PVC is also mounted on a permanent machine that you can access. 60 | CLAIRE members can use the `claire-build-machine` for this to access `claire-rcp-scratch`. 61 | RCP also provides a shared jump host `haas001.rcp.epfl.ch` that mounts most lab's shared storage. 62 | 63 | Setup your SSH configuration so that your keys are forwarded during your ssh connection to machine 64 | so that you can clone your repository easily. 65 | For CLAIRE members you should have the `claire-build-machine` already setup. 66 | For other labs you can copy the config example below for `haas001.rcp.epfl.ch`. 67 | 68 | ```bash 69 | # You need three things for your ssh keys to be forwarded during a connection: 70 | # an ssh agent running on your local machine, 71 | # the key added to the agent, 72 | # and a configuration file saying that the agent should be used with connection. 73 | # GitHub provides a guide for that (look at the troubleshooting section too) 74 | # https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding 75 | # and for the ssh config file you can use the following: 76 | Host rcp-haas 77 | HostName haas001.rcp.epfl.ch 78 | User YOUR-GASPAR 79 | ForwardAgent yes 80 | ``` 81 | 82 | SSH to the machine and clone your repository in your PVC / shared storage. 83 | (Remember to push the changes you made on your local machine after initializing the template, 84 | to have the latest state of your repo.) 85 | ```bash 86 | # Somewhere in your PVC, say your personal directory there. 87 | mkdir template-project-name 88 | git clone template-project-name/dev 89 | git clone template-project-name/run 90 | ``` 91 | 92 | We also recommend that you make Git ignore the executable bit as the repo is moved across filesystems. 93 | You can do so by running `git config core.filemode false` in both repositories. 94 | 95 | ```bash 96 | cd template-project-name/dev && git config core.filemode false 97 | cd ../run && git config core.filemode false 98 | ``` 99 | 100 | ### A quick test to understand how the template works 101 | 102 | Adapt the `submit-scripts/minimal.sh` with the name of your image, your PVC, 103 | and the correct path to your project in the PVC. 104 | 105 | When the container starts, its entrypoint does the following: 106 | 107 | - It runs the entrypoint of the base image if you specified it in the `compose-base.yaml` file. 108 | - It expects you specify `PROJECT_ROOT_AT=` 109 | and to set `PROJECT_ROOT_AT` as the working directory of the container 110 | and installs the project found at `PROJECT_ROOT_AT` in editable mode. 111 | This is a lightweight installation that allows to avoid all the hacky import path manipulations. 112 | (You can skip this if you have a different project structure, 113 | e.g., 114 | just copied the installation directory of the template by not specifying `PROJECT_ROOT_AT`). 115 | - It also handles all the remote development setups (VS Code, Cursor, PyCharm, Jupyter, ...) 116 | that you specify with environment variables. 117 | These are described in the later sections of this README. 118 | - Finally, it executes a provided command (e.g. `sleep infinity`), otherwise by default will run a shell and stop. 119 | It runs this command with PID 1 so that it can receive signals from the cluster and gracefully stop when preempted. 120 | You should not have to override the entrypoint, i.e., using `--command` flag with `runai submit` 121 | unless you are debugging the entrypoint itself. 122 | 123 | You need to make sure that this minimal submission works before proceeding. 124 | You can check the logs of the container with `runai logs example-minimal` to see if everything is working as expected. 125 | You should expect to see something like: 126 | 127 | ```text 128 | $ runai logs example-minimal 129 | ... 130 | [TEMPLATE INFO] PROJECT_ROOT_AT is set to /claire-rcp-scratch/home/moalla/template-project-name/dev. 131 | [TEMPLATE INFO] Expecting workdir to be /claire-rcp-scratch/home/moalla/template-project-name/dev. 132 | [TEMPLATE INFO] Installing the project with pip. 133 | [TEMPLATE INFO] Expecting /claire-rcp-scratch/home/moalla/template-project-name/dev to be a Python project. 134 | [TEMPLATE INFO] To skip this installation use the env variable SKIP_INSTALL_PROJECT=1. 135 | Obtaining file:///claire-rcp-scratch/home/moalla/template-project-name/dev 136 | Installing build dependencies: started 137 | ... 138 | Building editable for template-project-name (pyproject.toml): started 139 | ... 140 | Successfully built template-project-name 141 | Installing collected packages: template-project-name 142 | Successfully installed template-project-name-0.0.1 143 | [TEMPLATE INFO] Testing that the package can be imported. 144 | [TEMPLATE INFO] Package imported successfully. 145 | [TEMPLATE INFO] Executing the command sleep infinity 146 | ```` 147 | 148 | You can then open a shell in the container and check that everything is working as expected: 149 | 150 | ```bash 151 | runai exec -it example-minimal zsh 152 | ``` 153 | 154 | If the entrypoint fails the installation of your project, you can resubmit your job with `-e SKIP_INSTALL_PROJECT=1` 155 | which will skip the installation step then you can replay the installation manually in the container to debug it. 156 | 157 | ## Use cases 158 | 159 | The basic configuration for the project's environment is now set up. 160 | You can follow the remaining sections below to see how to run unattended jobs and set up remote development. 161 | After that, return to the root README for the rest of the instructions to run our experiments. 162 | 163 | 164 | ### Running unattended jobs 165 | 166 | By performing the above first steps, you should have all the required setup to run unattended jobs. 167 | An example of an unattended job can be found in `submit-scripts/unattended.sh`. 168 | Note the emphasis on having a frozen copy `run` of the repository for running unattended jobs. 169 | 170 | 171 | ### Run:ai selectors 172 | 173 | Different clusters have different names for node pools and options to enable `sudo` usage etc. 174 | Refer to the `submit-scripts` for the main options, otherwise to the clusters' respective documentation. 175 | 176 | ### Weights&Biases 177 | 178 | Your W&B API key should be exposed as the `WANDB_API_KEY` environment variable. 179 | Run:ai doesn't support Kubernetes secrets yet, and you don't want to pass it as a clear environment variable 180 | (visible in the Run:ai dashboard), 181 | so an alternative is to have it in your PVC and pass it with the 182 | `-e WANDB_API_KEY_FILE_AT` environment variable in your `runai submit` command and let the template handle it. 183 | 184 | E.g., 185 | 186 | ```bash 187 | 188 | # In my PVC. 189 | # 190 | echo > /claire-rcp-scratch/home/moalla/.wandb-api-key 191 | ``` 192 | 193 | Then specify `-e WANDB_API_KEY_FILE_AT=/claire-rcp-scratch/home/moalla/.wandb-api-key` in my `runai submit` command. 194 | 195 | ### HuggingFace 196 | 197 | Same idea as for W&B, you should have your Hugging Face API key in your PVC and pass it with the 198 | `-e HF_TOKEN_AT` environment variable in your `runai submit` command. 199 | 200 | E.g., 201 | 202 | ```bash 203 | 204 | # In my PVC 205 | echo > /claire-rcp-scratch/home/moalla/.hf-token 206 | ``` 207 | 208 | Then specify 209 | `-e HF_TOKEN_AT=/claire-rcp-scratch/home/moalla/.hf-token` in my `runai submit` command. 210 | 211 | 212 | ### Remote development 213 | 214 | This would be the typical use case for a user at CLAIRE using the Run:ai cluster as their daily driver to do 215 | development, testing, and debugging. 216 | Your job would be running a remote IDE/code editor on the cluster, and you would only have a lightweight local client 217 | running on your laptop. 218 | 219 | The entrypoint will start an ssh server and a remote development server for your preferred IDE/code editor 220 | when you set some environment variables. 221 | An example of an interactive job submission can be found in `submit-scripts/remote-development.sh`. 222 | 223 | Below, we list and describe in more detail the tools and IDEs supported for remote development. 224 | 225 | ### SSH Configuration (Necessary for PyCharm, VS Code, and Cursor) 226 | 227 | Your job will open an ssh server when you set the environment variable `SSH_SERVER=1`. 228 | This is necessary for some remote IDEs like PyCharm to work and can be beneficial 229 | for other things like ssh key forwarding. 230 | 231 | The ssh server is configured to run on port 2223 of the container. 232 | You can forward a local port on your machine to this port on the container. 233 | 234 | When your container is up, run 235 | 236 | ```bash 237 | # Here 2222 on the local machine is forwarded to 2223 on the pod. 238 | # You can change the local port number to another port number. 239 | kubectl get pods 240 | kubectl port-forward 2222:2223 241 | ``` 242 | 243 | You can then ssh to your container by ssh-ing to that port on your local machine. 244 | Connect with the user and password you specified in your `.env` file when you built the image. 245 | 246 | ```bash 247 | # ssh to local machine is forwarded to the pod. 248 | ssh -p 2222 @localhost 249 | ``` 250 | 251 | As the container will each time be on a different machine, the ssh key for the remote server has to be reset or not stored.. 252 | This is done for you in the ssh config below. If you face issues you can reset the key with: 253 | 254 | ```bash 255 | ssh-keygen -R '[localhost]:2222' 256 | ``` 257 | 258 | With the ssh connection, you can forward the ssh keys on your local machine (that you use for GitHub, etc.) 259 | on the remote server. 260 | This allows using the ssh keys on the remote server without having to copy them there. 261 | (The alternative would be to have them as Kubernetes secrets, 262 | but Run:ai doesn't support that yet with its submit command.) 263 | 264 | For that, you need three things: an ssh agent running on your local machine, the key added to the agent, 265 | and a configuration file saying that the agent should be used with the Run:ai job. 266 | GitHub provides a guide for that 267 | [here (look at the troubleshooting section too)](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding) 268 | and for the ssh config file you can use the following: 269 | 270 | ```bash 271 | Host local2222 272 | HostName 127.0.0.1 273 | User 274 | Port 2222 275 | StrictHostKeyChecking no 276 | UserKnownHostsFile=/dev/null 277 | ForwardAgent yes 278 | # If you open multiple projects at the same time, you can forward each of them to a different port. 279 | # And have two entries in your ssh config file. 280 | ``` 281 | 282 | The `StrictHostKeyChecking no` and `UserKnownHostsFile=/dev/null` allow bypass checking the identity 283 | of the host [(ref)](https://linuxcommando.blogspot.com/2008/10/how-to-disable-ssh-host-key-checking.html) 284 | which keeps changing every time a job is scheduled, 285 | so that you don't have to reset it each time. 286 | 287 | With this config you can then simply connect to your container with `ssh local2222` when the port 2222 is forwarded. 288 | 289 | **Limitations** 290 | 291 | Note that an ssh connection to the container is not like executing a shell on the container. 292 | In particular, the following limitations apply: 293 | 294 | - environment variables in the image sent to the entrypoint of the container and any command exec'ed in it 295 | are not available in ssh connections. 296 | There is a workaround for that in `entrypoints/remote-development-setup.sh` when opening an ssh server 297 | which should work for most cases, but you may still want to adapt it to your needs. 298 | 299 | ### Git config 300 | 301 | You can persist your Git config (username, email, etc.) by having it in your PVC and passing its location 302 | with the `GIT_CONFIG_AT` environment variable. 303 | 304 | E.g., create your config in your PVC with 305 | 306 | ```bash 307 | # In my PVC. 308 | cat >/claire-rcp-scratch/home/moalla/remote-development/gitconfig <`. 377 | The link looks like: 378 | 379 | ```bash 380 | Gateway link: jetbrains-gateway://connect#idePath=%2Fclaire-rcp-scratch%2Fhome%2Fmoalla%2Fremote-development%2Fpycharm&projectPath=%2Fclaire-rcp-scratch%2Fhome%2Fmoalla%2Ftemplate-project-name%2Fdev&host=127.0.0.1&port=2223&user=moalla&type=ssh&deploy=false&newUi=true 381 | ``` 382 | Use it in Gateway to connect to the IDE. 383 | 384 | **Configuration**: 385 | 386 | * PyCharm's default terminal is bash. Change it to zsh in the Settings -> Tools -> Terminal. 387 | * When running Run/Debug configurations, set your working directory the project root (`$PROJECT_ROOT_AT`), not the script's directory. 388 | * Your interpreter will be 389 | * the system Python `/usr/bin/python` with the `from-python` option. 390 | * the Python in your conda environment with the `from-scratch` option, with the conda binary found at `/opt/conda/condabin/conda`. 391 | 392 | **Limitations:** 393 | 394 | - The terminal in PyCharm opens ssh connections to the container, 395 | so the workaround (and its limitations) in the ssh section apply. 396 | If needed, you could just open a separate terminal on your local machine 397 | and directly exec a shell into the container. 398 | - It's not clear which environment variables are passed to the programs run from the IDE like the debugger. 399 | So far, it seems like the SSH env variables workaround works fine for this. 400 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet. 401 | 402 | ### VSCode / Cursor 403 | 404 | We support the [Remote Development using SSH ](https://code.visualstudio.com/docs/remote/ssh) 405 | feature of VS code that runs a remote IDE in the container via SSH. To set this up for Cursor, simply replace `VSCODE` by `CURSOR` and `vscode` by `cursor` in all instructions below. For example, `VSCODE_SERVER_AT` becomes `CURSOR_SERVER_AT`, and `~/.vscode-server` becomes `~/.cursor-server`. 406 | 407 | 408 | **Preliminaries: saving the IDE configuration** 409 | 410 | The remote IDE stores its configuration (e.g., the extensions you set up) in `~/.vscode-server`. 411 | To have it preserved between different dev containers, you should specify the 412 | `VSCODE_SERVER_AT` env variable with your submit command 413 | as shown in the examples in `submit-scripts/remote-development.sh`. 414 | The template will use it to store the IDE configuration and cache in a separate directory 415 | per project (defined by its $PROJECT_ROOT_AT). 416 | All the directories will be created automatically. 417 | 418 | **ssh configuration** 419 | 420 | VS Code takes ssh configuration from files. 421 | Follow the steps in the [SSH configuration section](#ssh-configuration-necessary-for-pycharm-and-vs-code) 422 | to set up your ssh config file for runai jobs. 423 | 424 | **Connecting VS Code to the container**: 425 | 426 | 1. In your `runai submit` command, set the environment variables for 427 | - Opening an ssh server `SSH_SERVER=1`. 428 | - preserving your config `VSCODE_SERVER_AT`. 429 | 2. Enable port forwarding for the SSH connection. 430 | 3. Have the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) 431 | extension on your local VS Code. 432 | 4. Connect to the ssh host following the 433 | steps [here](https://code.visualstudio.com/docs/remote/ssh#_connect-to-a-remote-host). 434 | 435 | The directory to add to your VS Code workspace should be the same as the one specified in the `PROJECT_ROOT_AT`. 436 | 437 | **Limitations** 438 | 439 | - The terminal in VS Code opens ssh connections to the container, 440 | so the workaround (and its limitations) in the ssh section apply. 441 | If needed, you could just open a separate terminal on your local machine 442 | and directly exec a shell into the container. 443 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet. 444 | 445 | ### JupyterLab 446 | 447 | If you have `jupyterlab` in your dependencies, then the template can open a Jupyter Lab server for you when 448 | the container starts. 449 | 450 | To do so, you need to: 451 | 452 | 1. Set the `JUPYTER_SERVER=1` environment variable in your `runai submit` command. 453 | You can find an example in `submit-scripts/remote-development.sh`. 454 | 455 | A Jupyter server will start running with your container. It will print a link to the container logs. 456 | 457 | Get the logs with `runai logs `. 458 | The link looks like: 459 | 460 | ```bash 461 | [C 2023-04-26 17:17:03.072 ServerApp] 462 | 463 | To access the server, open this file in a browser: 464 | ... 465 | Or copy and paste this URL: 466 | http://hostname:8887/?token=1098cadee3ac0c48e0b0a3bf012f8f06bb0d56a6cde7d128 467 | ``` 468 | 469 | 2. Forward the port `8887` on your local machine to the port `8887` on the container. 470 | ```bash 471 | kubectl port-forward 8887:8887 472 | ``` 473 | 474 | 3. Open the link in your browser, replacing `hostname` with `localhost`. 475 | 476 | **Note:** 477 | 478 | Development on Jupyter notebooks can be very useful, e.g., for quick iterations, plotting, etc., however, 479 | it can very easily facilitate bad practices, such as debugging with print statements, prevalence of global variables, 480 | relying on long-living kernel state, and hinder the reproducibility work. 481 | We strongly recommend using an IDE with a proper debugger for development, which would fill the need for quick 482 | iterations, and only use Jupyter notebooks for plotting results 483 | (where data is properly loaded from the output of a training script). 484 | 485 | **Limitations:** 486 | 487 | - We have limited usage of Jupyter so limitations are not known yet. 488 | 489 | ### Examples 490 | 491 | We provide examples of how to use the template in the `submit-scripts` directory. 492 | We use `submit` commands and not YAML files to specify job configurations because the Run:ai API for kubernetes 493 | resources keeps changing and is not stable yet. 494 | 495 | ### Troubleshooting 496 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-runai-setup/example-submit-scripts/minimal.sh: -------------------------------------------------------------------------------- 1 | runai submit \ 2 | --name example-minimal \ 3 | --interactive \ 4 | --image registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-moalla-latest \ 5 | --pvc runai-claire-moalla-scratch:/claire-rcp-scratch \ 6 | --working-dir /claire-rcp-scratch/home/moalla/template-project-name/dev \ 7 | -e PROJECT_ROOT_AT=/claire-rcp-scratch/home/moalla/template-project-name/dev \ 8 | -e PROJECT_NAME=template-project-name \ 9 | -e PACKAGE_NAME=template_package_name \ 10 | -g 1 --cpu 8 --cpu-limit 8 --memory 64G --memory-limit 64G \ 11 | -- sleep infinity 12 | 13 | ## Notes: 14 | # This is a minimal example of a working submission. 15 | # You can then attach a shell to this job with: runai exec -it example-minimal zsh 16 | 17 | # The important bits here are: 18 | # 1.The command to mount your pcv. 19 | # --pvc your_pvc_name:/where_to_mount_your_pvc (you can mount it anywhere) 20 | # 2.The environment variables that tell the entrypoint where to find your project. 21 | # -e PROJECT_ROOT_AT= . 22 | # 3.The working directory set to the PROJECT_ROOT_AT. 23 | # --working-dir same as PROJECT_ROOT_AT. 24 | 25 | ## Useful commands. 26 | # runai describe job example-minimal 27 | # runai logs example-minimal 28 | # runai exec -it example-minimal zsh 29 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-runai-setup/example-submit-scripts/remote-development.sh: -------------------------------------------------------------------------------- 1 | ## Go to the end of the file for useful commands and troubleshooting tips. 2 | 3 | # Minimal setup to just ssh into the container. 4 | # For additional options check the readme first, then use from below as examples. 5 | 6 | # For RCP use the --pvc claire-scratch:/claire-rcp-scratch 7 | # For IC use the runai-claire-moalla-scratch:/claire-rcp-scratch 8 | runai submit \ 9 | --name example-remote-development \ 10 | --interactive \ 11 | --image registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-moalla-latest \ 12 | --pvc runai-claire-moalla-scratch:/claire-rcp-scratch \ 13 | --working-dir /claire-rcp-scratch/home/moalla/template-project-name/dev \ 14 | -e PROJECT_ROOT_AT=/claire-rcp-scratch/home/moalla/template-project-name/dev \ 15 | -e PROJECT_NAME=template-project-name \ 16 | -e PACKAGE_NAME=template_package_name \ 17 | -e SSH_SERVER=1 \ 18 | --allow-privilege-escalation \ 19 | -g 1 --cpu 8 --cpu-limit 8 --memory 64G --memory-limit 64G --large-shm \ 20 | -- sleep infinity 21 | 22 | # To request more that the interactive quota add --preemptible to the submit command. 23 | 24 | # To mount your gitconfig 25 | # -e GIT_CONFIG_AT=/claire-rcp-scratch/home/moalla/remote-development/gitconfig \ 26 | 27 | # For PyCharm 28 | # -e JETBRAINS_SERVER_AT=/claire-rcp-scratch/home/moalla/remote-development/jetbrains-server \ 29 | # -e PYCHARM_IDE_AT=e632f2156c14a_pycharm-professional-2024.1.4 \ 30 | 31 | # For VSCode 32 | # -e VSCODE_SERVER_AT=/claire-rcp-scratch/home/moalla/remote-development/vscode-server \ 33 | 34 | # For Jupyter Lab 35 | # -e JUPYTER_SERVER=1 \ 36 | 37 | # For W&B 38 | # -e WANDB_API_KEY_FILE_AT=/claire-rcp-scratch/home/moalla/.wandb-api-key \ 39 | 40 | # For HuggingFace 41 | # -e HF_TOKEN_AT=/claire-rcp-scratch/home/moalla/.hf-token \ 42 | # -e HF_HOME=/claire-rcp-scratch/home/moalla/huggingface \ 43 | 44 | 45 | ## Useful commands. 46 | # runai describe job example-remote-development 47 | # runai logs example-remote-development 48 | # kubectl port-forward example-remote-development-0-0 2222:2223 49 | # ssh runai 50 | # kubectl port-forward example-remote-development-0-0 8888:8888 51 | # runai logs example-remote-development 52 | # Get the link and paste it in your browser, replacing hostname with localhost. 53 | 54 | ## Troubleshooting. 55 | # When you add a new line for an environment variable or a GPU, etc., remember to add a \ at the end of the line. 56 | # ... \ 57 | # -e SOME_ENV_VAR=1 \ 58 | # -g 1 \ 59 | #... 60 | # -- sleep infinity 61 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/EPFL-runai-setup/example-submit-scripts/unattended.sh: -------------------------------------------------------------------------------- 1 | # Minimal setup to just ssh into the container. 2 | # For additional options check the readme first, then use from below as examples. 3 | 4 | # For RCP use the --pvc claire-scratch:/claire-rcp-scratch 5 | # For IC use the runai-claire-moalla-scratch:/claire-rcp-scratch 6 | 7 | runai submit \ 8 | --name example-unattended \ 9 | --image registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-moalla-latest \ 10 | --pvc runai-claire-moalla-scratch:/claire-rcp-scratch \ 11 | --working-dir /claire-rcp-scratch/home/moalla/template-project-name/run \ 12 | -e PROJECT_ROOT_AT=/claire-rcp-scratch/home/moalla/template-project-name/run \ 13 | -e PROJECT_NAME=template-project-name \ 14 | -e PACKAGE_NAME=template_package_name \ 15 | -g 1 --cpu 8 --cpu-limit 8 --memory 64G --memory-limit 64G --large-shm \ 16 | -- python -m template_package_name.template_experiment some_arg=2 wandb.mode=offline 17 | 18 | # template_experiment is an actual script that you can run. 19 | # or -- zsh template_package_name/reproducibility-scripts/template-experiment.sh 20 | 21 | # For W&B 22 | # -e WANDB_API_KEY_FILE_AT=/claire-rcp-scratch/home/moalla/.wandb-api-key \ 23 | 24 | # For HuggingFace 25 | # -e HF_TOKEN_AT=/claire-rcp-scratch/home/moalla/.hf-token \ 26 | # -e HF_HOME=/claire-rcp-scratch/home/moalla/huggingface \ 27 | 28 | 29 | # To separate the dev state of the project from frozen checkouts to be used in unattended jobs you can observe that 30 | # we're pointing to the .../run instance of the repository on the PVC. 31 | # That would be a copy of the template-project-name repo frozen in a commit at a working state to be used in unattended jobs. 32 | # Otherwise while developing we would change the code that would be picked by newly scheduled jobs. 33 | 34 | # Useful commands. 35 | # runai describe job example-unattended 36 | # runai logs example-unattended 37 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/LICENSE.cresset: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 이준형/李俊炯/Joonhyung Lee/(John Young Lee) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/apt.txt: -------------------------------------------------------------------------------- 1 | # `apt` runtime requirements file. 2 | # These dependencies are required to run your code. 3 | build-essential # Likely needed. 4 | ca-certificates # Likely needed. 5 | curl # Useful. 6 | git # Likely needed. 7 | htop # Useful. 8 | htop # Useful. 9 | netcat # Useful. 10 | openssh-server # Required for remote development with most IDEs. 11 | openssl # Required. 12 | sudo # Required to open ssh server. 13 | tmux # Useful. 14 | tree # Useful. 15 | vim # Useful. 16 | wget # Useful. 17 | zsh # Required. 18 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/compose-base.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | build-args: 3 | build: 4 | args: 5 | # Pytorch 2.4.0a0+f70bd71a48, NVIDIA CUDA 12.5.0.23, Python 3.10. 6 | # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-06.html 7 | # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch 8 | BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3 9 | GIT_IMAGE: docker.io/alpine/git:2.40.1 # https://hub.docker.com/r/alpine/git/tags 10 | # You can find the entrypoint by running `docker inspect BASE_IMAGE | grep -A 3 Entrypoint` 11 | # If there is no entrypoint, you can leave it empty. 12 | BASE_ENTRYPOINT: /opt/nvidia/nvidia_entrypoint.sh 13 | # 1 normally, 0 if the entrypoint does not exec its arguments, in rare cases. 14 | BASE_ENTRYPOINT_EXECS: 1 15 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | image-root: 3 | extends: 4 | file: compose-base.yaml 5 | service: build-args 6 | image: ${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest 7 | build: 8 | platforms: 9 | - "linux/amd64" 10 | context: . 11 | dockerfile: Dockerfile 12 | target: runtime-generic 13 | args: 14 | PROJECT_NAME: ${PROJECT_NAME} 15 | PACKAGE_NAME: ${PACKAGE_NAME} 16 | 17 | image-user: 18 | extends: 19 | service: image-root 20 | image: ${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR}-latest 21 | build: 22 | dockerfile: Dockerfile-user 23 | target: runtime-user 24 | args: 25 | GENERIC_IMAGE: ${IMAGE_NAME} 26 | IMAGE_PLATFORM: ${IMAGE_PLATFORM} 27 | GRPID: ${GRPID} 28 | USRID: ${USRID} 29 | GRP: ${GRP} 30 | USR: ${USR} 31 | PASSWD: ${PASSWD} 32 | 33 | run-local-cpu: # Service to run the image locally with CPU only. 34 | extends: 35 | service: image-user 36 | tty: true 37 | stdin_open: true 38 | volumes: 39 | - ../..:${PROJECT_ROOT_AT} 40 | # Here you can mount other volumes and symlink directories in data and outputs to them. 41 | working_dir: ${PROJECT_ROOT_AT} 42 | environment: 43 | PROJECT_ROOT_AT: ${PROJECT_ROOT_AT} 44 | WANDB_API_KEY: ${WANDB_API_KEY} 45 | PROJECT_NAME: ${PROJECT_NAME} 46 | PACKAGE_NAME: ${PACKAGE_NAME} 47 | ipc: host # Edit as needed (NGC default recommendations, see /opt/nvidia/entrypoint.d/70-shm-check.sh). 48 | ulimits: # Edit as needed (NGC default recommendations, see /opt/nvidia/entrypoint.d/70-shm-check.sh). 49 | memlock: -1 50 | stack: 67108864 51 | network_mode: host # Edit as needed. Default to avoid extra complecity from networking. 52 | 53 | dev-local-cpu: # Service to develop locally with CPU only. 54 | extends: 55 | service: run-local-cpu 56 | volumes: 57 | # To persist IDE settings and cache. 58 | - ${HOME}/.template-gitconfig:/home/${USR}/.gitconfig 59 | - ${HOME}/.template-dev-vscode-server:/home/${USR}/.dev-vscode-server 60 | - ${HOME}/.template-dev-cursor-server:/home/${USR}/.dev-cursor-server 61 | - ${HOME}/.template-dev-jetbrains-server:/home/${USR}/.jetbrains-server 62 | environment: 63 | PYCHARM_IDE_AT: ${PYCHARM_IDE_AT} 64 | JETBRAINS_SERVER_AT: /home/${USR}/.jetbrains-server 65 | VSCODE_SERVER_AT: /home/${USR}/.dev-vscode-server 66 | CURSOR_SERVER_AT: /home/${USR}/.dev-cursor-server 67 | 68 | run-local-cuda: # Service to run the image locally with NVIDIA GPU. 69 | extends: 70 | service: run-local-cpu 71 | deploy: 72 | resources: 73 | reservations: 74 | devices: 75 | - driver: nvidia 76 | capabilities: [ gpu ] 77 | 78 | dev-local-cuda: # Service to develop locally with NVIDIA GPU. 79 | extends: 80 | service: dev-local-cpu 81 | deploy: 82 | resources: 83 | reservations: 84 | devices: 85 | - driver: nvidia 86 | capabilities: [ gpu ] 87 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/entrypoints/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Halt in case of errors. https://gist.github.com/vncsna/64825d5609c146e80de8b1fd623011ca 3 | set -eo pipefail 4 | echo "[TEMPLATE INFO] Running entrypoint.sh" 5 | 6 | # Check that the PROJECT_ROOT_AT is set. 7 | if [ -z "${PROJECT_ROOT_AT}" ]; then 8 | echo "[TEMPLATE WARNING] PROJECT_ROOT_AT is not set." 9 | echo "[TEMPLATE WARNING] It is expected to point to the location of your mounted project if you plan to run you code." 10 | echo "[TEMPLATE WARNING] Ignore if you only need the development environment." 11 | echo "[TEMPLATE WARNING] PROJECT_ROOT_AT has been defaulted to $(pwd)" 12 | echo "[TEMPLATE WARNING] The project installation will be skipped." 13 | export PROJECT_ROOT_AT="$(pwd)" 14 | export SKIP_INSTALL_PROJECT=1 15 | else 16 | echo "[TEMPLATE INFO] PROJECT_ROOT_AT is set to ${PROJECT_ROOT_AT}." 17 | fi 18 | echo "[TEMPLATE INFO] Expecting workdir to be ${PROJECT_ROOT_AT}." 19 | 20 | if [ "$(pwd)" != "${PROJECT_ROOT_AT}" ]; then 21 | echo "[TEMPLATE WARNING] The current/working directory $(pwd) is different from PROJECT_ROOT_AT." 22 | echo "[TEMPLATE WARNING] The template expects them to be the same, as it assumes running the experiments from PROJECT_ROOT_AT." 23 | fi 24 | 25 | # Install the package in editable mode. 26 | # Also ensures the code is mounted correctly. 27 | # Because setting the Python path the the project may not be enough. 28 | # https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs 29 | if [ -n "${SKIP_INSTALL_PROJECT}" ]; then 30 | # For debugging or other purposes. 31 | # Best practice is to install the project. 32 | echo "[TEMPLATE INFO] Skipping the installation of the project." 33 | else 34 | echo "[TEMPLATE INFO] Installing the project with pip." 35 | echo "[TEMPLATE INFO] Expecting ${PROJECT_ROOT_AT} to be a Python project." 36 | echo "[TEMPLATE INFO] To skip this installation use the env variable SKIP_INSTALL_PROJECT=1." 37 | # The path is relative on purpose. 38 | pip install --user --no-build-isolation -e "${PROJECT_ROOT_AT}" 39 | # Test that the package can be imported. 40 | echo "[TEMPLATE INFO] Testing that the package can be imported." 41 | python -c "import ${PACKAGE_NAME}" 42 | echo "[TEMPLATE INFO] Package imported successfully." 43 | fi 44 | 45 | # Login options, e.g., wandb. 46 | # Doesn't do anything if no option provided. 47 | source "${ENTRYPOINTS_ROOT}"/logins-setup.sh 48 | 49 | # Remote development options (e.g., PyCharm or VS Code configuration, Jupyter etc). 50 | # Doesn't do anything if no option provided. 51 | # Only do them once for SLURM. 52 | if [ -n "${SLURM_ONE_REMOTE_DEV}" ] && [ "${SLURM_PROCID}" -gt 0 ]; then 53 | echo "[TEMPLATE INFO] Running the remote development entrypoint only once." 54 | echo "[TEMPLATE INFO] Skipping remote development setup on SLURM_PROCID ${SLURM_PROCID}." 55 | else 56 | source "${ENTRYPOINTS_ROOT}"/remote-development-setup.sh 57 | fi 58 | 59 | # Exec so that the child process receives the OS signals. 60 | # E.g., signals that the container will be preempted. 61 | # It will be PID 1. 62 | echo "[TEMPLATE INFO] Executing the command" "$@" 63 | exec "$@" 64 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/entrypoints/logins-setup.sh: -------------------------------------------------------------------------------- 1 | # W&B login. 2 | 3 | # This does not need an internet connection. 4 | # OPTION 1: Set WANDB_API_KEY in the environment. 5 | if [ -n "${WANDB_API_KEY}" ]; then 6 | echo "[TEMPLATE INFO] Logging in to W&B." 7 | wandb login "${WANDB_API_KEY}" 8 | fi 9 | # OPTION 2: Set WANDB_API_KEY_FILE_AT in the environment which points to a file containing the key. 10 | if [ -n "${WANDB_API_KEY_FILE_AT}" ]; then 11 | echo "[TEMPLATE INFO] Logging in to W&B." 12 | wandb login "$(cat "${WANDB_API_KEY_FILE_AT}")" 13 | fi 14 | 15 | # Hugging Face login. 16 | if [ -n "${HF_TOKEN_AT}" ]; then 17 | echo "[TEMPLATE INFO] Logging in to Hugging Face." 18 | huggingface-cli login --token "$(cat "${HF_TOKEN_AT}")" 19 | fi 20 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/entrypoints/pre-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The base entrypoint (from the base image) should exec the command it receives otherwise this will break 4 | # the signal handling. 5 | # (Otherwise, you should source it, assuming then run with the same shell, then exec /opt/template-entrypoints/entrypoint.sh.) 6 | # In the end all variables exported should be present and the command given by the user should run with PID 1. 7 | 8 | # In distributed jobs the number of times the entrypoint is run should match the number of containers created. 9 | # On Slurm, for example, with Pyxis a single container is created per node, 10 | # and if the entrypoint is called manually after srun, it will run multiple times in the same container (ntasks-per-node) 11 | # so we can skip it with the following variables: 12 | 13 | # If nodes share the same container: 14 | if [ -n "${SLURM_ONE_ENTRYPOINT_SCRIPT_PER_JOB}" ] && [ "${SLURM_PROCID}" -gt 0 ]; then 15 | echo "[TEMPLATE INFO] Running the entrypoing only once for the job." 16 | echo "[TEMPLATE INFO] Skipping entrypoints on SLURM_PROCID ${SLURM_PROCID}." 17 | echo "[TEMPLATE INFO] Executing the command" "$@" 18 | exec "$@" 19 | fi 20 | # If tasks on the same node share the same container: 21 | if [ -n "${SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE}" ] && [ "${SLURM_LOCALID}" -gt 0 ]; then 22 | echo "[TEMPLATE INFO] Running the entrypoint once per node." 23 | echo "[TEMPLATE INFO] Skipping entrypoints on SLURM_PROCID ${SLURM_PROCID}." 24 | echo "[TEMPLATE INFO] Executing the command" "$@" 25 | exec "$@" 26 | fi 27 | 28 | # Continue with the entrypoint script. 29 | if [ -n "${SLURM_PROCID}" ]; then 30 | echo "[TEMPLATE INFO] Running the pre-entrypoint.sh for SLURM_PROCID ${SLURM_PROCID}, SLURM_LOCALID ${SLURM_LOCALID}, hostname $(hostname)." 31 | fi 32 | 33 | # Do this if the entrypoint execs the command it receives (every entrypoint should do this). 34 | if [ -n "${BASE_ENTRYPOINT_EXECS}" ] && [ "${BASE_ENTRYPOINT_EXECS}" -eq 1 ] && [ -n "${BASE_ENTRYPOINT}" ]; then 35 | echo "[TEMPLATE INFO] execing the base image's entrypoint ${BASE_ENTRYPOINT} which will then exec the template's entrypoint." 36 | exec "${BASE_ENTRYPOINT}" /opt/template-entrypoints/entrypoint.sh "$@" 37 | else 38 | if [ -n "${BASE_ENTRYPOINT}" ]; then 39 | echo "[TEMPLATE INFO] Sourcing the base image's entrypoint ${BASE_ENTRYPOINT} then execing the template's entrypoint." 40 | source "${BASE_ENTRYPOINT}" || { echo "Failed to source ${BASE_ENTRYPOINT}"; exit 1; } 41 | exec /opt/template-entrypoints/entrypoint.sh "$@" 42 | else 43 | echo "[TEMPLATE INFO] Execing the template's entrypoint." 44 | exec /opt/template-entrypoints/entrypoint.sh "$@" 45 | fi 46 | fi 47 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/entrypoints/remote-development-setup.sh: -------------------------------------------------------------------------------- 1 | #################### 2 | # Git config. 3 | # Workaround using symlinks when clusters do not allow to mount specific directories or files. 4 | 5 | if [ -n "${GIT_CONFIG_AT}" ]; then 6 | mkdir -p $(dirname "${GIT_CONFIG_AT}") 7 | touch "${GIT_CONFIG_AT}" 8 | ln -s "${GIT_CONFIG_AT}" "${HOME}/.gitconfig" 9 | echo "[TEMPLATE INFO] Sym-linked Git config to ${GIT_CONFIG_AT}." 10 | fi 11 | 12 | #################### 13 | # Open ssh server. 14 | 15 | if [ -n "${SSH_SERVER}" ] || [ -n "${SOURCE_ENV_FOR_SSH}" ];then 16 | # Export environment variables lost through ssh connection. 17 | # (Assumes a single user). 18 | # SSH connections don't have the environment variables, so we need to set them. 19 | # Export all the env variables except the ones specific to the current shell. 20 | # Not sure if this is the best way to do it. 21 | env | grep -v -E '^(BASH|SHLVL|PWD|OLDPWD|SHELL|LOGNAME|_| |\}|\{)' |\ 22 | sed -E 's/=(.*)/="\1"/' | sed 's/^/export /' > "${HOME}"/.container-env-vars 23 | # Export to login shells. 24 | echo "source ${HOME}/.container-env-vars" >> "${HOME}/.bash_profile" 25 | echo "source ${HOME}/.container-env-vars" >> "${HOME}/.zprofile" 26 | echo "[TEMPLATE INFO] Environment variables have been written to ${HOME}/.docker-env-vars." 27 | echo "[TEMPLATE_INFO] And will be sourced in login shells to preserve environment variables in ssh connections." 28 | echo "[TEMPLATE INFO] If you change one at runtime and want it to be preserved in subsequence shell invocations, you need to write it to ${HOME}/.docker-env-vars as well." 29 | fi 30 | 31 | 32 | if [ -n "${SSH_SERVER}" ]; then 33 | # Configuration for ssh server. 34 | # This could be done without sudo if needed. 35 | # check if user is not root 36 | echo "[TEMPLATE INFO] Configuring ssh server on port ${SSH_CONTAINER_PORT:-2223}." 37 | if [ "${EUID}" -eq 0 ] || [ -n "${NO_SUDO_NEEDED}" ]; then 38 | mkdir /var/run/sshd 39 | sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd 40 | # Change the default port to ${SSH_CONTAINER_PORT}. 41 | sed -i "s/#Port 22/Port ${SSH_CONTAINER_PORT:-2223}/" /etc/ssh/sshd_config 42 | else 43 | echo "${PASSWD}" | sudo -S mkdir /var/run/sshd 44 | echo "${PASSWD}" | sudo -S sed -i \ 45 | 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd 46 | # Change the default port to ${SSH_CONTAINER_PORT}. 47 | echo "${PASSWD}" | sudo -S sed -i "s/#Port 22/Port ${SSH_CONTAINER_PORT:-2223}/" /etc/ssh/sshd_config 48 | fi 49 | 50 | echo "[TEMPLATE INFO] Starting ssh server on port ${SSH_CONTAINER_PORT:-2223}." 51 | # This runs in background, so the script will continue. 52 | if [ "${EUID}" -eq 0 ] || [ -n "${NO_SUDO_NEEDED}" ]; then 53 | /usr/sbin/sshd 54 | else 55 | echo "${PASSWD}" | sudo -S /usr/sbin/sshd 56 | fi 57 | 58 | # Make login shells cd to the project root. 59 | echo "cd ${PROJECT_ROOT_AT}" >> "${HOME}/.bash_profile" 60 | echo "cd ${PROJECT_ROOT_AT}" >> "${HOME}/.zprofile" 61 | fi 62 | 63 | #################### 64 | ## PyCharm remote development server. 65 | # You can set the env variable JETBRAINS_SERVER_AT to persist your JetBrains configuration and cache. 66 | # You can set the env variable PYCHARM_IDE_AT to the location of the PyCharm binaries in your mounted storage. 67 | 68 | # Workaround using symlinks when clusters do not allow to mount specific directories or files. 69 | if [ -n "${JETBRAINS_SERVER_AT}" ]; then 70 | echo "[TEMPLATE INFO] Sym-linking to PyCharm project config files." 71 | # Per-project server. 72 | # Create if doesn't exist. 73 | PROJECT_JETBRAINS_SERVER_AT="${JETBRAINS_SERVER_AT}/projects/${PROJECT_ROOT_AT}" 74 | mkdir -p "${JETBRAINS_SERVER_AT}"/dist 75 | mkdir -p "${PROJECT_JETBRAINS_SERVER_AT}/config" 76 | mkdir -p "${PROJECT_JETBRAINS_SERVER_AT}/local" 77 | mkdir -p "${PROJECT_JETBRAINS_SERVER_AT}/cache" 78 | mkdir -p "${HOME}/.config" 79 | mkdir -p "${HOME}/.local/share" 80 | mkdir -p "${HOME}/.cache" 81 | ln -s "${PROJECT_JETBRAINS_SERVER_AT}/config" "${HOME}/.config/JetBrains" 82 | ln -s "${PROJECT_JETBRAINS_SERVER_AT}/local" "${HOME}/.local/share/JetBrains" 83 | ln -s "${PROJECT_JETBRAINS_SERVER_AT}/cache" "${HOME}/.cache/JetBrains" 84 | fi 85 | 86 | if [ -n "${PYCHARM_IDE_AT}" ]; then 87 | # Check if directory exists. 88 | if [ ! -d "${JETBRAINS_SERVER_AT}/dist/${PYCHARM_IDE_AT}" ]; then 89 | echo "[TEMPLATE WARNING] The PyCharm IDE directory ${JETBRAINS_SERVER_AT}/dist/${PYCHARM_IDE_AT} does not exist." 90 | echo "[TEMPLATE WARNING] The IDE will not be started. This is okay if you're installing an IDE manually." 91 | else 92 | echo "[TEMPLATE INFO] Starting PyCharm remote development server." 93 | REMOTE_DEV_NON_INTERACTIVE=1 \ 94 | "${JETBRAINS_SERVER_AT}/dist/${PYCHARM_IDE_AT}/bin/remote-dev-server.sh" run "${PROJECT_ROOT_AT}" \ 95 | --ssh-link-host 127.0.0.1 \ 96 | --ssh-link-user "${USER:-$(id -un)}" \ 97 | --ssh-link-port "${SSH_FORWARD_PORT:-2223}" & 98 | fi 99 | fi 100 | 101 | #################### 102 | ## VS Code remote development server. 103 | # Workaround using symlinks when clusters do not allow to mount specific directories or files. 104 | 105 | if [ -n "${VSCODE_SERVER_AT}" ]; then 106 | echo "[TEMPLATE INFO] Sym-linking to VSCode server config files." 107 | # Per-project server. 108 | # Create if doesn't exist. 109 | PROJECT_VSCODE_SERVER_AT="${VSCODE_SERVER_AT}/projects${PROJECT_ROOT_AT}" 110 | mkdir -p "${PROJECT_VSCODE_SERVER_AT}" 111 | ln -s "${PROJECT_VSCODE_SERVER_AT}" "${HOME}/.vscode-server" 112 | fi 113 | 114 | #################### 115 | ## Cursor remote development server. 116 | # Same as VSCode up to naming 117 | 118 | if [ -n "${CURSOR_SERVER_AT}" ]; then 119 | echo "[TEMPLATE INFO] Sym-linking to Cursor server config files." 120 | # Per-project server. 121 | # Create if doesn't exist. 122 | PROJECT_CURSOR_SERVER_AT="${CURSOR_SERVER_AT}/projects${PROJECT_ROOT_AT}" 123 | mkdir -p "${PROJECT_CURSOR_SERVER_AT}" 124 | ln -s "${PROJECT_CURSOR_SERVER_AT}" "${HOME}/.cursor-server" 125 | fi 126 | 127 | ##################### 128 | # Jupyter Lab server. 129 | # Jupyter must be installed in the environment. 130 | 131 | if [ -n "${JUPYTER_SERVER}" ]; then 132 | echo "[TEMPLATE INFO] Starting Jupyter Lab server." 133 | # Workaround to open zsh. 134 | SHELL=zsh \ 135 | jupyter-lab --no-browser --port="${JUPYTER_PORT:-8887}" --notebook-dir="${PROJECT_ROOT_AT}" & 136 | fi 137 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-python-template/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1 2 | 3 | # BASE_IMAGE is the image that will be extended by this Dockerfile. 4 | # It is assumed to a well configured Python installation. 5 | # The reminaing packages will be installed with pip. 6 | ARG BASE_IMAGE 7 | ARG GIT_IMAGE 8 | 9 | ######################################################################## 10 | # Install apt packages. 11 | 12 | FROM ${BASE_IMAGE} AS runtime-apt-pkgs 13 | 14 | # A directory to record all the dependency files used at multiple stages. 15 | # This is useful for a later inspection or debugging. 16 | ENV DEPENDENCIES_DIR=/opt/template-dependencies 17 | RUN mkdir ${DEPENDENCIES_DIR} 18 | COPY apt.txt ${DEPENDENCIES_DIR}/apt.txt 19 | 20 | # Enable caching for `apt` packages in Docker. 21 | # https://docs.docker.com/engine/reference/builder/#run---mounttypecache 22 | RUN rm -f /etc/apt/apt.conf.d/docker-clean; \ 23 | echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > \ 24 | /etc/apt/apt.conf.d/keep-cache 25 | 26 | ARG DEBIAN_FRONTEND=noninteractive 27 | # sed is only used as a hack to remove comments from the file apt.txt. 28 | RUN --mount=type=cache,target=/var/cache/apt,sharing=private \ 29 | --mount=type=cache,target=/var/lib/apt,sharing=private \ 30 | apt update && \ 31 | sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \ 32 | xargs -t apt-get install -y --no-install-recommends && \ 33 | rm -rf /var/lib/apt/lists/* 34 | 35 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes. 36 | #RUN --mount=type=cache,target=/var/cache/apt \ 37 | # --mount=type=cache,target=/var/lib/apt \ 38 | # apt update && \ 39 | # sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \ 40 | # xargs -t apt-get install -y --no-install-recommends && \ 41 | # rm -rf /var/lib/apt/lists/* 42 | 43 | ######################################################################## 44 | # Install dependencies. 45 | 46 | FROM runtime-apt-pkgs AS runtime-deps 47 | 48 | # Install pip packages. 49 | ENV PIP_CACHE_DIR=/root/.cache/pip 50 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-before-pip-install.txt 51 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-before-pip-install.txt 52 | COPY requirements.txt ${DEPENDENCIES_DIR}/requirements.txt 53 | RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \ 54 | pip install -r ${DEPENDENCIES_DIR}/requirements.txt 55 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes. 56 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \ 57 | # pip install -r ${DEPENDENCIES_DIR}/requirements.txt 58 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-after-pip-install.txt 59 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-after-pip-install.txt 60 | 61 | # For reproducible requirements use the following after getting the requirements-freeze.txt file from the first build. 62 | #COPY requirements-freeze.txt ${DEPENDENCIES_DIR}/requirements-freeze.txt 63 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \ 64 | # pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt 65 | # For podman 66 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \ 67 | # pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt 68 | 69 | # Optional optimizations. 70 | # Hack to enable Intel MKL optimizations on AMD CPUs. 71 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html 72 | ARG FAKEINTEL_PATH=/opt/fakeintel/libfakeintel.so 73 | ENV FAKEINTEL_PATH=${FAKEINTEL_PATH} 74 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html 75 | # Build. 76 | RUN echo 'int mkl_serv_intel_cpu_true() {return 1;}' > /tmp/fakeintel.c && \ 77 | mkdir -p /opt/fakeintel && \ 78 | gcc -shared -fPIC -o ${FAKEINTEL_PATH} /tmp/fakeintel.c 79 | # Enable. 80 | ENV LD_PRELOAD=${FAKEINTEL_PATH}:${LD_PRELOAD} 81 | 82 | ######################################################################## 83 | # Here you can install other software 84 | 85 | 86 | ######################################################################## 87 | # Download Z-Shell enhancements. 88 | 89 | FROM ${GIT_IMAGE} AS get-pure 90 | 91 | ARG PURE_URL=https://github.com/sindresorhus/pure.git 92 | ARG ZSHA_URL=https://github.com/zsh-users/zsh-autosuggestions.git 93 | ARG ZSHS_URL=https://github.com/zsh-users/zsh-syntax-highlighting.git 94 | 95 | RUN git clone --depth 1 ${PURE_URL} /opt/zsh/pure 96 | RUN git clone --depth 1 ${ZSHA_URL} /opt/zsh/zsh-autosuggestions 97 | RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting 98 | 99 | ######################################################################## 100 | # This stage is the final user-agnostic (generic) stage. 101 | # This layer can be distributed so that subsequent users 102 | 103 | FROM runtime-deps AS runtime-generic 104 | 105 | ENV HYDRA_FULL_ERROR=1 106 | 107 | # A final record of the dependencies from pip freeze. 108 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-final.txt 109 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-final.txt 110 | 111 | # Shell configuration. 112 | ENV ZSH_ENHANCE_DIR=/etc/zsh/enhance 113 | ARG PURE_PATH=${ZSH_ENHANCE_DIR}/pure 114 | ARG ZSHA_PATH=${ZSH_ENHANCE_DIR}/zsh-autosuggestions 115 | ARG ZSHS_PATH=${ZSH_ENHANCE_DIR}/zsh-syntax-highlighting 116 | COPY --from=get-pure /opt/zsh/pure ${PURE_PATH} 117 | COPY --from=get-pure /opt/zsh/zsh-autosuggestions ${ZSHA_PATH} 118 | COPY --from=get-pure /opt/zsh/zsh-syntax-highlighting ${ZSHS_PATH} 119 | RUN { echo "fpath+=${PURE_PATH}"; \ 120 | echo "autoload -Uz promptinit; promptinit"; \ 121 | echo "prompt pure"; \ 122 | echo "source ${ZSHA_PATH}/zsh-autosuggestions.zsh"; \ 123 | echo "source ${ZSHS_PATH}/zsh-syntax-highlighting.zsh"; \ 124 | echo "alias ls='ls --color=auto'"; \ 125 | echo "alias ll='ls -lh'"; \ 126 | echo "alias update-env-file='source \${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/update-env-file.sh'"; \ 127 | } >> /etc/zsh/zshrc 128 | 129 | 130 | # Entrypoints. 131 | # Don't overwrite the entrypoint, it is installing the project 132 | # and testing that you correctly mounted the project code. 133 | # It also performs some other important setup depending on the deployment platform. 134 | ARG BASE_ENTRYPOINT 135 | ARG BASE_ENTRYPOINT_EXECS 136 | ENV BASE_ENTRYPOINT=${BASE_ENTRYPOINT} 137 | ENV BASE_ENTRYPOINT_EXECS=${BASE_ENTRYPOINT_EXECS} 138 | ENV ENTRYPOINTS_ROOT=/opt/template-entrypoints 139 | COPY entrypoints ${ENTRYPOINTS_ROOT} 140 | ENTRYPOINT ["/opt/template-entrypoints/pre-entrypoint.sh"] 141 | CMD ["/bin/zsh"] 142 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-python-template/compose-base.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | build-args: 3 | build: 4 | args: 5 | # Pytorch 2.4.0a0+f70bd71a48, NVIDIA CUDA 12.5.0.23, Python 3.10. 6 | # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-06.html 7 | # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch 8 | BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3 9 | GIT_IMAGE: docker.io/alpine/git:2.40.1 # https://hub.docker.com/r/alpine/git/tags 10 | # You can find the entrypoint by running `docker inspect BASE_IMAGE | grep -A 3 Entrypoint` 11 | # If there is no entrypoint, you can leave it empty. 12 | BASE_ENTRYPOINT: /opt/nvidia/nvidia_entrypoint.sh 13 | # 1 normally, 0 if the entrypoint does not exec its arguments, in rare cases. 14 | BASE_ENTRYPOINT_EXECS: 1 15 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-python-template/requirements.txt: -------------------------------------------------------------------------------- 1 | hatchling # To build the package without isolation in edit mode for faster startup. 2 | editables # Same as above. 3 | hydra-core 4 | tqdm 5 | wandb 6 | pre-commit 7 | black 8 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-python-template/update-env-file.sh: -------------------------------------------------------------------------------- 1 | # Records the current environment to a file. 2 | # Packages installed from GitHub with pip install will not be recorded 3 | # properly (i.e. the link will be omitted and just replaced with the version). 4 | # In that case, you have to update this file to add commands that 5 | # will fix the environment file. (you could also just edit it manually afterwards). 6 | 7 | ENV_FILE="${PROJECT_ROOT_AT}"/installation/docker-amd64-cuda/requirements-freeze.txt 8 | pip list --exclude-editable --format freeze > "${ENV_FILE}" 9 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-scratch-template/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1 2 | 3 | # CURL_IMAGE is used to download from the internet in independent layers. 4 | # GIT_IMAGE is used to clone git repositories in independent layers. 5 | # BASE_IMAGE is the base image for the project, likely the Ubuntu image. 6 | ARG CURL_IMAGE 7 | ARG GIT_IMAGE 8 | ARG BASE_IMAGE 9 | 10 | ######################################################################## 11 | # Download conda. 12 | 13 | FROM ${CURL_IMAGE} AS get-conda 14 | ARG CONDA_URL 15 | RUN mkdir /tmp/conda && \ 16 | curl -fvL -o /tmp/conda/miniconda.sh ${CONDA_URL} 17 | 18 | ######################################################################## 19 | # Install conda. 20 | 21 | FROM ${BASE_IMAGE} AS install-conda 22 | 23 | ARG CONDA_INSTALL_PATH 24 | RUN --mount=type=bind,from=get-conda,source=/tmp/conda,target=/tmp/conda \ 25 | /bin/bash /tmp/conda/miniconda.sh -b -p ${CONDA_INSTALL_PATH} 26 | 27 | ######################################################################## 28 | # Install apt packages. 29 | 30 | FROM ${BASE_IMAGE} AS runtime-apt-pkgs 31 | 32 | # A directory to record all the dependency files used at multiple stages. 33 | # This is useful for a later inspection or debugging. 34 | ENV DEPENDENCIES_DIR=/opt/template-dependencies 35 | RUN mkdir ${DEPENDENCIES_DIR} 36 | COPY apt.txt ${DEPENDENCIES_DIR}/apt.txt 37 | 38 | # Enable caching for `apt` packages in Docker. 39 | # https://docs.docker.com/engine/reference/builder/#run---mounttypecache 40 | RUN rm -f /etc/apt/apt.conf.d/docker-clean; \ 41 | echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > \ 42 | /etc/apt/apt.conf.d/keep-cache 43 | 44 | ARG DEBIAN_FRONTEND=noninteractive 45 | # sed is only used as a hack to remove comments from the file apt.txt. 46 | RUN --mount=type=cache,target=/var/cache/apt,sharing=private \ 47 | --mount=type=cache,target=/var/lib/apt,sharing=private \ 48 | apt update && \ 49 | sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \ 50 | xargs -t apt-get install -y --no-install-recommends && \ 51 | rm -rf /var/lib/apt/lists/* 52 | 53 | ######################################################################## 54 | # Install dependencies. 55 | 56 | FROM runtime-apt-pkgs AS runtime-deps 57 | 58 | ARG PROJECT_NAME 59 | ENV PYTHONDONTWRITEBYTECODE=1 60 | ENV PYTHONUNBUFFERED=1 61 | ENV PIP_CACHE_DIR=/root/.cache/pip 62 | ARG CONDA_INSTALL_PATH 63 | ENV CONDA_INSTALL_PATH=${CONDA_INSTALL_PATH} 64 | ENV CONDA_CACHE_PKGS_DIRS=${CONDA_INSTALL_PATH}/pkgs 65 | ENV PATH=${CONDA_INSTALL_PATH}/condabin:${PATH} 66 | 67 | COPY --link --from=install-conda ${CONDA_INSTALL_PATH} ${CONDA_INSTALL_PATH} 68 | COPY environment.yml ${DEPENDENCIES_DIR}/environment.yml 69 | RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \ 70 | --mount=type=cache,target=${CONDA_CACHE_PKGS_DIRS},sharing=private \ 71 | mamba env create --file ${DEPENDENCIES_DIR}/environment.yml 72 | 73 | # Record the dependency file after conda install which may be useful. 74 | RUN mamba env export -n ${PROJECT_NAME} > ${DEPENDENCIES_DIR}/environment-mamba-after-env-create.yml 75 | 76 | # Cleaning must be in a separate `RUN` command to preserve the Docker cache. 77 | RUN mamba clean -fya && \ 78 | find ${CONDA_INSTALL_PATH}/envs/${PROJECT_NAME} -name '__pycache__' | xargs rm -rf 79 | 80 | # Make the conda env writeable by anyone so that it can be updated by a user. 81 | RUN chmod -R 777 ${CONDA_INSTALL_PATH} 82 | 83 | # Add conda env to path. 84 | ENV PATH=${CONDA_INSTALL_PATH}/envs/${PROJECT_NAME}/bin:${PATH} 85 | 86 | # Optional optimizations. 87 | # Hack to enable Intel MKL optimizations on AMD CPUs. 88 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html 89 | ARG FAKEINTEL_PATH=/opt/fakeintel/libfakeintel.so 90 | ENV FAKEINTEL_PATH=${FAKEINTEL_PATH} 91 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html 92 | # Build. 93 | RUN echo 'int mkl_serv_intel_cpu_true() {return 1;}' > /tmp/fakeintel.c && \ 94 | mkdir -p /opt/fakeintel && \ 95 | gcc -shared -fPIC -o ${FAKEINTEL_PATH} /tmp/fakeintel.c 96 | # Enable. 97 | ENV LD_PRELOAD=${FAKEINTEL_PATH}:${LD_PRELOAD} 98 | 99 | ######################################################################## 100 | # Here you can install other software 101 | # You can build and install pip packages 102 | # Just make sure to prefix your pip commands with `mamba run -n ${PROJECT_NAME} pip install ...` 103 | # to have to package installed in the same location as the conda env of the project. 104 | 105 | ######################################################################## 106 | # Download Z-Shell enhancements. 107 | 108 | FROM ${GIT_IMAGE} AS get-pure 109 | 110 | ARG PURE_URL=https://github.com/sindresorhus/pure.git 111 | ARG ZSHA_URL=https://github.com/zsh-users/zsh-autosuggestions.git 112 | ARG ZSHS_URL=https://github.com/zsh-users/zsh-syntax-highlighting.git 113 | 114 | RUN git clone --depth 1 ${PURE_URL} /opt/zsh/pure 115 | RUN git clone --depth 1 ${ZSHA_URL} /opt/zsh/zsh-autosuggestions 116 | RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting 117 | 118 | ######################################################################## 119 | # This stage is the final user-agnostic (generic) stage. 120 | # This layer can be distributed so that subsequent users 121 | 122 | FROM runtime-deps AS runtime-generic 123 | 124 | ARG PROJECT_NAME 125 | ENV HYDRA_FULL_ERROR=1 126 | 127 | # A final record of the depedenencies from mamba (to record the any extra custom installs) 128 | RUN mamba env export -n ${PROJECT_NAME} > ${DEPENDENCIES_DIR}/environment-mamba-final.yml 129 | 130 | # Initialize conda for login and interactive shells, for a better development experience. 131 | # Although the environment is already in the PATH. 132 | RUN mamba init --system bash 133 | RUN { echo "mamba activate ${PROJECT_NAME}"; \ 134 | } >> /etc/profile.d/conda.sh 135 | RUN cat /etc/profile.d/conda.sh >> /etc/bash.bashrc 136 | RUN cat /etc/profile.d/conda.sh >> /etc/zsh/zprofile 137 | RUN cat /etc/profile.d/conda.sh >> /etc/zsh/zshrc 138 | 139 | # Shell configuration. 140 | ENV ZSH_ENHANCE_DIR=/etc/zsh/enhance 141 | ARG PURE_PATH=${ZSH_ENHANCE_DIR}/pure 142 | ARG ZSHA_PATH=${ZSH_ENHANCE_DIR}/zsh-autosuggestions 143 | ARG ZSHS_PATH=${ZSH_ENHANCE_DIR}/zsh-syntax-highlighting 144 | COPY --link --from=get-pure /opt/zsh/pure ${PURE_PATH} 145 | COPY --link --from=get-pure /opt/zsh/zsh-autosuggestions ${ZSHA_PATH} 146 | COPY --link --from=get-pure /opt/zsh/zsh-syntax-highlighting ${ZSHS_PATH} 147 | # Utilities for interactive shells. 148 | RUN { echo "fpath+=${PURE_PATH}"; \ 149 | echo "autoload -Uz promptinit; promptinit"; \ 150 | echo "prompt pure"; \ 151 | echo "source ${ZSHA_PATH}/zsh-autosuggestions.zsh"; \ 152 | echo "source ${ZSHS_PATH}/zsh-syntax-highlighting.zsh"; \ 153 | echo "alias ls='ls --color=auto'"; \ 154 | echo "alias ll='ls -lh'"; \ 155 | echo "alias update-env-file='source \${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/update-env-file.sh'"; \ 156 | } >> /etc/zsh/zshrc 157 | 158 | # Entrypoints. 159 | # The entrypoint is run in an interactive shell so that the conda environment is activated before. 160 | # Don't overwrite the entrypoint, it is installing the project 161 | # and testing that you correctly mounted the project code. 162 | # It also performs some other important setup depending on the deployment platform. 163 | ENV ENTRYPOINTS_ROOT=/opt/template-entrypoints 164 | COPY --link entrypoints ${ENTRYPOINTS_ROOT} 165 | ENTRYPOINT ["/opt/template-entrypoints/pre-entrypoint.sh"] 166 | CMD ["/bin/zsh"] 167 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-scratch-template/compose-base.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | build-args: 3 | build: 4 | args: 5 | BASE_IMAGE: ubuntu:22.04 # Ubuntu: https://hub.docker.com/_/ubuntu 6 | CURL_IMAGE: curlimages/curl:8.2.1 # https://hub.docker.com/r/curlimages/curl/tags 7 | GIT_IMAGE: docker.io/alpine/git:2.40.1 # https://hub.docker.com/r/alpine/git/tags 8 | CONDA_URL: https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-23.3.1-1-Linux-x86_64.sh 9 | # Change the link when changing the platform or updating to a new version. 10 | # https://conda-forge.org/miniforge/ 11 | CONDA_INSTALL_PATH: /opt/conda 12 | # Should be the same between stages not to brake linking. 13 | # https://towardsdatascience.com/conda-essential-concepts-and-tricks-e478ed53b5b#bb7b 14 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-scratch-template/environment.yml: -------------------------------------------------------------------------------- 1 | name: template-project-name 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.10 6 | - pip 7 | - pip: 8 | - hatchling 9 | - editables 10 | - hydra-core 11 | - tqdm 12 | - wandb 13 | - pre-commit 14 | - black 15 | prefix: /opt/conda/envs/template-project-name 16 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/from-scratch-template/update-env-file.sh: -------------------------------------------------------------------------------- 1 | # Records the current environment to a file. 2 | # Packages installed from GitHub with pip install will not be recorded 3 | # properly (i.e. the link can be omitted and just replaced with the version). 4 | # In that case, you have to update this file to add commands that 5 | # will fix the environment file. (you could also just edit it manually afterwards). 6 | 7 | ENV_FILE="${PROJECT_ROOT_AT}"/installation/docker-amd64-cuda/environment-freeze.yml 8 | # Export, but delete the package itself as it's installed at runtime. 9 | # This is because it is only available after mounting the code. 10 | mamba env export --no-builds | sed "/${PROJECT_NAME}==.*/d" >"$ENV_FILE" 11 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/requirements.txt: -------------------------------------------------------------------------------- 1 | hatchling # To build the package without isolation in edit mode for faster startup. 2 | editables # Same as above. 3 | hydra-core 4 | tqdm 5 | wandb 6 | pre-commit 7 | black 8 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | set -e 4 | 5 | ENV_TEXT=$( 6 | cat <<-EOF 7 | # All user-specific configurations are here. 8 | 9 | ## For building: 10 | # Which docker and compose binary to use 11 | # docker and docker compose in general or podman and podman-compose for CSCS Clariden 12 | DOCKER=docker 13 | COMPOSE="docker compose" 14 | # Use the same USRID and GRPID as on the storage you will be mounting. 15 | # USR is used in the image name and must be lowercase. 16 | # It's fine if your username is not lowercase, jut make it lowercase. 17 | USR=$(id -un | tr "[:upper:]" "[:lower:]") 18 | USRID=$(id -u) 19 | GRPID=$(id -g) 20 | GRP=$(id -gn) 21 | # PASSWD is not secret, 22 | # it is only there to avoid running password-less sudo commands accidentally. 23 | PASSWD=$(id -un) 24 | # LAB_NAME will be the first component in the image path. 25 | # It must be lowercase. 26 | LAB_NAME=$(id -gn | tr "[:upper:]" "[:lower:]") 27 | 28 | #### For running locally 29 | # You can find the acceleration options in the compose.yaml file 30 | # by looking at the services with names dev-local-ACCELERATION. 31 | PROJECT_ROOT_AT=$(realpath "$(pwd)"/../..) 32 | ACCELERATION=cuda 33 | WANDB_API_KEY= 34 | # PyCharm-related. Fill after installing the IDE manually the first time. 35 | PYCHARM_IDE_AT= 36 | 37 | 38 | #################### 39 | # Project-specific environment variables. 40 | ## Used to avoid writing paths multiple times and creating inconsistencies. 41 | ## You should not need to change anything below this line. 42 | PROJECT_NAME=template-project-name 43 | PACKAGE_NAME=template_package_name 44 | IMAGE_NAME=\${LAB_NAME}/\${USR}/\${PROJECT_NAME} 45 | IMAGE_PLATFORM=amd64-cuda 46 | # The image name includes the USR to separate the images in an image registry. 47 | # Its tag includes the platform for registries that don't hand multi-platform images for the same tag. 48 | # You can also add a suffix to the platform e.g. -jax or -pytorch if you use different images for different environments/models etc. 49 | 50 | EOF 51 | ) 52 | 53 | ## All variables below are read from the `.env` and `.project.env` files. 54 | ENV_FILE=".env" 55 | 56 | env() { 57 | # Creates the `.env` file. 58 | if [[ -f "${ENV_FILE}" ]]; then 59 | echo "[TEMPLATE ERROR] File ${ENV_FILE} already exists. Aborting." 60 | exit 1 61 | fi 62 | echo "${ENV_TEXT}" >"${ENV_FILE}" 63 | echo "Created the ${ENV_FILE} file. Edit it to set your user-specific variables." 64 | } 65 | 66 | check() { 67 | # Checks if the `.env` file exists. 68 | if [[ ! -f "${ENV_FILE}" ]]; then 69 | echo "[TEMPLATE ERROR] File ${ENV_FILE} does not exist. 70 | Run ./template.sh env to create it, then edit it." 71 | exit 1 72 | fi 73 | source "${ENV_FILE}" 74 | COMPOSE_PROJECT="${PROJECT_NAME}-${IMAGE_PLATFORM}-${USR}" 75 | } 76 | 77 | edit_from_base() { 78 | FROM_BASE="${1}" 79 | if [ "${FROM_BASE}" == "from-python" ] || [ "${FROM_BASE}" == "from-scratch" ]; then 80 | rm -f compose-base.yaml Dockerfile requirements.txt environment.yml update-env-file.sh 81 | cp -r "${FROM_BASE}-template"/* . 82 | else 83 | echo "[TEMPLATE ERROR] Please specify a valid from-base: from-python or from-scratch." 84 | exit 1 85 | fi 86 | } 87 | 88 | pull_generic() { 89 | # Pull the generic runtime and dev images. 90 | check 91 | PULL_IMAGE_NAME="${1}" 92 | if [ "${PULL_IMAGE_NAME}" == "" ]; then 93 | echo "[TEMPLATE ERROR] Please specify the name of the image to pull." 94 | echo "For example: ./template.sh pull ic-registry.epfl.ch/${LAB_NAME}/gaspar/${PROJECT_NAME}" 95 | echo "For example: ./template.sh pull docker.io/docker-username/${PROJECT_NAME}" 96 | exit 1 97 | fi 98 | 99 | $DOCKER pull "${PULL_IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" 100 | $DOCKER tag "${PULL_IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" 101 | } 102 | 103 | build_generic() { 104 | # Check that the files in the installation/ directory are all committed to git if running the build command. 105 | # The image uses the git commit as a tag to know which dependencies where installed. 106 | # Error if there are uncommitted changes. 107 | case "$1" in 108 | --ignore-uncommitted) 109 | IGNORE_UNCOMMITTED=1 110 | shift 111 | ;; 112 | esac 113 | 114 | if [[ ${IGNORE_UNCOMMITTED} -ne 1 ]] && \ 115 | [[ $(git status --porcelain | grep "installation/" | grep -v -E "README" -c) -ge 1 ]]; then 116 | echo "[TEMPLATE ERROR] There are uncommitted changes in the installation/ directory. 117 | Please commit them before building your generic and user image. 118 | The image uses the git commit as a tag to keep track of which dependencies where installed. 119 | If these change don't affect the build (e.g. README), 120 | feel free to just commit and ignore the rebuild." 121 | echo "Force ignoring this error with the flag ./template.sh build --ignore-uncommitted." 122 | exit 1 123 | fi 124 | 125 | # Build the generic runtime and dev images and tag them with the current git commit. 126 | check 127 | $COMPOSE -p "${COMPOSE_PROJECT}" build image-root 128 | 129 | # Tag the images with the current git commit. 130 | GIT_COMMIT=$(git rev-parse --short HEAD) 131 | $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-${GIT_COMMIT}" 132 | } 133 | 134 | build_user() { 135 | # Check that the files in the installation/ directory are all committed to git if running the build command. 136 | # The image uses the git commit as a tag to know which dependencies where installed. 137 | # Error if there are uncommitted changes. 138 | case "$1" in 139 | --ignore-uncommitted) 140 | IGNORE_UNCOMMITTED=1 141 | shift 142 | ;; 143 | esac 144 | 145 | if [[ ${IGNORE_UNCOMMITTED} -ne 1 ]] && \ 146 | [[ $(git status --porcelain | grep "installation/" | grep -v -E "README" -c) -ge 1 ]]; then 147 | echo "[TEMPLATE ERROR] There are uncommitted changes in the installation/ directory. 148 | Please commit them before building your generic and user image. 149 | The image uses the git commit as a tag to keep track of which dependencies where installed. 150 | If these change don't affect the build (e.g. README), 151 | feel free to just commit and ignore the rebuild." 152 | echo "Force ignoring this error with the flag ./template.sh build --ignore-uncommitted." 153 | exit 1 154 | fi 155 | 156 | # Build the user runtime and dev images and tag them with the current git commit. 157 | check 158 | $COMPOSE -p "${COMPOSE_PROJECT}" build image-user 159 | 160 | # If the generic image has the current git tag, then the user image has been build from that tag. 161 | GIT_COMMIT=$(git rev-parse --short HEAD) 162 | if [[ $($DOCKER images --format '{{.Repository}}:${IMAGE_PLATFORM}-{{.Tag}}' |\ 163 | grep -c "${GIT_COMMIT}") -ge 1 ]]; then 164 | $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR}-latest" "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR}-${GIT_COMMIT}" 165 | fi 166 | } 167 | 168 | build() { 169 | build_generic "$@" 170 | build_user "$@" 171 | } 172 | 173 | import_from_podman() { 174 | check 175 | # Import returns a non-zero exit code so we need to ignore it. 176 | enroot import -x mount podman://${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest || true 177 | GIT_COMMIT=$(git rev-parse --short HEAD) 178 | if [[ $($DOCKER images --format '{{.Repository}}:{{.Tag}}' |\ 179 | grep "root-${GIT_COMMIT}" -c) -ge 1 ]]; then 180 | enroot import -x mount podman://${IMAGE_NAME}:${IMAGE_PLATFORM}-root-${GIT_COMMIT} || true 181 | fi 182 | } 183 | 184 | push_usr_or_root() { 185 | check 186 | USR_OR_ROOT="${1}" 187 | PUSH_IMAGE_NAME="${2}" 188 | if [ "${PUSH_IMAGE_NAME}" == "" ]; then 189 | echo "[TEMPLATE ERROR] Please specify the complete name of the image to push." 190 | echo "For example: ./template.sh push docker.io/docker-username/template-project-name" 191 | echo "EPFL people can just do ./template.sh push IC or ./template.sh push RCP 192 | And it will be pushed to ic-registry.epfl.ch/${IMAGE_NAME} 193 | or registry.rcp.epfl.ch/${IMAGE_NAME}" 194 | exit 1 195 | elif [ "${PUSH_IMAGE_NAME}" == "IC" ]; then 196 | PUSH_IMAGE_NAME="ic-registry.epfl.ch/${IMAGE_NAME}" 197 | elif [ "${PUSH_IMAGE_NAME}" == "RCP" ]; then 198 | PUSH_IMAGE_NAME="registry.rcp.epfl.ch/${IMAGE_NAME}" 199 | fi 200 | 201 | $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-latest" \ 202 | "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-latest" 203 | $DOCKER push "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-latest" 204 | 205 | # If the image has a git tag push it as well. 206 | GIT_COMMIT=$(git rev-parse --short HEAD) 207 | if [[ $($DOCKER images --format '{{.Repository}}:{{.Tag}}' |\ 208 | grep "${USR_OR_ROOT}-${GIT_COMMIT}" -c) -ge 1 ]]; then 209 | $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-${GIT_COMMIT}" \ 210 | "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-${GIT_COMMIT}" 211 | $DOCKER push "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-${GIT_COMMIT}" 212 | fi 213 | } 214 | 215 | push_generic() { 216 | check 217 | push_usr_or_root "root" "${1}" 218 | } 219 | 220 | push_user() { 221 | check 222 | push_usr_or_root "${USR}" "${1}" 223 | } 224 | 225 | push() { 226 | push_generic "${1}" 227 | push_user "${1}" 228 | } 229 | 230 | list_env() { 231 | # List the conda environment. 232 | check 233 | echo "[TEMPLATE INFO] Listing the dependencies in an empty container (nothing mounted)." 234 | echo "[TEMPLATE INFO] It's normal to see the warnings about missing PROJECT_ROOT_AT or acceleration options." 235 | echo "[TEMPLATE INFO] The idea is to see if all your dependencies have been installed." 236 | $DOCKER run --rm "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" zsh -c \ 237 | "echo '[TEMPLATE INFO] Running mamba list';\ 238 | if command -v mamba >/dev/null 2>&1; then mamba list -n ${PROJECT_NAME}; \ 239 | else echo '[TEMPLATE INFO] conda not in the environment, skipping...'; fi; 240 | echo '[TEMPLATE INFO] Running pip list'; pip list" 241 | } 242 | 243 | empty_interactive() { 244 | # Start an interactive shell in an empty container. 245 | check 246 | echo "[TEMPLATE INFO] Starting an interactive shell in an empty container (nothing mounted)." 247 | echo "[TEMPLATE INFO] It's normal to see the warnings about missing PROJECT_ROOT_AT or acceleration options." 248 | echo "[TEMPLATE INFO] The idea is to see if all your dependencies have been installed." 249 | $DOCKER run --rm -it "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" 250 | } 251 | 252 | run() { 253 | # Run a command in a new runtime container. 254 | # Usage: 255 | # ./template.sh run -e VAR1=VAL1 -e VAR2=VAL2 ... python -c "print('hello world')" 256 | check 257 | local env_vars=() 258 | local detach=() 259 | 260 | # Catch detach flag 261 | if [[ "$1" == "-d" ]]; then 262 | shift 263 | detach+=("-d") 264 | fi 265 | 266 | # Collect environment variables and commands dynamically 267 | while [[ "$1" == "-e" ]]; do 268 | env_vars+=("$1" "$2") # Store environment variable flags and values as array elements 269 | shift 2 270 | done 271 | 272 | # Execute the docker command using array expansion for environment variables 273 | $COMPOSE -p "${COMPOSE_PROJECT}" run --rm "${detach[@]}" "${env_vars[@]}" "run-local-${ACCELERATION}" "$@" 274 | } 275 | 276 | dev() { 277 | # Run a command in a new development container. 278 | # Usage: 279 | # ./template.sh dev -e VAR1=VAL1 -e VAR2=VAL2 -e SSH_SERVER=1 ... sleep infinity" 280 | check 281 | 282 | # Create the placeholder directories for remote development. 283 | touch ${HOME}/.template-gitconfig 284 | mkdir -p ${HOME}/.template-dev-vscode-server 285 | mkdir -p ${HOME}/.template-dev-jetbrains-server 286 | 287 | local env_vars=() 288 | local detach=() 289 | 290 | # Catch detach flag 291 | if [[ "$1" == "-d" ]]; then 292 | shift 293 | detach+=("-d") 294 | fi 295 | 296 | # Collect environment variables and commands dynamically 297 | while [[ "$1" == "-e" ]]; do 298 | env_vars+=("$1" "$2") # Store environment variable flags and values as array elements 299 | shift 2 300 | done 301 | 302 | # Execute the docker command using array expansion for environment variables 303 | $COMPOSE -p "${COMPOSE_PROJECT}" run --rm "${detach[@]}" "${env_vars[@]}" "dev-local-${ACCELERATION}" "$@" 304 | } 305 | 306 | get_runai_scripts() { 307 | # Rename the runai examples. 308 | # ./template.sh get_runai_scripts 309 | check 310 | cp -r "./EPFL-runai-setup/example-submit-scripts/" "./EPFL-runai-setup/submit-scripts" 311 | for file in $(find "./EPFL-runai-setup/submit-scripts" -type f); do 312 | sed -i.deleteme "s/moalla/${USR}/g" "$file" && rm "${file}.deleteme" 313 | sed -i.deleteme "s/claire/${LAB_NAME}/g" "$file" && rm "${file}.deleteme" 314 | done 315 | } 316 | 317 | get_scitas_scripts() { 318 | # Rename the scitas examples. 319 | # ./template.sh get_scitas_scripts 320 | check 321 | cp -r "./EPFL-SCITAS-setup/example-submit-scripts/" "./EPFL-SCITAS-setup/submit-scripts" 322 | for file in $(find "./EPFL-SCITAS-setup/submit-scripts" -type f); do 323 | sed -i.deleteme "s/moalla/${USR}/g" "$file" && rm "${file}.deleteme" 324 | sed -i.deleteme "s/claire/${LAB_NAME}/g" "$file" && rm "${file}.deleteme" 325 | done 326 | } 327 | 328 | usage() { 329 | echo "Usage: $0 {env|pull_generic|build_generic|build_user|build|push_generic|push_user|push|list_env|empty_interactive|run|dev|get_runai_scripts}" 330 | 331 | # Describe each function with its arguments. 332 | echo "env: Create the .env file with the user-specific variables." 333 | echo "pull_generic IMAGE_NAME: Pull the generic runtime and dev images." 334 | echo "build_generic: Build the generic runtime and dev images." 335 | echo "build_user: Build the user runtime and dev images." 336 | echo "build: Build the generic and user runtime and dev images." 337 | echo "import_from_podman: Import the podman image to enroot." 338 | echo "push_generic IMAGE_NAME: Push the generic runtime and dev images." 339 | echo "push_user IMAGE_NAME: Push the user runtime and dev images." 340 | echo "push IMAGE_NAME: Push the generic and user runtime and dev images." 341 | echo "list_env: List the pip/conda environment." 342 | echo "empty_interactive: Start an interactive shell in an empty container." 343 | echo "run -e VAR1=VAL1 -e VAR2=VAL2 ... COMMAND: Run a command in a new runtime container." 344 | echo "dev -e VAR1=VAL1 -e VAR2=VAL2 ... COMMAND: Run a command in a new development container." 345 | echo "get_runai_scripts: Rename the runai examples." 346 | echo "get_scitas_scripts: Rename the scitas examples." 347 | } 348 | 349 | if [ $# -eq 0 ]; then 350 | usage 351 | else 352 | # run the command 353 | case "$1" in 354 | -h|--help) 355 | usage 356 | exit 0 357 | ;; 358 | esac 359 | "$@" 360 | fi 361 | -------------------------------------------------------------------------------- /installation/docker-amd64-cuda/update-env-file.sh: -------------------------------------------------------------------------------- 1 | # Records the current environment to a file. 2 | # Packages installed from GitHub with pip install will not be recorded 3 | # properly (i.e. the link will be omitted and just replaced with the version). 4 | # In that case, you have to update this file to add commands that 5 | # will fix the environment file. (you could also just edit it manually afterwards). 6 | 7 | ENV_FILE="${PROJECT_ROOT_AT}"/installation/docker-amd64-cuda/requirements-freeze.txt 8 | pip list --exclude-editable --format freeze > "${ENV_FILE}" 9 | -------------------------------------------------------------------------------- /installation/edit-platform-and-acceleration.sh: -------------------------------------------------------------------------------- 1 | # Updates the platform and the hardware-acceleration supported by an installation. 2 | 3 | CHANGE_OR_COPY="${1}" 4 | INSTALL_METHOD="${2}" 5 | 6 | CURR_PLATFORM="${3}" 7 | CURR_ACCELERATION="${4}" 8 | 9 | NEW_PLATFORM="${5}" 10 | NEW_ACCELERATION="${6}" 11 | 12 | # Abort if variables not defined and show usage. 13 | if [ -z "${CHANGE_OR_COPY}" ] || [ -z "${INSTALL_METHOD}" ] || [ -z "${CURR_PLATFORM}" ] || [ -z "${CURR_ACCELERATION}" ] || [ -z "${NEW_PLATFORM}" ] || [ -z "${NEW_ACCELERATION}" ]; then 14 | echo "Usage: installation/edit-platform-and-acceleration.sh CHANGE_OR_COPY CURR_PLATFORM CURR_ACCELERATION NEW_PLATFORM NEW_ACCELERATION" 15 | echo "Example: installation/edit-platform-and-acceleration.sh change docker amd64 cuda arm64 cuda" 16 | echo "Example: installation/edit-platform-and-acceleration.sh copy docker amd64 cuda arm64 cuda" 17 | echo "Example: installation/edit-platform-and-acceleration.sh change docker amd64 cuda amd64 rocm" 18 | echo "Example: installation/edit-platform-and-acceleration.sh change conda osx-arm64 mps linux-64 cuda" 19 | exit 1 20 | fi 21 | 22 | # Abort if the current installation does not exist. 23 | if [ ! -d installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION}" ]; then 24 | echo installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION} does not exist." 25 | exit 1 26 | fi 27 | 28 | # Abort if the new installation already exists. 29 | if [ -d installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}" ]; then 30 | echo installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION} already exists." 31 | exit 1 32 | fi 33 | 34 | # Rename the current to the new one. 35 | 36 | if [ "${CHANGE_OR_COPY}" = "change" ]; then 37 | mv installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION}" installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}" 38 | elif [ "${CHANGE_OR_COPY}" = "copy" ]; then 39 | cp -r installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION}" installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}" 40 | else 41 | echo "CHANGE_OR_COPY must be either change or copy." 42 | exit 1 43 | fi 44 | 45 | # Rename the installation combination in all the files. 46 | for file in $(find "installation/${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}" -type f); do 47 | sed -i.deleteme "s/${CURR_PLATFORM}-${CURR_ACCELERATION}/${NEW_PLATFORM}-${NEW_ACCELERATION}/g" "${file}" 48 | sed -i.deleteme "s/${CURR_PLATFORM}/${NEW_PLATFORM}/g" "${file}" 49 | rm "${file}.deleteme" 50 | done 51 | 52 | # Rename the default platform for the docker installation. 53 | if [ "${INSTALL_METHOD}" = "docker" ]; then 54 | if [ "${NEW_ACCELERATION}" != "cuda" ]; then 55 | echo "You have to edit the compose.yaml manually to add services that can leverage 56 | the ${NEW_ACCELERATION} acceleration for the local deployment option with Docker Compose. 57 | Refer to the dev-local-cuda service as an example for using NVIDIA GPUs." 58 | fi 59 | fi 60 | -------------------------------------------------------------------------------- /outputs/README.md: -------------------------------------------------------------------------------- 1 | # Instructions for the outputs (models weights, logs, etc.) 2 | 3 | ## [TEMPLATE] Where and how to set up the outputs 4 | 5 | > [!IMPORTANT] 6 | > **TEMPLATE TODO:** 7 | > Update the instructions below to explain how to obtain the outputs and delete this section. 8 | 9 | The template provides the `PROJECT_ROOT/outputs/` directory as a placeholder for the outputs generated in the project 10 | (model weights, logs, etc.). 11 | This allows the experiment code to always refer to the same path for the outputs independently of the deployment method 12 | for better reproducibility between deployment options. 13 | The directory can be accessed in the experiments with `config.outputs_dir`. 14 | The output directories in `PROJECT_ROOT/outputs/` don't need to be physically in the same directory 15 | as the project, you can create symlinks to them. 16 | The default setup config `src/template_package_name/configs/setup.yaml` defines an outputs subdirectory where it will 17 | save the outputs. 18 | This is by default `PROJECT_ROOT/outputs/dev` (so you can symlink that location to somewhere else). 19 | This design shifts the outputs' path configuration from the code and config which should be identical across runs 20 | to the installation steps where you will create your symlinks. 21 | This is also more convenient than using environment variables to point to individual output locations. 22 | 23 | Below, you can instruct the users on how to link/download the outputs you generated 24 | to directly use them for reproducibility. 25 | Refer to the [data instructions](../data/README.md) for example instructions. 26 | 27 | ## Description of the outputs 28 | 29 | ## Instructions to obtain the outputs 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "template-project-name" 7 | readme = "README.md" 8 | license = { file = "LICENSE" } 9 | version = "0.0.1" 10 | requires-python = ">=3.10" 11 | 12 | [tool.hatch.build.targets.wheel] 13 | packages = ["src/template_package_name"] 14 | 15 | [tool.isort] 16 | known_third_party = ["wandb"] 17 | -------------------------------------------------------------------------------- /reproducibility-scripts/README.md: -------------------------------------------------------------------------------- 1 | > [!NOTE] 2 | > **TEMPLATE TODO** 3 | > This directory should contain the commands run to reproduce the results in the paper. 4 | > E.g. the commands to train, evaluate, and produce the plots in the paper. 5 | > This can also include hyperparameter search commands like wandb sweeps. 6 | > Ideally when you run unattended jobs, your jobs should run scripts in this directory. 7 | -------------------------------------------------------------------------------- /reproducibility-scripts/template-experiment.sh: -------------------------------------------------------------------------------- 1 | python -m template_package_name.template_experiment 2 | -------------------------------------------------------------------------------- /reproducibility-scripts/template-sweep.yaml: -------------------------------------------------------------------------------- 1 | # Run `wandb sweep reproducibility-scripts/template-sweep.yaml` to generate a sweep. 2 | # Run `wandb agent template-sweep-id` to run the sweep. 3 | 4 | project: template-project-name 5 | name: template-sweep 6 | method: grid 7 | metric: 8 | goal: maximize 9 | name: some_metric 10 | parameters: 11 | wandb.mode: 12 | value: online 13 | wandb.use_global_dir: 14 | value: True 15 | job_subdir: 16 | value: some-special-experiment 17 | seed: 18 | value: 1 19 | resuming.resume: 20 | value: True 21 | resuming.use_commit: 22 | value: True 23 | n: 24 | values: [1, 2, 3] 25 | 26 | command: 27 | - python 28 | - "-m" 29 | - "template_package_name.template_experiment" 30 | - ${args_no_hyphens} 31 | -------------------------------------------------------------------------------- /src/template_package_name/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLAIRE-Labo/python-ml-research-template/280ac60bbef5c740fe21aa6fd388ea2a0093f8d7/src/template_package_name/__init__.py -------------------------------------------------------------------------------- /src/template_package_name/configs/override/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !README.md 4 | !template-experiment.yaml 5 | -------------------------------------------------------------------------------- /src/template_package_name/configs/override/README.md: -------------------------------------------------------------------------------- 1 | The configs in this directory should not be tracked by git. 2 | They are meant to be used as personalized overrides for the configs in the `configs` directory. 3 | You can use them temporarily for development (e.g., disable wandb, reduce the number of epochs, etc.) 4 | or to specify configurations specific to your machine (e.g., the number of GPUs to use). 5 | E.g., you could have an `override/setup.yaml` doing something like: 6 | 7 | ```yaml 8 | wandb: 9 | mode: disabled 10 | 11 | optim: 12 | num_epochs: 1 13 | ``` 14 | 15 | As done for `.../override/template_experiment.yaml` in `config/template_experiment.yaml` 16 | put the override config as the last one to be read by the experiment config (the last one it its defaults). 17 | It will override any variable set there. 18 | Remember to remove everything that's not hardware dependent for your reproducible runs, 19 | or even better maintain two different copies of the repo: 20 | one for development and one for unattended runs to avoid edits 21 | while you develop to be picked up by your unattended runs. 22 | -------------------------------------------------------------------------------- /src/template_package_name/configs/override/template-experiment.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # The above line should appear in the override configs so that they sit at the root of the config tree. 3 | 4 | is_this_key_overridden: yes 5 | -------------------------------------------------------------------------------- /src/template_package_name/configs/setup.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # This config defines a common setup for experiments. 4 | # Add it to the defaults of any experiment config. 5 | 6 | # A random seed is generated for each run if you don't specify one. 7 | # This is good to avoid overfitting your dev runs to specific seeds. 8 | seed: ${generate_random_seed:} 9 | # Cuda deterministic settings. 10 | cuda_deterministic: false 11 | cuda_strong_deterministic: false 12 | 13 | # The root data and output directories to be used in experiments. 14 | data_dir: ${hydra:runtime.cwd}/data 15 | outputs_dir: ${hydra:runtime.cwd}/outputs 16 | 17 | # Outputs of an experiment are stored in a directory created by Hydra of the form 18 | # outputs/${outputs_subdir}/${hydra.job.name}/${job_subdir}/${now:%Y-%m-%d_%H-%M-%S-%f} 19 | # This has been designed to allow flexibility and can be configured in the sections below. 20 | 21 | # Outputs are saved in outputs/${outputs_subdir}/... 22 | # so you can store them in different physical locations if you want 23 | # by symlink-ing outputs_subdir to a different location. 24 | # outputs_subdir can also be useful to separate "dev" and "run" outputs. 25 | outputs_subdir: dev 26 | # Outputs of experiments generated by a script called job_name 27 | # will be saved in outputs/${outputs_subdir}/job_name/${job_subdir}/... 28 | # This can further be useful to tag experiments run from the same script. 29 | job_subdir: dev 30 | 31 | hydra: 32 | run: 33 | # Finally, this is where the outputs of an individual run will be stored. 34 | dir: outputs/${outputs_subdir}/${hydra.job.name}/${job_subdir}/runs/${now:%Y-%m-%d--%H-%M-%S-%f} 35 | job: 36 | chdir: true 37 | verbose: false # Set to true for logging at debug level. 38 | 39 | wandb: 40 | project: template-project-name 41 | mode: online 42 | anonymous: allow 43 | tags: 44 | - development 45 | run_id: null 46 | run_name: null 47 | # Make outputs/wandb a symlink if you want to save the wandb summaries elsewhere. 48 | # Same as with outputs_subdir. 49 | global_dir: ${hydra:runtime.cwd}/outputs 50 | use_global_dir: false # Otherwise, use the cwd of the experiment. 51 | 52 | run_dir: ${hydra:run.dir} 53 | resuming_dir: null 54 | 55 | resuming: 56 | resume: false 57 | use_commit: false 58 | wandb_cache_bust: 0 # Limitation of wandb. Cannot create runs with the same ID if deleted previously. 59 | # Use this to refresh the id of the run and make it a "new" run. 60 | exclude_keys: # Can be a deep key e.g. model.optimizer.lr 61 | - run_dir 62 | - data_dir # To be able to resume by another user. 63 | - outputs_dir # To be able to resume by another user. 64 | - resuming_dir # To be able to force resume from anywhere. 65 | - wandb # To be able to move a run and resume it. 66 | - resuming.exclude_keys # To be able to add keys on the fly and force resume. 67 | -------------------------------------------------------------------------------- /src/template_package_name/configs/template-experiment.yaml: -------------------------------------------------------------------------------- 1 | # An example config file for an experiment. 2 | # Keep this, it's used as an example to run the code after a user installs the project. 3 | 4 | defaults: 5 | # Common setup. 6 | - setup 7 | # This file. 8 | - _self_ 9 | # Optional override (untracked by git, must not impact reproducibility). 10 | - optional override: template-experiment 11 | 12 | ###################################################################### 13 | 14 | some_arg: "some_default_value" 15 | some_number: 10 16 | n: 10 17 | is_this_key_overridden: no 18 | -------------------------------------------------------------------------------- /src/template_package_name/template_experiment.py: -------------------------------------------------------------------------------- 1 | """An example file to run an experiment. 2 | Keep this, it's used as an example to run the code after a user installs the project. 3 | """ 4 | 5 | import logging 6 | from pathlib import Path 7 | from time import sleep 8 | 9 | import hydra 10 | import wandb 11 | from omegaconf import DictConfig 12 | 13 | from template_package_name import utils 14 | 15 | # Refers to utils for a description of resolvers 16 | utils.config.register_resolvers() 17 | 18 | # Hydra sets up the logger automatically. 19 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | @hydra.main(version_base=None, config_path="configs", config_name="template-experiment") 24 | def main(config: DictConfig) -> None: 25 | # Using the template provides utilities for experiments: 26 | 27 | # 1. Setting up experiment and resuming directories 28 | config = utils.config.setup_config_and_resuming( 29 | config, postprocess_func=lambda x: x 30 | ) 31 | # The current working directory is a new directory unique to this run made by hydra, accessible by config.run_dir. 32 | # A resuming directory uniquely identified by the config (and optionally the git sha) 33 | # for storing checkpoints of the same experiment can be accessed via config.resuming.dir. 34 | # The current directory will be the resuming directory if config.resuming.resume is True else the run directory. 35 | # You can pass a postprocessing function to postprocess the config. 36 | 37 | # 2. Setting up wandb with resuming and the config logged. 38 | utils.config.setup_wandb(config) 39 | # Use a custom step key when you log so that you can resume logging anywhere. 40 | # For example, if the checkpoint is earlier than the last logged step in the crashed run, you can resume 41 | # from steps already logged, and they will be rewritten (with the same value assuming reproducibility). 42 | # E.g., wandb.log({"my_custom_step": i, "loss": loss}) 43 | 44 | # 3. Seeding for reproducibility 45 | utils.seeding.seed_everything(config) 46 | # Update this function whenever you have a library that needs to be seeded. 47 | 48 | # Example experiment: 49 | checkpoints = sorted( 50 | Path.cwd().glob("checkpoint_*.txt"), key=lambda x: int(x.stem.split("_")[1]) 51 | ) 52 | if checkpoints: 53 | last_file = checkpoints[-1] 54 | logger.info(f"Resuming from {last_file.stem}") 55 | i = int(last_file.stem.split("_")[1]) + 1 56 | # Important: 57 | # When resuming, you should recover the state of the experiment as it was when it was interrupted. 58 | # I.e., the random state, the state of the model, the optimizer, etc. 59 | else: 60 | i = 0 61 | 62 | steps = 0 63 | while i < 30: 64 | # Compute and log i*n. 65 | logs = {"i": i, "y": i * config.n} 66 | print(logs) 67 | wandb.log(logs) 68 | 69 | # Checkpoint every 5 steps. 70 | if i % 5 == 0: 71 | with open(f"checkpoint_{i}.txt", "w") as f: 72 | f.write(f"y={logs['y']}") 73 | logger.info(f"Checkpointing at {i}") 74 | 75 | sleep(1) 76 | i += 1 77 | steps += 1 78 | 79 | # Preempt every 13 steps. 80 | if steps == 13: 81 | raise InterruptedError("Preempt after 13 steps.") 82 | 83 | logger.info("Finished writing files") 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /src/template_package_name/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from template_package_name.utils import config, seeding 2 | -------------------------------------------------------------------------------- /src/template_package_name/utils/config.py: -------------------------------------------------------------------------------- 1 | # Resolvers can be used in the config files. 2 | # https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html 3 | # They are useful when you want to make the default values of some config variables 4 | # result from direct computation of other config variables. 5 | # Only put variables meant to be edited by the user (as opposed to read-only variables described below) 6 | # and avoid making them too complicated, the point is not to write code in the config file. 7 | import logging 8 | import os 9 | import subprocess 10 | import sys 11 | from hashlib import blake2b 12 | from pathlib import Path 13 | 14 | import wandb 15 | from omegaconf import DictConfig, OmegaConf, omegaconf 16 | 17 | from template_package_name import utils 18 | 19 | # Hydra sets up the logger automatically. 20 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 21 | _logger = logging.getLogger(__name__) 22 | 23 | 24 | def register_resolvers(): 25 | if not OmegaConf.has_resolver("eval"): 26 | # Useful to evaluate expressions in the config file. 27 | OmegaConf.register_new_resolver("eval", eval, use_cache=True) 28 | if not OmegaConf.has_resolver("generate_random_seed"): 29 | # Generate a random seed and record it in the config of the experiment. 30 | OmegaConf.register_new_resolver( 31 | "generate_random_seed", utils.seeding.generate_random_seed, use_cache=True 32 | ) 33 | 34 | 35 | def save_or_check_config(config: DictConfig, path: str) -> None: 36 | """ 37 | Save if it doesn't exist; otherwise (in case of resuming) assert that the 38 | config is the same. If they differ, log the differing key(s). 39 | """ 40 | path_obj = Path(path) 41 | if not path_obj.exists(): 42 | OmegaConf.save(config, path_obj) 43 | return 44 | 45 | # Copy and remove excluded keys (in-place removal) from both new and existing config 46 | new_config = config.copy() 47 | existing_config = OmegaConf.load(path_obj) 48 | 49 | # Convert both configs to Python dictionaries 50 | OmegaConf.resolve(new_config) 51 | OmegaConf.resolve(existing_config) 52 | 53 | remove_excluded_keys(new_config, config.resuming.exclude_keys) 54 | remove_excluded_keys(existing_config, config.resuming.exclude_keys) 55 | 56 | new_config_dict = OmegaConf.to_container(new_config, resolve=True) 57 | existing_config_dict = OmegaConf.to_container(existing_config, resolve=True) 58 | 59 | # Compare dictionaries 60 | differences = dictionary_diff(new_config_dict, existing_config_dict) 61 | if differences: 62 | diff_msg = "\n".join(differences) 63 | _logger.error( 64 | f"Config to resume is different from the one saved in {path}.\n" 65 | f"Differences:\n{diff_msg}" 66 | ) 67 | raise AssertionError( 68 | f"Config differs from the existing config at {path}. See logs for details." 69 | ) 70 | 71 | _logger.info(f"Configs match the one in {path}. Resuming with the same config.") 72 | 73 | 74 | def remove_excluded_keys(config: DictConfig, exclude_keys: list[str]) -> None: 75 | """ 76 | Remove keys from the config that are specified in exclude_keys. 77 | Exclude keys can be specified as dot-paths, e.g., "key1.key2.key3". 78 | """ 79 | with omegaconf.open_dict(config): 80 | for key in exclude_keys: 81 | try: 82 | path_segments = key.split(".") 83 | node = config 84 | for segment in path_segments[:-1]: 85 | node = node[segment] # drill down 86 | del node[path_segments[-1]] # remove the final key 87 | except KeyError: 88 | pass 89 | 90 | 91 | def dictionary_diff(d1: dict, d2: dict, path: str = "") -> list[str]: 92 | """ 93 | Recursively compare two dictionary (or scalar) structures and return a list 94 | of human-readable differences. `path` is carried along to show the nested key path. 95 | """ 96 | differences = [] 97 | 98 | # If both are dict-like, compare keys and recurse 99 | if isinstance(d1, dict) and isinstance(d2, dict): 100 | all_keys = set(d1.keys()).union(d2.keys()) 101 | for key in all_keys: 102 | new_path = f"{path}.{key}" if path else key 103 | if key not in d1: 104 | differences.append(f"Missing in new config: {new_path}") 105 | elif key not in d2: 106 | differences.append(f"Missing in existing config: {new_path}") 107 | else: 108 | # Recurse 109 | differences.extend(dictionary_diff(d1[key], d2[key], new_path)) 110 | else: 111 | # If they are not both dicts, compare values directly 112 | if d1 != d2: 113 | differences.append( 114 | f"Value mismatch at '{path}': new='{d1}' vs existing='{d2}'" 115 | ) 116 | 117 | return differences 118 | 119 | 120 | def setup_resuming_dir(config): 121 | """Create a unique identifier of the experiment used to specify a resuming/checkpoint directory. 122 | The identifier is a hash of the config, excluding keys specified in config.resuming.exclude_keys. 123 | If config.resuming.use_commit is True, the commit hash is appended to the identifier. 124 | I.e. the checkpoint directory is defined by: the config - the excluded config keys + the commit hash (if specified) 125 | """ 126 | if config.resuming_dir is not None: 127 | return Path(config.resuming_dir), Path(config.resuming_dir).name 128 | 129 | resuming_hash = "" 130 | config_to_hash = config.copy() 131 | 132 | # resolve config 133 | OmegaConf.resolve(config_to_hash) 134 | remove_excluded_keys(config_to_hash, config.resuming.exclude_keys) 135 | config_hash = blake2b(str(config_to_hash).encode(), digest_size=8).hexdigest() 136 | resuming_hash += config_hash 137 | if config.resuming.use_commit: 138 | commit_hash = ( 139 | subprocess.check_output(["git", "rev-parse", "HEAD"]) 140 | .strip() 141 | .decode("utf-8") 142 | ) 143 | resuming_hash += f"-{commit_hash[:8]}" 144 | 145 | resuming_dir = Path.cwd().parent.parent / "checkpoints" / resuming_hash 146 | resuming_dir.mkdir(parents=True, exist_ok=True) 147 | with omegaconf.open_dict(config): 148 | config.resuming_dir = str(resuming_dir) 149 | config.resuming_hash = resuming_hash 150 | if config.resuming.resume: 151 | if config.wandb.run_id is None: 152 | config.wandb.run_id = config.resuming_hash 153 | if config.wandb.run_name is None: 154 | config.wandb.run_name = config.resuming_hash 155 | 156 | 157 | def setup_config_and_resuming(config, postprocess_func=None, logger=_logger): 158 | logger.info(f"Init directory: {Path.cwd()}") 159 | utils.config.setup_resuming_dir(config) 160 | logger.info(f"Run can be resumed from the directory: {config.resuming_dir}") 161 | if config.resuming.resume: 162 | os.chdir(config.resuming_dir) 163 | logger.info(f"Resuming from the directory: {Path.cwd()}") 164 | 165 | Path(f"config").mkdir(exist_ok=True, parents=True) 166 | utils.config.save_or_check_config( 167 | config, 168 | f"config/config-raw.yaml", 169 | ) 170 | 171 | # Do some optional postprocessing to the config (e.g., checking division of batch size etc.) 172 | OmegaConf.resolve(config) 173 | if postprocess_func: 174 | config = postprocess_func(config) 175 | 176 | # Save the resolved config. 177 | utils.config.save_or_check_config(config, f"config/config-postprocessed.yaml") 178 | 179 | return config 180 | 181 | 182 | def setup_wandb(config, logger=_logger): 183 | wandb.init( 184 | id=config.wandb.run_id, 185 | name=config.wandb.run_name, 186 | resume="allow" if config.resuming.resume else "never", 187 | config=OmegaConf.to_container(config), 188 | project=config.wandb.project, 189 | tags=config.wandb.tags, 190 | mode=config.wandb.mode, 191 | anonymous=config.wandb.anonymous, 192 | dir=Path.cwd() if not config.wandb.use_global_dir else config.wandb.global_dir, 193 | ) 194 | 195 | # Re-log to capture log with wandb. 196 | logger.info(f"Running command: {subprocess.list2cmdline(sys.argv)}") 197 | logger.info(f"Init directory: {config.run_dir}") 198 | logger.info(f"Run can be resumed from the directory: {config.resuming_dir}") 199 | logger.info(f"Working directory: {Path.cwd()}") 200 | logger.info(f"Running with config: \n{OmegaConf.to_yaml(config)}") 201 | if config.resuming.resume: 202 | logger.info(f"Resuming from the directory: {Path.cwd()}") 203 | -------------------------------------------------------------------------------- /src/template_package_name/utils/seeding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | # import numpy as np 5 | # import torch 6 | 7 | 8 | def generate_random_seed(): 9 | """Generate a random seed.""" 10 | return random.randint(0, 2**32 - 1) 11 | 12 | 13 | # Update this function whenever you have a library that needs to be seeded. 14 | def seed_everything(config): 15 | """Seed all random generators.""" 16 | random.seed(config.seed) 17 | 18 | ## For numpy: 19 | # This is for legacy numpy: 20 | # np.random.seed(config.seed) 21 | # New code should make a Generator out of the config.seed directly: 22 | # https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html 23 | 24 | ## For PyTorch: 25 | # torch.manual_seed(config.seed) 26 | 27 | # if config.cuda_deterministic: 28 | # # Higher (e.g., on CUDA too) reproducibility with deterministic algorithms: 29 | # # https://pytorch.org/docs/stable/notes/randomness.html 30 | # 31 | # # Not supported for all operations though: 32 | # # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html 33 | # if config.cuda_strong_deterministic: 34 | # torch.use_deterministic_algorithms(True) 35 | # 36 | # # A lighter version of the above otherwise as not all algorithms have a deterministic implementation 37 | # torch.backends.cudnn.deterministic = True 38 | # 39 | # # torch.backends.cudnn.benchmark = False 40 | # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" 41 | -------------------------------------------------------------------------------- /template/README.md: -------------------------------------------------------------------------------- 1 | # Additional Details about the Template 2 | 3 | ## Reproducibility 4 | 5 | This template ensures the reproducibility of your results through 3 artifacts: 6 | 7 | 1. The development environment 8 | - Recorded in the Docker images that you upload and described in Dockerfile and environment files 9 | that you keep up to date with the Docker installation. 10 | - (Less reliably) described in the environment file that you keep up to date for the conda installation. 11 | 2. The project code. 12 | - Recorded in the git repository that you keep up to date. 13 | - Made reproducible (to a desired degree) by you correctly seeding the random number generators and 14 | optionally removing non-deterministic operations, or replicable by running enough seeds. 15 | 3. The data, outputs, model weights and other artifacts. 16 | - Recorded and uploaded by you. 17 | - (Virtually) placed in the placeholder directories abstracting away the user storage system. 18 | 19 | ## Checkpointing 20 | 21 | The template provides an automatic setup of the checkpointing directory for an experiment. 22 | The unique identifier for the directory is created by hashing the config used and optionally the git commit sha. 23 | Running the same experiment with the same config will thus set its working directory to the same checkpoint directory 24 | every time (if the resuming option is enabled). 25 | 26 | To use this feature pass `resuming.resume=True` and `resuming.use_commit=True` to your script using a Hydra config 27 | that inherits form the `setup.yaml` config file, like the `template_experiment.py` script. 28 | 29 | Even without using `resuming.use_commit=True`, the path to the checkpoint directory will be computed, and you could 30 | for example, read from it. 31 | 32 | You can also force a resuming directory by passing `resuming.resume_dir=` to your script. 33 | 34 | ### Compatibility with Weights & Biases 35 | 36 | For a non-sweep run, the run will have the id of the checkpoint directory as its wandb id, therefore your wandb run 37 | will stay the same and resume when your run is resumed. 38 | Make sure to use a custom step key when you log metrics so that you can have full control over when to start rewriting 39 | when you resume (E.g. if you checkpoint less often than you log, you may relog from the last checkpoint), otherwise 40 | the default step key of wandb will resume from the latest step and may be inconsistent with the checkpoint. 41 | 42 | For a sweep run, it already has an id from the sweep, so to resume it you should manually get its id and restart 43 | the script with the same arguments the sweep agent started it, this way the config and the 44 | checkpoint directory will be the same 45 | (i.e. go to the wandb run UI, copy-paste the command it was run with and add `wandb.run_id=`). 46 | This is a limitation of the wandb sweep system. 47 | See [this issue.](https://github.com/wandb/wandb/issues/9143) 48 | 49 | ## Template Q&A 50 | 51 | ### I started my project from an older version of the template, how do I get updates? 52 | 53 | A project started from a template is different from a fork in that it is not (necessarily) meant to be updated. 54 | The template is free to change and evolve, and it is not guaranteed that it will be compatible with your project. 55 | 56 | Nevertheless, many changes are likely to be compatible with your project. 57 | In that case, there are two ways to incorporate them: 58 | 59 | 1. Manually copy the changes from the template to your project, adapting them when needed (different variable names especially). 60 | 2. Use git to merge the changes from the template to your project. 61 | ```bash 62 | git remote add template https://github.com/CLAIRE-Labo/python-ml-research-template.git 63 | git fetch template 64 | # Cherry pick the commits you want to merge, making sure they are compatible. 65 | # Add the option -n if you want to have the changes staged but not committed so you can edit them. 66 | git cherry-pick -x 67 | ``` 68 | 69 | ### Why Docker? Why not just Conda? (At least for container-compatible hardware acceleration methods.) 70 | 71 | Conda environments are not so self-contained, some packages can just work on your machine because 72 | they use some of your system libraries not recorded in the conda environment. 73 | An exhaustive and precise list of the system libraries outside conda is hard to record, 74 | and the environment will not run on another machine missing those libraries. 75 | Reinforcement learning (RL) environments usually require system libraries 76 | not recorded by conda, and RL is a big part of our work. 77 | 78 | Moreover, the environment is specified as an `environment.yml` file description, 79 | and does not contain the dependencies themselves. 80 | Some dependencies may actually become unavailable when a user tries to download them later. 81 | 82 | Docker images are self-contained and do contain the dependencies themselves. 83 | 84 | ### Why is the template so complex, e.g., include so many files? 85 | 86 | Some reasons for many files and extra code is to be able to provide a generic template 87 | that can be configured, extended, or shortened depending on the needs of the project. 88 | 89 | The other part of the apparent complexity probably comes from unfamiliarity with the tools and practices 90 | used in the template. 91 | These practices, however, (although usually not all combined in a research project, whence this template) 92 | are well established and have been proven to be very useful. 93 | 94 | For example, the `Dockerfile` seems complex because it leverages multi-staging to be very 95 | time and cache-efficient. 96 | Different build stages can run in parallel, so changing your build dependencies, 97 | or installing something in the Dockerfile will cause very few rebuilds. 98 | 99 | Using Docker Compose is also very convenient to define all the build arguments and multiple deployment options 100 | in a single file, avoiding long build and run commands. 101 | 102 | ### Why does the template have so many tools by default (e.g. `hydra`, `wandb`, `black`, etc.)? 103 | 104 | This template is mainly addressed to students and researchers at CLAIRE. 105 | Frequently, students are not aware of the tools and practices that are available to them until they face the problems 106 | we've all faced at some point in our career 107 | (how do I manage my configs? How do conveniently log my metrics?, etc.). 108 | We chose to include these tools by default to help students and researchers avoid these problems from the start, 109 | and to encourage them to use them. 110 | 111 | ### Can I fork a project that used the template and change its name? How do I do that? 112 | 113 | Yes, it seems like filling the `template/template-variables.env` file with your new project name and 114 | running `./template/change-project-name.sh` would work. 115 | 116 | ### Can I use this template for an already existing project? How do I do that? 117 | 118 | The template is mainly designed to start new projects, as it's hard to make assumptions on 119 | the structure of an existing project. 120 | However, it is possible to use it to transfer from an existing project. 121 | 122 | It's likely that your project is a bunch of Python files and a `requirements.txt` or `environment.yml` file. 123 | You can copy those files and potentially refactor the package structures and put all of them under `src`. 124 | You will also have to transfer the dependencies to the `environment.yml` file and identify the system dependencies. 125 | 126 | If your project provides a Docker image, the Docker installation method allows extending it, assuming 127 | it has a well-configured Python environment. 128 | 129 | In the worst case, you can keep the `installation/` directory if that's useful to you and replace all the rest with 130 | your project and adapt the installation as needed. 131 | You could also just get some inspiration from the template and do your own thing. 132 | -------------------------------------------------------------------------------- /template/change-project-name.sh: -------------------------------------------------------------------------------- 1 | # This script allows to replace the template variables with your project ones. 2 | set -eo pipefail 3 | source template/template-variables.env 4 | 5 | # Iterate through all files in the project except dot directories and this directory. 6 | for file in $(find . -type f -not -path './template/*' -not -path '*/\.*' -not -path '*/__*__/*' -not -path './outputs/*'); do 7 | # .deleteme is a trick to make sed work the same way on both Linux and OSX. 8 | # https://stackoverflow.com/questions/5694228/sed-in-place-flag-that-works-both-on-mac-bsd-and-linux 9 | sed -i.deleteme "s/${OLD_PROJECT_NAME}/${NEW_PROJECT_NAME}/g" "${file}" 10 | sed -i.deleteme "s/${OLD_PACKAGE_NAME}/${NEW_PACKAGE_NAME}/g" "$file" 11 | sed -i.deleteme "s/python=${OLD_PYTHON_VERSION}/python=${NEW_PYTHON_VERSION}/g" "$file" 12 | sed -i.deleteme "s/python${OLD_PYTHON_VERSION}/python${NEW_PYTHON_VERSION}/g" "$file" 13 | sed -i.deleteme "s/Python ${OLD_PYTHON_VERSION}/Python ${NEW_PYTHON_VERSION}/g" "$file" 14 | sed -i.deleteme "s/requires-python = \">=${OLD_PYTHON_VERSION}\"/requires-python = \">=${NEW_PYTHON_VERSION}\"/g" "$file" 15 | # Delete the .deleteme file if it exists. 16 | rm -f "$file.deleteme" 17 | done 18 | 19 | if [ "${NEW_PACKAGE_NAME}" != "${OLD_PACKAGE_NAME}" ]; then 20 | mv "src/${OLD_PACKAGE_NAME}" src/"${NEW_PACKAGE_NAME}" 21 | fi 22 | -------------------------------------------------------------------------------- /template/template-variables.env: -------------------------------------------------------------------------------- 1 | # The PROJECT_NAME and PACKAGE_NAME can be the same (replacing hyphens with underscores). 2 | # This is the safest and best option. 3 | # Though, you can make them different if you need. 4 | 5 | # The variables prefixed with $OLD_ are for the current state of the template 6 | # (correctly filled at the start, can be replaced with the project name later to change it). 7 | # The variables prefixed with $NEW_ are for the state it will be after the script is run. 8 | 9 | # This is the distribution name, used to `pip install PROJECT_NAME`. 10 | # $NEW_PROJECT_NAME Will replace $OLD_PROJECT_NAME in all the template. 11 | OLD_PROJECT_NAME="template-project-name" 12 | NEW_PROJECT_NAME="template-project-name" # Hyphen or underscore as word separator. 13 | 14 | # This is the package name used by `import PACKAGE_NAME`. 15 | # $NEW_PACKAGE_NAME will replace $OLD_PACKAGE_NAME in all the template. 16 | OLD_PACKAGE_NAME="template_package_name" 17 | NEW_PACKAGE_NAME="template_package_name" # Underscore as word separator. 18 | 19 | # Will be the Python version used in the environment (if it doesn't have Python already). 20 | # Will be the minimum version of Python supported by the project. 21 | # If you're basing your work on an already existing docker image, use the version in that image. 22 | # $NEW_ will replace "python=$OLD_PYTHON_VERSION", "python$OLD_PYTHON_VERSION", 23 | # "Python $OLD_PYTHON_VERSION", and "python>=$OLD_PYTHON_VERSION" in all the template. 24 | OLD_PYTHON_VERSION="3.10" 25 | NEW_PYTHON_VERSION="3.10" 26 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | Placeholder. 2 | --------------------------------------------------------------------------------