├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── LICENSE
├── LICENSE.ml-template
├── README.md
├── data
    └── README.md
├── installation
    ├── conda-osx-arm64-mps
    │   ├── README.md
    │   ├── environment.yml
    │   └── update-env-file.sh
    ├── docker-amd64-cuda
    │   ├── .dockerignore
    │   ├── CSCS-Clariden-setup
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   └── example-submit-scripts
    │   │   │   ├── README.md
    │   │   │   ├── edf.toml
    │   │   │   ├── minimal.sh
    │   │   │   ├── remote-development.sh
    │   │   │   ├── test-interactive.sh
    │   │   │   ├── unattended-distributed.sh
    │   │   │   └── unattended.sh
    │   ├── Dockerfile
    │   ├── Dockerfile-user
    │   ├── EPFL-SCITAS-setup
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   └── example-submit-scripts
    │   │   │   ├── README.md
    │   │   │   ├── minimal.sh
    │   │   │   ├── remote-development.sh
    │   │   │   ├── unattended-distributed.sh
    │   │   │   └── unattended.sh
    │   ├── EPFL-runai-setup
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   └── example-submit-scripts
    │   │   │   ├── minimal.sh
    │   │   │   ├── remote-development.sh
    │   │   │   └── unattended.sh
    │   ├── LICENSE.cresset
    │   ├── README.md
    │   ├── apt.txt
    │   ├── compose-base.yaml
    │   ├── compose.yaml
    │   ├── entrypoints
    │   │   ├── entrypoint.sh
    │   │   ├── logins-setup.sh
    │   │   ├── pre-entrypoint.sh
    │   │   └── remote-development-setup.sh
    │   ├── from-python-template
    │   │   ├── Dockerfile
    │   │   ├── compose-base.yaml
    │   │   ├── requirements.txt
    │   │   └── update-env-file.sh
    │   ├── from-scratch-template
    │   │   ├── Dockerfile
    │   │   ├── compose-base.yaml
    │   │   ├── environment.yml
    │   │   └── update-env-file.sh
    │   ├── requirements.txt
    │   ├── template.sh
    │   └── update-env-file.sh
    └── edit-platform-and-acceleration.sh
├── outputs
    └── README.md
├── pyproject.toml
├── reproducibility-scripts
    ├── README.md
    ├── template-experiment.sh
    └── template-sweep.yaml
├── src
    └── template_package_name
    │   ├── __init__.py
    │   ├── configs
    │       ├── override
    │       │   ├── .gitignore
    │       │   ├── README.md
    │       │   └── template-experiment.yaml
    │       ├── setup.yaml
    │       └── template-experiment.yaml
    │   ├── template_experiment.py
    │   └── utils
    │       ├── __init__.py
    │       ├── config.py
    │       └── seeding.py
├── template
    ├── README.md
    ├── change-project-name.sh
    └── template-variables.env
└── tests
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | !**/configs/env
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # JetBrains
133 | .idea/*
134 | !.idea/runConfigurations/
135 | 
136 | # macOS
137 | .DS_Store
138 | 
139 | # Project
140 | data/*
141 | !data/README.md
142 | 
143 | outputs/*
144 | !outputs/README.md
145 | 
146 | wandb
147 | **/*.out
148 | 
149 | third-party
150 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Pre-commit sets git hooks for identifying simple issues when committing code.
 2 | # See https://pre-commit.com for more information
 3 | # See https://pre-commit.com/hooks.html for more hooks
 4 | 
 5 | default_language_version:
 6 |   python: python3.10
 7 | 
 8 | repos:
 9 |   - repo: https://github.com/pre-commit/pre-commit-hooks
10 |     rev: v5.0.0
11 |     hooks:
12 |       - id: check-added-large-files
13 |       - id: check-symlinks
14 |       - id: destroyed-symlinks
15 |       - id: trailing-whitespace
16 |       - id: end-of-file-fixer
17 |       - id: mixed-line-ending
18 |       - id: check-yaml
19 |       - id: check-toml
20 |       - id: check-added-large-files
21 |       - id: check-merge-conflict
22 |       - id: check-shebang-scripts-are-executable
23 |       - id: detect-private-key
24 |       - id: debug-statements
25 |       - id: check-case-conflict
26 |   - repo: https://github.com/python/black
27 |     rev: 25.1.0
28 |     hooks:
29 |       - id: black
30 |       - id: black-jupyter
31 |   - repo: https://github.com/PyCQA/isort
32 |     rev: 6.0.1
33 |     hooks:
34 |       - id: isort
35 |         args: [ "--profile", "black", "--filter-files" ]
36 |   - repo: https://github.com/codespell-project/codespell
37 |     rev: v2.4.1
38 |     hooks:
39 |       - id: codespell
40 |         args: [ "--skip=*.ipynb" ]
41 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: Python Machine Learning Research Template
 6 | message: >-
 7 |   If you use this template or borrow some of its code, please
 8 |   cite it as shown below.
 9 | type: software
10 | authors:
11 |   - given-names: Skander
12 |     family-names: Moalla
13 |     email: skander.moalla@epfl.ch
14 |     affiliation: EPFL
15 |     orcid: 'https://orcid.org/0000-0002-8494-8071'
16 | repository-code: 'https://github.com/CLAIRE-Labo/python-ml-research-template'
17 | abstract: >-
18 |   A template for starting Python machine learning research
19 |   projects with hardware acceleration. It features reproducible
20 |   environments on major platforms, a great development experience,
21 |   Python project packaging following PyPA guidelines to avoid
22 |   hacky imports, experiment management and tracking with Hydra
23 |   and Weights & Biases, checkpointing for research experiments
24 |   compatible with Weights & Biases, and code quality enforcement
25 |   with pre-commit.
26 | keywords:
27 |   - python
28 |   - machine learning
29 |   - reproducibility
30 |   - containers
31 |   - template
32 | license: MIT
33 | doi: 10.5281/zenodo.15609829
34 | version: 0.1.0
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Skander Moalla
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE.ml-template:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Skander Moalla
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | > [!TIP]
  2 | > 🌟 If you like this template, please give it a star! 🌟
  3 | >
  4 | > 📜 If you use this template, borrow some of its code, or refer to it, please cite it as shown on GitHub! 📜
  5 | 
  6 | > [!NOTE]
  7 | > **TEMPLATE TODO:**
  8 | > Replace the title below with your project title, then delete this note.
  9 | 
 10 | # Python Machine Learning Research Template
 11 | 
 12 | ## Overview
 13 | 
 14 | > [!NOTE]
 15 | > **TEMPLATE TODO:**
 16 | > Replace the description below with a description of your project, then delete this note.
 17 | 
 18 | A template for starting Python machine-learning research
 19 | projects with hardware acceleration featuring:
 20 | 
 21 | - ✅ Reproducible environments on major platforms with hardware acceleration with a great development experience
 22 |   covering multiple use cases:
 23 |     - 💻 local machines, e.g., macOS (+ Apple Silicon/MPS) and Linux/Windows WSL (+ NVIDIA GPU).
 24 |     - 🌐 Remote Linux servers with GPUs, e.g., VMs on cloud providers and IC and RCP HaaS at EPFL.
 25 |     - ☁️ Managed clusters supporting OCI containers with GPUs, e.g., the EPFL IC and RCP Run:ai (Kubernetes) clusters
 26 |       and the SCITAS Slurm clusters.
 27 | - 📦 Python project packaging following the
 28 |   [PyPA packaging guidelines](https://packaging.python.org/en/latest/tutorials/packaging-projects/) to avoid hacky
 29 |   imports.
 30 | - 📊 Experiment management, tracking, and sharing with [Hydra](https://hydra.cc/)
 31 |   and [Weights & Biases](https://wandb.ai/site).
 32 | - 💾 Checkpointing setup for research experiments compatible with Weights & Biases.
 33 | - 🧹 Code quality with [pre-commit](https://pre-commit.com) hooks.
 34 | 
 35 | 🤝 The template makes collaboration and open-sourcing straightforward, avoiding setup issues and
 36 | [maximizing impact](https://medium.com/paperswithcode/ml-code-completeness-checklist-e9127b168501#a826).
 37 | 
 38 | 🏆 The practices in this template earned its authors
 39 | an [Outstanding Paper (Honorable Mention)](https://openreview.net/forum?id=E0qO5dI5aEn)
 40 | at the [ML Reproducibility Challenge 2022](https://paperswithcode.com/rc2022).
 41 | 
 42 | 📌 Projects made with the template would look like
 43 | [this toy project](https://github.com/skandermoalla/pytoych-benchmark)
 44 | or [this paper](https://github.com/CLAIRE-Labo/no-representation-no-trust) whose curves have been exactly reproduced
 45 | (exact same numbers) on multiple different platforms (EPFL Kubernetes cluster, VM on GCP, HPC cluster with Apptainer).
 46 | 
 47 | 📖 Follow this README to get started with the template.
 48 | 
 49 | For a brief discussion of the template's design choices, features, and a Q&A check `template/README.md` file.
 50 | 
 51 | ## Getting started with the template
 52 | 
 53 | > [!NOTE]
 54 | > **TEMPLATE TODO:**
 55 | > Delete this whole section when you're done with the template getting started.
 56 | 
 57 | Click on the `Use this template` GitHub button to create a new GitHub repository from this template.
 58 | Give it a lowercase hyphen-separated name (we will refer to this name as `PROJECT_NAME`),
 59 | then follow the instructions below to set up your project.
 60 | You can also give your GitHub repo another name format if you prefer, but for the template, you will have to pick
 61 | a `PROJECT_NAME` as well.
 62 | 
 63 | It's useful to commit after some checkpoints to be able to go back if you make a mistake.
 64 | Some instructions will send you to different READMEs in the template that will compile nicely together in the end.
 65 | Remember to get back to this root one after finishing each step.
 66 | 
 67 | 1. Clone the repo with destination `PROJECT_NAME. See where and how below:
 68 |     - If you plan to develop on your local computer, clone it there.
 69 |     - If you plan to develop or deploy on a remote server/cluster without access to a build engine
 70 |       (e.g., EPFL Run:ai/Kubernetes clusters, SCITAS clusters), clone on your local machine.
 71 |       (You will build the image on your local machine, then clone on your server for deployment.
 72 |       Docker allows cross-platform builds with emulation, but it can be slow.
 73 |       We would recommend that your local machine is of the same platform as the cluster (e.g. `amd64`, `arm64`),
 74 |       or that you have access to a remote Docker engine running on the same platform as the cluster.)
 75 |     - If you plan to develop on a remote server/cluster with access to a build engine
 76 |       (e.g. EPFL HaaS, CSCS Clariden), clone it there.
 77 |    ```bash
 78 |    # For your local machine clone anywhere
 79 | 
 80 |    # For clusters with scratch filesystems with a cleaning policy, clone in your home directory (no cleaning policy).
 81 |    # The training artifacts will be later stored on the scratch filesystem and symlinked to this directory.
 82 | 
 83 |    # Note the creation of a `dev` instance of the repo (And later `run` instance for unattended jobs)
 84 |    # This allows to run unattended jobs in the `run` while changing the code in the `dev`.
 85 |    mkdir PROJECT_NAME
 86 |    cd PROJECT_NAME
 87 |    git clone <Git SSH URL> dev
 88 |    cd dev
 89 |    # The current directory is referred to as PROJECT_ROOT
 90 |    ```
 91 |    We will refer to the absolute path to the root of the repository as `PROJECT_ROOT`.
 92 | 2. Fill the template variables in `template/template-variables.env` by
 93 |    editing the ones with the `$NEW_` prefix, then run the script
 94 |    ```bash
 95 |    # After filling the template variables in template/template-variables.env.
 96 |    ./template/change-project-name.sh
 97 |    ```
 98 |    Commit.
 99 | 3. Initialize the pre-commit hooks as described in the [contributing](#contributing) section.
100 |    Update them to their latest version with `pre-commit autoupdate`.
101 |    Commit.
102 | 4. Edit the `LICENSE` file, or delete it and remember to add one when open-sourcing your code.
103 |    [(Some help here).](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/licensing-a-repository)
104 |    A simple change if you're fine with the MIT license is to replace the `2022 Skander Moalla` with your year and name.
105 |    Same for the `CITATION.cff` file.
106 |    Commit.
107 | 5. Set up and edit the development environment instructions for the methods and platforms you will use and support.
108 |    Each method supports a group of use cases:
109 |     - **Docker**.
110 |       This is the preferred method to run on Linux machines (e.g. EPFL HaaS servers),
111 |       Windows machines with WSL, clusters running OCI-compliant containers like the EPFL Run:ai (Kubernetes) clusters,
112 |       the SCITAS clusters, and other cloud services.
113 |       (We provide tutorials for deployment with Docker, on the EPFL runai cluster, and on the SCITAS cluster.)
114 | 
115 |       The environment is shipped as a Linux Docker image, ensuring the highest level of reproducibility.
116 |       You are free to choose the architecture you want to build the image for,
117 |       e.g. `amd64` or `arm64`.
118 |       By default, this image is set up for `amd64`.
119 |       You are also free to choose the hardware acceleration you want to support.
120 |       By default, this template allows local deployment with NVIDIA GPUs and can extend
121 |       [NGC images](https://catalog.ngc.nvidia.com/containers).
122 | 
123 |       If you plan to support multiple platforms or hardware accelerations,
124 |       you can duplicate this installation method
125 |       or adapt it to support multiple platforms at the same time.
126 | 
127 |       Go to `installation/docker-amd64-cuda/README.md` for the setup.
128 |       Come back here after following the instructions there.
129 | 
130 |     - **Conda**.
131 |       The environment is shipped as a conda environment file.
132 |       The level of reproducibility is lower than with Docker, as system dependencies will not be strictly recorded.
133 |       The only reason this option is available is to leverage hardware acceleration of platforms not compatible with
134 |       OCI containers, in particular, [MPS](https://developer.apple.com/metal/pytorch/)
135 |       which is [not supported](https://github.com/pytorch/pytorch/issues/81224)
136 |       on Docker for macOS with Apple Silicon.
137 | 
138 |       By default, this option is set up for `osx-arm64` to run on macOS with Apple Silicon.
139 |       This installation method could also be used if you want to settle for a lower level of reproducibility
140 |       and do not need to run on container clusters.
141 |       In that case, you might support another platform, e.g. `amd64`, and hardware acceleration, e.g., NVIDIA GPUs.
142 | 
143 |       If you plan to support multiple platforms or hardware accelerations,
144 |       you can duplicate this installation method
145 |       or adapt it to support multiple platforms at the same time.
146 | 
147 |       Go to `installation/conda-osx-arm64-mps/README.md` for the setup.
148 |       Come back here after following the instructions there.
149 | 
150 |    Delete the installation directory for the installation method you don't use.
151 | 
152 |    Naturally, results will be reproducible on machines with the same architecture and hardware acceleration
153 |    using the same installation method,
154 |    but not necessarily across architectures and installation methods.
155 |    This is because dependency versions may vary across platforms.
156 |    Try to keep the dependency versions close to ensure an easy replicability of your results.
157 | 
158 | 6. Edit this `README.md` file.
159 |     1. Edit the title with the name of your project.
160 |        Replace the [Overview](#overview) section with a description of your project.
161 |     2. Delete the installation options you don't support in
162 |        the [Getting Started](#getting-started) section.
163 |     3. Have a look at the last paragraph below describing how to keep your project in good shape,
164 |        then delete this getting started, to only keep the project [Getting Started](#getting-started) section.
165 | 
166 | You're off to a good start! If you made it here, give the template a star!
167 | Here are a few tips for keeping your project in good shape.
168 | 
169 | - Keep this README up to date.
170 |   Fill in the rest of the sections after the Getting Started section when releasing your project.
171 |   We give a structure and some templates for those.
172 | 
173 |   If you use datasets, follow `data/README.md` to set them and write the instructions
174 |   for the subsequent users there.
175 |   Otherwise, delete the [data](#data) section.
176 | 
177 |   Similarly, you can use the `outputs/README.md` file to share your trained models, logs, etc.
178 | - Remember to pin your dependencies whenever you install new ones.
179 |   This is well described in the Maintaining the environment section of the installation instructions.
180 | - Keep your `reproducibility-scripts/` directory up to date.
181 |   Commit it regularly and run your jobs with those scripts.
182 |   More on this in the [reproducibility](#reproducing-our-results) section.
183 | - Maintain good commit hooks. More on this in the [Contributing](#contributing) section.
184 | - Have a look at the [ML Code Completeness Checklist](https://github.com/paperswithcode/releasing-research-code).
185 |   This template facilitates meeting all the checklist items, with a different design.
186 |   Have a look at the checklist when you ship your project.
187 | 
188 | ## Getting started
189 | 
190 | ### Code and development environment
191 | 
192 | > [!NOTE]
193 | > **TEMPLATE TODO**:
194 | > Update the installation methods and platforms you support, delete the rest, and delete this note.
195 | > I.e. keep either Docker or Conda, or both, or multiple of each if you support multiple platforms.
196 | > 1. Specify the platform for each option and its description
197 | >    e.g., for Docker amd64, arm64, etc., and for conda osx-arm64, linux-amd64, etc.
198 | > 2. Specify the hardware acceleration options for each platform
199 | >    e.g., for Docker NVIDIA GPUs, AMD GPUs etc.
200 | > 3. Specify the hardware on which you ran your experiments (e.g., type of CPU/GPU and size of memory) and
201 | >    the minimum hardware required to run your code if applicable (e.g., NVIDIA GPU with 80GB of memory).
202 | 
203 | We support the following methods and platforms for installing the project dependencies and running the code.
204 | 
205 | - **Docker/OCI-container for AMD64 machines (+ NVIDIA GPUs)**:
206 |   This option works for machines with AMD64 CPUs and NVIDIA GPUs.
207 |   E.g. Linux machines (EPFL HaaS servers, VMs on cloud providers),
208 |   Windows machines with WSL, and clusters running OCI-compliant containers,
209 |   like the EPFL Run:ai (Kubernetes) clusters.
210 | 
211 |   Follow the instructions in `installation/docker-amd64-cuda/README.md` to install the environment
212 |   then get back here for the rest of the instructions to run the experiments.
213 | 
214 |   We ran our experiments on TODO: FILL IN THE HARDWARE YOU USED.
215 |   To run them, you should have at least TODO: FILL IN THE MINIMUM HARDWARE REQS IF APPLICABLE.
216 | 
217 | - **Conda for osx-arm64**
218 |   This option works for macOS machines with Apple Silicon and can leverage MPS acceleration.
219 | 
220 |   Follow the instructions in `installation/conda-osx-arm64-mps/README.md` to install the environment
221 |   then get back here for the rest of the instructions to run the experiments.
222 | 
223 |   We ran our experiments on TODO: FILL IN THE HARDWARE YOU USED.
224 |   To run them, you should have at least TODO: FILL IN THE MINIMUM HARDWARE REQS IF APPLICABLE.
225 | 
226 | ### Data
227 | 
228 | > [!NOTE]
229 | > **TEMPLATE TODO**:
230 | > Fill `data/README.md` or delete this section, then delete this note.
231 | 
232 | Refer to `data/README.md`.
233 | 
234 | ### Logging and tracking experiments
235 | 
236 | We use [Weights & Biases](https://wandb.ai/site) to log and track our experiments.
237 | If you're logged in, your default entity will be used (a fixed entity is not set in the config),
238 | and you can set another entity with the `WANDB_ENTITY` environment variable.
239 | Otherwise, the runs will be anonymous (you don't need to be logged in).
240 | 
241 | ## Reproduction and experimentation
242 | 
243 | ### Reproducing our results
244 | 
245 | > [!NOTE]
246 | > **TEMPLATE TODO**:
247 | > Keep these scripts up to date and run your experiments using them.
248 | > Do provide the W&B runs and trained models or update this section.
249 | > Delete this note when shipping.
250 | 
251 | We provide scripts to reproduce our work in the `reproducibility-scripts/` directory.
252 | It has a README at its root describing which scripts reproduce which experiments.
253 | 
254 | We share our Weights and Biases runs in [this W&B project](https://wandb.ai/claire-labo/template-project-name).
255 | 
256 | Moreover, we make our trained models available.
257 | You can follow the instructions in `outputs/README.md` to download and use them.
258 | 
259 | ### Experiment with different configurations
260 | 
261 | The default configuration for each script is stored in the `configs/` directory.
262 | They are managed by [Hydra](https://hydra.cc/docs/intro/).
263 | You can experiment with different configurations by passing the relevant arguments.
264 | You can get examples of how to do so in the `reproducibility-scripts/` directory.
265 | 
266 | ## Repository structure
267 | 
268 | > [!NOTE]
269 | > **TEMPLATE TODO**:
270 | > Provide a quick overview of the main files in the repo for users to understand your code,
271 | > then delete this note.
272 | 
273 | Below, we give a description of the main files and directories in this repository.
274 | 
275 | ```
276 |  └─── src/                              # Source code.
277 |     └── template_package_name           # Our package.
278 |         ├── configs/                    # Hydra configuration files.
279 |         └── template_experiment.py      # A template experiment.
280 | ```
281 | 
282 | ## Contributing
283 | 
284 | We use [`pre-commit`](https://pre-commit.com) hooks to ensure high-quality code.
285 | Make sure it's installed on the system where you're developing
286 | (it is in the dependencies of the project, but you may be editing the code from outside the development environment.
287 | If you have conda you can install it in your base environment, otherwise, you can install it with `brew`).
288 | Install the pre-commit hooks with
289 | 
290 | ```bash
291 | # When in the PROJECT_ROOT.
292 | pre-commit install --install-hooks
293 | ```
294 | 
295 | Then every time you commit, the pre-commit hooks will be triggered.
296 | You can also trigger them manually with:
297 | 
298 | ```bash
299 | pre-commit run --all-files
300 | ```
301 | 
302 | ## Licenses and acknowledgements
303 | 
304 | This project is licensed under the LICENSE file in the root directory of the project.
305 | 
306 | The initial code of this repository has been initiated by the [Python Machine Learning Research Template](https://github.com/CLAIRE-Labo/python-ml-research-template)
307 | with the LICENSE.ml-template file.
308 | 
309 | Additional LICENSE files may be present in subdirectories of the project.
310 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions for the data
 2 | 
 3 | ## [TEMPLATE] Where and how to set up the data
 4 | 
 5 | > [!IMPORTANT]
 6 | > **TEMPLATE TODO:**
 7 | > Update the instructions below to explain how to obtain the data and delete this section.
 8 | 
 9 | The template provides the `PROJECT_ROOT/data/` directory as a placeholder for the data used in the project.
10 | This allows the experiment code to always refer to the same path for the data independently of the deployment method
11 | and the user configuration for better reproducibility.
12 | The directory can be accessed in the experiments with `config.data_dir`.
13 | Of course, this doesn't mean that the datasets inside `PROJECT_ROOT/data/` need to be physically in the same directory
14 | as the project.
15 | You can create symlinks to them.
16 | This shifts the data path configuration from the code and config to the installation steps
17 | (which we prefer, as it makes the committed code identical across deployment options).
18 | This is also more convenient than using environment variables to point to individual dataset locations.
19 | 
20 | Below, you can instruct the users on how to download or link to the data and preprocess it.
21 | 
22 | When the data is small enough (a few MBs),
23 | you can instruct the users (including you) to download it in the `PROJECT_ROOT/data/` directory.
24 | 
25 | Otherwise, you can provide hints to them on how to download it (or reuse parts of it) in a separate storage
26 | (likely in a shared storage where some datasets already exist) and then create symlinks to the different parts.
27 | For managed clusters you need to mount different filesystems remember to add this to the deployment scripts
28 | and setup files (e.g. `compose.yaml` for deployment with Docker.)
29 | 
30 | Here are example instructions:
31 | 
32 | To setup the `data` directory you can download the data anywhere on your system and then symlink to the data from
33 | the `PROJECT_ROOT/data/` directory.
34 | 
35 | ```bash
36 | # The data set already exist at /absolute_path/to/some-dataset
37 | # FROM the PROJECT_ROOT do
38 | ln -s /absolute-path/to/some-dataset data/some-dataset
39 | # Do this for each dataset root.
40 | # TEMPLATE TODO list all dataset roots (it's better to group them and use the groups accordingly in your code).
41 | ```
42 | 
43 | Be mindful that for the different deployment methods with container engines you will have to mount the filesystems
44 | where the data is stored (E.g. the local deployment option with Docker, and the container deployment on managed clusters)
45 | 
46 | `TEMPLATE TODO:` For the local deployment option with Docker you would edit the `../installation/docker-*/compose.yaml`
47 | file for the local deployment option with Docker,
48 | for the managed clusters you would edit the flags of the cluster client (`runai`, `srun`, etc.).
49 | Avoid nested mounts.
50 | It's better to mount the whole "scratch" filesystem and let the symlinks handle the rest.
51 | 
52 | ## Description of the data
53 | 
54 | ## Instructions to obtain the data
55 | 
56 | ## Instructions to process the data
57 | 


--------------------------------------------------------------------------------
/installation/conda-osx-arm64-mps/README.md:
--------------------------------------------------------------------------------
  1 | # Installation with conda
  2 | 
  3 | ## Template getting started
  4 | 
  5 | > [!NOTE]
  6 | > **TEMPLATE TODO:**
  7 | > Follow the instructions then delete this section.
  8 | 
  9 | This template provides a minimal `environment.yml` file for setting a conda environment.
 10 | Follow the steps below to get started.
 11 | Some steps will send you to different sections of the document.
 12 | It may feel like jumping back and forth, but everything should read nicely after the setup
 13 | for your future users (and yourself).
 14 | 
 15 | 1. Choose the platform and hardware acceleration that you will build the environment for.
 16 |    You have to pick one as fully specified conda environment files are not trivially
 17 |    portable across platforms and hardware accelerations.
 18 |    Packages are different for different platforms and hardware accelerations,
 19 |    so you cannot freeze an environment used for a platform and create it in another.
 20 | 
 21 |    The default platform is macOS on Apple Silicon `osx-arm64` to get support for `mps` hardware acceleration
 22 |    (reflected in the name of the directory `conda-osx-arm64-mps` by default).
 23 |    To edit it, run
 24 |    ```bash
 25 |    # When in the PROJECT_ROOT directory.
 26 |    # For examples run:
 27 |    ./installation/edit-platform-and-acceleration.sh
 28 |    # To do the change run:
 29 |    ./installation/edit-platform-and-acceleration.sh change conda CURR_PLATFORM CURR_ACCELERATION NEW_PLATFORM NEW_ACCELERATION
 30 |    # For a list of available platforms you can see the installers below
 31 |    # https://anaconda.org/pytorch/pytorch
 32 |    # The hardware acceleration will be determined by the packages you install.
 33 |    # E.g. if you install PyTorch with CUDA, set the acceleration to cuda.
 34 |    # Note: new PyTorch versions are only distributed on PyPI (i.e. with `pip`).
 35 |    ```
 36 |    If you plan to support multiple platforms or hardware accelerations,
 37 |    you can duplicate this installation method directory
 38 |    with `./installation/edit-platform-and-acceleration.sh copy ...`
 39 |    then perform the setup again.
 40 | 2. You can try to specify your dependencies if you are sure of how to install them and that they are compatible.
 41 |    Otherwise, you should build with the default dependencies and install them interactively in the running container
 42 |    then freeze them in the dependency files once you are sure of which to include and how to include them.
 43 |    You will find more information in the [instructions to maintain the environment](#from-python-instructions-to-maintain-the-environment).
 44 |    The Python version and package name have already been filled by the `fill-template.sh` script.
 45 | 
 46 |    If you change the dependency files commit so that you can track what worked and what didn't.
 47 | 3. Create the environment following the user
 48 |    [instructions to create the environment](#creating-the-environment) below.
 49 | 4. Get familiar with running the environment following the user [instructions to
 50 |    run the environment](#running-the-code-in-the-environment).
 51 | 5. If everything works fine,
 52 |    (we suggest checking that all of your dependencies are there with `mamba list`,
 53 |    and trying to import the important ones),
 54 |    then pin the dependencies you got following the [freeze the environment](#freeze-the-environment) section.
 55 |    You can then add more dependencies as your project grows following
 56 |    the [instructions to maintain the environment](#maintaining-the-environment).
 57 |    Commit.
 58 | 6. Go back to the root README for the rest of the instructions to set the template up.
 59 | 
 60 | ## Cloning the repository
 61 | 
 62 | Clone the git repository.
 63 | 
 64 | ```bash
 65 | # Keep a /dev copy for development and a /run copy for running unattended experiments.
 66 | mkdir template-project-name
 67 | cd template-project-name
 68 | git clone <git SSH URL> dev
 69 | cd dev
 70 | ```
 71 | 
 72 | We will refer the absolute path to the root of the repository as `PROJECT_ROOT`.
 73 | 
 74 | ## Creating the environment
 75 | 
 76 | **Prerequisites**
 77 | 
 78 | - `brew`: [Homebrew](https://brew.sh/).
 79 | - `mamba` (or equivalently `conda`): we recommend [Miniforge](https://github.com/conda-forge/miniforge).
 80 | 
 81 | **Installation**
 82 | 
 83 | System dependencies:
 84 | 
 85 | We list below the important system dependencies that are not available in conda,
 86 | but it is hard to list all the system dependencies needed to run the code.
 87 | We let you install the missing ones when you encounter errors.
 88 | 
 89 | - None.
 90 | 
 91 | The conda environment:
 92 | 
 93 | Create the environment with
 94 | 
 95 | ```bash
 96 | # When in the PROJECT_ROOT directory.
 97 | mamba env create --file installation/conda-osx-arm64-mps/environment.yml
 98 | ```
 99 | 
100 | Install the project with
101 | 
102 | ```bash
103 | # Activate the environment
104 | mamba activate template-project-name
105 | # When in the PROJECT_ROOT directory.
106 | pip install -e .
107 | ```
108 | 
109 | ## Running code in the environment
110 | 
111 | ```bash
112 | mamba activate template-project-name
113 | ```
114 | 
115 | Run scripts from the `PROJECT_ROOT` directory.
116 | Here are some examples.
117 | 
118 | ```bash
119 | # When in the PROJECT_ROOT directory.
120 | # template_experiment is an actual script that you can run.
121 | python -m template_package_name.template_experiment some_arg=some_value
122 | zsh reproducibility-scripts/template-experiment.sh
123 | ```
124 | 
125 | The environment is set up.
126 | Return to the root README for the rest of the instructions to run our experiments.
127 | 
128 | ## Maintaining the environment
129 | 
130 | System dependencies are managed by conda, otherwise when not available, by brew.
131 | (We try to keep everything self-contained as much as possible.)
132 | Python dependencies are managed by both conda and pip.
133 | 
134 | - Use `conda` for system and non-Python dependencies needed to run the project code (e.g., image libraries, etc.).
135 |   If not available on conda use `brew`.
136 | - Use `conda` for Python dependencies packaged with more that just Python code (e.g. `pytorch`, `numpy`).
137 |   These will typically be your main dependencies and will likely not change as your project grows.
138 |   Note: new PyTorch versions are only distributed on PyPI (i.e., with `pip`).
139 | - Use `pip` for the rest of the Python dependencies (e.g. `tqdm`).
140 | - For more complex dependencies that may require a custom installation or build,
141 |   manually follow their installation steps.
142 | 
143 | Here are references and reasons to follow the above claims:
144 | 
145 | * [A guide for managing conda + `pip` environments](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#using-pip-in-an-environment).
146 | * [Reasons to use conda for not-Python-only dependencies](https://numpy.org/install/#numpy-packages--accelerated-linear-algebra-libraries).
147 | * [Ways of combining conda and `pip`](https://medium.com/data-science/conda-essential-concepts-and-tricks-e478ed53b5b#42cb).
148 | 
149 | There are two ways to add dependencies to the environment:
150 | 
151 | 1. **Manually edit the `environment.yml` file.**
152 |    This is used the first time you set up the environment.
153 |    It will also be useful if you run into conflicts and have to restart from scratch.
154 | 2. **Add/upgrade dependencies interactively** while running a shell with the environment activated
155 |    to experiment with which dependency is needed.
156 |    This is probably what you'll be doing after creating the environment for the first time.
157 | 
158 | In both cases, after any change, a snapshot of the full environment specification should be saved.
159 | We describe how to do so in the freeze the environment section.
160 | Remember to commit the changes every time you freeze the environment.
161 | 
162 | ### Manual editing (before/while building)
163 | 
164 | - To edit the conda and pip dependencies, edit the `environment.yml` file.
165 | - For the `brew` and the more complex dependencies, describe the installation steps in the
166 |   [Creating the environment](#creating-the-environment) section.
167 | 
168 | When manually editing the `environment.yml` file,
169 | you do not need to specify the version of all the dependencies,
170 | these will be written to the file when you freeze the environment.
171 | You should just specify the major versions of specific dependencies you need.
172 | 
173 | After manually editing the `environment.yml` file, you need to recreate the environment.
174 | 
175 | ```bash
176 | # When in the PROJECT_ROOT directory.
177 | mamba deactivate
178 | mamba env remove --name template-project-name
179 | mamba env create --file installation/conda-osx-arm64-mps/environment.yml
180 | mamba activate template-project-name
181 | ```
182 | 
183 | ### Interactively (while developing)
184 | 
185 | Conda dependencies should all be installed before any `pip` dependency.
186 | This will cause conflicts otherwise as conda doesn't track the `pip` dependencies.
187 | So if you need to add a conda dependency after you already installed some `pip` dependencies, you need to
188 | manually add the dependency to the `environment.yml` file then recreate the environment.
189 | 
190 | * To add conda/pip dependencies run `(mamba | pip) install <package>`
191 | * To add a `brew`  dependency run `brew install <package>`
192 | 
193 | ### Freeze the environment
194 | 
195 | After any change to the dependencies, a snapshot of the full environment specification should be written to the
196 | `environment.yml` file.
197 | This includes manual changes to the file and changes made interactively.
198 | This is to ensure that the environment is reproducible and that the dependencies are tracked at any point in time.
199 | 
200 | To do so, run the following command.
201 | The script overwrites the `environment.yml` file with the current environment specification,
202 | so it's a good idea to commit the changes to the environment file before and after running it.
203 | 
204 | ```bash
205 | # When in the PROJECT_ROOT directory.
206 | zsh installation/conda-osx-arm64-mps/update-env-file.sh
207 | ```
208 | 
209 | There are some caveats (e.g., packages installed from GitHub with pip), so have a look at
210 | the output file to make sure it does what you want.
211 | The `update-env-file.sh` gives some hints for what to do, and in any case you can always patch the file manually.
212 | 
213 | For `brew` and more complex dependencies describe how to install them in the system dependencies section of
214 | the [instructions to install the environment](#creating-the-environment).
215 | 
216 | If one of the complex dependencies shows in the `environment.yml` after the freeze,
217 | you have to remove it, so that conda does not pick it up, and it is installed later by the user.
218 | 
219 | ## Troubleshooting
220 | 


--------------------------------------------------------------------------------
/installation/conda-osx-arm64-mps/environment.yml:
--------------------------------------------------------------------------------
 1 | name: template-project-name
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.10
 6 |   - pip
 7 |   - pip:
 8 |       - hydra-core
 9 |       - tqdm
10 |       - wandb
11 |       - pre-commit
12 |       - black
13 | 


--------------------------------------------------------------------------------
/installation/conda-osx-arm64-mps/update-env-file.sh:
--------------------------------------------------------------------------------
 1 | # Records the current environment to a file.
 2 | # Packages installed from GitHub with pip install <git url> will not be recorded
 3 | # properly (i.e. the link can be omitted and just replaced with the version).
 4 | # In that case, you have to update this file to add commands that
 5 | # will fix the environment file.
 6 | # (you could also patch the file manually afterwards).
 7 | # Similarly the conda channels used to install packages may not be recorded properly
 8 | # if you used complex combinations of channels.
 9 | # In that case you also have to make the edits here or patch the file manually.
10 | 
11 | ENVIR_FILE="installation/conda-osx-arm64-mps/environment.yml"
12 | conda env export --file "$ENVIR_FILE"
13 | 
14 | # Delete the path line.
15 | sed -i.deleteme "$ d" "$ENVIR_FILE"
16 | # Set the package to a local installation.
17 | sed -i.deleteme "/template-project-name==/d" "$ENVIR_FILE"
18 | # .deleteme is a trick to make sed work the same way on both Linux and OSX.
19 | # https://stackoverflow.com/questions/5694228/sed-in-place-flag-that-works-both-on-mac-bsd-and-linux
20 | rm "${ENVIR_FILE}.deleteme"
21 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/.dockerignore:
--------------------------------------------------------------------------------
1 | CSCS-Clariden-setup
2 | EPFL-runai-setup
3 | EPFL-SCITAS-setup
4 | from-python-template
5 | from-scratch-template
6 | README.md
7 | template.sh
8 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/.gitignore:
--------------------------------------------------------------------------------
1 | # If you want to put your own scripts contraining API keys (e.g. W&B).
2 | submit-scripts/
3 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/README.md:
--------------------------------------------------------------------------------
  1 | # Guide for using the template with the CSCS Clariden cluster
  2 | 
  3 | ## Overview
  4 | 
  5 | This guide will show you how to build and run your image on the CSCS Clariden cluster and use it for
  6 | 
  7 | 1. Remote development.
  8 | 2. Running unattended jobs.
  9 | 
 10 | ## Clone your repository in your home directory
 11 | 
 12 | We strongly suggest having two instances of your project repository.
 13 | 
 14 | 1. One for development, which may have uncommitted changes, be in a broken state, etc.
 15 | 2. One for running unattended jobs, which is always referring to a commit at a working state of the code.
 16 | 
 17 | The outputs and data directories of those two instances will be symlinked to the scratch storage
 18 | and will be shared anyway.
 19 | This guide includes the steps to do it, and there are general details in `data/README.md` and `outputs/README.md`.
 20 | 
 21 | ```bash
 22 | # SSH to a cluster.
 23 | ssh clariden
 24 | mkdir -p $HOME/projects/template-project-name
 25 | cd $HOME/projects/template-project-name
 26 | git clone <git SSH URL> dev
 27 | git clone <git SSH URL> run
 28 | 
 29 | # To setup symlinks to the scratch storage you can run the following commands
 30 | mkdir -p $SCRATCH/projects/template-project-name/data/dev
 31 | mkdir -p $SCRATCH/projects/template-project-name/outputs/dev
 32 | for instance in run dev; do
 33 |   ln -s $SCRATCH/projects/template-project-name/data/dev $HOME/projects/template-project-name/$instance/data/dev
 34 |   ln -s $SCRATCH/projects/template-project-name/outputs/dev $HOME/projects/template-project-name/$instance/outputs/dev
 35 | done
 36 | ```
 37 | 
 38 | The rest of the instructions should be performed on the cluster from the dev instance of the project.
 39 | 
 40 | ```bash
 41 | cd dev
 42 | # It may also be useful to open a remote code editor on a login node to view the project.
 43 | # (The remote development will happen in another IDE in the container.)
 44 | cd installation/docker-amd64-cuda
 45 | ```
 46 | 
 47 | ## Building the environment (skip if already have access to the image)
 48 | 
 49 | > [!NOTE]
 50 | > **TEMPLATE TODO:**
 51 | > After saving your generic image, provide the image location to your teammates.
 52 | > Ideally also push it to team registry and later on a public registry if you open-source your project.
 53 | > Add it below in the TODO ADD IMAGE PATH.
 54 | 
 55 | ### Prerequisites
 56 | 
 57 | * `podman` (Already installed on the CSCS clusters). Configure it as described [here](https://confluence.cscs.ch/display/KB/LLM+Inference)
 58 |   (step after "To use Podman, we first need to configure some storage ...")
 59 | * `podman-compose` (A utility to run Docker compose files with Podman) [Install here](https://github.com/containers/podman-compose/tree/main)
 60 |   or follow the steps below for an installation from scratch on CSCS.
 61 | 
 62 | ```bash
 63 | # Install Miniconda
 64 | curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 65 | bash Miniforge3-$(uname)-$(uname -m).sh
 66 | # Follow the instructions
 67 | # Close and reopen your terminal
 68 | bash
 69 | # Create a new conda environment
 70 | mamba create -n podman python=3.10
 71 | mamba activate podman
 72 | pip install podman-compose
 73 | 
 74 | # Activate this environment whenever you use this template.
 75 | ```
 76 | 
 77 | ### Build the images
 78 | 
 79 | All commands should be run from the `installation/docker-amd64-cuda/` directory.
 80 | 
 81 | You should be on a compute node. If not already, get one.
 82 | ```bash
 83 | # Request a compute node
 84 | sbatch --time 4:00:00 -A a-a10 --wrap "sleep infinity" --output=/dev/null --error=/dev/null
 85 | # Connect to it
 86 | srun --overlap --pty --jobid=GET_THE_JOB_ID bash
 87 | tmux
 88 | # or if reconnecting
 89 | tmux at
 90 | ```
 91 | 
 92 | ```bash
 93 | cd installation/docker-amd64-cuda
 94 | ```
 95 | 
 96 | 1. Create an environment file for your personal configuration with
 97 |    ```bash
 98 |    ./template.sh env
 99 |    ```
100 |    This creates a `.env` file with pre-filled values.
101 |     - Edit the `DOCKER` variable to `podman` and the `COMPOSE` variable to `podman-compose`.
102 |     - The rest of the variables are set correctly (`USR, USRID, GRP, GRPID, and PASSW`,
103 |       e.g.`LAB_NAME` will be the first element in name of the local images you get,
104 |       it's by default your horizontal/vertical)
105 |     - You can ignore the rest of the variables after `## For running locally`.
106 | 2. Edit the Dockerfile to make it compatible with Podman:
107 |    There are commented lines starting with `# Podman` which should be uncommented
108 |    and replace the corresponding lines above them.
109 | 3. Build the generic image.
110 |    This is the image with root as user.
111 |    It will be named according to the image name in your `.env`.
112 |    It will be tagged with `<platform>-root-latest` and if you're building it,
113 |    it will also be tagged with the latest git commit hash `<platform>-root-<sha>` and `<platform>-root-<sha>`.
114 |    ```bash
115 |    # Make sure the Conda environment with podman-compose is activated.
116 |    # mamba activate podman
117 |    ./template.sh build_generic
118 |    ```
119 | 4. Export the image to a file and move it to a directory where you keep the images.
120 |    ```bash
121 |    ./template.sh import_from_podman
122 |    # Move the images
123 |    # Make a directory where you store your images
124 |    # Add it to your bashrc as it'll be used often
125 |    CONTAINER_IMAGES=$SCRATCH/container-images
126 |    mkdir -p $CONTAINER_IMAGES
127 |    mv *.sqsh $CONTAINER_IMAGES
128 |    ```
129 | 5. You can run quick checks on the image to check it that it has what you expect it to have.
130 |    When the example scripts are described later, run the `test-interactive.sh` example script before the other scripts.
131 | 
132 | ## Getting your image (if already built, or just built)
133 | 
134 | #### From a file
135 | 
136 | You will find the image to use for this project in _TODO ADD IMAGE_PATH_.
137 | Copy it or create a symlink to it where you keep your images. E.g.,
138 | ```bash
139 | # Make a directory where you store your images
140 | # Add it to your bashrc as it'll be used often
141 | CONTAINER_IMAGES=$SCRATCH/container-images
142 | mkdir -p $CONTAINER_IMAGES
143 | # Copy the image with an adapted name with your horizontal/vertical name and username
144 | # (it will be readily-usable by the submit scripts)
145 | cp _TODO ADD IMAGE_PATH_ $CONTAINER_IMAGES/ADAPTED_NAME.sqsh
146 | ```
147 | 
148 | #### From a registry
149 | 
150 | > [!NOTE]
151 | > **TEMPLATE TODO:**
152 | > You can push your image to a registry after building and provide the path to your teammates.
153 | 
154 | Example submit scripts are provided in the `example-submit-scripts` directory and are used in the following examples.
155 | You can copy them to the directory `submit-scripts` which is not tracked by git and edit them to your needs.
156 | 
157 | ### A quick test to understand how the template works
158 | 
159 | Adapt the `submit-scripts/minimal.sh` with the name of your image and your cluster storage setup
160 | (should be correct by default).
161 | 
162 | The submission script gives an example of how to run containers on Clariden with [`enroot`](https://github.com/NVIDIA/enroo)
163 | and the [`pyxis`](https://github.com/NVIDIA/pyxis) plugin directly integrated in `srun`,
164 | 
165 | Run the script to see how the template works.
166 | ```bash
167 | cd installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts
168 | bash minimal.sh
169 | ```
170 | 
171 | When the container starts, its entrypoint does the following:
172 | 
173 | - It runs the entrypoint of the base image if you specified it in the `compose-base.yaml` file.
174 | - It expects you specify `PROJECT_ROOT_AT=<location to your project (dev or run)>`.
175 |   and `PROJECT_ROOT_AT` to be the working directory of the container.
176 |   Otherwise, it will issue a warning and set it to the default working directory of the container.
177 | - It then tries to install the project in editable mode.
178 |   This is a lightweight installation that allows to avoid all the hacky import path manipulations.
179 |   (This will be skipped if `PROJECT_ROOT_AT` has not been specified or if you specify `SKIP_INSTALL_PROJECT=1`.)
180 | - It also handles all the remote development setups (VS Code, Cursor, PyCharm, Jupyter, ...)
181 |   that you specify with environment variables.
182 |   These are described in the later sections of this README.
183 | - Finally, it executes a provided command (e.g. `bash` here for an interactive job with a connected --pty).
184 | 
185 | You need to make sure that this minimal submission works before proceeding.
186 | The logs of the entrypoint are only shown in case there was an error (design from pyxis).
187 | (A current workaround runs the entrypoint as a script at the start instead of as an entrypoint)
188 | 
189 | If the entrypoint fails the installation of your project, you can resubmit your job with `export SKIP_INSTALL_PROJECT=1`
190 | which will skip the installation step then you can replay the installation manually in the container to debug it.
191 | 
192 | ## Use cases
193 | 
194 | The basic configuration for the project's environment is now set up.
195 | You can follow the remaining sections below to see how to run unattended jobs and set up remote development.
196 | After that, return to the root README for the rest of the instructions to run our experiments.
197 | 
198 | 
199 | ### Running unattended jobs
200 | 
201 | By performing the above first steps, you should have all the required setup to run unattended jobs.
202 | The main difference is that the unattended job is run with `sbatch`.
203 | An example of an unattended job can be found in `submit-scripts/unattended.sh` to run with `sbatch`.
204 | Note the emphasis on having a frozen copy `run` of the repository for running unattended jobs.
205 | 
206 | ### Weights&Biases
207 | 
208 | Your W&B API key should be exposed as the `WANDB_API_KEY` environment variable.
209 | You can export it or if you're sharing the script with others export a location to a file containing it with
210 | `export WANDB_API_KEY_FILE_AT` and let the template handle it.
211 | 
212 | E.g.,
213 | 
214 | ```bash
215 | echo <my-wandb-api-key> > $HOME/.wandb-api-key
216 | chmod 600 $HOME/.wandb-api-key
217 | ```
218 | 
219 | 
220 | Then `export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key` in the submit script.
221 | You should also mount the file in the container.
222 | 
223 | ### Hugging Face
224 | 
225 | Your HF API key should be exposed as the `HF_TOKEN` environment variable.
226 | You can export it or if you're sharing the script with others export a location to a file containing it with
227 | `export HF_TOKEN_AT` and let the template handle it.
228 | 
229 | E.g.,
230 | 
231 | ```bash
232 | echo <my-huggingface-api-key> > $HOME/.hf-token
233 | chmod 600 $HOME/.hf-token
234 | ```
235 | 
236 | Then `export HF_TOKEN_AT=$HOME/.hf-token` in the submit script.
237 | You should also mount the file in the container.
238 | 
239 | ### Remote development
240 | 
241 | This would be the typical use case for a researcher at CLAIRE using the cluster as their daily driver to do
242 | development, testing, and debugging.
243 | Your job would be running a remote IDE/code editor on the cluster, and you would only have a lightweight local client
244 | running on your laptop.
245 | 
246 | The entrypoint will start an ssh server and a remote development server for your preferred IDE/code editor
247 | when you set some environment variables.
248 | An example of an interactive job submission can be found in `submit-scripts/remote-development.sh`
249 | to run with `sbatch`.
250 | 
251 | Below, we list and describe in more detail the tools and IDEs supported for remote development.
252 | 
253 | ### SSH Configuration (Necessary for PyCharm, VS Code, and Cursor)
254 | 
255 | Your job will open an ssh server when you set the environment variable `SSH_SERVER=1`.
256 | You also have to mount the authorized keys file from your home directory to the container (done in the example).
257 | The SSH connection is necessary for some remote IDEs like PyCharm to work and can be beneficial
258 | for other things like ssh key forwarding.
259 | The ssh server is configured to run on port 2223 of the container.
260 | 
261 | With the ssh connection, you can forward the ssh keys on your local machine (that you use for GitHub, etc.)
262 | on the remote server.
263 | This allows using the ssh keys on the remote server without having to copy them there.
264 | 
265 | For that, you need three things: an ssh agent running on your local machine, the key added to the agent,
266 | and a configuration file saying that the agent should be used with the ssh connection to SCITAS.
267 | GitHub provides a guide for that
268 | [here (look at the troubleshooting section too)](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding).
269 | 
270 | Use the following configuration in your local `~/.ssh/config`
271 | 
272 | ```bash
273 | Host clariden
274 |     HostName clariden.cscs.ch
275 |     User smoalla
276 |     ProxyJump ela
277 |     ForwardAgent yes
278 | 
279 | # EDIT THIS HOSTNAME WITH EVERY NEW JOB
280 | Host clariden-job
281 |     HostName nid007545
282 |     User smoalla
283 |     ProxyJump clariden
284 |     StrictHostKeyChecking no
285 |     UserKnownHostsFile=/dev/null
286 |     ForwardAgent yes
287 | 
288 | Host clariden-container
289 |     HostName localhost
290 |     ProxyJump clariden-job
291 |     Port 2223
292 |     User smoalla
293 | 	StrictHostKeyChecking no
294 | 	UserKnownHostsFile=/dev/null
295 | 	ForwardAgent yes
296 | ```
297 | 
298 | To update the hostname of the `clariden-job` you can add this to your `~/.zshrc` on macOS for example:
299 | 
300 | ```bash
301 | # Tested on macos with zsh
302 | function update-ssh-config() {
303 |   local config_file="$HOME/.ssh/config"  # Adjust this path if needed
304 |   local host="$1"
305 |   local new_hostname="$2"
306 | 
307 |   if [[ -z "$host" || -z "$new_hostname" ]]; then
308 |     echo "Usage: update-ssh-config <host> <new-hostname>"
309 |     return 1
310 |   fi
311 | 
312 |   sed -i '' '/Host '"$host"'/,/Host / s/^[[:space:]]*HostName.*/    HostName '"$new_hostname"'/' "$config_file"
313 |   echo "Updated HostName for '${host}' to '${new_hostname}' in ~/.ssh/config"
314 | }
315 | ```
316 | 
317 | The `StrictHostKeyChecking no` and `UserKnownHostsFile=/dev/null` allow bypass checking the identity
318 | of the host [(ref)](https://linuxcommando.blogspot.com/2008/10/how-to-disable-ssh-host-key-checking.html)
319 | which keeps changing every time a job is scheduled,
320 | so that you don't have to reset it each time.
321 | 
322 | With this config you can then connect to your container with `ssh clariden-container`.
323 | 
324 | **Limitations**
325 | 
326 | Note that an ssh connection to the container is not like executing a shell on the container.
327 | In particular, the following limitations apply:
328 | 
329 | - environment variables in the image sent to the entrypoint of the container and any command exec'ed in it
330 |   are not available in ssh connections.
331 |   There is a workaround for that in `entrypoints/remote-development-setup.sh` when opening an ssh server
332 |   which should work for most cases, but you may still want to adapt it to your needs.
333 | 
334 | ### Git config
335 | 
336 | You can persist your Git config (username, email, etc.) by mounting it in the container.
337 | This is done in the examples.
338 | 
339 | E.g., create your config in your home directory with
340 | 
341 | ```bash
342 | cat >$HOME/.gitconfig <<EOL
343 | [user]
344 |         email = your@email
345 |         name = Your Name
346 | [core]
347 |         filemode = false
348 | EOL
349 | ```
350 | 
351 | ### PyCharm Professional
352 | 
353 | We support the [Remote Development](https://www.jetbrains.com/help/pycharm/remote-development-overview.html)
354 | feature of PyCharm that runs a remote IDE in the container.
355 | 
356 | The first time connecting you will have to install the IDE in the server in a location mounted in the container
357 | that is stored for future use (somewhere in your `$HOME` directory).
358 | After that, or if you already have the IDE stored in from a previous project,
359 | the template will start the IDE on its own at the container creation,
360 | and you will be able to directly connect to it from the JetBrains Gateway client on your local machine.
361 | 
362 | **Preliminaries: saving the project IDE configuration**
363 | 
364 | The remote IDE stores its configuration and cache (e.g., the interpreters you set up, memory requirements, etc.)
365 | in `~/.config/JetBrains/RemoteDev-PY/...`, `~/.cache/JetBrains/RemoteDev-PY/...`, and other directories.
366 | 
367 | To have it preserved between different dev containers, you should specify the `JETBRAINS_SERVER_AT` env variable
368 | with your submit command as shown in the examples in `submit-scripts/remote-development.sh`.
369 | The template will use it to store the IDE configuration and cache in a separate directory
370 | per project (defined by its $PROJECT_ROOT_AT).
371 | All the directories will be created automatically.
372 | 
373 | **First time only (if you don't have the IDE stored from another project), or if you want to update the IDE.**
374 | 
375 | 1. `mkdir $HOME/jetbrains-server`
376 | 2. Submit your job as in the example `submit-scripts/remote-development.sh` and in particular edit the environment
377 |    variables
378 |     - `JETBRAINS_SERVER_AT`: set it to the `jetbrains-server` directory described above.
379 |     - `PYCHARM_IDE_AT`: don't include it as IDE is not installed yet.
380 |    And add `JETBRAINS_SERVER_AT` in the `--container-mounts`
381 | 3. Then follow the instructions [here](https://www.jetbrains.com/help/pycharm/remote-development-a.html#gateway) and
382 |    install the IDE in your `${JETBRAINS_SERVER_AT}/dist`
383 |    (something like `/users/smoalla/jetbrains-server/dist`)
384 |    not in its default location **(use the small "installation options..." link)**.
385 |    For the project directory, it should be in the same location where it was mounted (`${PROJECT_ROOT_AT}`,
386 |    something like `/users/smoalla/projects/template-project-name/dev`).
387 | 
388 | When in the container, locate the name of the PyCharm IDE installed.
389 | It will be at
390 | ```bash
391 | ls ${JETBRAINS_SERVER_AT}/dist
392 | # Outputs something like e632f2156c14a_pycharm-professional-2024.1.4
393 | ```
394 | The name of this directory will be what you should set the `PYCHARM_IDE_AT` variable to in the next submissions
395 | so that it starts automatically.
396 | ```bash
397 | PYCHARM_IDE_AT=744eea3d4045b_pycharm-professional-2024.1.6-aarch64
398 | ```
399 | 
400 | **When you have the IDE in the storage**
401 | You can find an example in `submit-scripts/remote-development.sh`.
402 | 
403 | 1. Same as above, but set the environment variable `PYCHARM_IDE_AT` to the directory containing the IDE binaries.
404 |    Your IDE will start running with your container.
405 | 2. Enable port forwarding for the SSH port.
406 | 3. Open JetBrains Gateway, your project should already be present in the list of projects and be running.
407 | 
408 | 
409 | **Configuration**:
410 | 
411 | * PyCharm's default terminal is bash. Change it to zsh in the Settings -> Tools -> Terminal.
412 | * When running Run/Debug configurations, set your working directory the project root (`$PROJECT_ROOT_AT`), not the script's directory.
413 | * Your interpreter will be
414 |   * the system Python `/usr/bin/python` with the `from-python` option.
415 |   * the Python in your conda environment with the `from-scratch` option, with the conda binary found at `/opt/conda/condabin/conda`.
416 | 
417 | **Limitations:**
418 | 
419 | - The terminal in PyCharm opens ssh connections to the container,
420 |   so the workaround (and its limitations) in the ssh section apply.
421 |   If needed, you could just open a separate terminal on your local machine
422 |   and directly exec a shell into the container.
423 | - It's not clear which environment variables are passed to the programs run from the IDE like the debugger.
424 |   So far, it seems like the SSH env variables workaround works fine for this.
425 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet.
426 | 
427 | ### VSCode / Cursor
428 | 
429 | We support the [Remote Development using SSH ](https://code.visualstudio.com/docs/remote/ssh)
430 | feature of VS code that runs a remote IDE in the container via SSH. To set this up for Cursor, simply replace `VSCODE` by `CURSOR` and `vscode` by `cursor` in all instructions below. For example, `VSCODE_SERVER_AT` becomes `CURSOR_SERVER_AT`, and `~/.vscode-server` becomes `~/.cursor-server`.
431 | 
432 | **Preliminaries: saving the IDE configuration**
433 | 
434 | The remote IDE stores its configuration (e.g., the extensions you set up) in `~/.vscode-server`.
435 | To have it preserved between different dev containers, you should specify the
436 | `VSCODE_SERVER_AT` env variable with your submit command
437 | as shown in the examples in `submit-scripts/remote-development.sh`.
438 | The template will use it to store the IDE configuration and cache in a separate directory
439 | per project (defined by its $PROJECT_ROOT_AT).
440 | All the directories will be created automatically.
441 | 
442 | **ssh configuration**
443 | 
444 | VS Code takes ssh configuration from files.
445 | Follow the steps in the [SSH configuration section](#ssh-configuration-necessary-for-pycharm-and-vs-code)
446 | to set up your ssh config file.
447 | 
448 | **Connecting VS Code to the container**:
449 | 
450 | 1. `mkdir $HOME/vscode-server`
451 | 2. In your submit command, set the environment variables for
452 |     - Opening an ssh server `SSH_SERVER=1`.
453 |     - preserving your config `VSCODE_SERVER_AT`.
454 |    And add `VSCODE_SERVER_AT` in the `--container-mounts`.
455 | 3. Have the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh)
456 |    extension on your local VS Code.
457 | 4. Connect to the ssh host following the
458 |    steps [here](https://code.visualstudio.com/docs/remote/ssh#_connect-to-a-remote-host).
459 | 
460 | The directory to add to your VS Code workspace should be the same as the one specified in the `PROJECT_ROOT_AT`.
461 | 
462 | **Limitations**
463 | 
464 | - The terminal in VS Code opens ssh connections to the container,
465 |   so the workaround (and its limitations) in the ssh section apply.
466 |   If needed, you could just open a separate terminal on your local machine
467 |   and directly exec a shell into the container.
468 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet.
469 | 
470 | ### JupyterLab (TODO)
471 | 
472 | ### Examples
473 | 
474 | We provide examples of how to use the template in the `submit-scripts` directory.
475 | 
476 | ### Troubleshooting
477 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/README.md:
--------------------------------------------------------------------------------
1 | # Tips and Best Practices for Running Jobs with Slurm + pyxis + enroot
2 | 
3 | Placeholder.
4 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/edf.toml:
--------------------------------------------------------------------------------
1 | [annotations]
2 | com.hooks.aws_ofi_nccl.enabled = "true"
3 | com.hooks.aws_ofi_nccl.variant = "cuda12"
4 | 
5 | [env]
6 | NCCL_DEBUG = "INFO"
7 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/minimal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Variables used by the entrypoint script
 4 | # Change this to the path of your project (can be the /dev or /run copy)
 5 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev
 6 | export PROJECT_NAME=template-project-name
 7 | export PACKAGE_NAME=template_package_name
 8 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
 9 | 
10 | # Enroot + Pyxis
11 | 
12 | # Limitation: pyxis doesn't send environment variables to the entrypoint so it has to be run manually
13 | # This is fixed in v0.20.0
14 | 
15 | srun \
16 |   -J template-minimal \
17 |   --pty \
18 |   --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \
19 |   --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \
20 |   --container-mounts=$PROJECT_ROOT_AT,$SCRATCH \
21 |   --container-workdir=$PROJECT_ROOT_AT \
22 |   --no-container-mount-home \
23 |   --no-container-remap-root \
24 |   --no-container-entrypoint \
25 |   --container-writable \
26 |   /opt/template-entrypoints/pre-entrypoint.sh \
27 |   bash
28 | 
29 | # additional options for pyxis
30 | # --container-env to override environment variables defined in the container
31 | 
32 | exit 0
33 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/remote-development.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J template-remote-development
 4 | #SBATCH -t 12:00:00
 5 | 
 6 | # Variables used by the entrypoint script
 7 | # Change this to the path of your project (can be the /dev or /run copy)
 8 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev
 9 | export PROJECT_NAME=template-project-name
10 | export PACKAGE_NAME=template_package_name
11 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
12 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key
13 | # You can remove the Hugging Face variables if you don't use it, also remove them from the container mounts.
14 | export HF_TOKEN_AT=$HOME/.hf-token
15 | export HF_HOME=$SCRATCH/huggingface
16 | 
17 | export SSH_SERVER=1
18 | export NO_SUDO_NEEDED=1
19 | # For the first time, mkdir -p $HOME/jetbrains-server, and comment out PYCHARM_IDE_AT
20 | export JETBRAINS_SERVER_AT=$HOME/jetbrains-server
21 | #export PYCHARM_IDE_AT=744eea3d4045b_pycharm-professional-2024.1.6-aarch64
22 | # or
23 | # export VSCODE_SERVER_AT=$HOME/vscode-server
24 | # We use a different path than the default .vscode-server to separate the container installation from the local installation
25 | # and replace JETBRAINS_SERVER_AT in the container-mounts with VSCODE_SERVER_AT
26 | 
27 | srun \
28 |   --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \
29 |   --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \
30 |   --container-mounts=\
31 | $PROJECT_ROOT_AT,\
32 | $SCRATCH,\
33 | $WANDB_API_KEY_FILE_AT,\
34 | $HOME/.gitconfig,\
35 | $HF_TOKEN_AT,\
36 | $JETBRAINS_SERVER_AT,\
37 | $HOME/.ssh \
38 |   --container-workdir=$PROJECT_ROOT_AT \
39 |   --no-container-mount-home \
40 |   --no-container-remap-root \
41 |   --no-container-entrypoint \
42 |   --container-writable \
43 |   /opt/template-entrypoints/pre-entrypoint.sh \
44 |   sleep infinity
45 | 
46 | # additional options
47 | # --container-env to override environment variables defined in the container
48 | 
49 | # Draft.
50 | # Here can connect to the container with
51 | # Get the job id (and node id if multinode)
52 | #
53 | # Connect to the allocation
54 | #   srun --overlap --pty --jobid=JOBID bash
55 | # Inside the job find the container name
56 | #   enroot list -f
57 | # Exec to the container
58 | #   enroot exec <container-pid> zsh
59 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/test-interactive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Enroot + Pyxis
 4 | 
 5 | srun \
 6 |   -J template-test \
 7 |   --pty \
 8 |   --container-image=$CONTAINER_IMAGES/claire+smoalla+template-project-name+amd64-cuda-root-latest.sqsh \
 9 |   --no-container-mount-home \
10 |   --no-container-remap-root \
11 |   --no-container-entrypoint \
12 |   --container-writable \
13 |   bash
14 | 
15 | exit 0
16 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/unattended-distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J template-untattended-distributed
 4 | #SBATCH -t 0:30:00
 5 | #SBATCH --nodes 2
 6 | #SBATCH --ntasks-per-node 3
 7 | 
 8 | # There is a current limitation in pyxis with the entrypoint and it has to run manually.
 9 | # It has to run only once per node and the other tasks in the nodes have to wait for it to finish.
10 | # So you can either limit your jobs to 1 task per node or use a sleep command to wait for the entrypoint to finish.
11 | 
12 | 
13 | # Variables used by the entrypoint script
14 | # Change this to the path of your project (can be the /dev or /run copy)
15 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run
16 | export PROJECT_NAME=template-project-name
17 | export PACKAGE_NAME=template_package_name
18 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
19 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key
20 | 
21 | srun \
22 |   --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \
23 |   --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \
24 |   --container-mounts=\
25 | $PROJECT_ROOT_AT,\
26 | $SCRATCH,\
27 | $WANDB_API_KEY_FILE_AT \
28 |   --container-workdir=$PROJECT_ROOT_AT \
29 |   --no-container-mount-home \
30 |   --no-container-remap-root \
31 |   --no-container-entrypoint \
32 |   --container-writable \
33 |   /opt/template-entrypoints/pre-entrypoint.sh \
34 |   bash -c 'sleep 60; python -m template_package_name.template_experiment some_arg=LOCALID-$SLURM_LOCALID-PROCID-$SLURM_PROCID'
35 | 
36 | # Sleep to wait for the installation of the project.
37 | 
38 | # additional options
39 | # --container-env to override environment variables defined in the container
40 | 
41 | exit 0
42 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/CSCS-Clariden-setup/example-submit-scripts/unattended.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J template-unattended
 4 | #SBATCH -t 0:30:00
 5 | 
 6 | # Variables used by the entrypoint script
 7 | # Change this to the path of your project (can be the /dev or /run copy)
 8 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run
 9 | export PROJECT_NAME=template-project-name
10 | export PACKAGE_NAME=template_package_name
11 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
12 | # For wandb, huggingface, etc. look at the remote-development.sh
13 | 
14 | srun \
15 |   --container-image=$CONTAINER_IMAGES/$(id -gn)+$(id -un)+template-project-name+amd64-cuda-root-latest.sqsh \
16 |   --environment="${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/CSCS-Clariden-setup/submit-scripts/edf.toml" \
17 |   --container-mounts=$PROJECT_ROOT_AT,$SCRATCH \
18 |   --container-workdir=$PROJECT_ROOT_AT \
19 |   --no-container-mount-home \
20 |   --no-container-remap-root \
21 |   --no-container-entrypoint \
22 |   --container-writable \
23 |   /opt/template-entrypoints/pre-entrypoint.sh \
24 |   python -m template_package_name.template_experiment some_arg=some_value wandb.mode=offline
25 | 
26 | # additional options for pyxis
27 | # --container-env to override environment variables defined in the container
28 | 
29 | exit 0
30 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/Dockerfile:
--------------------------------------------------------------------------------
  1 | # syntax = docker/dockerfile:1
  2 | 
  3 | # BASE_IMAGE is the image that will be extended by this Dockerfile.
  4 | # It is assumed to a well configured Python installation.
  5 | # The reminaing packages will be installed with pip.
  6 | ARG BASE_IMAGE
  7 | ARG GIT_IMAGE
  8 | 
  9 | ########################################################################
 10 | # Install apt packages.
 11 | 
 12 | FROM ${BASE_IMAGE} AS runtime-apt-pkgs
 13 | 
 14 | # A directory to record all the dependency files used at multiple stages.
 15 | # This is useful for a later inspection or debugging.
 16 | ENV DEPENDENCIES_DIR=/opt/template-dependencies
 17 | RUN mkdir ${DEPENDENCIES_DIR}
 18 | COPY apt.txt ${DEPENDENCIES_DIR}/apt.txt
 19 | 
 20 | # Enable caching for `apt` packages in Docker.
 21 | # https://docs.docker.com/engine/reference/builder/#run---mounttypecache
 22 | RUN rm -f /etc/apt/apt.conf.d/docker-clean; \
 23 |     echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > \
 24 |     /etc/apt/apt.conf.d/keep-cache
 25 | 
 26 | ARG DEBIAN_FRONTEND=noninteractive
 27 | # sed is only used as a hack to remove comments from the file apt.txt.
 28 | RUN --mount=type=cache,target=/var/cache/apt,sharing=private \
 29 |     --mount=type=cache,target=/var/lib/apt,sharing=private \
 30 |     apt update && \
 31 |     sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \
 32 |     xargs -t apt-get install -y --no-install-recommends && \
 33 |     rm -rf /var/lib/apt/lists/*
 34 | 
 35 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes.
 36 | #RUN --mount=type=cache,target=/var/cache/apt \
 37 | #    --mount=type=cache,target=/var/lib/apt \
 38 | #    apt update && \
 39 | #    sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \
 40 | #    xargs -t apt-get install -y --no-install-recommends && \
 41 | #    rm -rf /var/lib/apt/lists/*
 42 | 
 43 | ########################################################################
 44 | # Install dependencies.
 45 | 
 46 | FROM runtime-apt-pkgs AS runtime-deps
 47 | 
 48 | # Install pip packages.
 49 | ENV PIP_CACHE_DIR=/root/.cache/pip
 50 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-before-pip-install.txt
 51 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-before-pip-install.txt
 52 | COPY requirements.txt ${DEPENDENCIES_DIR}/requirements.txt
 53 | RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \
 54 |     pip install -r ${DEPENDENCIES_DIR}/requirements.txt
 55 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes.
 56 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \
 57 | #    pip install -r ${DEPENDENCIES_DIR}/requirements.txt
 58 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-after-pip-install.txt
 59 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-after-pip-install.txt
 60 | 
 61 | # For reproducible requirements use the following after getting the requirements-freeze.txt file from the first build.
 62 | #COPY requirements-freeze.txt ${DEPENDENCIES_DIR}/requirements-freeze.txt
 63 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \
 64 | #    pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt
 65 | # For podman
 66 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \
 67 | #    pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt
 68 | 
 69 | # Optional optimizations.
 70 | # Hack to enable Intel MKL optimizations on AMD CPUs.
 71 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
 72 | # ARG FAKEINTEL_PATH=/opt/fakeintel/libfakeintel.so
 73 | # ENV FAKEINTEL_PATH=${FAKEINTEL_PATH}
 74 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
 75 | # Build.
 76 | # RUN echo 'int mkl_serv_intel_cpu_true() {return 1;}' > /tmp/fakeintel.c && \
 77 | #    mkdir -p /opt/fakeintel && \
 78 | #    gcc -shared -fPIC -o ${FAKEINTEL_PATH} /tmp/fakeintel.c
 79 | # Enable.
 80 | # ENV LD_PRELOAD=${FAKEINTEL_PATH}:${LD_PRELOAD}
 81 | 
 82 | ########################################################################
 83 | # Here you can install other software
 84 | 
 85 | 
 86 | ########################################################################
 87 | # Download Z-Shell enhancements.
 88 | 
 89 | FROM ${GIT_IMAGE} AS get-pure
 90 | 
 91 | ARG PURE_URL=https://github.com/sindresorhus/pure.git
 92 | ARG ZSHA_URL=https://github.com/zsh-users/zsh-autosuggestions.git
 93 | ARG ZSHS_URL=https://github.com/zsh-users/zsh-syntax-highlighting.git
 94 | 
 95 | RUN git clone --depth 1 ${PURE_URL} /opt/zsh/pure
 96 | RUN git clone --depth 1 ${ZSHA_URL} /opt/zsh/zsh-autosuggestions
 97 | RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting
 98 | 
 99 | ########################################################################
100 | # This stage is the final user-agnostic (generic) stage.
101 | # This layer can be distributed so that subsequent users
102 | 
103 | FROM runtime-deps AS runtime-generic
104 | 
105 | ENV HYDRA_FULL_ERROR=1
106 | 
107 | # A final record of the dependencies from pip freeze.
108 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-final.txt
109 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-final.txt
110 | 
111 | # Shell configuration.
112 | ENV ZSH_ENHANCE_DIR=/etc/zsh/enhance
113 | ARG PURE_PATH=${ZSH_ENHANCE_DIR}/pure
114 | ARG ZSHA_PATH=${ZSH_ENHANCE_DIR}/zsh-autosuggestions
115 | ARG ZSHS_PATH=${ZSH_ENHANCE_DIR}/zsh-syntax-highlighting
116 | COPY --from=get-pure /opt/zsh/pure ${PURE_PATH}
117 | COPY --from=get-pure /opt/zsh/zsh-autosuggestions ${ZSHA_PATH}
118 | COPY --from=get-pure /opt/zsh/zsh-syntax-highlighting ${ZSHS_PATH}
119 | RUN {   echo "fpath+=${PURE_PATH}"; \
120 |         echo "autoload -Uz promptinit; promptinit"; \
121 |         echo "prompt pure"; \
122 |         echo "source ${ZSHA_PATH}/zsh-autosuggestions.zsh"; \
123 |         echo "source ${ZSHS_PATH}/zsh-syntax-highlighting.zsh"; \
124 |         echo "alias ls='ls --color=auto'"; \
125 |         echo "alias ll='ls -lh'"; \
126 |         echo "alias update-env-file='source \${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/update-env-file.sh'"; \
127 |     } >> /etc/zsh/zshrc
128 | 
129 | 
130 | # Entrypoints.
131 | # Don't overwrite the entrypoint, it is installing the project
132 | # and testing that you correctly mounted the project code.
133 | # It also performs some other important setup depending on the deployment platform.
134 | ARG BASE_ENTRYPOINT
135 | ARG BASE_ENTRYPOINT_EXECS
136 | ENV BASE_ENTRYPOINT=${BASE_ENTRYPOINT}
137 | ENV BASE_ENTRYPOINT_EXECS=${BASE_ENTRYPOINT_EXECS}
138 | ENV ENTRYPOINTS_ROOT=/opt/template-entrypoints
139 | COPY entrypoints ${ENTRYPOINTS_ROOT}
140 | ENTRYPOINT ["/opt/template-entrypoints/pre-entrypoint.sh"]
141 | CMD ["/bin/zsh"]
142 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/Dockerfile-user:
--------------------------------------------------------------------------------
 1 | ########################################################################
 2 | # 3. Stages for setting up the user and the development environment.
 3 | ########################################################################
 4 | 
 5 | ARG GENERIC_IMAGE
 6 | ARG IMAGE_PLATFORM
 7 | 
 8 | ########################################################################
 9 | # Final runtime layer for the user.
10 | # Explicitly create a user for Docker Engine interfaces
11 | # which do no support selecting the user at runtime; this is the case for Run:ai.
12 | 
13 | FROM ${GENERIC_IMAGE}:${IMAGE_PLATFORM}-root-latest AS runtime-user
14 | 
15 | ARG GRPID
16 | ARG USRID
17 | ARG GRP
18 | ARG USR
19 | ARG PASSWD
20 | ENV PASSWD=${PASSWD}
21 | 
22 | # Add user to sudoer to be able to install apt packages.
23 | RUN groupadd -f -g ${GRPID} ${GRP} && \
24 |     useradd --shell /bin/zsh --create-home -u ${USRID} -g ${GRP} -p  $(openssl passwd -1 ${PASSWD}) ${USR} && \
25 |     usermod -aG sudo ${USR}
26 | 
27 | USER ${USR}
28 | RUN touch /home/${USR}/.zshrc
29 | 
30 | ########################################################################
31 | # Final development layer for the user.
32 | 
33 | FROM ${GENERIC_IMAGE}:${IMAGE_PLATFORM}-root-latest AS development-user
34 | 
35 | ARG GRPID
36 | ARG USRID
37 | ARG GRP
38 | ARG USR
39 | ARG PASSWD
40 | ENV PASSWD=${PASSWD}
41 | 
42 | # Add user to sudoer to be able to install apt packages.
43 | RUN groupadd -f -g ${GRPID} ${GRP} && \
44 |     useradd --shell /bin/zsh --create-home -u ${USRID} -g ${GRP} -p  $(openssl passwd -1 ${PASSWD}) ${USR} && \
45 |     usermod -aG sudo ${USR}
46 | 
47 | USER ${USR}
48 | RUN touch /home/${USR}/.zshrc
49 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/.gitignore:
--------------------------------------------------------------------------------
1 | # If you want to put your own scripts contraining API keys (e.g. W&B).
2 | submit-scripts/
3 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/README.md:
--------------------------------------------------------------------------------
  1 | # Guide for using the template with the EPFL SCITAS clusters (Kuma, Izar)
  2 | 
  3 | ## Overview
  4 | 
  5 | At this point, you should have the runtime image that can be deployed on multiple platforms.
  6 | This guide will show you how to deploy your image on the EPFL SCITAS clusters supporting containers (Kuma, Izar)
  7 | and use it for
  8 | 
  9 | 1. Remote development.
 10 | 2. Running unattended jobs.
 11 | 
 12 | ## Prerequisites
 13 | 
 14 | **SCITAS and Slurm**:
 15 | 
 16 | 1. You should have access to the SCITAS clusters using containers (Kuma, Izar).
 17 | 2. You should have some knowledge of Slurm.
 18 | 
 19 | CLAIRE lab members can refer to our internal documentation on using the SCITAS clusters
 20 | [here](https://prickly-lip-484.notion.site/Compute-and-Storage-CLAIRE-91b4eddcc16c4a95a5ab32a83f3a8294#1402ae1961ac4b3e86a6a3ee2d8602aa).
 21 | 
 22 | ## First steps
 23 | 
 24 | ### Getting your image on the SCITAS clusters
 25 | 
 26 | You only need to pull the generic image as SCITAS mounts namespaces to the containers.
 27 | 
 28 | All the commands should be run on the SCITAS clusters.
 29 | ```bash
 30 | ssh izar
 31 | # or
 32 | ssh kuma
 33 | ```
 34 | Create an enroot config file in your home directory on the cluster if you don't have one yet.
 35 | It will store your credentials for the registries.
 36 | ```bash
 37 | export ENROOT_CONFIG_PATH=$HOME/.config/enroot/.credentials
 38 | mkdir -p $(dirname $ENROOT_CONFIG_PATH)
 39 | touch $ENROOT_CONFIG_PATH
 40 | # Make sur the file is only readable by you
 41 | chmod 600 $ENROOT_CONFIG_PATH
 42 | ```
 43 | Write the following to the file.
 44 | ```bash
 45 | # E.g. vim $ENROOT_CONFIG_PATH
 46 | machine ic-registry.epfl.ch login <username> password <password>
 47 | machine registry.rcp.epfl.ch login <username> password <password>
 48 | ```
 49 | 
 50 | Optionally if you want to use Apptainer
 51 | ```bash
 52 | apptainer registry login --username <username> docker://registry.rcp.epfl.ch
 53 | apptainer registry login --username <username> docker://ic-registry.epfl.ch
 54 | ```
 55 | 
 56 | Then you can pull your image with
 57 | ```bash
 58 | # On Izar
 59 | SCRATCH=/scratch/izar/$USER
 60 | # On Kuma
 61 | SCRATCH=/scratch/$USER
 62 | # Make a directory where you store your images
 63 | # Add it to your bashrc as it'll be used often
 64 | CONTAINER_IMAGES=$SCRATCH/container-images
 65 | mkdir -p $CONTAINER_IMAGES
 66 | 
 67 | # Pull the generic image (with tagged with root)
 68 | # E.g.,
 69 | cd $CONTAINER_IMAGES
 70 | # Don't do this on a login node.
 71 | # Replace with your image name
 72 | 
 73 | srun --ntasks=1 --cpus-per-task=32 --partition h100 --time=0:30:00 \
 74 | enroot import docker://registry.rcp.epfl.ch#claire/moalla/template-project-name:amd64-cuda-root-latest
 75 | # This will create a squashfs file that you'll use to start your jobs.
 76 | ```
 77 | 
 78 | Optionally if you want to use Apptainer
 79 | ```bash
 80 | # Takes ages to convert to sif.
 81 | # Don't do this on a login node.
 82 | # In a tmux shell ideally.
 83 | srun --ntasks=1 --cpus-per-task=32 --partition h100 --time=1:00:00 \
 84 | apptainer pull docker://registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-root-latest
 85 | ```
 86 | 
 87 | ### Clone your repository in your home directory
 88 | 
 89 | We strongly suggest having two instances of your project repository.
 90 | 
 91 | 1. One for development, which may have uncommitted changes, be in a broken state, etc.
 92 | 2. One for running unattended jobs, which is always referring to a commit at a working state of the code.
 93 | 
 94 | The outputs and data directories of those two instances will be symlinked to the scratch storage
 95 | and will be shared anyway.
 96 | This guide includes the steps to do it, and there are general details in `data/README.md` and `outputs/README.md`.
 97 | 
 98 | ```bash
 99 | # SSH to a cluster.
100 | ssh kuma
101 | mkdir -p $HOME/projects/template-project-name
102 | cd $HOME/projects/template-project-name
103 | git clone <git SSH URL> dev
104 | git clone <git SSH URL> run
105 | ```
106 | 
107 | The rest of the instructions should be performed on the cluster from the dev instance of the project.
108 | ```bash
109 | cd dev
110 | # It may also be useful to open a remote code editor on a login node to view the project. (The remote development will happen in another IDE in the container.)
111 | # Push what you did on your local machine so far (change project name etc) and pull it on the cluster.
112 | git pull
113 | cd installation/docker-amd64-cuda
114 | ```
115 | 
116 | ### Note about the examples
117 | 
118 | The example files were made with username `moalla` and lab-name `claire`.
119 | Adapt them accordingly to your username and lab name.
120 | Run
121 | ```bash
122 | # From the cluster this time.
123 | ./template.sh env
124 | # Edit the .env file with your lab name (you can ignore the rest).
125 | ./template.sh get_scitas_scripts
126 | ```
127 | to get a copy of the examples in this guide with your username, lab name, etc.
128 | They will be in `./EPFL-SCITAS-setup/submit-scripts`.
129 | 
130 | ### A quick test to understand how the template works
131 | 
132 | Adapt the `submit-scripts/minimal.sh` with the name of your image and your cluster storage setup.
133 | 
134 | The submission script gives two examples of how to run containers on SCITAS.
135 | Either with [`enroot`](https://github.com/NVIDIA/enroo)
136 | and the [`pyxis`](https://github.com/NVIDIA/pyxis) plugin directly integrated in `srun`,
137 | or with `apptainer` inside tasks as a separate command.
138 | We recommend using Pyxis+enroot as it allows more remote development tools to be used.
139 | 
140 | Run the script to see how the template works.
141 | ```bash
142 | cd installation/docker-amd64-cuda/EPFL-SCITAS-setup/submit-scripts
143 | bash minimal.sh
144 | ```
145 | 
146 | When the container starts, its entrypoint does the following:
147 | 
148 | - It runs the entrypoint of the base image if you specified it in the `compose-base.yaml` file.
149 | - It expects you specify `PROJECT_ROOT_AT=<location to your project (dev or run)>`.
150 |   and `PROJECT_ROOT_AT` to be the working directory of the container.
151 |   Otherwise, it will issue a warning and set it to the default working directory of the container.
152 | - It then tries to install the project in editable mode.
153 |   This is a lightweight installation that allows to avoid all the hacky import path manipulations.
154 |   (This will be skipped if `PROJECT_ROOT_AT` has not been specified or if you specify `SKIP_INSTALL_PROJECT=1`.)
155 | - It also handles all the remote development setups (VS Code, Cursor, PyCharm, Jupyter, ...)
156 |   that you specify with environment variables.
157 |   These are described in the later sections of this README.
158 | - Finally, it executes a provided command (e.g. `bash` here for an interactive job with a connected --pty).
159 | 
160 | You need to make sure that this minimal submission works before proceeding.
161 | The logs of the entrypoint are only shown in case there was an error (design from pyxis).
162 | (A current workaround runs the entrypoint as a script at the start instead of as an entrypoint)
163 | 
164 | If the entrypoint fails the installation of your project, you can resubmit your job with `export SKIP_INSTALL_PROJECT=1`
165 | which will skip the installation step then you can replay the installation manually in the container to debug it.
166 | 
167 | ## Use cases
168 | 
169 | The basic configuration for the project's environment is now set up.
170 | You can follow the remaining sections below to see how to run unattended jobs and set up remote development.
171 | After that, return to the root README for the rest of the instructions to run our experiments.
172 | 
173 | 
174 | ### Running unattended jobs
175 | 
176 | By performing the above first steps, you should have all the required setup to run unattended jobs.
177 | The main difference is that the unattended job is run with `sbatch`.
178 | An example of an unattended job can be found in `submit-scripts/unattended.sh` to run with `sbatch`.
179 | Note the emphasis on having a frozen copy `run` of the repository for running unattended jobs.
180 | 
181 | ### Weights&Biases
182 | 
183 | Your W&B API key should be exposed as the `WANDB_API_KEY` environment variable.
184 | You can export it or if you're sharing the script with others export a location to a file containing it with
185 | `export WANDB_API_KEY_FILE_AT` and let the template handle it.
186 | 
187 | E.g.,
188 | 
189 | ```bash
190 | echo <my-wandb-api-key> > $HOME/.wandb-api-key
191 | chmod 600 $HOME/.wandb-api-key
192 | ```
193 | 
194 | Then `export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key` in the submit script.
195 | 
196 | ### Remote development
197 | 
198 | This would be the typical use case for a researcher at CLAIRE using the cluster as their daily driver to do
199 | development, testing, and debugging.
200 | Your job would be running a remote IDE/code editor on the cluster, and you would only have a lightweight local client
201 | running on your laptop.
202 | 
203 | The entrypoint will start an ssh server and a remote development server for your preferred IDE/code editor
204 | when you set some environment variables.
205 | An example of an interactive job submission can be found in `submit-scripts/remote-development.sh`
206 | to run with `sbatch`.
207 | 
208 | Below, we list and describe in more detail the tools and IDEs supported for remote development.
209 | 
210 | ### SSH Configuration (Necessary for PyCharm, VS Code, and Cursor)
211 | 
212 | Your job will open an ssh server when you set the environment variable `SSH_SERVER=1`.
213 | You also have to mount the authorized keys file from your home directory to the container (done in the example).
214 | The SSH connection is necessary for some remote IDEs like PyCharm to work and can be beneficial
215 | for other things like ssh key forwarding.
216 | The ssh server is configured to run on port 2223 of the container.
217 | 
218 | With the ssh connection, you can forward the ssh keys on your local machine (that you use for GitHub, etc.)
219 | on the remote server.
220 | This allows using the ssh keys on the remote server without having to copy them there.
221 | 
222 | For that, you need three things: an ssh agent running on your local machine, the key added to the agent,
223 | and a configuration file saying that the agent should be used with the ssh connection to SCITAS.
224 | GitHub provides a guide for that
225 | [here (look at the troubleshooting section too)](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding).
226 | 
227 | Use the following configuration in your local `~/.ssh/config`
228 | 
229 | ```bash
230 | Host kuma
231 |     HostName kuma.hpc.epfl.ch
232 |     User moalla
233 |     ForwardAgent yes
234 | 
235 | # EDIT THIS HOSTNAME WITH EVERY NEW JOB
236 | Host kuma-job
237 |     HostName kh021
238 |     User moalla
239 |     ProxyJump kuma
240 |     StrictHostKeyChecking no
241 |     UserKnownHostsFile=/dev/null
242 |     ForwardAgent yes
243 | 
244 | Host kuma-container
245 |     HostName localhost
246 |     ProxyJump kuma-job
247 |     Port 2223
248 |     User moalla
249 |     StrictHostKeyChecking no
250 |     UserKnownHostsFile=/dev/null
251 |     ForwardAgent yes
252 | ```
253 | To update the hostname of the `clariden-job` you can add this to your `~/.zshrc` on macOS for example:
254 | 
255 | ```bash
256 | # Tested on macos with zsh
257 | function update-ssh-config() {
258 |   local config_file="$HOME/.ssh/config"  # Adjust this path if needed
259 |   local host="$1"
260 |   local new_hostname="$2"
261 | 
262 |   if [[ -z "$host" || -z "$new_hostname" ]]; then
263 |     echo "Usage: update-ssh-config <host> <new-hostname>"
264 |     return 1
265 |   fi
266 | 
267 |   sed -i '' '/Host '"$host"'/,/Host / s/^[[:space:]]*HostName.*/    HostName '"$new_hostname"'/' "$config_file"
268 |   echo "Updated HostName for '${host}' to '${new_hostname}' in ~/.ssh/config"
269 | }
270 | ```
271 | 
272 | The `StrictHostKeyChecking no` and `UserKnownHostsFile=/dev/null` allow bypass checking the identity
273 | of the host [(ref)](https://linuxcommando.blogspot.com/2008/10/how-to-disable-ssh-host-key-checking.html)
274 | which keeps changing every time a job is scheduled,
275 | so that you don't have to reset it each time.
276 | 
277 | With this config you can then connect to your container with `ssh clariden-container`.
278 | 
279 | **Limitations**
280 | 
281 | Note that an ssh connection to the container is not like executing a shell on the container.
282 | In particular, the following limitations apply:
283 | 
284 | - environment variables in the image sent to the entrypoint of the container and any command exec'ed in it
285 |   are not available in ssh connections.
286 |   There is a workaround for that in `entrypoints/remote-development-setup.sh` when opening an ssh server
287 |   which should work for most cases, but you may still want to adapt it to your needs.
288 | 
289 | ### Git config
290 | 
291 | You can persist your Git config (username, email, etc.) by mounting it in the container.
292 | This is done in the examples.
293 | 
294 | E.g., create your config in your home directory with
295 | 
296 | ```bash
297 | cat >$HOME/.gitconfig <<EOL
298 | [user]
299 |         email = your@email
300 |         name = Your Name
301 | [core]
302 |         filemode = false
303 | EOL
304 | ```
305 | 
306 | ### PyCharm Professional
307 | 
308 | We support the [Remote Development](https://www.jetbrains.com/help/pycharm/remote-development-overview.html)
309 | feature of PyCharm that runs a remote IDE in the container.
310 | 
311 | The first time connecting you will have to install the IDE in the server in a location mounted in the container
312 | that is stored for future use (somewhere in your `$HOME` directory).
313 | After that, or if you already have the IDE stored in from a previous project,
314 | the template will start the IDE on its own at the container creation,
315 | and you will be able to directly connect to it from the JetBrains Gateway client on your local machine.
316 | 
317 | **Preliminaries: saving the project IDE configuration**
318 | 
319 | The remote IDE stores its configuration and cache (e.g., the interpreters you set up, memory requirements, etc.)
320 | in `~/.config/JetBrains/RemoteDev-PY/...`, `~/.cache/JetBrains/RemoteDev-PY/...`, and other directories.
321 | 
322 | To have it preserved between different dev containers, you should specify the `JETBRAINS_SERVER_AT` env variable
323 | with your submit command as shown in the examples in `submit-scripts/remote-development.sh`.
324 | The template will use it to store the IDE configuration and cache in a separate directory
325 | per project (defined by its $PROJECT_ROOT_AT).
326 | All the directories will be created automatically.
327 | 
328 | **First time only (if you don't have the IDE stored from another project), or if you want to update the IDE.**
329 | 
330 | 1. `mkdir $HOME/jetbrains-server`
331 | 2. Submit your job as in the example `submit-scripts/remote-development.sh` and in particular edit the environment
332 |    variables
333 |     - `JETBRAINS_SERVER_AT`: set it to the `jetbrains-server` directory described above.
334 |     - `PYCHARM_IDE_AT`: don't include it as IDE is not installed yet.
335 |    And add `JETBRAINS_SERVER_AT` in the `--container-mounts`
336 | 3. Then follow the instructions [here](https://www.jetbrains.com/help/pycharm/remote-development-a.html#gateway) and
337 |    install the IDE in your `${JETBRAINS_SERVER_AT}/dist`
338 |    (something like `/users/smoalla/jetbrains-server/dist`)
339 |    not in its default location **(use the small "installation options..." link)**.
340 |    For the project directory, it should be in the same location where it was mounted (`${PROJECT_ROOT_AT}`,
341 |    something like `/users/smoalla/projects/template-project-name/dev`).
342 | 
343 | When in the container, locate the name of the PyCharm IDE installed.
344 | It will be at
345 | ```bash
346 | ls ${JETBRAINS_SERVER_AT}/dist
347 | # Outputs something like e632f2156c14a_pycharm-professional-2024.1.4
348 | ```
349 | The name of this directory will be what you should set the `PYCHARM_IDE_AT` variable to in the next submissions
350 | so that it starts automatically.
351 | ```bash
352 | PYCHARM_IDE_AT=744eea3d4045b_pycharm-professional-2024.1.6-aarch64
353 | ```
354 | 
355 | **When you have the IDE in the storage**
356 | You can find an example in `submit-scripts/remote-development.sh`.
357 | 
358 | 1. Same as above, but set the environment variable `PYCHARM_IDE_AT` to the directory containing the IDE binaries.
359 |    Your IDE will start running with your container.
360 | 2. Enable port forwarding for the SSH port.
361 | 3. Open JetBrains Gateway, your project should already be present in the list of projects and be running.
362 | 
363 | 
364 | **Configuration**:
365 | 
366 | * PyCharm's default terminal is bash. Change it to zsh in the Settings -> Tools -> Terminal.
367 | * When running Run/Debug configurations, set your working directory the project root (`$PROJECT_ROOT_AT`), not the script's directory.
368 | * Your interpreter will be
369 |   * the system Python `/usr/bin/python` with the `from-python` option.
370 |   * the Python in your conda environment with the `from-scratch` option, with the conda binary found at `/opt/conda/condabin/conda`.
371 | 
372 | **Limitations:**
373 | 
374 | - The terminal in PyCharm opens ssh connections to the container,
375 |   so the workaround (and its limitations) in the ssh section apply.
376 |   If needed, you could just open a separate terminal on your local machine
377 |   and directly exec a shell into the container.
378 | - It's not clear which environment variables are passed to the programs run from the IDE like the debugger.
379 |   So far, it seems like the SSH env variables workaround works fine for this.
380 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet.
381 | 
382 | ### VSCode / Cursor
383 | 
384 | We support the [Remote Development using SSH ](https://code.visualstudio.com/docs/remote/ssh)
385 | feature of VS code that runs a remote IDE in the container via SSH. To set this up for Cursor, simply replace `VSCODE` by `CURSOR` and `vscode` by `cursor` in all instructions below. For example, `VSCODE_SERVER_AT` becomes `CURSOR_SERVER_AT`, and `~/.vscode-server` becomes `~/.cursor-server`.
386 | 
387 | **Preliminaries: saving the IDE configuration**
388 | 
389 | The remote IDE stores its configuration (e.g., the extensions you set up) in `~/.vscode-server`.
390 | To have it preserved between different dev containers, you should specify the
391 | `VSCODE_SERVER_AT` env variable with your submit command
392 | as shown in the examples in `submit-scripts/remote-development.sh`.
393 | The template will use it to store the IDE configuration and cache in a separate directory
394 | per project (defined by its $PROJECT_ROOT_AT).
395 | All the directories will be created automatically.
396 | 
397 | **ssh configuration**
398 | 
399 | VS Code takes ssh configuration from files.
400 | Follow the steps in the [SSH configuration section](#ssh-configuration-necessary-for-pycharm-and-vs-code)
401 | to set up your ssh config file.
402 | 
403 | **Connecting VS Code to the container**:
404 | 
405 | 1. `mkdir $HOME/vscode-server`
406 | 2. In your submit command, set the environment variables for
407 |     - Opening an ssh server `SSH_SERVER=1`.
408 |     - preserving your config `VSCODE_SERVER_AT`.
409 |    And add `VSCODE_SERVER_AT` in the `--container-mounts`.
410 | 3. Have the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh)
411 |    extension on your local VS Code.
412 | 4. Connect to the ssh host following the
413 |    steps [here](https://code.visualstudio.com/docs/remote/ssh#_connect-to-a-remote-host).
414 | 
415 | The directory to add to your VS Code workspace should be the same as the one specified in the `PROJECT_ROOT_AT`.
416 | 
417 | **Limitations**
418 | 
419 | - The terminal in VS Code opens ssh connections to the container,
420 |   so the workaround (and its limitations) in the ssh section apply.
421 |   If needed, you could just open a separate terminal on your local machine
422 |   and directly exec a shell into the container.
423 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet.
424 | 
425 | ### JupyterLab (TODO)
426 | 
427 | ### Examples
428 | 
429 | We provide examples of how to use the template in the `submit-scripts` directory.
430 | 
431 | ### Troubleshooting
432 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/README.md:
--------------------------------------------------------------------------------
1 | # Tips and Best Practices for Running Jobs with Slurm + pyxis + enroot
2 | 
3 | Placeholder.
4 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/minimal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If not done already in your bashrc (depends on the cluster so better write that logic there.)
 4 | # export SCRATCH=/scratch/moalla
 5 | 
 6 | # Variables used by the entrypoint script
 7 | # Change this to the path of your project (can be the /dev or /run copy)
 8 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev
 9 | export PROJECT_NAME=template-project-name
10 | export PACKAGE_NAME=template_package_name
11 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
12 | 
13 | # Enroot + Pyxis
14 | 
15 | # Limitation: pyxis doesn't send environment variables to the entrypoint so it has to be run manually
16 | # This is fixed in v0.20.0
17 | 
18 | srun \
19 |   -J template-minimal \
20 |   -G 1 --partition h100 \
21 |   --pty \
22 |   --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \
23 |   --container-mounts=/etc/slurm,$PROJECT_ROOT_AT,$SCRATCH \
24 |   --container-workdir=$PROJECT_ROOT_AT \
25 |   --no-container-mount-home \
26 |   --no-container-remap-root \
27 |   --no-container-entrypoint \
28 |   --container-writable \
29 |   /opt/template-entrypoints/pre-entrypoint.sh \
30 |   bash
31 | 
32 | # additional options for pyxis
33 | # --container-env to override environment variables defined in the container
34 | 
35 | exit 0
36 | 
37 | # Some other possible option
38 | # Apptainer/Singularity
39 | srun \
40 |   -G 1 --partition h100 -J template-minimal \
41 |   --pty \
42 |   apptainer run \
43 |   --contain \
44 |   --bind $SCRATCH:$SCRATCH \
45 |   --cwd $PROJECT_ROOT_AT \
46 |   --no-home \
47 |   --nv \
48 |   --writable-tmpfs \
49 |   $CONTAINER_IMAGES/template-project-name_amd64-cuda-root-latest.sif \
50 |   bash
51 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/remote-development.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J template-remote-development
 4 | #SBATCH -t 12:00:00
 5 | #SBATCH --partition h100
 6 | #SBATCH --gpus 4
 7 | #SBATCH --cpus-per-task 60
 8 | 
 9 | # Only for Kuma temporarily
10 | 
11 | # If not done already in your bashrc (depends on the cluster so better write that logic there.)
12 | # export SCRATCH=/scratch/moalla
13 | 
14 | # Variables used by the entrypoint script
15 | # Change this to the path of your project (can be the /dev or /run copy)
16 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/dev
17 | export PROJECT_NAME=template-project-name
18 | export PACKAGE_NAME=template_package_name
19 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
20 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key
21 | export SSH_SERVER=1
22 | export NO_SUDO_NEEDED=1
23 | export JETBRAINS_SERVER_AT=$HOME/jetbrains-server
24 | #export PYCHARM_IDE_AT=e632f2156c14a_pycharm-professional-2024.1.4
25 | # or
26 | # export VSCODE_SERVER_AT=$SCRATCH/vscode-server
27 | 
28 | srun \
29 |   --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \
30 |   --container-mounts=\
31 | /etc/slurm,\
32 | $PROJECT_ROOT_AT,\
33 | $SCRATCH,\
34 | $WANDB_API_KEY_FILE_AT,\
35 | $JETBRAINS_SERVER_AT,\
36 | $HOME/.gitconfig,\
37 | $HOME/.ssh \
38 |   --container-workdir=$PROJECT_ROOT_AT \
39 |   --no-container-mount-home \
40 |   --no-container-remap-root \
41 |   --no-container-entrypoint \
42 |   --container-writable \
43 |   -G 4 -c 60 \
44 |   /opt/template-entrypoints/pre-entrypoint.sh \
45 |   sleep infinity
46 | 
47 | # additional options
48 | # --container-env to override environment variables defined in the container
49 | 
50 | # Draft.
51 | # Here can connect to the container with
52 | # Get the job id (and node id if multinode)
53 | #
54 | # Connect to the allocation
55 | #   srun --overlap --pty --jobid=JOBID bash
56 | # Inside the job find the container name
57 | #   enroot list -f
58 | # Exec to the container
59 | #   enroot exec <container-pid> zsh
60 | 
61 | # additional options
62 | # --container-env to override environment variables defined in the container
63 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/unattended-distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J template-untattended-distributed
 4 | #SBATCH -t 0:30:00
 5 | #SBATCH --partition h100
 6 | #SBATCH --nodes 2
 7 | #SBATCH --ntasks-per-node 3
 8 | 
 9 | # There is a current limitation in pyxis with the entrypoint and it has to run manually.
10 | # It has to run only once per node and the other tasks in the nodes have to wait for it to finish.
11 | # So you can either limit your jobs to 1 task per node or use a sleep command to wait for the entrypoint to finish.
12 | 
13 | # Only for Kuma temporarily
14 | 
15 | # If not done already in your bashrc (depends on the cluster so better write that logic there.)
16 | # export SCRATCH=/scratch/moalla
17 | 
18 | # Variables used by the entrypoint script
19 | # Change this to the path of your project (can be the /dev or /run copy)
20 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run
21 | export PROJECT_NAME=template-project-name
22 | export PACKAGE_NAME=template_package_name
23 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
24 | export WANDB_API_KEY_FILE_AT=$HOME/.wandb-api-key
25 | 
26 | srun \
27 |   --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \
28 |   --container-mounts=\
29 | /etc/slurm,\
30 | $PROJECT_ROOT_AT,\
31 | $SCRATCH,\
32 | $WANDB_API_KEY_FILE_AT \
33 |   --container-workdir=$PROJECT_ROOT_AT \
34 |   --no-container-mount-home \
35 |   --no-container-remap-root \
36 |   --no-container-entrypoint \
37 |   --container-writable \
38 |   /opt/template-entrypoints/pre-entrypoint.sh \
39 |   bash -c 'sleep 60; python -m template_package_name.template_experiment some_arg=LOCALID-$SLURM_LOCALID-PROCID-$SLURM_PROCID'
40 | 
41 | # Sleep to wait for the installation of the project.
42 | 
43 | # additional options
44 | # --container-env to override environment variables defined in the container
45 | 
46 | exit 0
47 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-SCITAS-setup/example-submit-scripts/unattended.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J template-unattended
 4 | #SBATCH -t 0:30:00
 5 | #SBATCH --partition h100
 6 | #SBATCH --gpus 1
 7 | 
 8 | # Only for Kuma temporarily
 9 | 
10 | # If not done already in your bashrc (depends on the cluster so better write that logic there.)
11 | # export SCRATCH=/scratch/moalla
12 | 
13 | # Variables used by the entrypoint script
14 | # Change this to the path of your project (can be the /dev or /run copy)
15 | export PROJECT_ROOT_AT=$HOME/projects/template-project-name/run
16 | export PROJECT_NAME=template-project-name
17 | export PACKAGE_NAME=template_package_name
18 | export SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE=1
19 | 
20 | srun \
21 |   --container-image=$CONTAINER_IMAGES/claire+moalla+template-project-name+amd64-cuda-root-latest.sqsh \
22 |   --container-mounts=/etc/slurm,$PROJECT_ROOT_AT,$SCRATCH \
23 |   --container-workdir=$PROJECT_ROOT_AT \
24 |   --no-container-mount-home \
25 |   --no-container-remap-root \
26 |   --no-container-entrypoint \
27 |   --container-writable \
28 |   -G 1 \
29 |   /opt/template-entrypoints/pre-entrypoint.sh \
30 |   python -m template_package_name.template_experiment some_arg=some_value wandb.mode=offline
31 | 
32 | # additional options for pyxis
33 | # --container-env to override environment variables defined in the container
34 | 
35 | exit 0
36 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-runai-setup/.gitignore:
--------------------------------------------------------------------------------
1 | # If you want to put your own scripts contraining API keys (e.g. W&B).
2 | submit-scripts/
3 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-runai-setup/README.md:
--------------------------------------------------------------------------------
  1 | # Guide for using the template with the EPFL IC and RCP Run:ai clusters
  2 | 
  3 | ## Overview
  4 | 
  5 | At this point, you should have the runtime image that can be deployed on multiple platforms.
  6 | This guide will show you how to deploy your image on the EPFL IC and RCP Run:ai clusters and use it for:
  7 | 
  8 | 1. Remote development. (At CLAIRE, we use the Run:ai platform as our daily driver.)
  9 | 2. Running unattended jobs.
 10 | 
 11 | Using the image on HaaS machines falls into the public instructions
 12 | using the local deployment option with Docker Compose service and is covered by the
 13 | instructions in the `installation/docker-amd64-cuda/README.md` file.
 14 | 
 15 | ## Prerequisites
 16 | 
 17 | **Run:ai**:
 18 | 
 19 | 1. You should have access to a Run:ai project and have some knowledge of the Run:ai platform, e.g.,
 20 |    know the commands to submit jobs and check their status.
 21 | 2. You should have one or more PVC(s) (Persistent Volume Claim) connecting some persistent storage
 22 |    to your Run:ai jobs, typically your lab's shared storage.
 23 |    (E.g. `runai-claire-gaspar-scratch`, you can run `kubectl get pvc` to list them).
 24 | 3. You should have access to a project on the [IC](https://ic-registry.epfl.ch/) or [RCP](https://registry.rcp.epfl.ch/)
 25 |    image registries
 26 |    and should be logged in to them (`docker login <registry>`).
 27 | 
 28 | EPIC provides an introduction to these tools [here](https://epic-guide.github.io/tools/ic-compute-storage).
 29 | We also have a guide at CLAIRE which you can get inspiration from
 30 | [here](https://prickly-lip-484.notion.site/Compute-and-Storage-CLAIRE-91b4eddcc16c4a95a5ab32a83f3a8294#1402ae1961ac4b3e86a6a3ee2d8602aa).
 31 | 
 32 | ## First steps
 33 | 
 34 | ### Note about the examples
 35 | 
 36 | The examples in this README were made with username `moalla` and lab-name `claire`.
 37 | Adapt them accordingly to your username and lab name.
 38 | Run
 39 | ```bash
 40 | ./template.sh get_runai_scripts
 41 | ```
 42 | to get a copy of the examples in this guide with your username, lab name, etc.
 43 | They will be in `.EPFL-runai-setup/submit-scripts`.
 44 | 
 45 | ### Clone your repository in your PVC / shared storage
 46 | 
 47 | We strongly suggest having two instances of your project repository on your PVCs.
 48 | 
 49 | 1. One for development, which may have uncommitted changes, be in a broken state, etc.
 50 | 2. One for running unattended jobs, which is always referring to a commit at a working state of the code.
 51 | 
 52 | You can still have the outputs and data directories of those two instances shared.
 53 | This can be done by creating symlinks between them, in the same the way you can read data from another PVC,
 54 | say a shared PVC that has model weights, etc. All of this is described in the
 55 | `data/README.md` and `outputs/README.md` files of the template and can be done later.
 56 | 
 57 | Follow the steps below to clone your repository in your PVCs / shared storage.
 58 | 
 59 | Typically the storage underlying your PVC is also mounted on a permanent machine that you can access.
 60 | CLAIRE members can use the `claire-build-machine` for this to access `claire-rcp-scratch`.
 61 | RCP also provides a shared jump host `haas001.rcp.epfl.ch` that mounts most lab's shared storage.
 62 | 
 63 | Setup your SSH configuration so that your keys are forwarded during your ssh connection to machine
 64 | so that you can clone your repository easily.
 65 | For CLAIRE members you should have the `claire-build-machine` already setup.
 66 | For other labs you can copy the config example below for `haas001.rcp.epfl.ch`.
 67 | 
 68 | ```bash
 69 | # You need three things for your ssh keys to be forwarded during a connection:
 70 | # an ssh agent running on your local machine,
 71 | # the key added to the agent,
 72 | # and a configuration file saying that the agent should be used with connection.
 73 | # GitHub provides a guide for that (look at the troubleshooting section too)
 74 | # https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding
 75 | # and for the ssh config file you can use the following:
 76 | Host rcp-haas
 77 | 	HostName haas001.rcp.epfl.ch
 78 | 	User YOUR-GASPAR
 79 | 	ForwardAgent yes
 80 | ```
 81 | 
 82 | SSH to the machine and clone your repository in your PVC / shared storage.
 83 |    (Remember to push the changes you made on your local machine after initializing the template,
 84 |    to have the latest state of your repo.)
 85 |    ```bash
 86 |    # Somewhere in your PVC, say your personal directory there.
 87 |    mkdir template-project-name
 88 |    git clone <git SSH URL> template-project-name/dev
 89 |    git clone <git SSH URL> template-project-name/run
 90 |    ```
 91 | 
 92 |    We also recommend that you make Git ignore the executable bit as the repo is moved across filesystems.
 93 |    You can do so by running `git config core.filemode false` in both repositories.
 94 | 
 95 |    ```bash
 96 |    cd template-project-name/dev && git config core.filemode false
 97 |    cd ../run && git config core.filemode false
 98 |    ```
 99 | 
100 | ### A quick test to understand how the template works
101 | 
102 | Adapt the `submit-scripts/minimal.sh` with the name of your image, your PVC,
103 | and the correct path to your project in the PVC.
104 | 
105 | When the container starts, its entrypoint does the following:
106 | 
107 | - It runs the entrypoint of the base image if you specified it in the `compose-base.yaml` file.
108 | - It expects you specify `PROJECT_ROOT_AT=<location to your project in the PVC>`
109 |   and to set `PROJECT_ROOT_AT` as the working directory of the container
110 |   and installs the project found at `PROJECT_ROOT_AT` in editable mode.
111 |   This is a lightweight installation that allows to avoid all the hacky import path manipulations.
112 |   (You can skip this if you have a different project structure,
113 |   e.g.,
114 |   just copied the installation directory of the template by not specifying `PROJECT_ROOT_AT`).
115 | - It also handles all the remote development setups (VS Code, Cursor, PyCharm, Jupyter, ...)
116 |   that you specify with environment variables.
117 |   These are described in the later sections of this README.
118 | - Finally, it executes a provided command (e.g. `sleep infinity`), otherwise by default will run a shell and stop.
119 |   It runs this command with PID 1 so that it can receive signals from the cluster and gracefully stop when preempted.
120 |   You should not have to override the entrypoint, i.e., using `--command` flag with `runai submit`
121 |   unless you are debugging the entrypoint itself.
122 | 
123 | You need to make sure that this minimal submission works before proceeding.
124 | You can check the logs of the container with `runai logs example-minimal` to see if everything is working as expected.
125 | You should expect to see something like:
126 | 
127 | ```text
128 | $ runai logs example-minimal
129 | ...
130 | [TEMPLATE INFO] PROJECT_ROOT_AT is set to /claire-rcp-scratch/home/moalla/template-project-name/dev.
131 | [TEMPLATE INFO] Expecting workdir to be /claire-rcp-scratch/home/moalla/template-project-name/dev.
132 | [TEMPLATE INFO] Installing the project with pip.
133 | [TEMPLATE INFO] Expecting /claire-rcp-scratch/home/moalla/template-project-name/dev to be a Python project.
134 | [TEMPLATE INFO] To skip this installation use the env variable SKIP_INSTALL_PROJECT=1.
135 | Obtaining file:///claire-rcp-scratch/home/moalla/template-project-name/dev
136 |   Installing build dependencies: started
137 |   ...
138 |   Building editable for template-project-name (pyproject.toml): started
139 |   ...
140 | Successfully built template-project-name
141 | Installing collected packages: template-project-name
142 | Successfully installed template-project-name-0.0.1
143 | [TEMPLATE INFO] Testing that the package can be imported.
144 | [TEMPLATE INFO] Package imported successfully.
145 | [TEMPLATE INFO] Executing the command sleep infinity
146 | ````
147 | 
148 | You can then open a shell in the container and check that everything is working as expected:
149 | 
150 | ```bash
151 | runai exec -it example-minimal zsh
152 | ```
153 | 
154 | If the entrypoint fails the installation of your project, you can resubmit your job with `-e SKIP_INSTALL_PROJECT=1`
155 | which will skip the installation step then you can replay the installation manually in the container to debug it.
156 | 
157 | ## Use cases
158 | 
159 | The basic configuration for the project's environment is now set up.
160 | You can follow the remaining sections below to see how to run unattended jobs and set up remote development.
161 | After that, return to the root README for the rest of the instructions to run our experiments.
162 | 
163 | 
164 | ### Running unattended jobs
165 | 
166 | By performing the above first steps, you should have all the required setup to run unattended jobs.
167 | An example of an unattended job can be found in `submit-scripts/unattended.sh`.
168 | Note the emphasis on having a frozen copy `run` of the repository for running unattended jobs.
169 | 
170 | 
171 | ### Run:ai selectors
172 | 
173 | Different clusters have different names for node pools and options to enable `sudo` usage etc.
174 | Refer to the `submit-scripts` for the main options, otherwise to the clusters' respective documentation.
175 | 
176 | ### Weights&Biases
177 | 
178 | Your W&B API key should be exposed as the `WANDB_API_KEY` environment variable.
179 | Run:ai doesn't support Kubernetes secrets yet, and you don't want to pass it as a clear environment variable
180 | (visible in the Run:ai dashboard),
181 | so an alternative is to have it in your PVC and pass it with the
182 | `-e WANDB_API_KEY_FILE_AT` environment variable in your `runai submit` command and let the template handle it.
183 | 
184 | E.g.,
185 | 
186 | ```bash
187 | 
188 | # In my PVC.
189 | # <my-wandb-api-key>
190 | echo <my-wandb-api-key> > /claire-rcp-scratch/home/moalla/.wandb-api-key
191 | ```
192 | 
193 | Then specify `-e WANDB_API_KEY_FILE_AT=/claire-rcp-scratch/home/moalla/.wandb-api-key` in my `runai submit` command.
194 | 
195 | ### HuggingFace
196 | 
197 | Same idea as for W&B, you should have your Hugging Face API key in your PVC and pass it with the
198 | `-e HF_TOKEN_AT` environment variable in your `runai submit` command.
199 | 
200 | E.g.,
201 | 
202 | ```bash
203 | 
204 | # In my PVC
205 | echo <my-hf-api-key> > /claire-rcp-scratch/home/moalla/.hf-token
206 | ```
207 | 
208 | Then specify
209 | `-e HF_TOKEN_AT=/claire-rcp-scratch/home/moalla/.hf-token` in my `runai submit` command.
210 | 
211 | 
212 | ### Remote development
213 | 
214 | This would be the typical use case for a user at CLAIRE using the Run:ai cluster as their daily driver to do
215 | development, testing, and debugging.
216 | Your job would be running a remote IDE/code editor on the cluster, and you would only have a lightweight local client
217 | running on your laptop.
218 | 
219 | The entrypoint will start an ssh server and a remote development server for your preferred IDE/code editor
220 | when you set some environment variables.
221 | An example of an interactive job submission can be found in `submit-scripts/remote-development.sh`.
222 | 
223 | Below, we list and describe in more detail the tools and IDEs supported for remote development.
224 | 
225 | ### SSH Configuration (Necessary for PyCharm, VS Code, and Cursor)
226 | 
227 | Your job will open an ssh server when you set the environment variable `SSH_SERVER=1`.
228 | This is necessary for some remote IDEs like PyCharm to work and can be beneficial
229 | for other things like ssh key forwarding.
230 | 
231 | The ssh server is configured to run on port 2223 of the container.
232 | You can forward a local port on your machine to this port on the container.
233 | 
234 | When your container is up, run
235 | 
236 | ```bash
237 | # Here 2222 on the local machine is forwarded to 2223 on the pod.
238 | # You can change the local port number to another port number.
239 | kubectl get pods
240 | kubectl port-forward <pod-name> 2222:2223
241 | ```
242 | 
243 | You can then ssh to your container by ssh-ing to that port on your local machine.
244 | Connect with the user and password you specified in your `.env` file when you built the image.
245 | 
246 | ```bash
247 | # ssh to local machine is forwarded to the pod.
248 | ssh -p 2222 <username>@localhost
249 | ```
250 | 
251 | As the container will each time be on a different machine, the ssh key for the remote server has to be reset or not stored..
252 | This is done for you in the ssh config below. If you face issues you can reset the key with:
253 | 
254 | ```bash
255 | ssh-keygen -R '[localhost]:2222'
256 | ```
257 | 
258 | With the ssh connection, you can forward the ssh keys on your local machine (that you use for GitHub, etc.)
259 | on the remote server.
260 | This allows using the ssh keys on the remote server without having to copy them there.
261 | (The alternative would be to have them as Kubernetes secrets,
262 | but Run:ai doesn't support that yet with its submit command.)
263 | 
264 | For that, you need three things: an ssh agent running on your local machine, the key added to the agent,
265 | and a configuration file saying that the agent should be used with the Run:ai job.
266 | GitHub provides a guide for that
267 | [here (look at the troubleshooting section too)](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/using-ssh-agent-forwarding)
268 | and for the ssh config file you can use the following:
269 | 
270 | ```bash
271 | Host local2222
272 | 	HostName 127.0.0.1
273 | 	User <username>
274 | 	Port 2222
275 | 	StrictHostKeyChecking no
276 | 	UserKnownHostsFile=/dev/null
277 | 	ForwardAgent yes
278 | # If you open multiple projects at the same time, you can forward each of them to a different port.
279 | # And have two entries in your ssh config file.
280 | ```
281 | 
282 | The `StrictHostKeyChecking no` and `UserKnownHostsFile=/dev/null` allow bypass checking the identity
283 | of the host [(ref)](https://linuxcommando.blogspot.com/2008/10/how-to-disable-ssh-host-key-checking.html)
284 | which keeps changing every time a job is scheduled,
285 | so that you don't have to reset it each time.
286 | 
287 | With this config you can then simply connect to your container with `ssh local2222` when the port 2222 is forwarded.
288 | 
289 | **Limitations**
290 | 
291 | Note that an ssh connection to the container is not like executing a shell on the container.
292 | In particular, the following limitations apply:
293 | 
294 | - environment variables in the image sent to the entrypoint of the container and any command exec'ed in it
295 |   are not available in ssh connections.
296 |   There is a workaround for that in `entrypoints/remote-development-setup.sh` when opening an ssh server
297 |   which should work for most cases, but you may still want to adapt it to your needs.
298 | 
299 | ### Git config
300 | 
301 | You can persist your Git config (username, email, etc.) by having it in your PVC and passing its location
302 | with the `GIT_CONFIG_AT` environment variable.
303 | 
304 | E.g., create your config in your PVC with
305 | 
306 | ```bash
307 | # In my PVC.
308 | cat >/claire-rcp-scratch/home/moalla/remote-development/gitconfig <<EOL
309 | [user]
310 |         email = your@email
311 |         name = Your Name
312 | [core]
313 |         filemode = false
314 | EOL
315 | ```
316 | 
317 | Then specify something like `-e GIT_CONFIG_AT=/claire-rcp-scratch/home/moalla/remote-development/gitconfig`
318 | in your `runai submit` command.
319 | 
320 | ### PyCharm Professional
321 | 
322 | We support the [Remote Development](https://www.jetbrains.com/help/pycharm/remote-development-overview.html)
323 | feature of PyCharm that runs a remote IDE in the container.
324 | 
325 | The first time connecting you will have to install the IDE in the server in a location mounted from your PVC so
326 | that is stored for future use.
327 | After that, or if you already have the IDE stored in your PVC from a previous project,
328 | the template will start the IDE on its own at the container creation,
329 | and you will be able to directly connect to it from the JetBrains Gateway client on your local machine.
330 | 
331 | **Preliminaries: saving the project IDE configuration**
332 | 
333 | The remote IDE stores its configuration and cache (e.g., the interpreters you set up, memory requirements, etc.)
334 | in `~/.config/JetBrains/RemoteDev-PY/...`, `~/.cache/JetBrains/RemoteDev-PY/...`, and other directories.
335 | 
336 | To have it preserved between different dev containers, you should specify the `JETBRAINS_SERVER_AT` env variable
337 | with your submit command as shown in the examples in `submit-scripts/remote-development.sh`.
338 | The template will use it to store the IDE configuration and cache in a separate directory
339 | per project (defined by its $PROJECT_ROOT_AT).
340 | All the directories will be created automatically.
341 | 
342 | **First time only (if you don't have the IDE stored from another project), or if you want to update the IDE.**
343 | 
344 | 1. Submit your job as in the example `submit-scripts/remote-development.sh` and in particular edit the environment
345 |    variables
346 |     - `JETBRAINS_SERVER_AT`: set it to the `jetbrains-server` directory described above.
347 |     - `PYCHARM_IDE_AT`: don't include it as IDE is not installed yet.
348 | 2. Enable port forwarding for the SSH port.
349 | 3. Then follow the instructions [here](https://www.jetbrains.com/help/pycharm/remote-development-a.html#gateway) and
350 |    install the IDE in your `${JETBRAINS_SERVER_AT}/dist`
351 |    (something like `/claire-rcp-scratch/home/moalla/remote-development/jetbrains-server/dist`)
352 |    not in its default location **(use the small "installation options..." link)**.
353 |    For the project directory, it should be in the same location as your PVC (`${PROJECT_ROOT_AT}`.
354 |    something like `/claire-rcp-scratch/home/moalla/template-project-name/dev`).
355 | 
356 | When in the container, locate the name of the PyCharm IDE installed.
357 | It will be at
358 | ```bash
359 | ls ${JETBRAINS_SERVER_AT}/dist
360 | # Outputs something like e632f2156c14a_pycharm-professional-2024.1.4
361 | ```
362 | The name of this directory will be what you should set the `PYCHARM_IDE_AT` variable to in the next submissions
363 | so that it starts automatically.
364 | ```bash
365 | PYCHARM_IDE_AT=e632f2156c14a_pycharm-professional-2024.1.4
366 | ```
367 | 
368 | **When you have the IDE in the PVC**
369 | You can find an example in `submit-scripts/remote-development.sh`.
370 | 
371 | 1. Same as above, but set the environment variable `PYCHARM_IDE_AT` to the directory containing the IDE binaries.
372 |    Your IDE will start running with your container.
373 | 2. Enable port forwarding for the SSH port.
374 | 3. Open JetBrains Gateway, your project should already be present in the list of projects and be running.
375 | 4. Otherwise, your container prints a link to the IDE that you can find it its logs.
376 |    Get the logs with `runai logs <job-name>`.
377 |    The link looks like:
378 | 
379 |    ```bash
380 |     Gateway link: jetbrains-gateway://connect#idePath=%2Fclaire-rcp-scratch%2Fhome%2Fmoalla%2Fremote-development%2Fpycharm&projectPath=%2Fclaire-rcp-scratch%2Fhome%2Fmoalla%2Ftemplate-project-name%2Fdev&host=127.0.0.1&port=2223&user=moalla&type=ssh&deploy=false&newUi=true
381 |     ```
382 |    Use it in Gateway to connect to the IDE.
383 | 
384 | **Configuration**:
385 | 
386 | * PyCharm's default terminal is bash. Change it to zsh in the Settings -> Tools -> Terminal.
387 | * When running Run/Debug configurations, set your working directory the project root (`$PROJECT_ROOT_AT`), not the script's directory.
388 | * Your interpreter will be
389 |   * the system Python `/usr/bin/python` with the `from-python` option.
390 |   * the Python in your conda environment with the `from-scratch` option, with the conda binary found at `/opt/conda/condabin/conda`.
391 | 
392 | **Limitations:**
393 | 
394 | - The terminal in PyCharm opens ssh connections to the container,
395 |   so the workaround (and its limitations) in the ssh section apply.
396 |   If needed, you could just open a separate terminal on your local machine
397 |   and directly exec a shell into the container.
398 | - It's not clear which environment variables are passed to the programs run from the IDE like the debugger.
399 |   So far, it seems like the SSH env variables workaround works fine for this.
400 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet.
401 | 
402 | ### VSCode / Cursor
403 | 
404 | We support the [Remote Development using SSH ](https://code.visualstudio.com/docs/remote/ssh)
405 | feature of VS code that runs a remote IDE in the container via SSH. To set this up for Cursor, simply replace `VSCODE` by `CURSOR` and `vscode` by `cursor` in all instructions below. For example, `VSCODE_SERVER_AT` becomes `CURSOR_SERVER_AT`, and `~/.vscode-server` becomes `~/.cursor-server`.
406 | 
407 | 
408 | **Preliminaries: saving the IDE configuration**
409 | 
410 | The remote IDE stores its configuration (e.g., the extensions you set up) in `~/.vscode-server`.
411 | To have it preserved between different dev containers, you should specify the
412 | `VSCODE_SERVER_AT` env variable with your submit command
413 | as shown in the examples in `submit-scripts/remote-development.sh`.
414 | The template will use it to store the IDE configuration and cache in a separate directory
415 | per project (defined by its $PROJECT_ROOT_AT).
416 | All the directories will be created automatically.
417 | 
418 | **ssh configuration**
419 | 
420 | VS Code takes ssh configuration from files.
421 | Follow the steps in the [SSH configuration section](#ssh-configuration-necessary-for-pycharm-and-vs-code)
422 | to set up your ssh config file for runai jobs.
423 | 
424 | **Connecting VS Code to the container**:
425 | 
426 | 1. In your `runai submit` command, set the environment variables for
427 |     - Opening an ssh server `SSH_SERVER=1`.
428 |     - preserving your config `VSCODE_SERVER_AT`.
429 | 2. Enable port forwarding for the SSH connection.
430 | 3. Have the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh)
431 |    extension on your local VS Code.
432 | 4. Connect to the ssh host following the
433 |    steps [here](https://code.visualstudio.com/docs/remote/ssh#_connect-to-a-remote-host).
434 | 
435 | The directory to add to your VS Code workspace should be the same as the one specified in the `PROJECT_ROOT_AT`.
436 | 
437 | **Limitations**
438 | 
439 | - The terminal in VS Code opens ssh connections to the container,
440 |   so the workaround (and its limitations) in the ssh section apply.
441 |   If needed, you could just open a separate terminal on your local machine
442 |   and directly exec a shell into the container.
443 | - Support for programs with graphical interfaces (i.g. forwarding their interface) has not been tested yet.
444 | 
445 | ### JupyterLab
446 | 
447 | If you have `jupyterlab` in your dependencies, then the template can open a Jupyter Lab server for you when
448 | the container starts.
449 | 
450 | To do so, you need to:
451 | 
452 | 1. Set the `JUPYTER_SERVER=1` environment variable in your `runai submit` command.
453 |    You can find an example in `submit-scripts/remote-development.sh`.
454 | 
455 |    A Jupyter server will start running with your container. It will print a link to the container logs.
456 | 
457 |    Get the logs with `runai logs <job-name>`.
458 |    The link looks like:
459 | 
460 |    ```bash
461 |    [C 2023-04-26 17:17:03.072 ServerApp]
462 | 
463 |     To access the server, open this file in a browser:
464 |         ...
465 |     Or copy and paste this URL:
466 |         http://hostname:8887/?token=1098cadee3ac0c48e0b0a3bf012f8f06bb0d56a6cde7d128
467 |    ```
468 | 
469 | 2. Forward the port `8887` on your local machine to the port `8887` on the container.
470 |    ```bash
471 |    kubectl port-forward <pod-name> 8887:8887
472 |    ```
473 | 
474 | 3. Open the link in your browser, replacing `hostname` with `localhost`.
475 | 
476 | **Note:**
477 | 
478 | Development on Jupyter notebooks can be very useful, e.g., for quick iterations, plotting, etc., however,
479 | it can very easily facilitate bad practices, such as debugging with print statements, prevalence of global variables,
480 | relying on long-living kernel state, and hinder the reproducibility work.
481 | We strongly recommend using an IDE with a proper debugger for development, which would fill the need for quick
482 | iterations, and only use Jupyter notebooks for plotting results
483 | (where data is properly loaded from the output of a training script).
484 | 
485 | **Limitations:**
486 | 
487 | - We have limited usage of Jupyter so limitations are not known yet.
488 | 
489 | ### Examples
490 | 
491 | We provide examples of how to use the template in the `submit-scripts` directory.
492 | We use `submit` commands and not YAML files to specify job configurations because the Run:ai API for kubernetes
493 | resources keeps changing and is not stable yet.
494 | 
495 | ### Troubleshooting
496 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-runai-setup/example-submit-scripts/minimal.sh:
--------------------------------------------------------------------------------
 1 | runai submit \
 2 |   --name example-minimal \
 3 |   --interactive \
 4 |   --image registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-moalla-latest \
 5 |   --pvc runai-claire-moalla-scratch:/claire-rcp-scratch \
 6 |   --working-dir /claire-rcp-scratch/home/moalla/template-project-name/dev \
 7 |   -e PROJECT_ROOT_AT=/claire-rcp-scratch/home/moalla/template-project-name/dev \
 8 |   -e PROJECT_NAME=template-project-name \
 9 |   -e PACKAGE_NAME=template_package_name \
10 |   -g 1 --cpu 8 --cpu-limit 8 --memory 64G --memory-limit 64G \
11 |   -- sleep infinity
12 | 
13 | ## Notes:
14 | # This is a minimal example of a working submission.
15 | # You can then attach a shell to this job with: runai exec -it example-minimal zsh
16 | 
17 | # The important bits here are:
18 | # 1.The command to mount your pcv.
19 | # --pvc your_pvc_name:/where_to_mount_your_pvc (you can mount it anywhere)
20 | # 2.The environment variables that tell the entrypoint where to find your project.
21 | # -e PROJECT_ROOT_AT=<location of your project in your mounted PVC> .
22 | # 3.The working directory set to the PROJECT_ROOT_AT.
23 | # --working-dir same as PROJECT_ROOT_AT.
24 | 
25 | ## Useful commands.
26 | # runai describe job example-minimal
27 | # runai logs example-minimal
28 | # runai exec -it example-minimal zsh
29 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-runai-setup/example-submit-scripts/remote-development.sh:
--------------------------------------------------------------------------------
 1 | ## Go to the end of the file for useful commands and troubleshooting tips.
 2 | 
 3 | # Minimal setup to just ssh into the container.
 4 | # For additional options check the readme first, then use from below as examples.
 5 | 
 6 | # For RCP use the --pvc claire-scratch:/claire-rcp-scratch
 7 | # For IC use the runai-claire-moalla-scratch:/claire-rcp-scratch
 8 | runai submit \
 9 |   --name example-remote-development \
10 |   --interactive \
11 |   --image registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-moalla-latest \
12 |   --pvc runai-claire-moalla-scratch:/claire-rcp-scratch \
13 |   --working-dir /claire-rcp-scratch/home/moalla/template-project-name/dev \
14 |   -e PROJECT_ROOT_AT=/claire-rcp-scratch/home/moalla/template-project-name/dev \
15 |   -e PROJECT_NAME=template-project-name \
16 |   -e PACKAGE_NAME=template_package_name \
17 |   -e SSH_SERVER=1 \
18 |    --allow-privilege-escalation \
19 |   -g 1 --cpu 8 --cpu-limit 8 --memory 64G --memory-limit 64G --large-shm \
20 |   -- sleep infinity
21 | 
22 | # To request more that the interactive quota add --preemptible to the submit command.
23 | 
24 | # To mount your gitconfig
25 | #  -e GIT_CONFIG_AT=/claire-rcp-scratch/home/moalla/remote-development/gitconfig \
26 | 
27 | # For PyCharm
28 | #  -e JETBRAINS_SERVER_AT=/claire-rcp-scratch/home/moalla/remote-development/jetbrains-server \
29 | #  -e PYCHARM_IDE_AT=e632f2156c14a_pycharm-professional-2024.1.4 \
30 | 
31 | # For VSCode
32 | #  -e VSCODE_SERVER_AT=/claire-rcp-scratch/home/moalla/remote-development/vscode-server \
33 | 
34 | # For Jupyter Lab
35 | #  -e JUPYTER_SERVER=1 \
36 | 
37 | # For W&B
38 | #  -e WANDB_API_KEY_FILE_AT=/claire-rcp-scratch/home/moalla/.wandb-api-key \
39 | 
40 | # For HuggingFace
41 | #  -e HF_TOKEN_AT=/claire-rcp-scratch/home/moalla/.hf-token \
42 | #  -e HF_HOME=/claire-rcp-scratch/home/moalla/huggingface \
43 | 
44 | 
45 | ## Useful commands.
46 | # runai describe job example-remote-development
47 | # runai logs example-remote-development
48 | # kubectl port-forward example-remote-development-0-0  2222:2223
49 | # ssh runai
50 | # kubectl port-forward example-remote-development-0-0  8888:8888
51 | # runai logs example-remote-development
52 | # Get the link and paste it in your browser, replacing hostname with localhost.
53 | 
54 | ## Troubleshooting.
55 | # When you add a new line for an environment variable or a GPU, etc., remember to add a \ at the end of the line.
56 | # ... \
57 | # -e SOME_ENV_VAR=1 \
58 | # -g 1 \
59 | #...
60 | # -- sleep infinity
61 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/EPFL-runai-setup/example-submit-scripts/unattended.sh:
--------------------------------------------------------------------------------
 1 | # Minimal setup to just ssh into the container.
 2 | # For additional options check the readme first, then use from below as examples.
 3 | 
 4 | # For RCP use the --pvc claire-scratch:/claire-rcp-scratch
 5 | # For IC use the runai-claire-moalla-scratch:/claire-rcp-scratch
 6 | 
 7 | runai submit \
 8 |   --name example-unattended \
 9 |   --image registry.rcp.epfl.ch/claire/moalla/template-project-name:amd64-cuda-moalla-latest \
10 |   --pvc runai-claire-moalla-scratch:/claire-rcp-scratch \
11 |   --working-dir /claire-rcp-scratch/home/moalla/template-project-name/run \
12 |   -e PROJECT_ROOT_AT=/claire-rcp-scratch/home/moalla/template-project-name/run \
13 |   -e PROJECT_NAME=template-project-name \
14 |   -e PACKAGE_NAME=template_package_name \
15 |   -g 1 --cpu 8 --cpu-limit 8 --memory 64G --memory-limit 64G --large-shm \
16 |   -- python -m template_package_name.template_experiment some_arg=2 wandb.mode=offline
17 | 
18 | # template_experiment is an actual script that you can run.
19 | # or -- zsh template_package_name/reproducibility-scripts/template-experiment.sh
20 | 
21 | # For W&B
22 | #  -e WANDB_API_KEY_FILE_AT=/claire-rcp-scratch/home/moalla/.wandb-api-key \
23 | 
24 | # For HuggingFace
25 | #  -e HF_TOKEN_AT=/claire-rcp-scratch/home/moalla/.hf-token \
26 | #  -e HF_HOME=/claire-rcp-scratch/home/moalla/huggingface \
27 | 
28 | 
29 | # To separate the dev state of the project from frozen checkouts to be used in unattended jobs you can observe that
30 | # we're pointing to the .../run instance of the repository on the PVC.
31 | # That would be a copy of the template-project-name repo frozen in a commit at a working state to be used in unattended jobs.
32 | # Otherwise while developing we would change the code that would be picked by newly scheduled jobs.
33 | 
34 | # Useful commands.
35 | # runai describe job example-unattended
36 | # runai logs example-unattended
37 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/LICENSE.cresset:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 이준형/李俊炯/Joonhyung Lee/(John Young Lee)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/apt.txt:
--------------------------------------------------------------------------------
 1 | # `apt` runtime requirements file.
 2 | # These dependencies are required to run your code.
 3 | build-essential     # Likely needed.
 4 | ca-certificates     # Likely needed.
 5 | curl                # Useful.
 6 | git                 # Likely needed.
 7 | htop                # Useful.
 8 | htop                # Useful.
 9 | netcat              # Useful.
10 | openssh-server      # Required for remote development with most IDEs.
11 | openssl             # Required.
12 | sudo                # Required to open ssh server.
13 | tmux                # Useful.
14 | tree                # Useful.
15 | vim                 # Useful.
16 | wget                # Useful.
17 | zsh                 # Required.
18 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/compose-base.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   build-args:
 3 |     build:
 4 |       args:
 5 |         # Pytorch 2.4.0a0+f70bd71a48, NVIDIA CUDA 12.5.0.23, Python 3.10.
 6 |         # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-06.html
 7 |         # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
 8 |         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
 9 |         GIT_IMAGE: docker.io/alpine/git:2.40.1            # https://hub.docker.com/r/alpine/git/tags
10 |         # You can find the entrypoint by running `docker inspect BASE_IMAGE | grep -A 3 Entrypoint`
11 |         # If there is no entrypoint, you can leave it empty.
12 |         BASE_ENTRYPOINT: /opt/nvidia/nvidia_entrypoint.sh
13 |         # 1 normally, 0 if the entrypoint does not exec its arguments, in rare cases.
14 |         BASE_ENTRYPOINT_EXECS: 1
15 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   image-root:
 3 |     extends:
 4 |       file: compose-base.yaml
 5 |       service: build-args
 6 |     image: ${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest
 7 |     build:
 8 |       platforms:
 9 |         - "linux/amd64"
10 |       context: .
11 |       dockerfile: Dockerfile
12 |       target: runtime-generic
13 |       args:
14 |         PROJECT_NAME: ${PROJECT_NAME}
15 |         PACKAGE_NAME: ${PACKAGE_NAME}
16 | 
17 |   image-user:
18 |     extends:
19 |       service: image-root
20 |     image: ${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR}-latest
21 |     build:
22 |       dockerfile: Dockerfile-user
23 |       target: runtime-user
24 |       args:
25 |         GENERIC_IMAGE: ${IMAGE_NAME}
26 |         IMAGE_PLATFORM: ${IMAGE_PLATFORM}
27 |         GRPID: ${GRPID}
28 |         USRID: ${USRID}
29 |         GRP: ${GRP}
30 |         USR: ${USR}
31 |         PASSWD: ${PASSWD}
32 | 
33 |   run-local-cpu: # Service to run the image locally with CPU only.
34 |     extends:
35 |       service: image-user
36 |     tty: true
37 |     stdin_open: true
38 |     volumes:
39 |       - ../..:${PROJECT_ROOT_AT}
40 |       # Here you can mount other volumes and symlink directories in data and outputs to them.
41 |     working_dir: ${PROJECT_ROOT_AT}
42 |     environment:
43 |       PROJECT_ROOT_AT: ${PROJECT_ROOT_AT}
44 |       WANDB_API_KEY: ${WANDB_API_KEY}
45 |       PROJECT_NAME: ${PROJECT_NAME}
46 |       PACKAGE_NAME: ${PACKAGE_NAME}
47 |     ipc: host                 # Edit as needed (NGC default recommendations, see /opt/nvidia/entrypoint.d/70-shm-check.sh).
48 |     ulimits:                  # Edit as needed (NGC default recommendations, see /opt/nvidia/entrypoint.d/70-shm-check.sh).
49 |       memlock: -1
50 |       stack: 67108864
51 |     network_mode: host            # Edit as needed. Default to avoid extra complecity from networking.
52 | 
53 |   dev-local-cpu: # Service to develop locally with CPU only.
54 |     extends:
55 |       service: run-local-cpu
56 |     volumes:
57 |       # To persist IDE settings and cache.
58 |       - ${HOME}/.template-gitconfig:/home/${USR}/.gitconfig
59 |       - ${HOME}/.template-dev-vscode-server:/home/${USR}/.dev-vscode-server
60 |       - ${HOME}/.template-dev-cursor-server:/home/${USR}/.dev-cursor-server
61 |       - ${HOME}/.template-dev-jetbrains-server:/home/${USR}/.jetbrains-server
62 |     environment:
63 |       PYCHARM_IDE_AT: ${PYCHARM_IDE_AT}
64 |       JETBRAINS_SERVER_AT: /home/${USR}/.jetbrains-server
65 |       VSCODE_SERVER_AT: /home/${USR}/.dev-vscode-server
66 |       CURSOR_SERVER_AT: /home/${USR}/.dev-cursor-server
67 | 
68 |   run-local-cuda: # Service to run the image locally with NVIDIA GPU.
69 |     extends:
70 |       service: run-local-cpu
71 |     deploy:
72 |       resources:
73 |         reservations:
74 |           devices:
75 |             - driver: nvidia
76 |               capabilities: [ gpu ]
77 | 
78 |   dev-local-cuda: # Service to develop locally with NVIDIA GPU.
79 |     extends:
80 |       service: dev-local-cpu
81 |     deploy:
82 |       resources:
83 |         reservations:
84 |           devices:
85 |             - driver: nvidia
86 |               capabilities: [ gpu ]
87 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/entrypoints/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Halt in case of errors. https://gist.github.com/vncsna/64825d5609c146e80de8b1fd623011ca
 3 | set -eo pipefail
 4 | echo "[TEMPLATE INFO] Running entrypoint.sh"
 5 | 
 6 | # Check that the PROJECT_ROOT_AT is set.
 7 | if [ -z "${PROJECT_ROOT_AT}" ]; then
 8 |   echo "[TEMPLATE WARNING] PROJECT_ROOT_AT is not set."
 9 |   echo "[TEMPLATE WARNING] It is expected to point to the location of your mounted project if you plan to run you code."
10 |   echo "[TEMPLATE WARNING] Ignore if you only need the development environment."
11 |   echo "[TEMPLATE WARNING] PROJECT_ROOT_AT has been defaulted to $(pwd)"
12 |   echo "[TEMPLATE WARNING] The project installation will be skipped."
13 |   export PROJECT_ROOT_AT="$(pwd)"
14 |   export SKIP_INSTALL_PROJECT=1
15 | else
16 |   echo "[TEMPLATE INFO] PROJECT_ROOT_AT is set to ${PROJECT_ROOT_AT}."
17 | fi
18 | echo "[TEMPLATE INFO] Expecting workdir to be ${PROJECT_ROOT_AT}."
19 | 
20 | if [ "$(pwd)" != "${PROJECT_ROOT_AT}" ]; then
21 |   echo "[TEMPLATE WARNING] The current/working directory $(pwd) is different from PROJECT_ROOT_AT."
22 |   echo "[TEMPLATE WARNING] The template expects them to be the same, as it assumes running the experiments from PROJECT_ROOT_AT."
23 | fi
24 | 
25 | # Install the package in editable mode.
26 | # Also ensures the code is mounted correctly.
27 | # Because setting the Python path the the project may not be enough.
28 | # https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs
29 | if [ -n "${SKIP_INSTALL_PROJECT}" ]; then
30 |   # For debugging or other purposes.
31 |   # Best practice is to install the project.
32 |   echo "[TEMPLATE INFO] Skipping the installation of the project."
33 | else
34 |   echo "[TEMPLATE INFO] Installing the project with pip."
35 |   echo "[TEMPLATE INFO] Expecting ${PROJECT_ROOT_AT} to be a Python project."
36 |   echo "[TEMPLATE INFO] To skip this installation use the env variable SKIP_INSTALL_PROJECT=1."
37 |   # The path is relative on purpose.
38 |   pip install --user --no-build-isolation -e "${PROJECT_ROOT_AT}"
39 |   # Test that the package can be imported.
40 |   echo "[TEMPLATE INFO] Testing that the package can be imported."
41 |   python -c "import ${PACKAGE_NAME}"
42 |   echo "[TEMPLATE INFO] Package imported successfully."
43 | fi
44 | 
45 | # Login options, e.g., wandb.
46 | # Doesn't do anything if no option provided.
47 | source "${ENTRYPOINTS_ROOT}"/logins-setup.sh
48 | 
49 | # Remote development options (e.g., PyCharm or VS Code configuration, Jupyter etc).
50 | # Doesn't do anything if no option provided.
51 | # Only do them once for SLURM.
52 | if [ -n "${SLURM_ONE_REMOTE_DEV}" ] && [ "${SLURM_PROCID}" -gt 0 ]; then
53 |   echo "[TEMPLATE INFO] Running the remote development entrypoint only once."
54 |   echo "[TEMPLATE INFO] Skipping remote development setup on SLURM_PROCID ${SLURM_PROCID}."
55 | else
56 |   source "${ENTRYPOINTS_ROOT}"/remote-development-setup.sh
57 | fi
58 | 
59 | # Exec so that the child process receives the OS signals.
60 | # E.g., signals that the container will be preempted.
61 | # It will be PID 1.
62 | echo "[TEMPLATE INFO] Executing the command" "$@"
63 | exec "$@"
64 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/entrypoints/logins-setup.sh:
--------------------------------------------------------------------------------
 1 | # W&B login.
 2 | 
 3 | # This does not need an internet connection.
 4 | # OPTION 1: Set WANDB_API_KEY in the environment.
 5 | if [ -n "${WANDB_API_KEY}" ]; then
 6 |   echo "[TEMPLATE INFO] Logging in to W&B."
 7 |   wandb login "${WANDB_API_KEY}"
 8 | fi
 9 | # OPTION 2: Set WANDB_API_KEY_FILE_AT in the environment which points to a file containing the key.
10 | if [ -n "${WANDB_API_KEY_FILE_AT}" ]; then
11 |   echo "[TEMPLATE INFO] Logging in to W&B."
12 |   wandb login "$(cat "${WANDB_API_KEY_FILE_AT}")"
13 | fi
14 | 
15 | # Hugging Face login.
16 | if [ -n "${HF_TOKEN_AT}" ]; then
17 |   echo "[TEMPLATE INFO] Logging in to Hugging Face."
18 |   huggingface-cli login --token "$(cat "${HF_TOKEN_AT}")"
19 | fi
20 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/entrypoints/pre-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # The base entrypoint (from the base image) should exec the command it receives otherwise this will break
 4 | # the signal handling.
 5 | # (Otherwise, you should source it, assuming then run with the same shell, then exec /opt/template-entrypoints/entrypoint.sh.)
 6 | # In the end all variables exported should be present and the command given by the user should run with PID 1.
 7 | 
 8 | # In distributed jobs the number of times the entrypoint is run should match the number of containers created.
 9 | # On Slurm, for example, with Pyxis a single container is created per node,
10 | # and if the entrypoint is called manually after srun, it will run multiple times in the same container (ntasks-per-node)
11 | # so we can skip it with the following variables:
12 | 
13 | # If nodes share the same container:
14 | if [ -n "${SLURM_ONE_ENTRYPOINT_SCRIPT_PER_JOB}" ] && [ "${SLURM_PROCID}" -gt 0 ]; then
15 |   echo "[TEMPLATE INFO] Running the entrypoing only once for the job."
16 |   echo "[TEMPLATE INFO] Skipping entrypoints on SLURM_PROCID ${SLURM_PROCID}."
17 |   echo "[TEMPLATE INFO] Executing the command" "$@"
18 |   exec "$@"
19 | fi
20 | # If tasks on the same node share the same container:
21 | if [ -n "${SLURM_ONE_ENTRYPOINT_SCRIPT_PER_NODE}" ] && [ "${SLURM_LOCALID}" -gt 0 ]; then
22 |   echo "[TEMPLATE INFO] Running the entrypoint once per node."
23 |   echo "[TEMPLATE INFO] Skipping entrypoints on SLURM_PROCID ${SLURM_PROCID}."
24 |   echo "[TEMPLATE INFO] Executing the command" "$@"
25 |   exec "$@"
26 | fi
27 | 
28 | # Continue with the entrypoint script.
29 | if [ -n "${SLURM_PROCID}" ]; then
30 |   echo "[TEMPLATE INFO] Running the pre-entrypoint.sh for SLURM_PROCID ${SLURM_PROCID}, SLURM_LOCALID ${SLURM_LOCALID}, hostname $(hostname)."
31 | fi
32 | 
33 | # Do this if the entrypoint execs the command it receives (every entrypoint should do this).
34 | if [ -n "${BASE_ENTRYPOINT_EXECS}" ] && [ "${BASE_ENTRYPOINT_EXECS}" -eq 1 ] && [ -n "${BASE_ENTRYPOINT}" ]; then
35 |   echo "[TEMPLATE INFO] execing the base image's entrypoint ${BASE_ENTRYPOINT} which will then exec the template's entrypoint."
36 |   exec "${BASE_ENTRYPOINT}" /opt/template-entrypoints/entrypoint.sh "$@"
37 | else
38 |   if [ -n "${BASE_ENTRYPOINT}" ]; then
39 |     echo "[TEMPLATE INFO] Sourcing the base image's entrypoint ${BASE_ENTRYPOINT} then execing the template's entrypoint."
40 |     source "${BASE_ENTRYPOINT}" || { echo "Failed to source ${BASE_ENTRYPOINT}"; exit 1; }
41 |     exec /opt/template-entrypoints/entrypoint.sh "$@"
42 |   else
43 |     echo "[TEMPLATE INFO] Execing the template's entrypoint."
44 |     exec /opt/template-entrypoints/entrypoint.sh "$@"
45 |   fi
46 | fi
47 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/entrypoints/remote-development-setup.sh:
--------------------------------------------------------------------------------
  1 | ####################
  2 | # Git config.
  3 | # Workaround using symlinks when clusters do not allow to mount specific directories or files.
  4 | 
  5 | if [ -n "${GIT_CONFIG_AT}" ]; then
  6 |   mkdir -p $(dirname "${GIT_CONFIG_AT}")
  7 |   touch "${GIT_CONFIG_AT}"
  8 |   ln -s "${GIT_CONFIG_AT}" "${HOME}/.gitconfig"
  9 |   echo "[TEMPLATE INFO] Sym-linked Git config to ${GIT_CONFIG_AT}."
 10 | fi
 11 | 
 12 | ####################
 13 | # Open ssh server.
 14 | 
 15 | if [ -n "${SSH_SERVER}" ] || [ -n "${SOURCE_ENV_FOR_SSH}" ];then
 16 |   # Export environment variables lost through ssh connection.
 17 |   # (Assumes a single user).
 18 |   # SSH connections don't have the environment variables, so we need to set them.
 19 |   # Export all the env variables except the ones specific to the current shell.
 20 |   # Not sure if this is the best way to do it.
 21 |   env | grep -v -E '^(BASH|SHLVL|PWD|OLDPWD|SHELL|LOGNAME|_| |\}|\{)' |\
 22 |    sed -E 's/=(.*)/="\1"/' | sed 's/^/export /' > "${HOME}"/.container-env-vars
 23 |   # Export to login shells.
 24 |   echo "source ${HOME}/.container-env-vars" >> "${HOME}/.bash_profile"
 25 |   echo "source ${HOME}/.container-env-vars" >> "${HOME}/.zprofile"
 26 |   echo "[TEMPLATE INFO] Environment variables have been written to ${HOME}/.docker-env-vars."
 27 |   echo "[TEMPLATE_INFO] And will be sourced in login shells to preserve environment variables in ssh connections."
 28 |   echo "[TEMPLATE INFO] If you change one at runtime and want it to be preserved in subsequence shell invocations, you need to write it to ${HOME}/.docker-env-vars as well."
 29 | fi
 30 | 
 31 | 
 32 | if [ -n "${SSH_SERVER}" ]; then
 33 |   # Configuration for ssh server.
 34 |   # This could be done without sudo if needed.
 35 |   # check if user is not root
 36 |  echo "[TEMPLATE INFO] Configuring ssh server on port ${SSH_CONTAINER_PORT:-2223}."
 37 |   if [ "${EUID}" -eq 0 ] || [ -n "${NO_SUDO_NEEDED}" ]; then
 38 |     mkdir /var/run/sshd
 39 |     sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd
 40 |     # Change the default port to ${SSH_CONTAINER_PORT}.
 41 |     sed -i "s/#Port 22/Port ${SSH_CONTAINER_PORT:-2223}/" /etc/ssh/sshd_config
 42 |   else
 43 |     echo "${PASSWD}" | sudo -S mkdir /var/run/sshd
 44 |     echo "${PASSWD}" | sudo -S sed -i \
 45 |     's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd
 46 |     # Change the default port to ${SSH_CONTAINER_PORT}.
 47 |     echo "${PASSWD}" | sudo -S sed -i "s/#Port 22/Port ${SSH_CONTAINER_PORT:-2223}/" /etc/ssh/sshd_config
 48 |   fi
 49 | 
 50 |   echo "[TEMPLATE INFO] Starting ssh server on port ${SSH_CONTAINER_PORT:-2223}."
 51 |   # This runs in background, so the script will continue.
 52 |   if [ "${EUID}" -eq 0 ] || [ -n "${NO_SUDO_NEEDED}" ]; then
 53 |     /usr/sbin/sshd
 54 |   else
 55 |     echo "${PASSWD}" | sudo -S /usr/sbin/sshd
 56 |   fi
 57 | 
 58 |   # Make login shells cd to the project root.
 59 |   echo "cd ${PROJECT_ROOT_AT}" >> "${HOME}/.bash_profile"
 60 |   echo "cd ${PROJECT_ROOT_AT}" >> "${HOME}/.zprofile"
 61 | fi
 62 | 
 63 | ####################
 64 | ## PyCharm remote development server.
 65 | # You can set the env variable JETBRAINS_SERVER_AT to persist your JetBrains configuration and cache.
 66 | # You can set the env variable PYCHARM_IDE_AT to the location of the PyCharm binaries in your mounted storage.
 67 | 
 68 | # Workaround using symlinks when clusters do not allow to mount specific directories or files.
 69 | if [ -n "${JETBRAINS_SERVER_AT}" ]; then
 70 |   echo "[TEMPLATE INFO] Sym-linking to PyCharm project config files."
 71 |   # Per-project server.
 72 |   # Create if doesn't exist.
 73 |   PROJECT_JETBRAINS_SERVER_AT="${JETBRAINS_SERVER_AT}/projects/${PROJECT_ROOT_AT}"
 74 |   mkdir -p "${JETBRAINS_SERVER_AT}"/dist
 75 |   mkdir -p "${PROJECT_JETBRAINS_SERVER_AT}/config"
 76 |   mkdir -p "${PROJECT_JETBRAINS_SERVER_AT}/local"
 77 |   mkdir -p "${PROJECT_JETBRAINS_SERVER_AT}/cache"
 78 |   mkdir -p "${HOME}/.config"
 79 |   mkdir -p "${HOME}/.local/share"
 80 |   mkdir -p "${HOME}/.cache"
 81 |   ln -s "${PROJECT_JETBRAINS_SERVER_AT}/config" "${HOME}/.config/JetBrains"
 82 |   ln -s "${PROJECT_JETBRAINS_SERVER_AT}/local" "${HOME}/.local/share/JetBrains"
 83 |   ln -s "${PROJECT_JETBRAINS_SERVER_AT}/cache" "${HOME}/.cache/JetBrains"
 84 | fi
 85 | 
 86 | if [ -n "${PYCHARM_IDE_AT}" ]; then
 87 |   # Check if directory exists.
 88 |   if [ ! -d "${JETBRAINS_SERVER_AT}/dist/${PYCHARM_IDE_AT}" ]; then
 89 |     echo "[TEMPLATE WARNING] The PyCharm IDE directory ${JETBRAINS_SERVER_AT}/dist/${PYCHARM_IDE_AT} does not exist."
 90 |     echo "[TEMPLATE WARNING] The IDE will not be started. This is okay if you're installing an IDE manually."
 91 |   else
 92 |     echo "[TEMPLATE INFO] Starting PyCharm remote development server."
 93 |     REMOTE_DEV_NON_INTERACTIVE=1 \
 94 |       "${JETBRAINS_SERVER_AT}/dist/${PYCHARM_IDE_AT}/bin/remote-dev-server.sh" run "${PROJECT_ROOT_AT}" \
 95 |       --ssh-link-host 127.0.0.1 \
 96 |       --ssh-link-user "${USER:-$(id -un)}" \
 97 |       --ssh-link-port "${SSH_FORWARD_PORT:-2223}" &
 98 |   fi
 99 | fi
100 | 
101 | ####################
102 | ## VS Code remote development server.
103 | # Workaround using symlinks when clusters do not allow to mount specific directories or files.
104 | 
105 | if [ -n "${VSCODE_SERVER_AT}" ]; then
106 |   echo "[TEMPLATE INFO] Sym-linking to VSCode server config files."
107 |   # Per-project server.
108 |   # Create if doesn't exist.
109 |   PROJECT_VSCODE_SERVER_AT="${VSCODE_SERVER_AT}/projects${PROJECT_ROOT_AT}"
110 |   mkdir -p "${PROJECT_VSCODE_SERVER_AT}"
111 |   ln -s "${PROJECT_VSCODE_SERVER_AT}" "${HOME}/.vscode-server"
112 | fi
113 | 
114 | ####################
115 | ## Cursor remote development server.
116 | # Same as VSCode up to naming
117 | 
118 | if [ -n "${CURSOR_SERVER_AT}" ]; then
119 |   echo "[TEMPLATE INFO] Sym-linking to Cursor server config files."
120 |   # Per-project server.
121 |   # Create if doesn't exist.
122 |   PROJECT_CURSOR_SERVER_AT="${CURSOR_SERVER_AT}/projects${PROJECT_ROOT_AT}"
123 |   mkdir -p "${PROJECT_CURSOR_SERVER_AT}"
124 |   ln -s "${PROJECT_CURSOR_SERVER_AT}" "${HOME}/.cursor-server"
125 | fi
126 | 
127 | #####################
128 | # Jupyter Lab server.
129 | # Jupyter must be installed in the environment.
130 | 
131 | if [ -n "${JUPYTER_SERVER}" ]; then
132 |   echo "[TEMPLATE INFO] Starting Jupyter Lab server."
133 |   # Workaround to open zsh.
134 |   SHELL=zsh \
135 |     jupyter-lab --no-browser --port="${JUPYTER_PORT:-8887}" --notebook-dir="${PROJECT_ROOT_AT}" &
136 | fi
137 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-python-template/Dockerfile:
--------------------------------------------------------------------------------
  1 | # syntax = docker/dockerfile:1
  2 | 
  3 | # BASE_IMAGE is the image that will be extended by this Dockerfile.
  4 | # It is assumed to a well configured Python installation.
  5 | # The reminaing packages will be installed with pip.
  6 | ARG BASE_IMAGE
  7 | ARG GIT_IMAGE
  8 | 
  9 | ########################################################################
 10 | # Install apt packages.
 11 | 
 12 | FROM ${BASE_IMAGE} AS runtime-apt-pkgs
 13 | 
 14 | # A directory to record all the dependency files used at multiple stages.
 15 | # This is useful for a later inspection or debugging.
 16 | ENV DEPENDENCIES_DIR=/opt/template-dependencies
 17 | RUN mkdir ${DEPENDENCIES_DIR}
 18 | COPY apt.txt ${DEPENDENCIES_DIR}/apt.txt
 19 | 
 20 | # Enable caching for `apt` packages in Docker.
 21 | # https://docs.docker.com/engine/reference/builder/#run---mounttypecache
 22 | RUN rm -f /etc/apt/apt.conf.d/docker-clean; \
 23 |     echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > \
 24 |     /etc/apt/apt.conf.d/keep-cache
 25 | 
 26 | ARG DEBIAN_FRONTEND=noninteractive
 27 | # sed is only used as a hack to remove comments from the file apt.txt.
 28 | RUN --mount=type=cache,target=/var/cache/apt,sharing=private \
 29 |     --mount=type=cache,target=/var/lib/apt,sharing=private \
 30 |     apt update && \
 31 |     sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \
 32 |     xargs -t apt-get install -y --no-install-recommends && \
 33 |     rm -rf /var/lib/apt/lists/*
 34 | 
 35 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes.
 36 | #RUN --mount=type=cache,target=/var/cache/apt \
 37 | #    --mount=type=cache,target=/var/lib/apt \
 38 | #    apt update && \
 39 | #    sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \
 40 | #    xargs -t apt-get install -y --no-install-recommends && \
 41 | #    rm -rf /var/lib/apt/lists/*
 42 | 
 43 | ########################################################################
 44 | # Install dependencies.
 45 | 
 46 | FROM runtime-apt-pkgs AS runtime-deps
 47 | 
 48 | # Install pip packages.
 49 | ENV PIP_CACHE_DIR=/root/.cache/pip
 50 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-before-pip-install.txt
 51 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-before-pip-install.txt
 52 | COPY requirements.txt ${DEPENDENCIES_DIR}/requirements.txt
 53 | RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \
 54 |     pip install -r ${DEPENDENCIES_DIR}/requirements.txt
 55 | # Podman: Comment the above and use this instead with podman as it doesn't support sharing mount modes.
 56 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \
 57 | #    pip install -r ${DEPENDENCIES_DIR}/requirements.txt
 58 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-after-pip-install.txt
 59 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-after-pip-install.txt
 60 | 
 61 | # For reproducible requirements use the following after getting the requirements-freeze.txt file from the first build.
 62 | #COPY requirements-freeze.txt ${DEPENDENCIES_DIR}/requirements-freeze.txt
 63 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \
 64 | #    pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt
 65 | # For podman
 66 | #RUN --mount=type=cache,target=${PIP_CACHE_DIR} \
 67 | #    pip install --no-deps -r ${DEPENDENCIES_DIR}/requirements-freeze.txt
 68 | 
 69 | # Optional optimizations.
 70 | # Hack to enable Intel MKL optimizations on AMD CPUs.
 71 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
 72 | ARG FAKEINTEL_PATH=/opt/fakeintel/libfakeintel.so
 73 | ENV FAKEINTEL_PATH=${FAKEINTEL_PATH}
 74 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
 75 | # Build.
 76 | RUN echo 'int mkl_serv_intel_cpu_true() {return 1;}' > /tmp/fakeintel.c && \
 77 |     mkdir -p /opt/fakeintel && \
 78 |     gcc -shared -fPIC -o ${FAKEINTEL_PATH} /tmp/fakeintel.c
 79 | # Enable.
 80 | ENV LD_PRELOAD=${FAKEINTEL_PATH}:${LD_PRELOAD}
 81 | 
 82 | ########################################################################
 83 | # Here you can install other software
 84 | 
 85 | 
 86 | ########################################################################
 87 | # Download Z-Shell enhancements.
 88 | 
 89 | FROM ${GIT_IMAGE} AS get-pure
 90 | 
 91 | ARG PURE_URL=https://github.com/sindresorhus/pure.git
 92 | ARG ZSHA_URL=https://github.com/zsh-users/zsh-autosuggestions.git
 93 | ARG ZSHS_URL=https://github.com/zsh-users/zsh-syntax-highlighting.git
 94 | 
 95 | RUN git clone --depth 1 ${PURE_URL} /opt/zsh/pure
 96 | RUN git clone --depth 1 ${ZSHA_URL} /opt/zsh/zsh-autosuggestions
 97 | RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting
 98 | 
 99 | ########################################################################
100 | # This stage is the final user-agnostic (generic) stage.
101 | # This layer can be distributed so that subsequent users
102 | 
103 | FROM runtime-deps AS runtime-generic
104 | 
105 | ENV HYDRA_FULL_ERROR=1
106 | 
107 | # A final record of the dependencies from pip freeze.
108 | RUN pip freeze > ${DEPENDENCIES_DIR}/requirements-freeze-final.txt
109 | RUN pip list --format freeze > ${DEPENDENCIES_DIR}/requirements-list-final.txt
110 | 
111 | # Shell configuration.
112 | ENV ZSH_ENHANCE_DIR=/etc/zsh/enhance
113 | ARG PURE_PATH=${ZSH_ENHANCE_DIR}/pure
114 | ARG ZSHA_PATH=${ZSH_ENHANCE_DIR}/zsh-autosuggestions
115 | ARG ZSHS_PATH=${ZSH_ENHANCE_DIR}/zsh-syntax-highlighting
116 | COPY --from=get-pure /opt/zsh/pure ${PURE_PATH}
117 | COPY --from=get-pure /opt/zsh/zsh-autosuggestions ${ZSHA_PATH}
118 | COPY --from=get-pure /opt/zsh/zsh-syntax-highlighting ${ZSHS_PATH}
119 | RUN {   echo "fpath+=${PURE_PATH}"; \
120 |         echo "autoload -Uz promptinit; promptinit"; \
121 |         echo "prompt pure"; \
122 |         echo "source ${ZSHA_PATH}/zsh-autosuggestions.zsh"; \
123 |         echo "source ${ZSHS_PATH}/zsh-syntax-highlighting.zsh"; \
124 |         echo "alias ls='ls --color=auto'"; \
125 |         echo "alias ll='ls -lh'"; \
126 |         echo "alias update-env-file='source \${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/update-env-file.sh'"; \
127 |     } >> /etc/zsh/zshrc
128 | 
129 | 
130 | # Entrypoints.
131 | # Don't overwrite the entrypoint, it is installing the project
132 | # and testing that you correctly mounted the project code.
133 | # It also performs some other important setup depending on the deployment platform.
134 | ARG BASE_ENTRYPOINT
135 | ARG BASE_ENTRYPOINT_EXECS
136 | ENV BASE_ENTRYPOINT=${BASE_ENTRYPOINT}
137 | ENV BASE_ENTRYPOINT_EXECS=${BASE_ENTRYPOINT_EXECS}
138 | ENV ENTRYPOINTS_ROOT=/opt/template-entrypoints
139 | COPY entrypoints ${ENTRYPOINTS_ROOT}
140 | ENTRYPOINT ["/opt/template-entrypoints/pre-entrypoint.sh"]
141 | CMD ["/bin/zsh"]
142 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-python-template/compose-base.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   build-args:
 3 |     build:
 4 |       args:
 5 |         # Pytorch 2.4.0a0+f70bd71a48, NVIDIA CUDA 12.5.0.23, Python 3.10.
 6 |         # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-06.html
 7 |         # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
 8 |         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
 9 |         GIT_IMAGE: docker.io/alpine/git:2.40.1            # https://hub.docker.com/r/alpine/git/tags
10 |         # You can find the entrypoint by running `docker inspect BASE_IMAGE | grep -A 3 Entrypoint`
11 |         # If there is no entrypoint, you can leave it empty.
12 |         BASE_ENTRYPOINT: /opt/nvidia/nvidia_entrypoint.sh
13 |         # 1 normally, 0 if the entrypoint does not exec its arguments, in rare cases.
14 |         BASE_ENTRYPOINT_EXECS: 1
15 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-python-template/requirements.txt:
--------------------------------------------------------------------------------
1 | hatchling # To build the package without isolation in edit mode for faster startup.
2 | editables # Same as above.
3 | hydra-core
4 | tqdm
5 | wandb
6 | pre-commit
7 | black
8 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-python-template/update-env-file.sh:
--------------------------------------------------------------------------------
1 | # Records the current environment to a file.
2 | # Packages installed from GitHub with pip install <git url> will not be recorded
3 | # properly (i.e. the link will be omitted and just replaced with the version).
4 | # In that case, you have to update this file to add commands that
5 | # will fix the environment file. (you could also just edit it manually afterwards).
6 | 
7 | ENV_FILE="${PROJECT_ROOT_AT}"/installation/docker-amd64-cuda/requirements-freeze.txt
8 | pip list --exclude-editable --format freeze > "${ENV_FILE}"
9 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-scratch-template/Dockerfile:
--------------------------------------------------------------------------------
  1 | # syntax = docker/dockerfile:1
  2 | 
  3 | # CURL_IMAGE is used to download from the internet in independent layers.
  4 | # GIT_IMAGE is used to clone git repositories in independent layers.
  5 | # BASE_IMAGE is the base image for the project, likely the Ubuntu image.
  6 | ARG CURL_IMAGE
  7 | ARG GIT_IMAGE
  8 | ARG BASE_IMAGE
  9 | 
 10 | ########################################################################
 11 | # Download conda.
 12 | 
 13 | FROM ${CURL_IMAGE} AS get-conda
 14 | ARG CONDA_URL
 15 | RUN mkdir /tmp/conda && \
 16 |     curl -fvL -o /tmp/conda/miniconda.sh ${CONDA_URL}
 17 | 
 18 | ########################################################################
 19 | # Install conda.
 20 | 
 21 | FROM ${BASE_IMAGE} AS install-conda
 22 | 
 23 | ARG CONDA_INSTALL_PATH
 24 | RUN --mount=type=bind,from=get-conda,source=/tmp/conda,target=/tmp/conda \
 25 |     /bin/bash /tmp/conda/miniconda.sh -b -p ${CONDA_INSTALL_PATH}
 26 | 
 27 | ########################################################################
 28 | # Install apt packages.
 29 | 
 30 | FROM ${BASE_IMAGE} AS runtime-apt-pkgs
 31 | 
 32 | # A directory to record all the dependency files used at multiple stages.
 33 | # This is useful for a later inspection or debugging.
 34 | ENV DEPENDENCIES_DIR=/opt/template-dependencies
 35 | RUN mkdir ${DEPENDENCIES_DIR}
 36 | COPY apt.txt ${DEPENDENCIES_DIR}/apt.txt
 37 | 
 38 | # Enable caching for `apt` packages in Docker.
 39 | # https://docs.docker.com/engine/reference/builder/#run---mounttypecache
 40 | RUN rm -f /etc/apt/apt.conf.d/docker-clean; \
 41 |     echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > \
 42 |     /etc/apt/apt.conf.d/keep-cache
 43 | 
 44 | ARG DEBIAN_FRONTEND=noninteractive
 45 | # sed is only used as a hack to remove comments from the file apt.txt.
 46 | RUN --mount=type=cache,target=/var/cache/apt,sharing=private \
 47 |     --mount=type=cache,target=/var/lib/apt,sharing=private \
 48 |     apt update && \
 49 |     sed -e 's/#.*//g' -e 's/\r//g' ${DEPENDENCIES_DIR}/apt.txt | \
 50 |     xargs -t apt-get install -y --no-install-recommends && \
 51 |     rm -rf /var/lib/apt/lists/*
 52 | 
 53 | ########################################################################
 54 | # Install dependencies.
 55 | 
 56 | FROM runtime-apt-pkgs AS runtime-deps
 57 | 
 58 | ARG PROJECT_NAME
 59 | ENV PYTHONDONTWRITEBYTECODE=1
 60 | ENV PYTHONUNBUFFERED=1
 61 | ENV PIP_CACHE_DIR=/root/.cache/pip
 62 | ARG CONDA_INSTALL_PATH
 63 | ENV CONDA_INSTALL_PATH=${CONDA_INSTALL_PATH}
 64 | ENV CONDA_CACHE_PKGS_DIRS=${CONDA_INSTALL_PATH}/pkgs
 65 | ENV PATH=${CONDA_INSTALL_PATH}/condabin:${PATH}
 66 | 
 67 | COPY --link --from=install-conda ${CONDA_INSTALL_PATH} ${CONDA_INSTALL_PATH}
 68 | COPY environment.yml ${DEPENDENCIES_DIR}/environment.yml
 69 | RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=private \
 70 |     --mount=type=cache,target=${CONDA_CACHE_PKGS_DIRS},sharing=private \
 71 |     mamba env create --file ${DEPENDENCIES_DIR}/environment.yml
 72 | 
 73 | # Record the dependency file after conda install which may be useful.
 74 | RUN mamba env export -n ${PROJECT_NAME} > ${DEPENDENCIES_DIR}/environment-mamba-after-env-create.yml
 75 | 
 76 | # Cleaning must be in a separate `RUN` command to preserve the Docker cache.
 77 | RUN mamba clean -fya && \
 78 |     find ${CONDA_INSTALL_PATH}/envs/${PROJECT_NAME} -name '__pycache__' | xargs rm -rf
 79 | 
 80 | # Make the conda env writeable by anyone so that it can be updated by a user.
 81 | RUN chmod -R 777 ${CONDA_INSTALL_PATH}
 82 | 
 83 | # Add conda env to path.
 84 | ENV PATH=${CONDA_INSTALL_PATH}/envs/${PROJECT_NAME}/bin:${PATH}
 85 | 
 86 | # Optional optimizations.
 87 | # Hack to enable Intel MKL optimizations on AMD CPUs.
 88 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
 89 | ARG FAKEINTEL_PATH=/opt/fakeintel/libfakeintel.so
 90 | ENV FAKEINTEL_PATH=${FAKEINTEL_PATH}
 91 | # https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
 92 | # Build.
 93 | RUN echo 'int mkl_serv_intel_cpu_true() {return 1;}' > /tmp/fakeintel.c && \
 94 |     mkdir -p /opt/fakeintel && \
 95 |     gcc -shared -fPIC -o ${FAKEINTEL_PATH} /tmp/fakeintel.c
 96 | # Enable.
 97 | ENV LD_PRELOAD=${FAKEINTEL_PATH}:${LD_PRELOAD}
 98 | 
 99 | ########################################################################
100 | # Here you can install other software
101 | # You can build and install pip packages
102 | # Just make sure to prefix your pip commands with `mamba run -n ${PROJECT_NAME} pip install ...`
103 | # to have to package installed in the same location as the conda env of the project.
104 | 
105 | ########################################################################
106 | # Download Z-Shell enhancements.
107 | 
108 | FROM ${GIT_IMAGE} AS get-pure
109 | 
110 | ARG PURE_URL=https://github.com/sindresorhus/pure.git
111 | ARG ZSHA_URL=https://github.com/zsh-users/zsh-autosuggestions.git
112 | ARG ZSHS_URL=https://github.com/zsh-users/zsh-syntax-highlighting.git
113 | 
114 | RUN git clone --depth 1 ${PURE_URL} /opt/zsh/pure
115 | RUN git clone --depth 1 ${ZSHA_URL} /opt/zsh/zsh-autosuggestions
116 | RUN git clone --depth 1 ${ZSHS_URL} /opt/zsh/zsh-syntax-highlighting
117 | 
118 | ########################################################################
119 | # This stage is the final user-agnostic (generic) stage.
120 | # This layer can be distributed so that subsequent users
121 | 
122 | FROM runtime-deps AS runtime-generic
123 | 
124 | ARG PROJECT_NAME
125 | ENV HYDRA_FULL_ERROR=1
126 | 
127 | # A final record of the depedenencies from mamba (to record the any extra custom installs)
128 | RUN mamba env export -n ${PROJECT_NAME} > ${DEPENDENCIES_DIR}/environment-mamba-final.yml
129 | 
130 | # Initialize conda for login and interactive shells, for a better development experience.
131 | # Although the environment is already in the PATH.
132 | RUN mamba init --system bash
133 | RUN { echo "mamba activate ${PROJECT_NAME}"; \
134 |     } >> /etc/profile.d/conda.sh
135 | RUN cat /etc/profile.d/conda.sh >> /etc/bash.bashrc
136 | RUN cat /etc/profile.d/conda.sh >> /etc/zsh/zprofile
137 | RUN cat /etc/profile.d/conda.sh >> /etc/zsh/zshrc
138 | 
139 | # Shell configuration.
140 | ENV ZSH_ENHANCE_DIR=/etc/zsh/enhance
141 | ARG PURE_PATH=${ZSH_ENHANCE_DIR}/pure
142 | ARG ZSHA_PATH=${ZSH_ENHANCE_DIR}/zsh-autosuggestions
143 | ARG ZSHS_PATH=${ZSH_ENHANCE_DIR}/zsh-syntax-highlighting
144 | COPY --link --from=get-pure /opt/zsh/pure ${PURE_PATH}
145 | COPY --link --from=get-pure /opt/zsh/zsh-autosuggestions ${ZSHA_PATH}
146 | COPY --link --from=get-pure /opt/zsh/zsh-syntax-highlighting ${ZSHS_PATH}
147 | # Utilities for interactive shells.
148 | RUN {   echo "fpath+=${PURE_PATH}"; \
149 |         echo "autoload -Uz promptinit; promptinit"; \
150 |         echo "prompt pure"; \
151 |         echo "source ${ZSHA_PATH}/zsh-autosuggestions.zsh"; \
152 |         echo "source ${ZSHS_PATH}/zsh-syntax-highlighting.zsh"; \
153 |         echo "alias ls='ls --color=auto'"; \
154 |         echo "alias ll='ls -lh'"; \
155 |         echo "alias update-env-file='source \${PROJECT_ROOT_AT}/installation/docker-amd64-cuda/update-env-file.sh'"; \
156 |     } >> /etc/zsh/zshrc
157 | 
158 | # Entrypoints.
159 | # The entrypoint is run in an interactive shell so that the conda environment is activated before.
160 | # Don't overwrite the entrypoint, it is installing the project
161 | # and testing that you correctly mounted the project code.
162 | # It also performs some other important setup depending on the deployment platform.
163 | ENV ENTRYPOINTS_ROOT=/opt/template-entrypoints
164 | COPY --link entrypoints ${ENTRYPOINTS_ROOT}
165 | ENTRYPOINT ["/opt/template-entrypoints/pre-entrypoint.sh"]
166 | CMD ["/bin/zsh"]
167 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-scratch-template/compose-base.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   build-args:
 3 |     build:
 4 |       args:
 5 |         BASE_IMAGE: ubuntu:22.04                # Ubuntu: https://hub.docker.com/_/ubuntu
 6 |         CURL_IMAGE: curlimages/curl:8.2.1       # https://hub.docker.com/r/curlimages/curl/tags
 7 |         GIT_IMAGE: docker.io/alpine/git:2.40.1            # https://hub.docker.com/r/alpine/git/tags
 8 |         CONDA_URL: https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-23.3.1-1-Linux-x86_64.sh
 9 |         # Change the link when changing the platform or updating to a new version.
10 |         # https://conda-forge.org/miniforge/
11 |         CONDA_INSTALL_PATH: /opt/conda
12 |         # Should be the same between stages not to brake linking.
13 |         # https://towardsdatascience.com/conda-essential-concepts-and-tricks-e478ed53b5b#bb7b
14 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-scratch-template/environment.yml:
--------------------------------------------------------------------------------
 1 | name: template-project-name
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.10
 6 |   - pip
 7 |   - pip:
 8 |       - hatchling
 9 |       - editables
10 |       - hydra-core
11 |       - tqdm
12 |       - wandb
13 |       - pre-commit
14 |       - black
15 | prefix: /opt/conda/envs/template-project-name
16 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/from-scratch-template/update-env-file.sh:
--------------------------------------------------------------------------------
 1 | # Records the current environment to a file.
 2 | # Packages installed from GitHub with pip install <git url> will not be recorded
 3 | # properly (i.e. the link can be omitted and just replaced with the version).
 4 | # In that case, you have to update this file to add commands that
 5 | # will fix the environment file. (you could also just edit it manually afterwards).
 6 | 
 7 | ENV_FILE="${PROJECT_ROOT_AT}"/installation/docker-amd64-cuda/environment-freeze.yml
 8 | # Export, but delete the package itself as it's installed at runtime.
 9 | # This is because it is only available after mounting the code.
10 | mamba env export --no-builds | sed "/${PROJECT_NAME}==.*/d" >"$ENV_FILE"
11 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/requirements.txt:
--------------------------------------------------------------------------------
1 | hatchling # To build the package without isolation in edit mode for faster startup.
2 | editables # Same as above.
3 | hydra-core
4 | tqdm
5 | wandb
6 | pre-commit
7 | black
8 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/template.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | set -e
  4 | 
  5 | ENV_TEXT=$(
  6 |   cat <<-EOF
  7 | # All user-specific configurations are here.
  8 | 
  9 | ## For building:
 10 | # Which docker and compose binary to use
 11 | # docker and docker compose in general or podman and podman-compose for CSCS Clariden
 12 | DOCKER=docker
 13 | COMPOSE="docker compose"
 14 | # Use the same USRID and GRPID as on the storage you will be mounting.
 15 | # USR is used in the image name and must be lowercase.
 16 | # It's fine if your username is not lowercase, jut make it lowercase.
 17 | USR=$(id -un | tr "[:upper:]" "[:lower:]")
 18 | USRID=$(id -u)
 19 | GRPID=$(id -g)
 20 | GRP=$(id -gn)
 21 | # PASSWD is not secret,
 22 | # it is only there to avoid running password-less sudo commands accidentally.
 23 | PASSWD=$(id -un)
 24 | # LAB_NAME will be the first component in the image path.
 25 | # It must be lowercase.
 26 | LAB_NAME=$(id -gn | tr "[:upper:]" "[:lower:]")
 27 | 
 28 | #### For running locally
 29 | # You can find the acceleration options in the compose.yaml file
 30 | # by looking at the services with names dev-local-ACCELERATION.
 31 | PROJECT_ROOT_AT=$(realpath "$(pwd)"/../..)
 32 | ACCELERATION=cuda
 33 | WANDB_API_KEY=
 34 | # PyCharm-related. Fill after installing the IDE manually the first time.
 35 | PYCHARM_IDE_AT=
 36 | 
 37 | 
 38 | ####################
 39 | # Project-specific environment variables.
 40 | ## Used to avoid writing paths multiple times and creating inconsistencies.
 41 | ## You should not need to change anything below this line.
 42 | PROJECT_NAME=template-project-name
 43 | PACKAGE_NAME=template_package_name
 44 | IMAGE_NAME=\${LAB_NAME}/\${USR}/\${PROJECT_NAME}
 45 | IMAGE_PLATFORM=amd64-cuda
 46 | # The image name includes the USR to separate the images in an image registry.
 47 | # Its tag includes the platform for registries that don't hand multi-platform images for the same tag.
 48 | # You can also add a suffix to the platform e.g. -jax or -pytorch if you use different images for different environments/models etc.
 49 | 
 50 | EOF
 51 | )
 52 | 
 53 | ## All variables below are read from the `.env` and `.project.env` files.
 54 | ENV_FILE=".env"
 55 | 
 56 | env() {
 57 |   # Creates the `.env` file.
 58 |   if [[ -f "${ENV_FILE}" ]]; then
 59 |     echo "[TEMPLATE ERROR] File ${ENV_FILE} already exists. Aborting."
 60 |     exit 1
 61 |   fi
 62 |   echo "${ENV_TEXT}" >"${ENV_FILE}"
 63 |   echo "Created the ${ENV_FILE} file. Edit it to set your user-specific variables."
 64 | }
 65 | 
 66 | check() {
 67 |   # Checks if the `.env` file exists.
 68 |   if [[ ! -f "${ENV_FILE}" ]]; then
 69 |     echo "[TEMPLATE ERROR] File ${ENV_FILE} does not exist.
 70 |      Run ./template.sh env to create it, then edit it."
 71 |     exit 1
 72 |   fi
 73 |   source "${ENV_FILE}"
 74 |   COMPOSE_PROJECT="${PROJECT_NAME}-${IMAGE_PLATFORM}-${USR}"
 75 | }
 76 | 
 77 | edit_from_base() {
 78 |   FROM_BASE="${1}"
 79 |   if [ "${FROM_BASE}" == "from-python" ] || [ "${FROM_BASE}" == "from-scratch" ]; then
 80 |     rm -f compose-base.yaml Dockerfile requirements.txt environment.yml update-env-file.sh
 81 |     cp -r "${FROM_BASE}-template"/* .
 82 |   else
 83 |     echo "[TEMPLATE ERROR] Please specify a valid from-base: from-python or from-scratch."
 84 |     exit 1
 85 |   fi
 86 | }
 87 | 
 88 | pull_generic() {
 89 |   # Pull the generic runtime and dev images.
 90 |   check
 91 |   PULL_IMAGE_NAME="${1}"
 92 |   if [ "${PULL_IMAGE_NAME}" == "" ]; then
 93 |     echo "[TEMPLATE ERROR] Please specify the name of the image to pull."
 94 |     echo "For example: ./template.sh pull ic-registry.epfl.ch/${LAB_NAME}/gaspar/${PROJECT_NAME}"
 95 |     echo "For example: ./template.sh pull docker.io/docker-username/${PROJECT_NAME}"
 96 |     exit 1
 97 |   fi
 98 | 
 99 |   $DOCKER pull "${PULL_IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest"
100 |   $DOCKER tag "${PULL_IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest"
101 | }
102 | 
103 | build_generic() {
104 |   # Check that the files in the installation/ directory are all committed to git if running the build command.
105 |   # The image uses the git commit as a tag to know which dependencies where installed.
106 |   # Error if there are uncommitted changes.
107 |   case "$1" in
108 |     --ignore-uncommitted)
109 |       IGNORE_UNCOMMITTED=1
110 |       shift
111 |       ;;
112 |   esac
113 | 
114 |   if [[ ${IGNORE_UNCOMMITTED} -ne 1 ]] && \
115 |     [[ $(git status --porcelain | grep  "installation/" | grep -v -E "README" -c) -ge 1 ]]; then
116 |     echo "[TEMPLATE ERROR] There are uncommitted changes in the installation/ directory.
117 |     Please commit them before building your generic and user image.
118 |     The image uses the git commit as a tag to keep track of which dependencies where installed.
119 |     If these change don't affect the build (e.g. README),
120 |     feel free to just commit and ignore the rebuild."
121 |     echo "Force ignoring this error with the flag ./template.sh build --ignore-uncommitted."
122 |     exit 1
123 |   fi
124 | 
125 |   # Build the generic runtime and dev images and tag them with the current git commit.
126 |   check
127 |   $COMPOSE -p "${COMPOSE_PROJECT}" build image-root
128 | 
129 |   # Tag the images with the current git commit.
130 |   GIT_COMMIT=$(git rev-parse --short HEAD)
131 |   $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-${GIT_COMMIT}"
132 | }
133 | 
134 | build_user() {
135 |   # Check that the files in the installation/ directory are all committed to git if running the build command.
136 |   # The image uses the git commit as a tag to know which dependencies where installed.
137 |   # Error if there are uncommitted changes.
138 |   case "$1" in
139 |     --ignore-uncommitted)
140 |       IGNORE_UNCOMMITTED=1
141 |       shift
142 |       ;;
143 |   esac
144 | 
145 |   if [[ ${IGNORE_UNCOMMITTED} -ne 1 ]] && \
146 |     [[ $(git status --porcelain | grep  "installation/" | grep -v -E "README" -c) -ge 1 ]]; then
147 |     echo "[TEMPLATE ERROR] There are uncommitted changes in the installation/ directory.
148 |     Please commit them before building your generic and user image.
149 |     The image uses the git commit as a tag to keep track of which dependencies where installed.
150 |     If these change don't affect the build (e.g. README),
151 |     feel free to just commit and ignore the rebuild."
152 |     echo "Force ignoring this error with the flag ./template.sh build --ignore-uncommitted."
153 |     exit 1
154 |   fi
155 | 
156 |   # Build the user runtime and dev images and tag them with the current git commit.
157 |   check
158 |   $COMPOSE -p "${COMPOSE_PROJECT}" build image-user
159 | 
160 |   # If the generic image has the current git tag, then the user image has been build from that tag.
161 |   GIT_COMMIT=$(git rev-parse --short HEAD)
162 |   if [[ $($DOCKER images --format '{{.Repository}}:${IMAGE_PLATFORM}-{{.Tag}}' |\
163 |    grep -c "${GIT_COMMIT}") -ge 1 ]]; then
164 |     $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR}-latest" "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR}-${GIT_COMMIT}"
165 |   fi
166 | }
167 | 
168 | build() {
169 |   build_generic "$@"
170 |   build_user "$@"
171 | }
172 | 
173 | import_from_podman() {
174 |   check
175 |   # Import returns a non-zero exit code so we need to ignore it.
176 |   enroot import -x mount podman://${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest || true
177 |   GIT_COMMIT=$(git rev-parse --short HEAD)
178 |   if [[ $($DOCKER images --format '{{.Repository}}:{{.Tag}}' |\
179 |     grep "root-${GIT_COMMIT}" -c) -ge 1 ]]; then
180 |     enroot import -x mount podman://${IMAGE_NAME}:${IMAGE_PLATFORM}-root-${GIT_COMMIT} || true
181 |   fi
182 | }
183 | 
184 | push_usr_or_root() {
185 |   check
186 |   USR_OR_ROOT="${1}"
187 |   PUSH_IMAGE_NAME="${2}"
188 |   if [ "${PUSH_IMAGE_NAME}" == "" ]; then
189 |     echo "[TEMPLATE ERROR] Please specify the complete name of the image to push."
190 |     echo "For example: ./template.sh push docker.io/docker-username/template-project-name"
191 |     echo "EPFL people can just do ./template.sh push IC or ./template.sh push RCP
192 |       And it will be pushed to ic-registry.epfl.ch/${IMAGE_NAME}
193 |       or registry.rcp.epfl.ch/${IMAGE_NAME}"
194 |     exit 1
195 |   elif [ "${PUSH_IMAGE_NAME}" == "IC" ]; then
196 |     PUSH_IMAGE_NAME="ic-registry.epfl.ch/${IMAGE_NAME}"
197 |   elif [ "${PUSH_IMAGE_NAME}" == "RCP" ]; then
198 |     PUSH_IMAGE_NAME="registry.rcp.epfl.ch/${IMAGE_NAME}"
199 |   fi
200 | 
201 |   $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-latest" \
202 |   "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-latest"
203 |   $DOCKER push "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-latest"
204 | 
205 |   # If the image has a git tag push it as well.
206 |   GIT_COMMIT=$(git rev-parse --short HEAD)
207 |   if [[ $($DOCKER images --format '{{.Repository}}:{{.Tag}}' |\
208 |   grep "${USR_OR_ROOT}-${GIT_COMMIT}" -c) -ge 1 ]]; then
209 |     $DOCKER tag "${IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-${GIT_COMMIT}" \
210 |       "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-${GIT_COMMIT}"
211 |     $DOCKER push "${PUSH_IMAGE_NAME}:${IMAGE_PLATFORM}-${USR_OR_ROOT}-${GIT_COMMIT}"
212 |   fi
213 | }
214 | 
215 | push_generic() {
216 |   check
217 |   push_usr_or_root "root" "${1}"
218 | }
219 | 
220 | push_user() {
221 |   check
222 |   push_usr_or_root "${USR}" "${1}"
223 | }
224 | 
225 | push() {
226 |   push_generic "${1}"
227 |   push_user "${1}"
228 | }
229 | 
230 | list_env() {
231 |   # List the conda environment.
232 |   check
233 |   echo "[TEMPLATE INFO] Listing the dependencies in an empty container (nothing mounted)."
234 |   echo "[TEMPLATE INFO] It's normal to see the warnings about missing PROJECT_ROOT_AT or acceleration options."
235 |   echo "[TEMPLATE INFO] The idea is to see if all your dependencies have been installed."
236 |   $DOCKER run --rm "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest" zsh -c \
237 |   "echo '[TEMPLATE INFO] Running mamba list';\
238 |   if command -v mamba >/dev/null 2>&1; then mamba list -n ${PROJECT_NAME}; \
239 |   else echo '[TEMPLATE INFO] conda not in the environment, skipping...'; fi;
240 |   echo '[TEMPLATE INFO] Running pip list'; pip list"
241 | }
242 | 
243 | empty_interactive() {
244 |   # Start an interactive shell in an empty container.
245 |   check
246 |   echo "[TEMPLATE INFO] Starting an interactive shell in an empty container (nothing mounted)."
247 |   echo "[TEMPLATE INFO] It's normal to see the warnings about missing PROJECT_ROOT_AT or acceleration options."
248 |   echo "[TEMPLATE INFO] The idea is to see if all your dependencies have been installed."
249 |   $DOCKER run --rm -it "${IMAGE_NAME}:${IMAGE_PLATFORM}-root-latest"
250 | }
251 | 
252 | run() {
253 |   # Run a command in a new runtime container.
254 |   # Usage:
255 |   # ./template.sh run -e VAR1=VAL1 -e VAR2=VAL2 ... python -c "print('hello world')"
256 |   check
257 |   local env_vars=()
258 |   local detach=()
259 | 
260 |   # Catch detach flag
261 |   if [[ "$1" == "-d" ]]; then
262 |     shift
263 |     detach+=("-d")
264 |   fi
265 | 
266 |   # Collect environment variables and commands dynamically
267 |   while [[ "$1" == "-e" ]]; do
268 |     env_vars+=("$1" "$2")  # Store environment variable flags and values as array elements
269 |     shift 2
270 |   done
271 | 
272 |   # Execute the docker command using array expansion for environment variables
273 |   $COMPOSE -p "${COMPOSE_PROJECT}" run --rm "${detach[@]}" "${env_vars[@]}" "run-local-${ACCELERATION}" "$@"
274 | }
275 | 
276 | dev() {
277 |   # Run a command in a new development container.
278 |   # Usage:
279 |   # ./template.sh dev -e VAR1=VAL1 -e VAR2=VAL2 -e SSH_SERVER=1 ... sleep infinity"
280 |   check
281 | 
282 |   # Create the placeholder directories for remote development.
283 |   touch ${HOME}/.template-gitconfig
284 |   mkdir -p ${HOME}/.template-dev-vscode-server
285 |   mkdir -p ${HOME}/.template-dev-jetbrains-server
286 | 
287 |   local env_vars=()
288 |   local detach=()
289 | 
290 |   # Catch detach flag
291 |   if [[ "$1" == "-d" ]]; then
292 |     shift
293 |     detach+=("-d")
294 |   fi
295 | 
296 |   # Collect environment variables and commands dynamically
297 |   while [[ "$1" == "-e" ]]; do
298 |     env_vars+=("$1" "$2")  # Store environment variable flags and values as array elements
299 |     shift 2
300 |   done
301 | 
302 |   # Execute the docker command using array expansion for environment variables
303 |   $COMPOSE -p "${COMPOSE_PROJECT}" run --rm "${detach[@]}" "${env_vars[@]}" "dev-local-${ACCELERATION}" "$@"
304 | }
305 | 
306 | get_runai_scripts() {
307 |   # Rename the runai examples.
308 |   # ./template.sh get_runai_scripts
309 |   check
310 |   cp -r "./EPFL-runai-setup/example-submit-scripts/" "./EPFL-runai-setup/submit-scripts"
311 |   for file in $(find "./EPFL-runai-setup/submit-scripts" -type f); do
312 |     sed -i.deleteme "s/moalla/${USR}/g" "$file" && rm "${file}.deleteme"
313 |     sed -i.deleteme "s/claire/${LAB_NAME}/g" "$file" && rm "${file}.deleteme"
314 |   done
315 | }
316 | 
317 | get_scitas_scripts() {
318 |   # Rename the scitas examples.
319 |   # ./template.sh get_scitas_scripts
320 |   check
321 |   cp -r "./EPFL-SCITAS-setup/example-submit-scripts/" "./EPFL-SCITAS-setup/submit-scripts"
322 |     for file in $(find "./EPFL-SCITAS-setup/submit-scripts" -type f); do
323 |     sed -i.deleteme "s/moalla/${USR}/g" "$file" && rm "${file}.deleteme"
324 |     sed -i.deleteme "s/claire/${LAB_NAME}/g" "$file" && rm "${file}.deleteme"
325 |   done
326 | }
327 | 
328 | usage() {
329 |   echo "Usage: $0 {env|pull_generic|build_generic|build_user|build|push_generic|push_user|push|list_env|empty_interactive|run|dev|get_runai_scripts}"
330 | 
331 |   # Describe each function with its arguments.
332 |   echo "env: Create the .env file with the user-specific variables."
333 |   echo "pull_generic IMAGE_NAME: Pull the generic runtime and dev images."
334 |   echo "build_generic: Build the generic runtime and dev images."
335 |   echo "build_user: Build the user runtime and dev images."
336 |   echo "build: Build the generic and user runtime and dev images."
337 |   echo "import_from_podman: Import the podman image to enroot."
338 |   echo "push_generic IMAGE_NAME: Push the generic runtime and dev images."
339 |   echo "push_user IMAGE_NAME: Push the user runtime and dev images."
340 |   echo "push IMAGE_NAME: Push the generic and user runtime and dev images."
341 |   echo "list_env: List the pip/conda environment."
342 |   echo "empty_interactive: Start an interactive shell in an empty container."
343 |   echo "run -e VAR1=VAL1 -e VAR2=VAL2 ... COMMAND: Run a command in a new runtime container."
344 |   echo "dev -e VAR1=VAL1 -e VAR2=VAL2 ... COMMAND: Run a command in a new development container."
345 |   echo "get_runai_scripts: Rename the runai examples."
346 |   echo "get_scitas_scripts: Rename the scitas examples."
347 | }
348 | 
349 | if [ $# -eq 0 ]; then
350 |     usage
351 | else
352 |   # run the command
353 |   case "$1" in
354 |   -h|--help)
355 |     usage
356 |     exit 0
357 |     ;;
358 |   esac
359 |   "$@"
360 | fi
361 | 


--------------------------------------------------------------------------------
/installation/docker-amd64-cuda/update-env-file.sh:
--------------------------------------------------------------------------------
1 | # Records the current environment to a file.
2 | # Packages installed from GitHub with pip install <git url> will not be recorded
3 | # properly (i.e. the link will be omitted and just replaced with the version).
4 | # In that case, you have to update this file to add commands that
5 | # will fix the environment file. (you could also just edit it manually afterwards).
6 | 
7 | ENV_FILE="${PROJECT_ROOT_AT}"/installation/docker-amd64-cuda/requirements-freeze.txt
8 | pip list --exclude-editable --format freeze > "${ENV_FILE}"
9 | 


--------------------------------------------------------------------------------
/installation/edit-platform-and-acceleration.sh:
--------------------------------------------------------------------------------
 1 | # Updates the platform and the hardware-acceleration supported by an installation.
 2 | 
 3 | CHANGE_OR_COPY="${1}"
 4 | INSTALL_METHOD="${2}"
 5 | 
 6 | CURR_PLATFORM="${3}"
 7 | CURR_ACCELERATION="${4}"
 8 | 
 9 | NEW_PLATFORM="${5}"
10 | NEW_ACCELERATION="${6}"
11 | 
12 | # Abort if variables not defined and show usage.
13 | if [ -z "${CHANGE_OR_COPY}" ] || [ -z "${INSTALL_METHOD}" ] || [ -z "${CURR_PLATFORM}" ] || [ -z "${CURR_ACCELERATION}" ]  || [ -z "${NEW_PLATFORM}" ] || [ -z "${NEW_ACCELERATION}" ]; then
14 |   echo "Usage: installation/edit-platform-and-acceleration.sh CHANGE_OR_COPY CURR_PLATFORM CURR_ACCELERATION NEW_PLATFORM NEW_ACCELERATION"
15 |   echo "Example: installation/edit-platform-and-acceleration.sh change docker amd64 cuda arm64 cuda"
16 |   echo "Example: installation/edit-platform-and-acceleration.sh copy docker amd64 cuda arm64 cuda"
17 |   echo "Example: installation/edit-platform-and-acceleration.sh change docker amd64 cuda amd64 rocm"
18 |   echo "Example: installation/edit-platform-and-acceleration.sh change conda osx-arm64 mps linux-64 cuda"
19 |   exit 1
20 | fi
21 | 
22 | # Abort if the current installation does not exist.
23 | if [ ! -d installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION}" ]; then
24 |   echo installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION} does not exist."
25 |   exit 1
26 | fi
27 | 
28 | # Abort if the new installation already exists.
29 | if [ -d  installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}" ]; then
30 |   echo installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION} already exists."
31 |   exit 1
32 | fi
33 | 
34 | # Rename the current to the new one.
35 | 
36 | if [ "${CHANGE_OR_COPY}" = "change" ]; then
37 |   mv installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION}" installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}"
38 | elif [ "${CHANGE_OR_COPY}" = "copy" ]; then
39 |   cp -r installation/"${INSTALL_METHOD}-${CURR_PLATFORM}-${CURR_ACCELERATION}" installation/"${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}"
40 | else
41 |   echo "CHANGE_OR_COPY must be either change or copy."
42 |   exit 1
43 | fi
44 | 
45 | # Rename the installation combination in all the files.
46 | for file in $(find "installation/${INSTALL_METHOD}-${NEW_PLATFORM}-${NEW_ACCELERATION}" -type f); do
47 |   sed -i.deleteme "s/${CURR_PLATFORM}-${CURR_ACCELERATION}/${NEW_PLATFORM}-${NEW_ACCELERATION}/g" "${file}"
48 |   sed -i.deleteme "s/${CURR_PLATFORM}/${NEW_PLATFORM}/g" "${file}"
49 |   rm "${file}.deleteme"
50 | done
51 | 
52 | # Rename the default platform for the docker installation.
53 | if [ "${INSTALL_METHOD}" = "docker" ]; then
54 |   if [ "${NEW_ACCELERATION}" != "cuda" ]; then
55 |     echo "You have to edit the compose.yaml manually to add services that can leverage
56 |      the ${NEW_ACCELERATION} acceleration for the local deployment option with Docker Compose.
57 |      Refer to the dev-local-cuda service as an example for using NVIDIA GPUs."
58 |   fi
59 | fi
60 | 


--------------------------------------------------------------------------------
/outputs/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions for the outputs (models weights, logs, etc.)
 2 | 
 3 | ## [TEMPLATE] Where and how to set up the outputs
 4 | 
 5 | > [!IMPORTANT]
 6 | > **TEMPLATE TODO:**
 7 | > Update the instructions below to explain how to obtain the outputs and delete this section.
 8 | 
 9 | The template provides the `PROJECT_ROOT/outputs/` directory as a placeholder for the outputs generated in the project
10 | (model weights, logs, etc.).
11 | This allows the experiment code to always refer to the same path for the outputs independently of the deployment method
12 | for better reproducibility between deployment options.
13 | The directory can be accessed in the experiments with `config.outputs_dir`.
14 | The output directories in `PROJECT_ROOT/outputs/` don't need to be physically in the same directory
15 | as the project, you can create symlinks to them.
16 | The default setup config `src/template_package_name/configs/setup.yaml` defines an outputs subdirectory where it will
17 | save the outputs.
18 | This is by default `PROJECT_ROOT/outputs/dev` (so you can symlink that location to somewhere else).
19 | This design shifts the outputs' path configuration from the code and config which should be identical across runs
20 | to the installation steps where you will create your symlinks.
21 | This is also more convenient than using environment variables to point to individual output locations.
22 | 
23 | Below, you can instruct the users on how to link/download the outputs you generated
24 | to directly use them for reproducibility.
25 | Refer to the [data instructions](../data/README.md) for example instructions.
26 | 
27 | ## Description of the outputs
28 | 
29 | ## Instructions to obtain the outputs
30 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "template-project-name"
 7 | readme = "README.md"
 8 | license = { file = "LICENSE" }
 9 | version = "0.0.1"
10 | requires-python = ">=3.10"
11 | 
12 | [tool.hatch.build.targets.wheel]
13 | packages = ["src/template_package_name"]
14 | 
15 | [tool.isort]
16 | known_third_party = ["wandb"]
17 | 


--------------------------------------------------------------------------------
/reproducibility-scripts/README.md:
--------------------------------------------------------------------------------
1 | > [!NOTE]
2 | > **TEMPLATE TODO**
3 | > This directory should contain the commands run to reproduce the results in the paper.
4 | > E.g. the commands to train, evaluate, and produce the plots in the paper.
5 | > This can also include hyperparameter search commands like wandb sweeps.
6 | > Ideally when you run unattended jobs, your jobs should run scripts in this directory.
7 | 


--------------------------------------------------------------------------------
/reproducibility-scripts/template-experiment.sh:
--------------------------------------------------------------------------------
1 | python -m template_package_name.template_experiment
2 | 


--------------------------------------------------------------------------------
/reproducibility-scripts/template-sweep.yaml:
--------------------------------------------------------------------------------
 1 | # Run `wandb sweep reproducibility-scripts/template-sweep.yaml` to generate a sweep.
 2 | # Run `wandb agent template-sweep-id` to run the sweep.
 3 | 
 4 | project: template-project-name
 5 | name: template-sweep
 6 | method: grid
 7 | metric:
 8 |   goal: maximize
 9 |   name: some_metric
10 | parameters:
11 |   wandb.mode:
12 |     value: online
13 |   wandb.use_global_dir:
14 |     value: True
15 |   job_subdir:
16 |     value: some-special-experiment
17 |   seed:
18 |     value: 1
19 |   resuming.resume:
20 |     value: True
21 |   resuming.use_commit:
22 |     value: True
23 |   n:
24 |     values: [1, 2, 3]
25 | 
26 | command:
27 |   - python
28 |   - "-m"
29 |   - "template_package_name.template_experiment"
30 |   - ${args_no_hyphens}
31 | 


--------------------------------------------------------------------------------
/src/template_package_name/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLAIRE-Labo/python-ml-research-template/280ac60bbef5c740fe21aa6fd388ea2a0093f8d7/src/template_package_name/__init__.py


--------------------------------------------------------------------------------
/src/template_package_name/configs/override/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !README.md
4 | !template-experiment.yaml
5 | 


--------------------------------------------------------------------------------
/src/template_package_name/configs/override/README.md:
--------------------------------------------------------------------------------
 1 | The configs in this directory should not be tracked by git.
 2 | They are meant to be used as personalized overrides for the configs in the `configs` directory.
 3 | You can use them temporarily for development (e.g., disable wandb, reduce the number of epochs, etc.)
 4 | or to specify configurations specific to your machine (e.g., the number of GPUs to use).
 5 | E.g., you could have an `override/setup.yaml` doing something like:
 6 | 
 7 | ```yaml
 8 | wandb:
 9 |   mode: disabled
10 | 
11 | optim:
12 |   num_epochs: 1
13 | ```
14 | 
15 | As done for `.../override/template_experiment.yaml` in `config/template_experiment.yaml`
16 | put the override config as the last one to be read by the experiment config (the last one it its defaults).
17 | It will override any variable set there.
18 | Remember to remove everything that's not hardware dependent for your reproducible runs,
19 | or even better maintain two different copies of the repo:
20 | one for development and one for unattended runs to avoid edits
21 | while you develop to be picked up by your unattended runs.
22 | 


--------------------------------------------------------------------------------
/src/template_package_name/configs/override/template-experiment.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | # The above line should appear in the override configs so that they sit at the root of the config tree.
3 | 
4 | is_this_key_overridden: yes
5 | 


--------------------------------------------------------------------------------
/src/template_package_name/configs/setup.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # This config defines a common setup for experiments.
 4 | # Add it to the defaults of any experiment config.
 5 | 
 6 | # A random seed is generated for each run if you don't specify one.
 7 | # This is good to avoid overfitting your dev runs to specific seeds.
 8 | seed: ${generate_random_seed:}
 9 | # Cuda deterministic settings.
10 | cuda_deterministic: false
11 | cuda_strong_deterministic: false
12 | 
13 | # The root data and output directories to be used in experiments.
14 | data_dir: ${hydra:runtime.cwd}/data
15 | outputs_dir: ${hydra:runtime.cwd}/outputs
16 | 
17 | # Outputs of an experiment are stored in a directory created by Hydra of the form
18 | # outputs/${outputs_subdir}/${hydra.job.name}/${job_subdir}/${now:%Y-%m-%d_%H-%M-%S-%f}
19 | # This has been designed to allow flexibility and can be configured in the sections below.
20 | 
21 | # Outputs are saved in outputs/${outputs_subdir}/...
22 | # so you can store them in different physical locations if you want
23 | # by symlink-ing outputs_subdir to a different location.
24 | # outputs_subdir can also be useful to separate "dev" and "run" outputs.
25 | outputs_subdir: dev
26 | # Outputs of experiments generated by a script called job_name
27 | # will be saved in outputs/${outputs_subdir}/job_name/${job_subdir}/...
28 | # This can further be useful to tag experiments run from the same script.
29 | job_subdir: dev
30 | 
31 | hydra:
32 |   run:
33 |     # Finally, this is where the outputs of an individual run will be stored.
34 |     dir: outputs/${outputs_subdir}/${hydra.job.name}/${job_subdir}/runs/${now:%Y-%m-%d--%H-%M-%S-%f}
35 |   job:
36 |     chdir: true
37 |   verbose: false  # Set to true for logging at debug level.
38 | 
39 | wandb:
40 |   project: template-project-name
41 |   mode: online
42 |   anonymous: allow
43 |   tags:
44 |     - development
45 |   run_id: null
46 |   run_name: null
47 |   # Make outputs/wandb a symlink if you want to save the wandb summaries elsewhere.
48 |   # Same as with outputs_subdir.
49 |   global_dir: ${hydra:runtime.cwd}/outputs
50 |   use_global_dir: false # Otherwise, use the cwd of the experiment.
51 | 
52 | run_dir: ${hydra:run.dir}
53 | resuming_dir: null
54 | 
55 | resuming:
56 |   resume: false
57 |   use_commit: false
58 |   wandb_cache_bust: 0 # Limitation of wandb. Cannot create runs with the same ID if deleted previously.
59 |                       # Use this to refresh the id of the run and make it a "new" run.
60 |   exclude_keys: # Can be a deep key e.g. model.optimizer.lr
61 |     - run_dir
62 |     - data_dir      # To be able to resume by another user.
63 |     - outputs_dir   # To be able to resume by another user.
64 |     - resuming_dir  # To be able to force resume from anywhere.
65 |     - wandb         # To be able to move a run and resume it.
66 |     - resuming.exclude_keys # To be able to add keys on the fly and force resume.
67 | 


--------------------------------------------------------------------------------
/src/template_package_name/configs/template-experiment.yaml:
--------------------------------------------------------------------------------
 1 | # An example config file for an experiment.
 2 | # Keep this, it's used as an example to run the code after a user installs the project.
 3 | 
 4 | defaults:
 5 |   # Common setup.
 6 |   - setup
 7 |   # This file.
 8 |   - _self_
 9 |   # Optional override (untracked by git, must not impact reproducibility).
10 |   - optional override: template-experiment
11 | 
12 | ######################################################################
13 | 
14 | some_arg: "some_default_value"
15 | some_number: 10
16 | n: 10
17 | is_this_key_overridden: no
18 | 


--------------------------------------------------------------------------------
/src/template_package_name/template_experiment.py:
--------------------------------------------------------------------------------
 1 | """An example file to run an experiment.
 2 | Keep this, it's used as an example to run the code after a user installs the project.
 3 | """
 4 | 
 5 | import logging
 6 | from pathlib import Path
 7 | from time import sleep
 8 | 
 9 | import hydra
10 | import wandb
11 | from omegaconf import DictConfig
12 | 
13 | from template_package_name import utils
14 | 
15 | # Refers to utils for a description of resolvers
16 | utils.config.register_resolvers()
17 | 
18 | # Hydra sets up the logger automatically.
19 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
20 | logger = logging.getLogger(__name__)
21 | 
22 | 
23 | @hydra.main(version_base=None, config_path="configs", config_name="template-experiment")
24 | def main(config: DictConfig) -> None:
25 |     # Using the template provides utilities for experiments:
26 | 
27 |     # 1. Setting up experiment and resuming directories
28 |     config = utils.config.setup_config_and_resuming(
29 |         config, postprocess_func=lambda x: x
30 |     )
31 |     # The current working directory is a new directory unique to this run made by hydra, accessible by config.run_dir.
32 |     # A resuming directory uniquely identified by the config (and optionally the git sha)
33 |     # for storing checkpoints of the same experiment can be accessed via config.resuming.dir.
34 |     # The current directory will be the resuming directory if config.resuming.resume is True else the run directory.
35 |     # You can pass a postprocessing function to postprocess the config.
36 | 
37 |     # 2. Setting up wandb with resuming and the config logged.
38 |     utils.config.setup_wandb(config)
39 |     # Use a custom step key when you log so that you can resume logging anywhere.
40 |     # For example, if the checkpoint is earlier than the last logged step in the crashed run, you can resume
41 |     # from steps already logged, and they will be rewritten (with the same value assuming reproducibility).
42 |     # E.g., wandb.log({"my_custom_step": i, "loss": loss})
43 | 
44 |     # 3. Seeding for reproducibility
45 |     utils.seeding.seed_everything(config)
46 |     # Update this function whenever you have a library that needs to be seeded.
47 | 
48 |     # Example experiment:
49 |     checkpoints = sorted(
50 |         Path.cwd().glob("checkpoint_*.txt"), key=lambda x: int(x.stem.split("_")[1])
51 |     )
52 |     if checkpoints:
53 |         last_file = checkpoints[-1]
54 |         logger.info(f"Resuming from {last_file.stem}")
55 |         i = int(last_file.stem.split("_")[1]) + 1
56 |         # Important:
57 |         # When resuming, you should recover the state of the experiment as it was when it was interrupted.
58 |         # I.e., the random state, the state of the model, the optimizer, etc.
59 |     else:
60 |         i = 0
61 | 
62 |     steps = 0
63 |     while i < 30:
64 |         # Compute and log i*n.
65 |         logs = {"i": i, "y": i * config.n}
66 |         print(logs)
67 |         wandb.log(logs)
68 | 
69 |         # Checkpoint every 5 steps.
70 |         if i % 5 == 0:
71 |             with open(f"checkpoint_{i}.txt", "w") as f:
72 |                 f.write(f"y={logs['y']}")
73 |                 logger.info(f"Checkpointing at {i}")
74 | 
75 |         sleep(1)
76 |         i += 1
77 |         steps += 1
78 | 
79 |         # Preempt every 13 steps.
80 |         if steps == 13:
81 |             raise InterruptedError("Preempt after 13 steps.")
82 | 
83 |     logger.info("Finished writing files")
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/src/template_package_name/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from template_package_name.utils import config, seeding
2 | 


--------------------------------------------------------------------------------
/src/template_package_name/utils/config.py:
--------------------------------------------------------------------------------
  1 | # Resolvers can be used in the config files.
  2 | # https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html
  3 | # They are useful when you want to make the default values of some config variables
  4 | # result from direct computation of other config variables.
  5 | # Only put variables meant to be edited by the user (as opposed to read-only variables described below)
  6 | # and avoid making them too complicated, the point is not to write code in the config file.
  7 | import logging
  8 | import os
  9 | import subprocess
 10 | import sys
 11 | from hashlib import blake2b
 12 | from pathlib import Path
 13 | 
 14 | import wandb
 15 | from omegaconf import DictConfig, OmegaConf, omegaconf
 16 | 
 17 | from template_package_name import utils
 18 | 
 19 | # Hydra sets up the logger automatically.
 20 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
 21 | _logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def register_resolvers():
 25 |     if not OmegaConf.has_resolver("eval"):
 26 |         # Useful to evaluate expressions in the config file.
 27 |         OmegaConf.register_new_resolver("eval", eval, use_cache=True)
 28 |     if not OmegaConf.has_resolver("generate_random_seed"):
 29 |         # Generate a random seed and record it in the config of the experiment.
 30 |         OmegaConf.register_new_resolver(
 31 |             "generate_random_seed", utils.seeding.generate_random_seed, use_cache=True
 32 |         )
 33 | 
 34 | 
 35 | def save_or_check_config(config: DictConfig, path: str) -> None:
 36 |     """
 37 |     Save if it doesn't exist; otherwise (in case of resuming) assert that the
 38 |     config is the same. If they differ, log the differing key(s).
 39 |     """
 40 |     path_obj = Path(path)
 41 |     if not path_obj.exists():
 42 |         OmegaConf.save(config, path_obj)
 43 |         return
 44 | 
 45 |     # Copy and remove excluded keys (in-place removal) from both new and existing config
 46 |     new_config = config.copy()
 47 |     existing_config = OmegaConf.load(path_obj)
 48 | 
 49 |     # Convert both configs to Python dictionaries
 50 |     OmegaConf.resolve(new_config)
 51 |     OmegaConf.resolve(existing_config)
 52 | 
 53 |     remove_excluded_keys(new_config, config.resuming.exclude_keys)
 54 |     remove_excluded_keys(existing_config, config.resuming.exclude_keys)
 55 | 
 56 |     new_config_dict = OmegaConf.to_container(new_config, resolve=True)
 57 |     existing_config_dict = OmegaConf.to_container(existing_config, resolve=True)
 58 | 
 59 |     # Compare dictionaries
 60 |     differences = dictionary_diff(new_config_dict, existing_config_dict)
 61 |     if differences:
 62 |         diff_msg = "\n".join(differences)
 63 |         _logger.error(
 64 |             f"Config to resume is different from the one saved in {path}.\n"
 65 |             f"Differences:\n{diff_msg}"
 66 |         )
 67 |         raise AssertionError(
 68 |             f"Config differs from the existing config at {path}. See logs for details."
 69 |         )
 70 | 
 71 |     _logger.info(f"Configs match the one in {path}. Resuming with the same config.")
 72 | 
 73 | 
 74 | def remove_excluded_keys(config: DictConfig, exclude_keys: list[str]) -> None:
 75 |     """
 76 |     Remove keys from the config that are specified in exclude_keys.
 77 |     Exclude keys can be specified as dot-paths, e.g., "key1.key2.key3".
 78 |     """
 79 |     with omegaconf.open_dict(config):
 80 |         for key in exclude_keys:
 81 |             try:
 82 |                 path_segments = key.split(".")
 83 |                 node = config
 84 |                 for segment in path_segments[:-1]:
 85 |                     node = node[segment]  # drill down
 86 |                 del node[path_segments[-1]]  # remove the final key
 87 |             except KeyError:
 88 |                 pass
 89 | 
 90 | 
 91 | def dictionary_diff(d1: dict, d2: dict, path: str = "") -> list[str]:
 92 |     """
 93 |     Recursively compare two dictionary (or scalar) structures and return a list
 94 |     of human-readable differences. `path` is carried along to show the nested key path.
 95 |     """
 96 |     differences = []
 97 | 
 98 |     # If both are dict-like, compare keys and recurse
 99 |     if isinstance(d1, dict) and isinstance(d2, dict):
100 |         all_keys = set(d1.keys()).union(d2.keys())
101 |         for key in all_keys:
102 |             new_path = f"{path}.{key}" if path else key
103 |             if key not in d1:
104 |                 differences.append(f"Missing in new config: {new_path}")
105 |             elif key not in d2:
106 |                 differences.append(f"Missing in existing config: {new_path}")
107 |             else:
108 |                 # Recurse
109 |                 differences.extend(dictionary_diff(d1[key], d2[key], new_path))
110 |     else:
111 |         # If they are not both dicts, compare values directly
112 |         if d1 != d2:
113 |             differences.append(
114 |                 f"Value mismatch at '{path}': new='{d1}' vs existing='{d2}'"
115 |             )
116 | 
117 |     return differences
118 | 
119 | 
120 | def setup_resuming_dir(config):
121 |     """Create a unique identifier of the experiment used to specify a resuming/checkpoint directory.
122 |     The identifier is a hash of the config, excluding keys specified in config.resuming.exclude_keys.
123 |     If config.resuming.use_commit is True, the commit hash is appended to the identifier.
124 |     I.e. the checkpoint directory is defined by: the config - the excluded config keys + the commit hash (if specified)
125 |     """
126 |     if config.resuming_dir is not None:
127 |         return Path(config.resuming_dir), Path(config.resuming_dir).name
128 | 
129 |     resuming_hash = ""
130 |     config_to_hash = config.copy()
131 | 
132 |     # resolve config
133 |     OmegaConf.resolve(config_to_hash)
134 |     remove_excluded_keys(config_to_hash, config.resuming.exclude_keys)
135 |     config_hash = blake2b(str(config_to_hash).encode(), digest_size=8).hexdigest()
136 |     resuming_hash += config_hash
137 |     if config.resuming.use_commit:
138 |         commit_hash = (
139 |             subprocess.check_output(["git", "rev-parse", "HEAD"])
140 |             .strip()
141 |             .decode("utf-8")
142 |         )
143 |         resuming_hash += f"-{commit_hash[:8]}"
144 | 
145 |     resuming_dir = Path.cwd().parent.parent / "checkpoints" / resuming_hash
146 |     resuming_dir.mkdir(parents=True, exist_ok=True)
147 |     with omegaconf.open_dict(config):
148 |         config.resuming_dir = str(resuming_dir)
149 |         config.resuming_hash = resuming_hash
150 |         if config.resuming.resume:
151 |             if config.wandb.run_id is None:
152 |                 config.wandb.run_id = config.resuming_hash
153 |             if config.wandb.run_name is None:
154 |                 config.wandb.run_name = config.resuming_hash
155 | 
156 | 
157 | def setup_config_and_resuming(config, postprocess_func=None, logger=_logger):
158 |     logger.info(f"Init directory: {Path.cwd()}")
159 |     utils.config.setup_resuming_dir(config)
160 |     logger.info(f"Run can be resumed from the directory: {config.resuming_dir}")
161 |     if config.resuming.resume:
162 |         os.chdir(config.resuming_dir)
163 |         logger.info(f"Resuming from the directory: {Path.cwd()}")
164 | 
165 |     Path(f"config").mkdir(exist_ok=True, parents=True)
166 |     utils.config.save_or_check_config(
167 |         config,
168 |         f"config/config-raw.yaml",
169 |     )
170 | 
171 |     # Do some optional postprocessing to the config (e.g., checking division of batch size etc.)
172 |     OmegaConf.resolve(config)
173 |     if postprocess_func:
174 |         config = postprocess_func(config)
175 | 
176 |     # Save the resolved config.
177 |     utils.config.save_or_check_config(config, f"config/config-postprocessed.yaml")
178 | 
179 |     return config
180 | 
181 | 
182 | def setup_wandb(config, logger=_logger):
183 |     wandb.init(
184 |         id=config.wandb.run_id,
185 |         name=config.wandb.run_name,
186 |         resume="allow" if config.resuming.resume else "never",
187 |         config=OmegaConf.to_container(config),
188 |         project=config.wandb.project,
189 |         tags=config.wandb.tags,
190 |         mode=config.wandb.mode,
191 |         anonymous=config.wandb.anonymous,
192 |         dir=Path.cwd() if not config.wandb.use_global_dir else config.wandb.global_dir,
193 |     )
194 | 
195 |     # Re-log to capture log with wandb.
196 |     logger.info(f"Running command: {subprocess.list2cmdline(sys.argv)}")
197 |     logger.info(f"Init directory: {config.run_dir}")
198 |     logger.info(f"Run can be resumed from the directory: {config.resuming_dir}")
199 |     logger.info(f"Working directory: {Path.cwd()}")
200 |     logger.info(f"Running with config: \n{OmegaConf.to_yaml(config)}")
201 |     if config.resuming.resume:
202 |         logger.info(f"Resuming from the directory: {Path.cwd()}")
203 | 


--------------------------------------------------------------------------------
/src/template_package_name/utils/seeding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | # import numpy as np
 5 | # import torch
 6 | 
 7 | 
 8 | def generate_random_seed():
 9 |     """Generate a random seed."""
10 |     return random.randint(0, 2**32 - 1)
11 | 
12 | 
13 | # Update this function whenever you have a library that needs to be seeded.
14 | def seed_everything(config):
15 |     """Seed all random generators."""
16 |     random.seed(config.seed)
17 | 
18 |     ## For numpy:
19 |     # This is for legacy numpy:
20 |     # np.random.seed(config.seed)
21 |     # New code should make a Generator out of the config.seed directly:
22 |     # https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
23 | 
24 |     ## For PyTorch:
25 |     # torch.manual_seed(config.seed)
26 | 
27 |     # if config.cuda_deterministic:
28 |     #     # Higher (e.g., on CUDA too) reproducibility with deterministic algorithms:
29 |     #     # https://pytorch.org/docs/stable/notes/randomness.html
30 |     #
31 |     #     # Not supported for all operations though:
32 |     #     # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
33 |     #     if config.cuda_strong_deterministic:
34 |     #         torch.use_deterministic_algorithms(True)
35 |     #
36 |     #     #  A lighter version of the above otherwise as not all algorithms have a deterministic implementation
37 |     #     torch.backends.cudnn.deterministic = True
38 |     #
39 |     #     # torch.backends.cudnn.benchmark = False
40 |     #     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
41 | 


--------------------------------------------------------------------------------
/template/README.md:
--------------------------------------------------------------------------------
  1 | # Additional Details about the Template
  2 | 
  3 | ## Reproducibility
  4 | 
  5 | This template ensures the reproducibility of your results through 3 artifacts:
  6 | 
  7 | 1. The development environment
  8 |     - Recorded in the Docker images that you upload and described in Dockerfile and environment files
  9 |       that you keep up to date with the Docker installation.
 10 |     - (Less reliably) described in the environment file that you keep up to date for the conda installation.
 11 | 2. The project code.
 12 |     - Recorded in the git repository that you keep up to date.
 13 |     - Made reproducible (to a desired degree) by you correctly seeding the random number generators and
 14 |       optionally removing non-deterministic operations, or replicable by running enough seeds.
 15 | 3. The data, outputs, model weights and other artifacts.
 16 |     - Recorded and uploaded by you.
 17 |     - (Virtually) placed in the placeholder directories abstracting away the user storage system.
 18 | 
 19 | ## Checkpointing
 20 | 
 21 | The template provides an automatic setup of the checkpointing directory for an experiment.
 22 | The unique identifier for the directory is created by hashing the config used and optionally the git commit sha.
 23 | Running the same experiment with the same config will thus set its working directory to the same checkpoint directory
 24 | every time (if the resuming option is enabled).
 25 | 
 26 | To use this feature pass `resuming.resume=True` and `resuming.use_commit=True` to your script using a Hydra config
 27 | that inherits form the `setup.yaml` config file, like the `template_experiment.py` script.
 28 | 
 29 | Even without using `resuming.use_commit=True`, the path to the checkpoint directory will be computed, and you could
 30 | for example, read from it.
 31 | 
 32 | You can also force a resuming directory by passing `resuming.resume_dir=<path>` to your script.
 33 | 
 34 | ### Compatibility with Weights & Biases
 35 | 
 36 | For a non-sweep run, the run will have the id of the checkpoint directory as its wandb id, therefore your wandb run
 37 | will stay the same and resume when your run is resumed.
 38 | Make sure to use a custom step key when you log metrics so that you can have full control over when to start rewriting
 39 | when you resume (E.g. if you checkpoint less often than you log, you may relog from the last checkpoint), otherwise
 40 | the default step key of wandb will resume from the latest step and may be inconsistent with the checkpoint.
 41 | 
 42 | For a sweep run, it already has an id from the sweep, so to resume it you should manually get its id and restart
 43 | the script with the same arguments the sweep agent started it, this way the config and the
 44 | checkpoint directory will be the same
 45 | (i.e. go to the wandb run UI, copy-paste the command it was run with and add `wandb.run_id=<id-of-the-run>`).
 46 | This is a limitation of the wandb sweep system.
 47 | See [this issue.](https://github.com/wandb/wandb/issues/9143)
 48 | 
 49 | ## Template Q&A
 50 | 
 51 | ### I started my project from an older version of the template, how do I get updates?
 52 | 
 53 | A project started from a template is different from a fork in that it is not (necessarily) meant to be updated.
 54 | The template is free to change and evolve, and it is not guaranteed that it will be compatible with your project.
 55 | 
 56 | Nevertheless, many changes are likely to be compatible with your project.
 57 | In that case, there are two ways to incorporate them:
 58 | 
 59 | 1. Manually copy the changes from the template to your project, adapting them when needed (different variable names especially).
 60 | 2. Use git to merge the changes from the template to your project.
 61 | ```bash
 62 | git remote add template https://github.com/CLAIRE-Labo/python-ml-research-template.git
 63 | git fetch template
 64 | # Cherry pick the commits you want to merge, making sure they are compatible.
 65 | # Add the option -n if you want to have the changes staged but not committed so you can edit them.
 66 | git cherry-pick -x <commit-hash>
 67 | ```
 68 | 
 69 | ### Why Docker? Why not just Conda? (At least for container-compatible hardware acceleration methods.)
 70 | 
 71 | Conda environments are not so self-contained, some packages can just work on your machine because
 72 | they use some of your system libraries not recorded in the conda environment.
 73 | An exhaustive and precise list of the system libraries outside conda is hard to record,
 74 | and the environment will not run on another machine missing those libraries.
 75 | Reinforcement learning (RL) environments usually require system libraries
 76 | not recorded by conda, and RL is a big part of our work.
 77 | 
 78 | Moreover, the environment is specified as an `environment.yml` file description,
 79 | and does not contain the dependencies themselves.
 80 | Some dependencies may actually become unavailable when a user tries to download them later.
 81 | 
 82 | Docker images are self-contained and do contain the dependencies themselves.
 83 | 
 84 | ### Why is the template so complex, e.g., include so many files?
 85 | 
 86 | Some reasons for many files and extra code is to be able to provide a generic template
 87 | that can be configured, extended, or shortened depending on the needs of the project.
 88 | 
 89 | The other part of the apparent complexity probably comes from unfamiliarity with the tools and practices
 90 | used in the template.
 91 | These practices, however, (although usually not all combined in a research project, whence this template)
 92 | are well established and have been proven to be very useful.
 93 | 
 94 | For example, the `Dockerfile` seems complex because it leverages multi-staging to be very
 95 | time and cache-efficient.
 96 | Different build stages can run in parallel, so changing your build dependencies,
 97 | or installing something in the Dockerfile will cause very few rebuilds.
 98 | 
 99 | Using Docker Compose is also very convenient to define all the build arguments and multiple deployment options
100 | in a single file, avoiding long build and run commands.
101 | 
102 | ### Why does the template have so many tools by default (e.g. `hydra`, `wandb`, `black`, etc.)?
103 | 
104 | This template is mainly addressed to students and researchers at CLAIRE.
105 | Frequently, students are not aware of the tools and practices that are available to them until they face the problems
106 | we've all faced at some point in our career
107 | (how do I manage my configs? How do conveniently log my metrics?, etc.).
108 | We chose to include these tools by default to help students and researchers avoid these problems from the start,
109 | and to encourage them to use them.
110 | 
111 | ### Can I fork a project that used the template and change its name? How do I do that?
112 | 
113 | Yes, it seems like filling the `template/template-variables.env` file with your new project name and
114 | running `./template/change-project-name.sh` would work.
115 | 
116 | ### Can I use this template for an already existing project? How do I do that?
117 | 
118 | The template is mainly designed to start new projects, as it's hard to make assumptions on
119 | the structure of an existing project.
120 | However, it is possible to use it to transfer from an existing project.
121 | 
122 | It's likely that your project is a bunch of Python files and a `requirements.txt` or `environment.yml` file.
123 | You can copy those files and potentially refactor the package structures and put all of them under `src`.
124 | You will also have to transfer the dependencies to the `environment.yml` file and identify the system dependencies.
125 | 
126 | If your project provides a Docker image, the Docker installation method allows extending it, assuming
127 | it has a well-configured Python environment.
128 | 
129 | In the worst case, you can keep the `installation/` directory if that's useful to you and replace all the rest with
130 | your project and adapt the installation as needed.
131 | You could also just get some inspiration from the template and do your own thing.
132 | 


--------------------------------------------------------------------------------
/template/change-project-name.sh:
--------------------------------------------------------------------------------
 1 | # This script allows to replace the template variables with your project ones.
 2 | set -eo pipefail
 3 | source template/template-variables.env
 4 | 
 5 | # Iterate through all files in the project except dot directories and this directory.
 6 | for file in $(find . -type f -not -path './template/*' -not -path '*/\.*' -not -path '*/__*__/*' -not -path './outputs/*'); do
 7 |   # .deleteme is a trick to make sed work the same way on both Linux and OSX.
 8 |   # https://stackoverflow.com/questions/5694228/sed-in-place-flag-that-works-both-on-mac-bsd-and-linux
 9 |   sed -i.deleteme "s/${OLD_PROJECT_NAME}/${NEW_PROJECT_NAME}/g" "${file}"
10 |   sed -i.deleteme "s/${OLD_PACKAGE_NAME}/${NEW_PACKAGE_NAME}/g" "$file"
11 |   sed -i.deleteme "s/python=${OLD_PYTHON_VERSION}/python=${NEW_PYTHON_VERSION}/g" "$file"
12 |   sed -i.deleteme "s/python${OLD_PYTHON_VERSION}/python${NEW_PYTHON_VERSION}/g" "$file"
13 |   sed -i.deleteme "s/Python ${OLD_PYTHON_VERSION}/Python ${NEW_PYTHON_VERSION}/g" "$file"
14 |   sed -i.deleteme "s/requires-python = \">=${OLD_PYTHON_VERSION}\"/requires-python = \">=${NEW_PYTHON_VERSION}\"/g" "$file"
15 |   # Delete the .deleteme file if it exists.
16 |   rm -f "$file.deleteme"
17 | done
18 | 
19 | if [ "${NEW_PACKAGE_NAME}" != "${OLD_PACKAGE_NAME}" ]; then
20 |   mv "src/${OLD_PACKAGE_NAME}" src/"${NEW_PACKAGE_NAME}"
21 | fi
22 | 


--------------------------------------------------------------------------------
/template/template-variables.env:
--------------------------------------------------------------------------------
 1 | # The PROJECT_NAME and PACKAGE_NAME can be the same (replacing hyphens with underscores).
 2 | # This is the safest and best option.
 3 | # Though, you can make them different if you need.
 4 | 
 5 | # The variables prefixed with $OLD_ are for the current state of the template
 6 | # (correctly filled at the start, can be replaced with the project name later to change it).
 7 | # The variables prefixed with $NEW_ are for the state it will be after the script is run.
 8 | 
 9 | # This is the distribution name, used to `pip install PROJECT_NAME`.
10 | # $NEW_PROJECT_NAME Will replace $OLD_PROJECT_NAME in all the template.
11 | OLD_PROJECT_NAME="template-project-name"
12 | NEW_PROJECT_NAME="template-project-name" # Hyphen or underscore as word separator.
13 | 
14 | # This is the package name used by `import PACKAGE_NAME`.
15 | # $NEW_PACKAGE_NAME will replace $OLD_PACKAGE_NAME in all the template.
16 | OLD_PACKAGE_NAME="template_package_name"
17 | NEW_PACKAGE_NAME="template_package_name" # Underscore as word separator.
18 | 
19 | # Will be the Python version used in the environment (if it doesn't have Python already).
20 | # Will be the minimum version of Python supported by the project.
21 | # If you're basing your work on an already existing docker image, use the version in that image.
22 | # $NEW_ will replace "python=$OLD_PYTHON_VERSION", "python$OLD_PYTHON_VERSION",
23 | # "Python $OLD_PYTHON_VERSION", and "python>=$OLD_PYTHON_VERSION" in all the template.
24 | OLD_PYTHON_VERSION="3.10"
25 | NEW_PYTHON_VERSION="3.10"
26 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | Placeholder.
2 | 


--------------------------------------------------------------------------------