├── .editorconfig ├── .github └── workflows │ ├── pandoc.yml │ └── shell_check.yml ├── .gitignore ├── LICENSE ├── README.md ├── README.pdf ├── cleanup.sh ├── config.sh ├── include.sh ├── prepare.sh ├── run.sh └── slurm_utils.sh /.editorconfig: -------------------------------------------------------------------------------- 1 | # More info: https://editorconfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Use space and 4 indentation style everywhere. 7 | # Also add a newline at the end of every file. 8 | [*.sh] 9 | charset = utf-8 10 | end_of_line = lf 11 | insert_final_newline = true 12 | trim_trailing_whitespace = true 13 | indent_style = space 14 | indent_size = 4 15 | 16 | # YAML files use 2 space indentation for now. 17 | [*.{yml,md}] 18 | indent_style = space 19 | indent_size = 2 20 | 21 | # Makefiles require tabs 22 | [Makefile] 23 | indent_style = tab 24 | -------------------------------------------------------------------------------- /.github/workflows/pandoc.yml: -------------------------------------------------------------------------------- 1 | name: Create README.pdf 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'README.md' 7 | 8 | jobs: 9 | publish_readme_pdf: 10 | if: ${{ github.ref != 'refs/heads/master' }} 11 | name: README as pdf 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v2 16 | - name: Convert 17 | uses: docker://pandoc/latex:2.14.2 18 | with: 19 | args: >- 20 | --output=README.pdf 21 | README.md 22 | - name: Push the README 23 | env: 24 | GITHUB_TOKEN: ${{ secrets.BOT_TOKEN }} 25 | run: | 26 | git config user.name 'ginkgo-bot' 27 | git config user.email 'ginkgo.library@gmail.com' 28 | git add README.pdf 29 | git commit -m "[bot] Update README.pdf" 30 | git pull --rebase 31 | git push 32 | -------------------------------------------------------------------------------- /.github/workflows/shell_check.yml: -------------------------------------------------------------------------------- 1 | name: "Shellcheck" 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | check_script_files: 6 | runs-on: ubuntu-latest 7 | 8 | steps: 9 | - uses: actions/checkout@v2 10 | - name: "Shellcheck" 11 | run: shellcheck --check-sourced *.sh 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | *.log 4 | tmp/ 5 | 6 | .dir-locals.el 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, the Ginkgo Project, NHR@KIT Cx Project 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 20 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 22 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | --- 3 | header-includes: | 4 | \usepackage{fancyhdr} 5 | \pagestyle{fancy} 6 | \hypersetup{colorlinks=true, 7 | linkcolor=blue, 8 | allbordercolors={0 0 0}, 9 | pdfborderstyle={/S/U/W 1}} 10 | --- 11 |
12 | 13 | # GitLab runner for HPC systems 14 | 15 | In rootless mode, by relying on ENROOT and SLURM. 16 | 17 | 1. [Overview](#overview) 18 | 1. [Purpose and Features](#purpose-and-features) 19 | 2. [Dependencies](#dependencies) 20 | 3. [Code Structure](#code-structure) 21 | 4. [Configuration Variables](#configuration-variables) 22 | 1. [Global Options](#global-options) 23 | 2. [SLURM Behavior](#slurm-behavior) 24 | 2. [Installation](#installation) 25 | 1. [Installing a gitlab-runner](#installing-a-gitlab-runner) 26 | 2. [Enroot and Cluster Setup](#enroot-and-cluster-setup) 27 | 3. [Volume mounting and Ccache setup](#volume-mounting-and-ccache-setup) 28 | 3. [Usage Example](#usage-example) 29 | 4. [License](#license) 30 | 5. [Links](#links) 31 | 32 | ## Overview 33 | 34 | ### Purpose and Features 35 | 36 | This set of scripts aims at enabling user-level (no root access required) Continuous Integration on HPC clusters by relying on Gitlab runner's [custom executors][gitlab-custom-executors], [ENROOT][enroot-nvidia] as a rootless container solution replacement for docker and the SLURM job scheduler when using computing nodes. It also optionally supports [Ccache][ccache-website] to speed up compilation times on CI jobs. This tool was inspired by the [NHR@KIT Cx Project][nhr-kit-cx] which provides ENROOT and GitLab-runner on their clusters. It is used in production in some of the [Ginkgo Software][ginkgo-software]'s [pipelines][ginkgo-pipelines]. 37 | 38 | SLURM usage is optional in this set of scripts as it is considered that many of the simple CI steps, such as compilation, will happen on a login node to optimize computing time and resource sharing. Currently, the script uses non-interactive job submission and waiting loops to ensure the correct completion of the job on the cluster. 39 | 40 | A typical use case for this series of scripts is the following: 41 | 42 | + build: configure, build, install all the software on the login node. 43 | + test: reuse the previous container to launch tests on a compute node with device access through SLURM. 44 | + benchmark: also reuse the previous container to launch a benchmarking job on a compute node with device access through SLURM, then delete the container. 45 | 46 | See the [usage example](#usage-example) for concrete details. 47 | 48 | ### Dependencies 49 | 50 | There are several standard Linux commands used on top of ENROOT and SLURM commands. For some commands, the script can rely on non-standard/GNU-only options. This is for now not optimized. 51 | 52 | Always required: 53 | 54 | + Gitlab runner (user mode) 55 | + Enroot 56 | + Flock 57 | + Bash 58 | + grep 59 | 60 | With SLURM: 61 | 62 | + sacct, squeue, scancel, sbatch, srun 63 | + GNU ls, wc, head, tr, cut, awk, ... 64 | + option extglob 65 | 66 | ### Code Structure 67 | 68 | The code structure is simple, there are the standard GitLab-runner custom executor scripts: 69 | 70 | + `config.sh`: describes the executor; 71 | + `prepare.sh`: prepares the enroot container from a docker image, uses an image cache, optionally reuses existing container instead; 72 | + `run.sh`: either directly runs the GitLab commands on a local enroot container, or submits a job that executes everything in bulk; 73 | + `cleanup.sh`: delete the container if not requested otherwise, cleanup the SLURM job if needed. 74 | 75 | The main configuration variables and functions are defined in the following files: 76 | 77 | + `include.sh`: contains the main (non slurm) configuration options; 78 | + `slurm_utils.sh`: contains most functions and configurations taking for SLURM functionality. 79 | 80 | ### Configuration Variables 81 | 82 | The following variables control some aspects of the script functionality. They can be set as job variables in the script or on the web pages. In the script, they need to be accessed as `${CUSTOM_ENV_}`. 83 | 84 | #### Global Options 85 | 86 | These variables are not SLURM specific and can be used in the default `ENROOT` only mode. 87 | 88 | + `CI_JOB_IMAGE` (YAML script `image:` option): a standard docker image for enroot to instantiate a container from; If it is hosted on gitlab (`CI_REGISTRY` is set), it will be accessed via the default token `CI_REGISTRY_PASSWORD`. 89 | + `CI_WS`: a directory with shared data access across all nodes to be used as a workspace. 90 | 91 | Optional: 92 | 93 | + `USE_NAME`: instead of an automatically generated name, use a specific name for the container (and SLURM job if applicable). This name needs to be unique! When not specified, the name will be `GitLabRunnerEnrootExecutorID${CUSTOM_ENV_CI_JOB_ID}`. 94 | + `NVIDIA_VISIBLE_DEVICES`: a value passed to the enroot container to control NVIDIA device visibility. When no GPU is available or used, `void` should be passed. 95 | + `CCACHE_MAXSIZE`: sets a custom maximum limit to the Ccache directory size. 96 | + `KEEP_CONTAINER`: a non-zero value allows to not delete the container after usage, except if an error occurred. 97 | + `ENROOT_REMAP_ROOT`: a non-zero value allows adds the enroot option --root 98 | 99 | Volumes: 100 | 101 | + `VOL_NUM`: sets the number of volumes configured to be mounted in the container. 102 | + `VOL_1_SRC`: sets the source directory (on the cluster) for the first volume. 103 | + `VOL_1_DST`: sets the destination directory (in the container) for the first volume. 104 | 105 | #### SLURM Behavior 106 | 107 | When any of these variables are set, instead of directly running the container on the node where `gitlab-runner` is running, this will submit a job instead. These variables allow to control the SLURM job submission and related behavior. 108 | 109 | + `SLURM_PARTITION`: the value of the SLURM `--partition` parameter, e.g., `gpu`. 110 | + `SLURM_EXCLUSIVE`: when non-zero, adds the SLURM `--exclusive` parameter. 111 | + `SLURM_TIME`: the value of the SLURM `--time` parameter, e.g. `0:30:00`. 112 | + `SLURM_GRES`: the value of the SLURM `--gres` parameter. 113 | + `SLURM_ACCOUNT`: the value of the SLURM `--account` parameter. 114 | + `USE_SLURM`: if no other variables are set, setting this enables SLURM mode of execution for this job. 115 | 116 | These variables control the SLURM job waiting loop behavior: 117 | 118 | + `SLURM_UPDATE_INTERVAL`: the sleeping time between two job status checks. 119 | + `SLURM_PENDING_LIMIT`: the job pending time waiting limit, the default is 12 hours. 120 | + `SLURM_TIME`: when specified, this changes the running time waiting limit to that value, the default is 24 hours. 121 | 122 | ## Installation 123 | The instructions are for a standard Linux system that already supports user mode GitLab and has enroot installed (see [dependencies](#dependencies)). Also, refer to the [NHR@KIT CI user documentation][nhr-kit-cx] which detail this setup on their systems. 124 | 125 | ### Installing a gitlab-runner 126 | 127 | The standard `gitlab-runner install` command can be used. Make sure to select the custom executor, see [gitlab runner registration documentation][gitlab-runner-install]. Here is an example of what a runner configuration can look like, usually found in `~/.gitlab/config.toml`: 128 | 129 | ``` yaml 130 | [[runners]] 131 | name = "enroot executor" 132 | url = "https://gitlab.com" 133 | token = "" 134 | executor = "custom" 135 | builds_dir = "/workspace/scratch/my-ci-project/gitlab-runner/builds/" 136 | cache_dir = "/workspack/scratch/my-ci-project/gitlab-runner/cache/" 137 | environment = ["CI_WS=/workspace/scratch/my-ci-project", 138 | "VOL_1_SRC=/workspace/scratch/my-ci-project/ccache", "VOL_1_DST=/ccache", 139 | "VOL_2_SRC=/workspace/scratch/my-ci-project/test_data", "VOL_2_DST=/test_data", 140 | "NUM_VOL=2", "CCACHE_MAXSIZE=40G"] 141 | [runners.custom_build_dir] 142 | enabled = false 143 | [runners.custom] 144 | config_exec = "//gitlab-hpc-ci-cb/config.sh" 145 | prepare_exec = "//gitlab-hpc-ci-cb/prepare.sh" 146 | run_exec = "//gitlab-hpc-ci-cb/run.sh" 147 | cleanup_exec = "//gitlab-hpc-ci-cb/cleanup.sh" 148 | ``` 149 | 150 | ### Enroot and Cluster Setup 151 | On machines using `systemd` and `logind`, enable lingering for your user so that the gitlab-runner daemon can persist when logged off: `loginctl enable-linger ${USER}`. To check if the property is active, use the command: `loginctl show-user $USER --property=Linger`, which should output `Linger=yes`. 152 | 153 | As detailed in [global options](#global-options), it is required to set the environment variable `CI_WS` either in the runner configuration or in the script to be used as a workspace for storing enroot containers, caching, and more. 154 | 155 | After the new GitLab runner has been configured, lingering is enabled and the other cluster setup steps are finished, start your runner in user mode with the following commands on a `systemd`-based system: 156 | 157 | ``` sh 158 | # Enable your own gitlab-runner and start it up 159 | systemctl --user enable --now gitlab-runner 160 | # Check that the gitlab runner is running 161 | systemctl --user status gitlab-runner 162 | ``` 163 | 164 | ### Volume mounting and Ccache setup 165 | A generic volume mounting interface is provided. This is useful for Ccache support but can be used for other aspects as well. It is configured through multiple environment variables: 166 | 1. `VOL_NUM` specifies the number of volumes configured. 167 | 2. `VOL_1_SRC` is the volume source (on the cluster), e.g. `${CI_WS}/ccache` 168 | 3. `VOL_1_DST` is the volume destination (in the container), e.g. `/ccache` 169 | 170 | 171 | A full example is available in [Installing a gitlab-runner](#installing-a-gitlab-runner). 172 | 173 | ## Usage Example 174 | 175 | Assuming that the code of `default_build` contains the code for compiling your software in the required setting, and `default_test` contains the equivalent of `make test`, the following gitlab-ci YAML configuration will: 176 | 177 | + `my_build_job`: build the software on the node running gitlab-runner (no SLURM), keep the container's state for the next job 178 | + `my_test_job`: test the software on a compute node on the `gpu` SLURM partition with one GPU and a time limit of 30 minutes. Then delete the container (no `KEEP_CONTAINER` is set). 179 | 180 | 181 | Note that this works because both use the same custom name `simple_hpc_ci_job`, which needs to be unique, but shared among the jobs of the same pipeline. 182 | 183 | 184 | ``` yaml 185 | stages: 186 | - build 187 | - test 188 | 189 | my_build_job: 190 | image: ubuntu:xenial 191 | stage: build 192 | <<: *default_build 193 | variables: 194 | USE_NAME: "simple_hpc_ci_job" 195 | KEEP_CONTAINER: "ON" 196 | NVIDIA_VISIBLE_DEVICES: "void" 197 | tags: 198 | - my_enroot_runner 199 | 200 | slurm_test_job: 201 | image: ubuntu:xenial 202 | stage: test 203 | <<: *default_test 204 | variables: 205 | USE_NAME: "simple_hpc_ci_job" 206 | SLURM_PARTITION: "gpu" 207 | SLURM_EXCLUSIVE: "ON" 208 | SLURM_GRES: "gpu:1" 209 | SLURM_TIME: "00:30:00" 210 | dependencies: [ "my_build_job" ] 211 | tags: 212 | - my_enroot_runner 213 | ``` 214 | 215 | #### `after_script` 216 | The `after_script` step is never executed inside a SLURM job, but always 217 | directly executed instead. It is assumed that this script is only used for 218 | cleanup or similar purpose. 219 | 220 | ## License 221 | Licensed under the [BSD 3-Clause license]. 222 | 223 | ## Links 224 | * [NHR@KIT CI User documentation][nhr-kit-cx] 225 | * [Gitlab runner's custom executors][gitlab-custom-executors] 226 | * [Gitlab runner custom executor examples](https://docs.gitlab.com/runner/executors/custom_examples/) 227 | 228 | [gitlab-custom-executors]: https://docs.gitlab.com/runner/executors/custom.html 229 | [gitlab-runner-install]: https://docs.gitlab.com/runner/register/index.html 230 | [enroot-nvidia]: https://github.com/NVIDIA/enroot 231 | [ccache-website]: https://ccache.dev/ 232 | [nhr-kit-cx]: https://www.nhr.kit.edu/userdocs/ci/ 233 | [ginkgo-software]: https://github.com/ginkgo-project/ginkgo 234 | [ginkgo-pipelines]: https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/pipelines 235 | [BSD 3-Clause license]: LICENSE 236 | -------------------------------------------------------------------------------- /README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ginkgo-project/gitlab-hpc-ci-cb/5a71b78b9acb29f0afa0fddb4d8f9813199f56fd/README.pdf -------------------------------------------------------------------------------- /cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # https://docs.gitlab.com/runner/executors/custom.html#cleanup 3 | 4 | # shellcheck source=./include.sh 5 | source "${BASH_SOURCE[0]%/*}/include.sh" 6 | 7 | 8 | ensure_executable_available enroot 9 | 10 | 11 | # Take care of slurm cleanup if needed 12 | if [[ -f "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt" ]]; then 13 | ensure_executable_available scancel 14 | ensure_executable_available squeue 15 | 16 | USE_SLURM=1 17 | JOBID=$(cat "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt") 18 | rm "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt" # not needed anymore 19 | # If the job isn't finished yet, we still need to cancel it 20 | scancel --quiet "${JOBID}" 21 | fi 22 | 23 | # Somehow, the work dir is leftover, that can indicate a job cancellation. 24 | WORK_DIR="${CI_WS}/${CONTAINER_NAME}" 25 | if [[ -d "${WORK_DIR}" ]]; then 26 | rm -rf "${WORK_DIR}" 27 | fi 28 | 29 | # Delete container root filesystems if it isn't asked to be preserved or there 30 | # was an error in one of the previous step. 31 | { 32 | echo -e "===============================" 33 | echo -e "Job: ${CUSTOM_ENV_CI_JOB_ID}" 34 | echo -e "Job started at: ${CUSTOM_ENV_CI_JOB_STARTED_AT}" 35 | echo -e "Pipeline: ${CUSTOM_ENV_CI_PIPELINE_ID}" 36 | if [[ -z "${CUSTOM_ENV_KEEP_CONTAINER}" ]]; then 37 | echo -e "Cleaning up container ${CONTAINER_NAME}" 38 | enroot remove --force -- "${CONTAINER_NAME}" 39 | else 40 | echo -e "Keeping container ${CONTAINER_NAME}" 41 | fi 42 | 43 | enroot list --fancy 44 | if [[ "${USE_SLURM}" == 1 ]]; then 45 | squeue -u "${USER}" 46 | fi 47 | } >> "${LOGFILE}" 48 | -------------------------------------------------------------------------------- /config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # https://docs.gitlab.com/runner/executors/custom.html#config 3 | 4 | # shellcheck source=./include.sh 5 | source "${BASH_SOURCE[0]%/*}/include.sh" 6 | 7 | # Sometimes you might want to set some settings during execution time. 8 | # For example settings a build directory depending on the project ID. 9 | # config_exec reads from STDOUT and expects a valid JSON string with specific keys. 10 | 11 | cat <<'EOF' 12 | { 13 | "driver": { 14 | "name": "ENROOT (SLURM) driver", 15 | "version": "v1.0.0" 16 | } 17 | } 18 | EOF 19 | -------------------------------------------------------------------------------- /include.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Include the slurm utility functions 4 | # shellcheck source=./slurm_utils.sh 5 | source "${BASH_SOURCE[0]%/*}/slurm_utils.sh" 6 | 7 | # Do NOT touch these and make sure they are the same as local environment 8 | # variables!! Otherwise, there can be *duplicate* locations and local containers 9 | # will not see the same as gitlab containers! 10 | export CI_WS="${CUSTOM_ENV_CI_WS}" 11 | export LOGFILE="${CI_WS}/gitlab-runner-enroot.log" 12 | export ENROOT_CACHE_PATH="${CI_WS}/ENROOT_CACHE" 13 | export ENROOT_DATA_PATH="${CI_WS}/ENROOT_DATA" 14 | export SLURM_IDS_PATH="${CI_WS}/SLURM_IDS" 15 | 16 | 17 | # Set a variable CONTAINER_NAME based on the environment variable 18 | # CUSTOM_ENV_USE_NAME 19 | if [[ -z "${CUSTOM_ENV_USE_NAME}" ]]; then 20 | CONTAINER_NAME="GitLabRunnerEnrootExecutorID${CUSTOM_ENV_CI_JOB_ID}" 21 | else 22 | CONTAINER_NAME="${CUSTOM_ENV_USE_NAME}" 23 | fi 24 | export CONTAINER_NAME 25 | 26 | 27 | # Ccache and volume management 28 | ENROOT_MOUNT_OPTIONS=() 29 | if [[ -n "${CUSTOM_ENV_VOL_NUM}" ]]; then 30 | for i in $(seq 1 "${CUSTOM_ENV_VOL_NUM}"); do 31 | VOL_SRC="CUSTOM_ENV_VOL_${i}_SRC" 32 | VOL_DST="CUSTOM_ENV_VOL_${i}_DST" 33 | if [[ ! -e "${!VOL_SRC}" ]]; then 34 | mkdir -p "${!VOL_SRC}" 35 | fi 36 | ENROOT_MOUNT_OPTIONS+=("--mount ${!VOL_SRC}:${!VOL_DST}") 37 | done 38 | fi 39 | export ENROOT_MOUNT_OPTIONS 40 | 41 | if [[ -n "${CUSTOM_ENV_ENROOT_REMAP_ROOT}" ]]; then 42 | ENROOT_REMAP_ROOT="--root" 43 | export ENROOT_REMAP_ROOT 44 | fi 45 | 46 | # Propagate these environment variables to the container 47 | PROPAGATED_ENV_VARIABLES=(BENCHMARK 48 | DRY_RUN 49 | EXECUTOR 50 | REPETITIONS 51 | SOLVER_REPETITIONS 52 | SEGMENTS 53 | SEGMENT_ID 54 | PRECONDS 55 | FORMATS 56 | ELL_IMBALANCE_LIMIT 57 | SOLVERS 58 | SOLVERS_PRECISION 59 | SOLVERS_MAX_ITERATIONS 60 | SOLVERS_GMRES_RESTART 61 | SYSTEM_NAME 62 | DEVICE_ID 63 | SOLVERS_JACOBI_MAX_BS 64 | BENCHMARK_PRECISION 65 | SOLVERS_RHS 66 | SOLVERS_RHS_FLAG 67 | SOLVERS_INITIAL_GUESS 68 | GPU_TIMER 69 | DETAILED 70 | MATRIX_LIST_FILE 71 | NVIDIA_VISIBLE_DEVICES 72 | CCACHE_DIR 73 | CCACHE_MAXSIZE 74 | ) 75 | # shellcheck disable=SC2048 76 | for bench_var in ${PROPAGATED_ENV_VARIABLES[*]}; do 77 | check_var="CUSTOM_ENV_${bench_var}" 78 | if [[ -n "${!check_var}" ]]; then 79 | ENROOT_ENV_CONFIG+=("-e ${bench_var}=${!check_var}") 80 | fi 81 | done 82 | export ENROOT_ENV_CONFIG 83 | 84 | # SLURM configuration variables. 85 | # 86 | # If the user sets any slurm variable or the variable USE_SLURM, this container 87 | # will use slurm job submission 88 | USE_SLURM=${CUSTOM_ENV_USE_SLURM} 89 | if [[ -z "${USE_SLURM}" || ${USE_SLURM} -ne 0 ]]; then 90 | SUPPORTED_SLURM_VARIABLES=(SLURM_PARTITION 91 | SLURM_EXCLUSIVE 92 | SLURM_TIME 93 | SLURM_GRES 94 | SLURM_ACCOUNT 95 | SLURM_UPDATE_INTERVAL 96 | SLURM_PENDING_LIMIT 97 | SLURM_RUNNING_LIMIT 98 | USE_SLURM) 99 | # shellcheck disable=SC2048 100 | for slurm_var in ${SUPPORTED_SLURM_VARIABLES[*]}; do 101 | check_var="CUSTOM_ENV_${slurm_var}" 102 | if [[ -n "${!check_var}" ]]; then 103 | USE_SLURM=1 104 | fi 105 | done 106 | fi 107 | export USE_SLURM 108 | # variables from slurm_utils we need to expose outside 109 | export SLURM_UPDATE_INTERVAL 110 | export SLURM_PENDING_LIMIT 111 | export SLURM_RUNNING_LIMIT 112 | export SLURM_GOOD_COMPLETED_STATUS 113 | export SLURM_GOOD_PENDING_STATUS 114 | export SLURM_BAD_STATUS 115 | 116 | 117 | function ensure_executable_available() { 118 | local command=${1} 119 | 120 | if ! type -p "${command}" >/dev/null 2>/dev/null; then 121 | die "No ${command} executable found" 122 | fi 123 | } 124 | -------------------------------------------------------------------------------- /prepare.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # https://docs.gitlab.com/runner/executors/custom.html#prepare 3 | 4 | # shellcheck source=./include.sh 5 | source "${BASH_SOURCE[0]%/*}/include.sh" 6 | 7 | 8 | ensure_executable_available enroot 9 | ensure_executable_available flock 10 | ensure_executable_available grep 11 | 12 | 13 | # Create CI WorkSpace paths if they don't exist 14 | if [[ ! -d "${ENROOT_CACHE_PATH}" ]]; then 15 | mkdir -p "${ENROOT_CACHE_PATH}" 16 | fi 17 | 18 | if [[ ! -d "${ENROOT_DATA_PATH}" ]]; then 19 | mkdir -p "${ENROOT_DATA_PATH}" 20 | fi 21 | 22 | if [[ ! -d "${SLURM_IDS_PATH}" ]]; then 23 | mkdir -p "${SLURM_IDS_PATH}" 24 | fi 25 | 26 | 27 | # Reuse a container if it exists 28 | # shellcheck disable=SC2143 29 | if ! [[ $(enroot list | grep "${CONTAINER_NAME}") ]]; then 30 | echo -e "Preparing the container ${CONTAINER_NAME}." 31 | 32 | # Check if CI job image: is set 33 | if [[ -z "${CUSTOM_ENV_CI_JOB_IMAGE}" ]]; then 34 | die "No CI job image specified" 35 | fi 36 | 37 | # Import a container image from a specific location to enroot image dir 38 | # Scheme: docker://[USER@][REGISTRY#]IMAGE[:TAG] 39 | IMAGE_DIR="${ENROOT_DATA_PATH}" 40 | if [[ ${CUSTOM_ENV_CI_JOB_IMAGE} == "${CUSTOM_ENV_CI_REGISTRY}"* ]]; then 41 | # shellcheck disable=SC2295 42 | URL="docker://${CUSTOM_ENV_CI_REGISTRY_USER}:${CUSTOM_ENV_CI_REGISTRY_PASSWORD}@${CUSTOM_ENV_CI_REGISTRY}#${CUSTOM_ENV_CI_JOB_IMAGE#*$CUSTOM_ENV_CI_REGISTRY/}" 43 | else 44 | URL="docker://${CUSTOM_ENV_CI_JOB_IMAGE}" 45 | fi 46 | IMAGE_NAME="${CUSTOM_ENV_CI_JOB_IMAGE//[:@#.\/]/-}" 47 | # Utility timestamp and lock files 48 | IMAGE_TIMESTAMP_FILE=${IMAGE_DIR}/TIMESTAMP_${IMAGE_NAME} 49 | IMAGE_LOCK_FILE=${IMAGE_DIR}/LOCK_${IMAGE_NAME} 50 | 51 | # Update the image once every 3 hours. Use a lock to prevent conflicts 52 | exec 100<>"${IMAGE_LOCK_FILE}" 53 | flock -w 120 100 54 | if [[ ! -f ${IMAGE_TIMESTAMP_FILE} || 55 | ($(cat "${IMAGE_TIMESTAMP_FILE}") -le $(date +%s -d '-3 hours')) ]]; then 56 | IMAGE_FILE="${IMAGE_DIR}/${IMAGE_NAME}.sqsh" 57 | if [[ -f ${IMAGE_FILE} ]]; then 58 | rm "${IMAGE_DIR}/${IMAGE_NAME}.sqsh" 59 | fi 60 | 61 | COMMAND=(enroot import \ 62 | --output "${IMAGE_DIR}/${IMAGE_NAME}.sqsh" \ 63 | -- "${URL}") 64 | 65 | "${COMMAND[@]}" || die "Command: ${COMMAND[*]} failed with exit code ${?}" 66 | date +%s > "${IMAGE_TIMESTAMP_FILE}" 67 | fi 68 | flock -u 100 69 | 70 | # Create a container root filesystem from a container image 71 | COMMAND=( 72 | enroot create \ 73 | --name "${CONTAINER_NAME}" \ 74 | -- "${IMAGE_DIR}/${IMAGE_NAME}.sqsh" 75 | ) 76 | "${COMMAND[@]}" || die "Command: ${COMMAND[*]} failed with exit code ${?}" 77 | else 78 | echo -e "Reusing container ${CONTAINER_NAME}" 79 | fi 80 | 81 | 82 | # List all the container root filesystems on the system. 83 | enroot list --fancy 84 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # https://docs.gitlab.com/runner/executors/custom.html#run 3 | 4 | 5 | # shellcheck source=./include.sh 6 | source "${BASH_SOURCE[0]%/*}/include.sh" 7 | 8 | 9 | ensure_executable_available enroot 10 | 11 | 12 | # External args 13 | # Last argument is always the step name. The one before last is the step script 14 | before_last=$(($#-1)) 15 | STEP_NAME_ARG="${!#}" 16 | STEP_SCRIPT_ARG="${!before_last}" 17 | 18 | if [[ "${STEP_NAME_ARG}" == "step_script" || "${STEP_NAME_ARG}" == "build_script" ]]; then 19 | echo -e "VOLUMES configuration:" 20 | printf "\t%s\n" "${ENROOT_MOUNT_OPTIONS[@]}" 21 | echo -e "\n" 22 | fi 23 | 24 | # No slurm requested or required, directly use the login node 25 | if [[ -z "${USE_SLURM}" || ${USE_SLURM} -eq 0 || 26 | # All scripts from after_script onward are executed on the login node 27 | # see https://docs.gitlab.com/runner/executors/custom.html#run 28 | "${STEP_NAME_ARG}" == "after_script" || 29 | "${STEP_NAME_ARG}" == "cleanup_file_variables" || 30 | "${STEP_NAME_ARG}" == *"archive"* || 31 | "${STEP_NAME_ARG}" == *"upload_artifacts"* ]]; then 32 | # Enroot fails when quoting anything or splitting this command. Leave it in 33 | # this format. 34 | # 35 | # shellcheck disable=SC2206 36 | COMMAND=(enroot start ${ENROOT_REMAP_ROOT} ${ENROOT_MOUNT_OPTIONS[*]} --rw ${ENROOT_ENV_CONFIG[*]} -e "NVIDIA_VISIBLE_DEVICES=void" ${CONTAINER_NAME} /bin/bash) 37 | "${COMMAND[@]}" < "${STEP_SCRIPT_ARG}" || die "Command: ${COMMAND[*]} failed with exit code ${?}" 38 | else # SLURM usage requested 39 | ensure_executable_available sacct 40 | ensure_executable_available scancel 41 | ensure_executable_available sbatch 42 | ensure_executable_available srun 43 | ensure_executable_available squeue 44 | ensure_executable_available wc 45 | ensure_executable_available awk 46 | 47 | # We need to create the temporary files in a directory with filesystem 48 | # access on all nodes. Because we consider ${CONTAINER_NAME} to be unique, 49 | # we use it as storage for this job. 50 | WORK_DIR="${CI_WS}/${CONTAINER_NAME}" 51 | STEP_SCRIPT_DIR="${WORK_DIR}/step_scripts" 52 | if [[ ! -d "${WORK_DIR}" ]]; then 53 | mkdir -p "${WORK_DIR}" 54 | fi 55 | if [[ ! -d "${STEP_SCRIPT_DIR}" ]]; then 56 | mkdir -p "${STEP_SCRIPT_DIR}" 57 | fi 58 | NUM_SCRIPTS="$(find "${STEP_SCRIPT_DIR}" -maxdepth 1 -type f | wc -l)" 59 | STEP_SCRIPT="${STEP_SCRIPT_DIR}/${NUM_SCRIPTS}" 60 | touch "${STEP_SCRIPT}" 61 | JOB_SCRIPT=$(mktemp -p "${WORK_DIR}") 62 | 63 | # Save the step script 64 | cp "${STEP_SCRIPT_ARG}" "${STEP_SCRIPT}" 65 | 66 | # Only store the gitlab scripts until we reach the main {build,step}_script 67 | if [[ ! "${STEP_NAME_ARG}" =~ ^[bs].*"_script" ]]; then 68 | echo -e "Storing the script for step ${STEP_NAME_ARG} for bulk submission." 69 | exit 70 | fi 71 | 72 | # We finally reached the main script, prepare the SLURM job 73 | JOB_LOG=$(mktemp -p "${WORK_DIR}") 74 | JOB_ERR=$(mktemp -p "${WORK_DIR}") 75 | SLURM_CONFIG=("--job-name=${CONTAINER_NAME}") 76 | SLURM_CONFIG+=("--output=${JOB_LOG}") 77 | SLURM_CONFIG+=("--error=${JOB_ERR}") 78 | SLURM_CONFIG+=("--chdir=${WORK_DIR}") 79 | if [[ -n "${CUSTOM_ENV_SLURM_PARTITION}" ]]; then 80 | SLURM_CONFIG+=("--partition=${CUSTOM_ENV_SLURM_PARTITION}") 81 | fi 82 | if [[ -n "${CUSTOM_ENV_SLURM_EXCLUSIVE}" ]]; then 83 | SLURM_CONFIG+=("--exclusive") 84 | fi 85 | if [[ -n "${CUSTOM_ENV_SLURM_TIME}" ]]; then 86 | SLURM_CONFIG+=("--time=${CUSTOM_ENV_SLURM_TIME}") 87 | fi 88 | if [[ -n "${CUSTOM_ENV_SLURM_GRES}" ]]; then 89 | SLURM_CONFIG+=("--gres=${CUSTOM_ENV_SLURM_GRES}") 90 | fi 91 | if [[ -n "${CUSTOM_ENV_SLURM_ACCOUNT}" ]]; then 92 | SLURM_CONFIG+=("--account=${CUSTOM_ENV_SLURM_ACCOUNT}") 93 | fi 94 | 95 | # Log the configuration 96 | echo -e "SLURM configuration:" 97 | printf "\t%s\n" "${SLURM_CONFIG[@]}" 98 | echo -e "\n" 99 | echo -e "ENROOT environment configuration:" 100 | printf "\t%s\n" "${ENROOT_ENV_CONFIG[@]}" 101 | echo -e "\n" 102 | 103 | 104 | # Launch the container through slurm 105 | # Somehow, this script doesn't like if the variables are surrounded by " 106 | echo -e "#!/usr/bin/env bash 107 | 108 | for scriptnum in \$(ls -1v ${STEP_SCRIPT_DIR}); do 109 | srun enroot start ${ENROOT_REMAP_ROOT} ${ENROOT_MOUNT_OPTIONS[*]} --rw ${ENROOT_ENV_CONFIG[*]} \ 110 | ${CONTAINER_NAME} /bin/bash < ${STEP_SCRIPT_DIR}/\${scriptnum} 111 | done 112 | " > "${JOB_SCRIPT}" 113 | chmod +x "${JOB_SCRIPT}" 114 | 115 | 116 | # Submission 117 | # shellcheck disable=SC2206 118 | COMMAND=(sbatch --parsable ${SLURM_CONFIG[*]} ${JOB_SCRIPT}) 119 | JOB_ID=$("${COMMAND[@]}") || \ 120 | die "Command: ${COMMAND[*]} failed with exit code ${?}" "${WORK_DIR}" 121 | echo -e "Job submitted and pending with ID: ${JOB_ID}." 122 | squeue -u "${USER}" 123 | 124 | # Store the JOB_ID so `cleanup.sh` can read it and cancel the job if running 125 | # (e.g., when pressing the cancel button on gitlab). We consider that the 126 | # CONTAINER_NAME is unique at a given time, so we don't use locking or a list 127 | # of ids. 128 | echo "${JOB_ID}" > "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt" 129 | 130 | slurm_wait_for_status "${JOB_ID}" "${SLURM_PENDING_LIMIT}" \ 131 | "${SLURM_GOOD_PENDING_STATUS}" || die "encountered an error while waiting" \ 132 | "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}" 133 | 134 | echo -e "Job ${JOB_ID} started execution." 135 | slurm_wait_for_status "${JOB_ID}" "${SLURM_RUNNING_LIMIT}" \ 136 | "${SLURM_GOOD_COMPLETED_STATUS}" || die "encountered an error while waiting" \ 137 | "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}" 138 | 139 | test -f "${JOB_ERR}" && test "$(cat "${JOB_ERR}")" != "" && \ 140 | die "encountered an error during execution" "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}" 141 | 142 | echo -e "Job ${JOB_ID} completed." 143 | slurm_print_output "${JOB_ID}" "Log" "${JOB_LOG}" /dev/stdout 144 | slurm_print_output "${JOB_ID}" "Errors" "${JOB_ERR}" /dev/stdout 145 | 146 | # Cleanup the workdir 147 | rm -rf "${WORK_DIR}" 148 | fi 149 | -------------------------------------------------------------------------------- /slurm_utils.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Sections: 3 | # 1. General standalone utility functions 4 | # 2. SLURM variables management (requires previous utility functions) 5 | # 3. Main SLURM loop waiting function (relies on all utilities) 6 | 7 | 8 | ##### 9 | ## 1. Standalone utility functions 10 | ##### 11 | 12 | # Convenient exit function 13 | # This has multiple modes builtin depending on the arguments given: 14 | # + No argument: only exit, no message or anything printed 15 | # + One argument: exit with a message 16 | # + Two argument: exit with message and deletes temporary workdir (useful for 17 | # intermediate SLURM code) 18 | # + More arguments: Full SLURM management, cancels any ongoing job, print job logs, ... 19 | function die() { 20 | # External arguments 21 | local msg="${1}" 22 | # Extra arguments in the SLURM cases 23 | local workdir="${2}" 24 | local jobid=${3} 25 | local joblog="${4}" 26 | local joberr="${5}" 27 | 28 | if [[ -n "${jobid}" ]]; then 29 | msg="${jobid}: ${msg}" 30 | fi 31 | test -n "${msg}" && echo -e "${msg}" > /dev/stderr 32 | test -n "${jobid}" && scancel --quiet "${jobid}" 33 | test -n "${joblog}" && slurm_print_output "${jobid}" "Log" "${joblog}" /dev/stderr 34 | test -n "${joberr}" && slurm_print_output "${jobid}" "Errors" "${joberr}" /dev/stderr 35 | test -n "${workdir}" && test -d "${workdir}" && rm -rf "${workdir}" 36 | # Inform cleanup.sh that we encountered an error 37 | # touch "${CUSTOM_ENV_CI_WS}/${CUSTOM_ENV_CI_JOB_ID}" 38 | exit "${BUILD_FAILURE_EXIT_CODE}" 39 | } 40 | 41 | 42 | # Prints a SLURM job output file to $output 43 | # Does nothing if the file is empty 44 | function slurm_print_output() { 45 | # External arguments 46 | local jobid=${1} 47 | local logtype="${2}" 48 | local slurmlogfile="${3}" 49 | local output=${4} 50 | 51 | if [[ ! -f "${slurmlogfile}" || "$(cat "${slurmlogfile}")" == "" ]]; then 52 | return 0 53 | fi 54 | { 55 | echo -e "== SLURM Job ${jobid} ${logtype}" 56 | echo -e "============================" 57 | cat "${slurmlogfile}" 58 | } >> "${output}" 59 | } 60 | 61 | 62 | # Uses awk to convert a string of the form d-hh:min:s and all combinations to 63 | # seconds. 64 | # The result is return as simple echo. 65 | function slurm_time_to_seconds() { 66 | # Parameters 67 | local slurm_time="$1" 68 | 69 | # Local variables 70 | local num_colons="${slurm_time//[!:]/}" 71 | local num_dashes="${slurm_time//[!-]/}" 72 | num_colons=${#num_colons} 73 | num_dashes=${#num_dashes} 74 | local running_limit=0 75 | # We use awk to split the string into sub components. The fields are 76 | # available in $1 to $. If $3, e.g. doesn't exist, its value is 0 so 77 | # optional components at the end of the expression are taken care of 78 | # naturally. We need different cases for optional components at the 79 | # beginning of the expression. 80 | if [[ ${num_dashes} == 1 ]]; then # Suppose d-hh(:min(:s)) where parenthesis show optional components 81 | running_limit=$(echo "${slurm_time}" | awk -F[-:] '{ print ($1 * 86400) + ($2 * 3600) + ($3 * 60) + $4 }') 82 | elif [[ ${num_colons} == 2 ]]; then # Suppose hh:min:s 83 | running_limit=$(echo "${slurm_time}" | awk -F: '{ print ($1 * 3600) + ($2 * 60) + $3 }') 84 | elif [[ ${num_colons} == 1 || ${num_colons} == 0 ]]; then # Suppose min(:s) 85 | running_limit=$(echo "${slurm_time}" | awk -F: '{ print ($1 * 60) + $2 }') 86 | else 87 | return 1 88 | fi 89 | echo "${running_limit}" 90 | } 91 | 92 | 93 | ##### 94 | ## 2. Variables which control the SLURM waiting loop's behavior 95 | ##### 96 | export SLURM_GOOD_COMPLETED_STATUS="COMPLETED" 97 | export SLURM_GOOD_PENDING_STATUS="@(COMPLETED|COMPLETING|RUNNING)*" 98 | export SLURM_BAD_STATUS="@(FAILED|TIMEOUT|OUT_OF_MEMORY|REVOKED|NODE_FAIL|CANCELLED|BOOT_FAIL)*" 99 | 100 | export SLURM_UPDATE_INTERVAL=${CUSTOM_ENV_SLURM_UPDATE_INTERVAL:-120} # 2 minutes 101 | export SLURM_PENDING_LIMIT=${CUSTOM_ENV_SLURM_PENDING_LIMIT:-43200} # 12 hours 102 | SLURM_RUNNING_LIMIT=86400 # 24 hours 103 | if [[ -n ${CUSTOM_ENV_SLURM_TIME} ]]; then 104 | SLURM_RUNNING_LIMIT=$(slurm_time_to_seconds "${CUSTOM_ENV_SLURM_TIME}" || \ 105 | die "Couldn't understand the time format ${CUSTOM_ENV_SLURM_TIME}.") 106 | fi 107 | export SLURM_RUNNING_LIMIT 108 | 109 | 110 | ##### 111 | ## 3. SLURM waiting loop function 112 | ##### 113 | 114 | # A simple waiting loop for a specific SLURM job based on its status. 115 | # Error conditions: 116 | # 1. We waited past the waiting limit 117 | # 2. The job status is one of ${SLURM_BAD_STATUS}. 118 | function slurm_wait_for_status() { 119 | # Get External params 120 | local jobid=${1} 121 | local waiting_limit=${2} 122 | local good_status_expr="${3}" 123 | 124 | # Internal variables 125 | local keep_waiting=1 126 | local waiting_time=0 127 | local jobstatus="" 128 | 129 | echo -e "" 130 | while [[ $keep_waiting == 1 ]]; do 131 | jobstatus="$(sacct -bn -j "${jobid}" | head -1 | tr -s ' ' | cut -d' ' -f 2)" 132 | if [[ $waiting_time -gt $waiting_limit ]]; then 133 | echo -e "\nJob ${jobid} has exceeded the waiting limit\ 134 | of ${waiting_limit}." > /dev/stderr 135 | return 1 136 | fi 137 | # We need extglob for the expression variable based cases to work 138 | shopt -s extglob 139 | # shellcheck disable=SC2254 140 | case ${jobstatus} in 141 | ${good_status_expr}) 142 | keep_waiting=0 143 | ;; 144 | ${SLURM_BAD_STATUS}) 145 | echo -e "" 146 | return 1 147 | ;; 148 | *) 149 | echo -n "." 150 | sleep "$SLURM_UPDATE_INTERVAL" 151 | waiting_time=$((waiting_time + SLURM_UPDATE_INTERVAL)) 152 | ;; 153 | esac 154 | shopt -u extglob 155 | done 156 | echo -e "" 157 | } 158 | --------------------------------------------------------------------------------