├── .editorconfig
├── .github
└── workflows
│ ├── pandoc.yml
│ └── shell_check.yml
├── .gitignore
├── LICENSE
├── README.md
├── README.pdf
├── cleanup.sh
├── config.sh
├── include.sh
├── prepare.sh
├── run.sh
└── slurm_utils.sh
/.editorconfig:
--------------------------------------------------------------------------------
1 | # More info: https://editorconfig.org
2 |
3 | # top-most EditorConfig file
4 | root = true
5 |
6 | # Use space and 4 indentation style everywhere.
7 | # Also add a newline at the end of every file.
8 | [*.sh]
9 | charset = utf-8
10 | end_of_line = lf
11 | insert_final_newline = true
12 | trim_trailing_whitespace = true
13 | indent_style = space
14 | indent_size = 4
15 |
16 | # YAML files use 2 space indentation for now.
17 | [*.{yml,md}]
18 | indent_style = space
19 | indent_size = 2
20 |
21 | # Makefiles require tabs
22 | [Makefile]
23 | indent_style = tab
24 |
--------------------------------------------------------------------------------
/.github/workflows/pandoc.yml:
--------------------------------------------------------------------------------
1 | name: Create README.pdf
2 |
3 | on:
4 | push:
5 | paths:
6 | - 'README.md'
7 |
8 | jobs:
9 | publish_readme_pdf:
10 | if: ${{ github.ref != 'refs/heads/master' }}
11 | name: README as pdf
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout
15 | uses: actions/checkout@v2
16 | - name: Convert
17 | uses: docker://pandoc/latex:2.14.2
18 | with:
19 | args: >-
20 | --output=README.pdf
21 | README.md
22 | - name: Push the README
23 | env:
24 | GITHUB_TOKEN: ${{ secrets.BOT_TOKEN }}
25 | run: |
26 | git config user.name 'ginkgo-bot'
27 | git config user.email 'ginkgo.library@gmail.com'
28 | git add README.pdf
29 | git commit -m "[bot] Update README.pdf"
30 | git pull --rebase
31 | git push
32 |
--------------------------------------------------------------------------------
/.github/workflows/shell_check.yml:
--------------------------------------------------------------------------------
1 | name: "Shellcheck"
2 | on: [push, pull_request]
3 |
4 | jobs:
5 | check_script_files:
6 | runs-on: ubuntu-latest
7 |
8 | steps:
9 | - uses: actions/checkout@v2
10 | - name: "Shellcheck"
11 | run: shellcheck --check-sourced *.sh
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | *.log
4 | tmp/
5 |
6 | .dir-locals.el
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2021, the Ginkgo Project, NHR@KIT Cx Project
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions
6 | are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright
12 | notice, this list of conditions and the following disclaimer in the
13 | documentation and/or other materials provided with the distribution.
14 |
15 | 3. Neither the name of the copyright holder nor the names of its
16 | contributors may be used to endorse or promote products derived from
17 | this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
20 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | header-includes: |
4 | \usepackage{fancyhdr}
5 | \pagestyle{fancy}
6 | \hypersetup{colorlinks=true,
7 | linkcolor=blue,
8 | allbordercolors={0 0 0},
9 | pdfborderstyle={/S/U/W 1}}
10 | ---
11 |
12 |
13 | # GitLab runner for HPC systems
14 |
15 | In rootless mode, by relying on ENROOT and SLURM.
16 |
17 | 1. [Overview](#overview)
18 | 1. [Purpose and Features](#purpose-and-features)
19 | 2. [Dependencies](#dependencies)
20 | 3. [Code Structure](#code-structure)
21 | 4. [Configuration Variables](#configuration-variables)
22 | 1. [Global Options](#global-options)
23 | 2. [SLURM Behavior](#slurm-behavior)
24 | 2. [Installation](#installation)
25 | 1. [Installing a gitlab-runner](#installing-a-gitlab-runner)
26 | 2. [Enroot and Cluster Setup](#enroot-and-cluster-setup)
27 | 3. [Volume mounting and Ccache setup](#volume-mounting-and-ccache-setup)
28 | 3. [Usage Example](#usage-example)
29 | 4. [License](#license)
30 | 5. [Links](#links)
31 |
32 | ## Overview
33 |
34 | ### Purpose and Features
35 |
36 | This set of scripts aims at enabling user-level (no root access required) Continuous Integration on HPC clusters by relying on Gitlab runner's [custom executors][gitlab-custom-executors], [ENROOT][enroot-nvidia] as a rootless container solution replacement for docker and the SLURM job scheduler when using computing nodes. It also optionally supports [Ccache][ccache-website] to speed up compilation times on CI jobs. This tool was inspired by the [NHR@KIT Cx Project][nhr-kit-cx] which provides ENROOT and GitLab-runner on their clusters. It is used in production in some of the [Ginkgo Software][ginkgo-software]'s [pipelines][ginkgo-pipelines].
37 |
38 | SLURM usage is optional in this set of scripts as it is considered that many of the simple CI steps, such as compilation, will happen on a login node to optimize computing time and resource sharing. Currently, the script uses non-interactive job submission and waiting loops to ensure the correct completion of the job on the cluster.
39 |
40 | A typical use case for this series of scripts is the following:
41 |
42 | + build: configure, build, install all the software on the login node.
43 | + test: reuse the previous container to launch tests on a compute node with device access through SLURM.
44 | + benchmark: also reuse the previous container to launch a benchmarking job on a compute node with device access through SLURM, then delete the container.
45 |
46 | See the [usage example](#usage-example) for concrete details.
47 |
48 | ### Dependencies
49 |
50 | There are several standard Linux commands used on top of ENROOT and SLURM commands. For some commands, the script can rely on non-standard/GNU-only options. This is for now not optimized.
51 |
52 | Always required:
53 |
54 | + Gitlab runner (user mode)
55 | + Enroot
56 | + Flock
57 | + Bash
58 | + grep
59 |
60 | With SLURM:
61 |
62 | + sacct, squeue, scancel, sbatch, srun
63 | + GNU ls, wc, head, tr, cut, awk, ...
64 | + option extglob
65 |
66 | ### Code Structure
67 |
68 | The code structure is simple, there are the standard GitLab-runner custom executor scripts:
69 |
70 | + `config.sh`: describes the executor;
71 | + `prepare.sh`: prepares the enroot container from a docker image, uses an image cache, optionally reuses existing container instead;
72 | + `run.sh`: either directly runs the GitLab commands on a local enroot container, or submits a job that executes everything in bulk;
73 | + `cleanup.sh`: delete the container if not requested otherwise, cleanup the SLURM job if needed.
74 |
75 | The main configuration variables and functions are defined in the following files:
76 |
77 | + `include.sh`: contains the main (non slurm) configuration options;
78 | + `slurm_utils.sh`: contains most functions and configurations taking for SLURM functionality.
79 |
80 | ### Configuration Variables
81 |
82 | The following variables control some aspects of the script functionality. They can be set as job variables in the script or on the web pages. In the script, they need to be accessed as `${CUSTOM_ENV_}`.
83 |
84 | #### Global Options
85 |
86 | These variables are not SLURM specific and can be used in the default `ENROOT` only mode.
87 |
88 | + `CI_JOB_IMAGE` (YAML script `image:` option): a standard docker image for enroot to instantiate a container from; If it is hosted on gitlab (`CI_REGISTRY` is set), it will be accessed via the default token `CI_REGISTRY_PASSWORD`.
89 | + `CI_WS`: a directory with shared data access across all nodes to be used as a workspace.
90 |
91 | Optional:
92 |
93 | + `USE_NAME`: instead of an automatically generated name, use a specific name for the container (and SLURM job if applicable). This name needs to be unique! When not specified, the name will be `GitLabRunnerEnrootExecutorID${CUSTOM_ENV_CI_JOB_ID}`.
94 | + `NVIDIA_VISIBLE_DEVICES`: a value passed to the enroot container to control NVIDIA device visibility. When no GPU is available or used, `void` should be passed.
95 | + `CCACHE_MAXSIZE`: sets a custom maximum limit to the Ccache directory size.
96 | + `KEEP_CONTAINER`: a non-zero value allows to not delete the container after usage, except if an error occurred.
97 | + `ENROOT_REMAP_ROOT`: a non-zero value allows adds the enroot option --root
98 |
99 | Volumes:
100 |
101 | + `VOL_NUM`: sets the number of volumes configured to be mounted in the container.
102 | + `VOL_1_SRC`: sets the source directory (on the cluster) for the first volume.
103 | + `VOL_1_DST`: sets the destination directory (in the container) for the first volume.
104 |
105 | #### SLURM Behavior
106 |
107 | When any of these variables are set, instead of directly running the container on the node where `gitlab-runner` is running, this will submit a job instead. These variables allow to control the SLURM job submission and related behavior.
108 |
109 | + `SLURM_PARTITION`: the value of the SLURM `--partition` parameter, e.g., `gpu`.
110 | + `SLURM_EXCLUSIVE`: when non-zero, adds the SLURM `--exclusive` parameter.
111 | + `SLURM_TIME`: the value of the SLURM `--time` parameter, e.g. `0:30:00`.
112 | + `SLURM_GRES`: the value of the SLURM `--gres` parameter.
113 | + `SLURM_ACCOUNT`: the value of the SLURM `--account` parameter.
114 | + `USE_SLURM`: if no other variables are set, setting this enables SLURM mode of execution for this job.
115 |
116 | These variables control the SLURM job waiting loop behavior:
117 |
118 | + `SLURM_UPDATE_INTERVAL`: the sleeping time between two job status checks.
119 | + `SLURM_PENDING_LIMIT`: the job pending time waiting limit, the default is 12 hours.
120 | + `SLURM_TIME`: when specified, this changes the running time waiting limit to that value, the default is 24 hours.
121 |
122 | ## Installation
123 | The instructions are for a standard Linux system that already supports user mode GitLab and has enroot installed (see [dependencies](#dependencies)). Also, refer to the [NHR@KIT CI user documentation][nhr-kit-cx] which detail this setup on their systems.
124 |
125 | ### Installing a gitlab-runner
126 |
127 | The standard `gitlab-runner install` command can be used. Make sure to select the custom executor, see [gitlab runner registration documentation][gitlab-runner-install]. Here is an example of what a runner configuration can look like, usually found in `~/.gitlab/config.toml`:
128 |
129 | ``` yaml
130 | [[runners]]
131 | name = "enroot executor"
132 | url = "https://gitlab.com"
133 | token = ""
134 | executor = "custom"
135 | builds_dir = "/workspace/scratch/my-ci-project/gitlab-runner/builds/"
136 | cache_dir = "/workspack/scratch/my-ci-project/gitlab-runner/cache/"
137 | environment = ["CI_WS=/workspace/scratch/my-ci-project",
138 | "VOL_1_SRC=/workspace/scratch/my-ci-project/ccache", "VOL_1_DST=/ccache",
139 | "VOL_2_SRC=/workspace/scratch/my-ci-project/test_data", "VOL_2_DST=/test_data",
140 | "NUM_VOL=2", "CCACHE_MAXSIZE=40G"]
141 | [runners.custom_build_dir]
142 | enabled = false
143 | [runners.custom]
144 | config_exec = "//gitlab-hpc-ci-cb/config.sh"
145 | prepare_exec = "//gitlab-hpc-ci-cb/prepare.sh"
146 | run_exec = "//gitlab-hpc-ci-cb/run.sh"
147 | cleanup_exec = "//gitlab-hpc-ci-cb/cleanup.sh"
148 | ```
149 |
150 | ### Enroot and Cluster Setup
151 | On machines using `systemd` and `logind`, enable lingering for your user so that the gitlab-runner daemon can persist when logged off: `loginctl enable-linger ${USER}`. To check if the property is active, use the command: `loginctl show-user $USER --property=Linger`, which should output `Linger=yes`.
152 |
153 | As detailed in [global options](#global-options), it is required to set the environment variable `CI_WS` either in the runner configuration or in the script to be used as a workspace for storing enroot containers, caching, and more.
154 |
155 | After the new GitLab runner has been configured, lingering is enabled and the other cluster setup steps are finished, start your runner in user mode with the following commands on a `systemd`-based system:
156 |
157 | ``` sh
158 | # Enable your own gitlab-runner and start it up
159 | systemctl --user enable --now gitlab-runner
160 | # Check that the gitlab runner is running
161 | systemctl --user status gitlab-runner
162 | ```
163 |
164 | ### Volume mounting and Ccache setup
165 | A generic volume mounting interface is provided. This is useful for Ccache support but can be used for other aspects as well. It is configured through multiple environment variables:
166 | 1. `VOL_NUM` specifies the number of volumes configured.
167 | 2. `VOL_1_SRC` is the volume source (on the cluster), e.g. `${CI_WS}/ccache`
168 | 3. `VOL_1_DST` is the volume destination (in the container), e.g. `/ccache`
169 |
170 |
171 | A full example is available in [Installing a gitlab-runner](#installing-a-gitlab-runner).
172 |
173 | ## Usage Example
174 |
175 | Assuming that the code of `default_build` contains the code for compiling your software in the required setting, and `default_test` contains the equivalent of `make test`, the following gitlab-ci YAML configuration will:
176 |
177 | + `my_build_job`: build the software on the node running gitlab-runner (no SLURM), keep the container's state for the next job
178 | + `my_test_job`: test the software on a compute node on the `gpu` SLURM partition with one GPU and a time limit of 30 minutes. Then delete the container (no `KEEP_CONTAINER` is set).
179 |
180 |
181 | Note that this works because both use the same custom name `simple_hpc_ci_job`, which needs to be unique, but shared among the jobs of the same pipeline.
182 |
183 |
184 | ``` yaml
185 | stages:
186 | - build
187 | - test
188 |
189 | my_build_job:
190 | image: ubuntu:xenial
191 | stage: build
192 | <<: *default_build
193 | variables:
194 | USE_NAME: "simple_hpc_ci_job"
195 | KEEP_CONTAINER: "ON"
196 | NVIDIA_VISIBLE_DEVICES: "void"
197 | tags:
198 | - my_enroot_runner
199 |
200 | slurm_test_job:
201 | image: ubuntu:xenial
202 | stage: test
203 | <<: *default_test
204 | variables:
205 | USE_NAME: "simple_hpc_ci_job"
206 | SLURM_PARTITION: "gpu"
207 | SLURM_EXCLUSIVE: "ON"
208 | SLURM_GRES: "gpu:1"
209 | SLURM_TIME: "00:30:00"
210 | dependencies: [ "my_build_job" ]
211 | tags:
212 | - my_enroot_runner
213 | ```
214 |
215 | #### `after_script`
216 | The `after_script` step is never executed inside a SLURM job, but always
217 | directly executed instead. It is assumed that this script is only used for
218 | cleanup or similar purpose.
219 |
220 | ## License
221 | Licensed under the [BSD 3-Clause license].
222 |
223 | ## Links
224 | * [NHR@KIT CI User documentation][nhr-kit-cx]
225 | * [Gitlab runner's custom executors][gitlab-custom-executors]
226 | * [Gitlab runner custom executor examples](https://docs.gitlab.com/runner/executors/custom_examples/)
227 |
228 | [gitlab-custom-executors]: https://docs.gitlab.com/runner/executors/custom.html
229 | [gitlab-runner-install]: https://docs.gitlab.com/runner/register/index.html
230 | [enroot-nvidia]: https://github.com/NVIDIA/enroot
231 | [ccache-website]: https://ccache.dev/
232 | [nhr-kit-cx]: https://www.nhr.kit.edu/userdocs/ci/
233 | [ginkgo-software]: https://github.com/ginkgo-project/ginkgo
234 | [ginkgo-pipelines]: https://gitlab.com/ginkgo-project/ginkgo-public-ci/-/pipelines
235 | [BSD 3-Clause license]: LICENSE
236 |
--------------------------------------------------------------------------------
/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ginkgo-project/gitlab-hpc-ci-cb/5a71b78b9acb29f0afa0fddb4d8f9813199f56fd/README.pdf
--------------------------------------------------------------------------------
/cleanup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # https://docs.gitlab.com/runner/executors/custom.html#cleanup
3 |
4 | # shellcheck source=./include.sh
5 | source "${BASH_SOURCE[0]%/*}/include.sh"
6 |
7 |
8 | ensure_executable_available enroot
9 |
10 |
11 | # Take care of slurm cleanup if needed
12 | if [[ -f "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt" ]]; then
13 | ensure_executable_available scancel
14 | ensure_executable_available squeue
15 |
16 | USE_SLURM=1
17 | JOBID=$(cat "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt")
18 | rm "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt" # not needed anymore
19 | # If the job isn't finished yet, we still need to cancel it
20 | scancel --quiet "${JOBID}"
21 | fi
22 |
23 | # Somehow, the work dir is leftover, that can indicate a job cancellation.
24 | WORK_DIR="${CI_WS}/${CONTAINER_NAME}"
25 | if [[ -d "${WORK_DIR}" ]]; then
26 | rm -rf "${WORK_DIR}"
27 | fi
28 |
29 | # Delete container root filesystems if it isn't asked to be preserved or there
30 | # was an error in one of the previous step.
31 | {
32 | echo -e "==============================="
33 | echo -e "Job: ${CUSTOM_ENV_CI_JOB_ID}"
34 | echo -e "Job started at: ${CUSTOM_ENV_CI_JOB_STARTED_AT}"
35 | echo -e "Pipeline: ${CUSTOM_ENV_CI_PIPELINE_ID}"
36 | if [[ -z "${CUSTOM_ENV_KEEP_CONTAINER}" ]]; then
37 | echo -e "Cleaning up container ${CONTAINER_NAME}"
38 | enroot remove --force -- "${CONTAINER_NAME}"
39 | else
40 | echo -e "Keeping container ${CONTAINER_NAME}"
41 | fi
42 |
43 | enroot list --fancy
44 | if [[ "${USE_SLURM}" == 1 ]]; then
45 | squeue -u "${USER}"
46 | fi
47 | } >> "${LOGFILE}"
48 |
--------------------------------------------------------------------------------
/config.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # https://docs.gitlab.com/runner/executors/custom.html#config
3 |
4 | # shellcheck source=./include.sh
5 | source "${BASH_SOURCE[0]%/*}/include.sh"
6 |
7 | # Sometimes you might want to set some settings during execution time.
8 | # For example settings a build directory depending on the project ID.
9 | # config_exec reads from STDOUT and expects a valid JSON string with specific keys.
10 |
11 | cat <<'EOF'
12 | {
13 | "driver": {
14 | "name": "ENROOT (SLURM) driver",
15 | "version": "v1.0.0"
16 | }
17 | }
18 | EOF
19 |
--------------------------------------------------------------------------------
/include.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Include the slurm utility functions
4 | # shellcheck source=./slurm_utils.sh
5 | source "${BASH_SOURCE[0]%/*}/slurm_utils.sh"
6 |
7 | # Do NOT touch these and make sure they are the same as local environment
8 | # variables!! Otherwise, there can be *duplicate* locations and local containers
9 | # will not see the same as gitlab containers!
10 | export CI_WS="${CUSTOM_ENV_CI_WS}"
11 | export LOGFILE="${CI_WS}/gitlab-runner-enroot.log"
12 | export ENROOT_CACHE_PATH="${CI_WS}/ENROOT_CACHE"
13 | export ENROOT_DATA_PATH="${CI_WS}/ENROOT_DATA"
14 | export SLURM_IDS_PATH="${CI_WS}/SLURM_IDS"
15 |
16 |
17 | # Set a variable CONTAINER_NAME based on the environment variable
18 | # CUSTOM_ENV_USE_NAME
19 | if [[ -z "${CUSTOM_ENV_USE_NAME}" ]]; then
20 | CONTAINER_NAME="GitLabRunnerEnrootExecutorID${CUSTOM_ENV_CI_JOB_ID}"
21 | else
22 | CONTAINER_NAME="${CUSTOM_ENV_USE_NAME}"
23 | fi
24 | export CONTAINER_NAME
25 |
26 |
27 | # Ccache and volume management
28 | ENROOT_MOUNT_OPTIONS=()
29 | if [[ -n "${CUSTOM_ENV_VOL_NUM}" ]]; then
30 | for i in $(seq 1 "${CUSTOM_ENV_VOL_NUM}"); do
31 | VOL_SRC="CUSTOM_ENV_VOL_${i}_SRC"
32 | VOL_DST="CUSTOM_ENV_VOL_${i}_DST"
33 | if [[ ! -e "${!VOL_SRC}" ]]; then
34 | mkdir -p "${!VOL_SRC}"
35 | fi
36 | ENROOT_MOUNT_OPTIONS+=("--mount ${!VOL_SRC}:${!VOL_DST}")
37 | done
38 | fi
39 | export ENROOT_MOUNT_OPTIONS
40 |
41 | if [[ -n "${CUSTOM_ENV_ENROOT_REMAP_ROOT}" ]]; then
42 | ENROOT_REMAP_ROOT="--root"
43 | export ENROOT_REMAP_ROOT
44 | fi
45 |
46 | # Propagate these environment variables to the container
47 | PROPAGATED_ENV_VARIABLES=(BENCHMARK
48 | DRY_RUN
49 | EXECUTOR
50 | REPETITIONS
51 | SOLVER_REPETITIONS
52 | SEGMENTS
53 | SEGMENT_ID
54 | PRECONDS
55 | FORMATS
56 | ELL_IMBALANCE_LIMIT
57 | SOLVERS
58 | SOLVERS_PRECISION
59 | SOLVERS_MAX_ITERATIONS
60 | SOLVERS_GMRES_RESTART
61 | SYSTEM_NAME
62 | DEVICE_ID
63 | SOLVERS_JACOBI_MAX_BS
64 | BENCHMARK_PRECISION
65 | SOLVERS_RHS
66 | SOLVERS_RHS_FLAG
67 | SOLVERS_INITIAL_GUESS
68 | GPU_TIMER
69 | DETAILED
70 | MATRIX_LIST_FILE
71 | NVIDIA_VISIBLE_DEVICES
72 | CCACHE_DIR
73 | CCACHE_MAXSIZE
74 | )
75 | # shellcheck disable=SC2048
76 | for bench_var in ${PROPAGATED_ENV_VARIABLES[*]}; do
77 | check_var="CUSTOM_ENV_${bench_var}"
78 | if [[ -n "${!check_var}" ]]; then
79 | ENROOT_ENV_CONFIG+=("-e ${bench_var}=${!check_var}")
80 | fi
81 | done
82 | export ENROOT_ENV_CONFIG
83 |
84 | # SLURM configuration variables.
85 | #
86 | # If the user sets any slurm variable or the variable USE_SLURM, this container
87 | # will use slurm job submission
88 | USE_SLURM=${CUSTOM_ENV_USE_SLURM}
89 | if [[ -z "${USE_SLURM}" || ${USE_SLURM} -ne 0 ]]; then
90 | SUPPORTED_SLURM_VARIABLES=(SLURM_PARTITION
91 | SLURM_EXCLUSIVE
92 | SLURM_TIME
93 | SLURM_GRES
94 | SLURM_ACCOUNT
95 | SLURM_UPDATE_INTERVAL
96 | SLURM_PENDING_LIMIT
97 | SLURM_RUNNING_LIMIT
98 | USE_SLURM)
99 | # shellcheck disable=SC2048
100 | for slurm_var in ${SUPPORTED_SLURM_VARIABLES[*]}; do
101 | check_var="CUSTOM_ENV_${slurm_var}"
102 | if [[ -n "${!check_var}" ]]; then
103 | USE_SLURM=1
104 | fi
105 | done
106 | fi
107 | export USE_SLURM
108 | # variables from slurm_utils we need to expose outside
109 | export SLURM_UPDATE_INTERVAL
110 | export SLURM_PENDING_LIMIT
111 | export SLURM_RUNNING_LIMIT
112 | export SLURM_GOOD_COMPLETED_STATUS
113 | export SLURM_GOOD_PENDING_STATUS
114 | export SLURM_BAD_STATUS
115 |
116 |
117 | function ensure_executable_available() {
118 | local command=${1}
119 |
120 | if ! type -p "${command}" >/dev/null 2>/dev/null; then
121 | die "No ${command} executable found"
122 | fi
123 | }
124 |
--------------------------------------------------------------------------------
/prepare.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # https://docs.gitlab.com/runner/executors/custom.html#prepare
3 |
4 | # shellcheck source=./include.sh
5 | source "${BASH_SOURCE[0]%/*}/include.sh"
6 |
7 |
8 | ensure_executable_available enroot
9 | ensure_executable_available flock
10 | ensure_executable_available grep
11 |
12 |
13 | # Create CI WorkSpace paths if they don't exist
14 | if [[ ! -d "${ENROOT_CACHE_PATH}" ]]; then
15 | mkdir -p "${ENROOT_CACHE_PATH}"
16 | fi
17 |
18 | if [[ ! -d "${ENROOT_DATA_PATH}" ]]; then
19 | mkdir -p "${ENROOT_DATA_PATH}"
20 | fi
21 |
22 | if [[ ! -d "${SLURM_IDS_PATH}" ]]; then
23 | mkdir -p "${SLURM_IDS_PATH}"
24 | fi
25 |
26 |
27 | # Reuse a container if it exists
28 | # shellcheck disable=SC2143
29 | if ! [[ $(enroot list | grep "${CONTAINER_NAME}") ]]; then
30 | echo -e "Preparing the container ${CONTAINER_NAME}."
31 |
32 | # Check if CI job image: is set
33 | if [[ -z "${CUSTOM_ENV_CI_JOB_IMAGE}" ]]; then
34 | die "No CI job image specified"
35 | fi
36 |
37 | # Import a container image from a specific location to enroot image dir
38 | # Scheme: docker://[USER@][REGISTRY#]IMAGE[:TAG]
39 | IMAGE_DIR="${ENROOT_DATA_PATH}"
40 | if [[ ${CUSTOM_ENV_CI_JOB_IMAGE} == "${CUSTOM_ENV_CI_REGISTRY}"* ]]; then
41 | # shellcheck disable=SC2295
42 | URL="docker://${CUSTOM_ENV_CI_REGISTRY_USER}:${CUSTOM_ENV_CI_REGISTRY_PASSWORD}@${CUSTOM_ENV_CI_REGISTRY}#${CUSTOM_ENV_CI_JOB_IMAGE#*$CUSTOM_ENV_CI_REGISTRY/}"
43 | else
44 | URL="docker://${CUSTOM_ENV_CI_JOB_IMAGE}"
45 | fi
46 | IMAGE_NAME="${CUSTOM_ENV_CI_JOB_IMAGE//[:@#.\/]/-}"
47 | # Utility timestamp and lock files
48 | IMAGE_TIMESTAMP_FILE=${IMAGE_DIR}/TIMESTAMP_${IMAGE_NAME}
49 | IMAGE_LOCK_FILE=${IMAGE_DIR}/LOCK_${IMAGE_NAME}
50 |
51 | # Update the image once every 3 hours. Use a lock to prevent conflicts
52 | exec 100<>"${IMAGE_LOCK_FILE}"
53 | flock -w 120 100
54 | if [[ ! -f ${IMAGE_TIMESTAMP_FILE} ||
55 | ($(cat "${IMAGE_TIMESTAMP_FILE}") -le $(date +%s -d '-3 hours')) ]]; then
56 | IMAGE_FILE="${IMAGE_DIR}/${IMAGE_NAME}.sqsh"
57 | if [[ -f ${IMAGE_FILE} ]]; then
58 | rm "${IMAGE_DIR}/${IMAGE_NAME}.sqsh"
59 | fi
60 |
61 | COMMAND=(enroot import \
62 | --output "${IMAGE_DIR}/${IMAGE_NAME}.sqsh" \
63 | -- "${URL}")
64 |
65 | "${COMMAND[@]}" || die "Command: ${COMMAND[*]} failed with exit code ${?}"
66 | date +%s > "${IMAGE_TIMESTAMP_FILE}"
67 | fi
68 | flock -u 100
69 |
70 | # Create a container root filesystem from a container image
71 | COMMAND=(
72 | enroot create \
73 | --name "${CONTAINER_NAME}" \
74 | -- "${IMAGE_DIR}/${IMAGE_NAME}.sqsh"
75 | )
76 | "${COMMAND[@]}" || die "Command: ${COMMAND[*]} failed with exit code ${?}"
77 | else
78 | echo -e "Reusing container ${CONTAINER_NAME}"
79 | fi
80 |
81 |
82 | # List all the container root filesystems on the system.
83 | enroot list --fancy
84 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # https://docs.gitlab.com/runner/executors/custom.html#run
3 |
4 |
5 | # shellcheck source=./include.sh
6 | source "${BASH_SOURCE[0]%/*}/include.sh"
7 |
8 |
9 | ensure_executable_available enroot
10 |
11 |
12 | # External args
13 | # Last argument is always the step name. The one before last is the step script
14 | before_last=$(($#-1))
15 | STEP_NAME_ARG="${!#}"
16 | STEP_SCRIPT_ARG="${!before_last}"
17 |
18 | if [[ "${STEP_NAME_ARG}" == "step_script" || "${STEP_NAME_ARG}" == "build_script" ]]; then
19 | echo -e "VOLUMES configuration:"
20 | printf "\t%s\n" "${ENROOT_MOUNT_OPTIONS[@]}"
21 | echo -e "\n"
22 | fi
23 |
24 | # No slurm requested or required, directly use the login node
25 | if [[ -z "${USE_SLURM}" || ${USE_SLURM} -eq 0 ||
26 | # All scripts from after_script onward are executed on the login node
27 | # see https://docs.gitlab.com/runner/executors/custom.html#run
28 | "${STEP_NAME_ARG}" == "after_script" ||
29 | "${STEP_NAME_ARG}" == "cleanup_file_variables" ||
30 | "${STEP_NAME_ARG}" == *"archive"* ||
31 | "${STEP_NAME_ARG}" == *"upload_artifacts"* ]]; then
32 | # Enroot fails when quoting anything or splitting this command. Leave it in
33 | # this format.
34 | #
35 | # shellcheck disable=SC2206
36 | COMMAND=(enroot start ${ENROOT_REMAP_ROOT} ${ENROOT_MOUNT_OPTIONS[*]} --rw ${ENROOT_ENV_CONFIG[*]} -e "NVIDIA_VISIBLE_DEVICES=void" ${CONTAINER_NAME} /bin/bash)
37 | "${COMMAND[@]}" < "${STEP_SCRIPT_ARG}" || die "Command: ${COMMAND[*]} failed with exit code ${?}"
38 | else # SLURM usage requested
39 | ensure_executable_available sacct
40 | ensure_executable_available scancel
41 | ensure_executable_available sbatch
42 | ensure_executable_available srun
43 | ensure_executable_available squeue
44 | ensure_executable_available wc
45 | ensure_executable_available awk
46 |
47 | # We need to create the temporary files in a directory with filesystem
48 | # access on all nodes. Because we consider ${CONTAINER_NAME} to be unique,
49 | # we use it as storage for this job.
50 | WORK_DIR="${CI_WS}/${CONTAINER_NAME}"
51 | STEP_SCRIPT_DIR="${WORK_DIR}/step_scripts"
52 | if [[ ! -d "${WORK_DIR}" ]]; then
53 | mkdir -p "${WORK_DIR}"
54 | fi
55 | if [[ ! -d "${STEP_SCRIPT_DIR}" ]]; then
56 | mkdir -p "${STEP_SCRIPT_DIR}"
57 | fi
58 | NUM_SCRIPTS="$(find "${STEP_SCRIPT_DIR}" -maxdepth 1 -type f | wc -l)"
59 | STEP_SCRIPT="${STEP_SCRIPT_DIR}/${NUM_SCRIPTS}"
60 | touch "${STEP_SCRIPT}"
61 | JOB_SCRIPT=$(mktemp -p "${WORK_DIR}")
62 |
63 | # Save the step script
64 | cp "${STEP_SCRIPT_ARG}" "${STEP_SCRIPT}"
65 |
66 | # Only store the gitlab scripts until we reach the main {build,step}_script
67 | if [[ ! "${STEP_NAME_ARG}" =~ ^[bs].*"_script" ]]; then
68 | echo -e "Storing the script for step ${STEP_NAME_ARG} for bulk submission."
69 | exit
70 | fi
71 |
72 | # We finally reached the main script, prepare the SLURM job
73 | JOB_LOG=$(mktemp -p "${WORK_DIR}")
74 | JOB_ERR=$(mktemp -p "${WORK_DIR}")
75 | SLURM_CONFIG=("--job-name=${CONTAINER_NAME}")
76 | SLURM_CONFIG+=("--output=${JOB_LOG}")
77 | SLURM_CONFIG+=("--error=${JOB_ERR}")
78 | SLURM_CONFIG+=("--chdir=${WORK_DIR}")
79 | if [[ -n "${CUSTOM_ENV_SLURM_PARTITION}" ]]; then
80 | SLURM_CONFIG+=("--partition=${CUSTOM_ENV_SLURM_PARTITION}")
81 | fi
82 | if [[ -n "${CUSTOM_ENV_SLURM_EXCLUSIVE}" ]]; then
83 | SLURM_CONFIG+=("--exclusive")
84 | fi
85 | if [[ -n "${CUSTOM_ENV_SLURM_TIME}" ]]; then
86 | SLURM_CONFIG+=("--time=${CUSTOM_ENV_SLURM_TIME}")
87 | fi
88 | if [[ -n "${CUSTOM_ENV_SLURM_GRES}" ]]; then
89 | SLURM_CONFIG+=("--gres=${CUSTOM_ENV_SLURM_GRES}")
90 | fi
91 | if [[ -n "${CUSTOM_ENV_SLURM_ACCOUNT}" ]]; then
92 | SLURM_CONFIG+=("--account=${CUSTOM_ENV_SLURM_ACCOUNT}")
93 | fi
94 |
95 | # Log the configuration
96 | echo -e "SLURM configuration:"
97 | printf "\t%s\n" "${SLURM_CONFIG[@]}"
98 | echo -e "\n"
99 | echo -e "ENROOT environment configuration:"
100 | printf "\t%s\n" "${ENROOT_ENV_CONFIG[@]}"
101 | echo -e "\n"
102 |
103 |
104 | # Launch the container through slurm
105 | # Somehow, this script doesn't like if the variables are surrounded by "
106 | echo -e "#!/usr/bin/env bash
107 |
108 | for scriptnum in \$(ls -1v ${STEP_SCRIPT_DIR}); do
109 | srun enroot start ${ENROOT_REMAP_ROOT} ${ENROOT_MOUNT_OPTIONS[*]} --rw ${ENROOT_ENV_CONFIG[*]} \
110 | ${CONTAINER_NAME} /bin/bash < ${STEP_SCRIPT_DIR}/\${scriptnum}
111 | done
112 | " > "${JOB_SCRIPT}"
113 | chmod +x "${JOB_SCRIPT}"
114 |
115 |
116 | # Submission
117 | # shellcheck disable=SC2206
118 | COMMAND=(sbatch --parsable ${SLURM_CONFIG[*]} ${JOB_SCRIPT})
119 | JOB_ID=$("${COMMAND[@]}") || \
120 | die "Command: ${COMMAND[*]} failed with exit code ${?}" "${WORK_DIR}"
121 | echo -e "Job submitted and pending with ID: ${JOB_ID}."
122 | squeue -u "${USER}"
123 |
124 | # Store the JOB_ID so `cleanup.sh` can read it and cancel the job if running
125 | # (e.g., when pressing the cancel button on gitlab). We consider that the
126 | # CONTAINER_NAME is unique at a given time, so we don't use locking or a list
127 | # of ids.
128 | echo "${JOB_ID}" > "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt"
129 |
130 | slurm_wait_for_status "${JOB_ID}" "${SLURM_PENDING_LIMIT}" \
131 | "${SLURM_GOOD_PENDING_STATUS}" || die "encountered an error while waiting" \
132 | "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}"
133 |
134 | echo -e "Job ${JOB_ID} started execution."
135 | slurm_wait_for_status "${JOB_ID}" "${SLURM_RUNNING_LIMIT}" \
136 | "${SLURM_GOOD_COMPLETED_STATUS}" || die "encountered an error while waiting" \
137 | "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}"
138 |
139 | test -f "${JOB_ERR}" && test "$(cat "${JOB_ERR}")" != "" && \
140 | die "encountered an error during execution" "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}"
141 |
142 | echo -e "Job ${JOB_ID} completed."
143 | slurm_print_output "${JOB_ID}" "Log" "${JOB_LOG}" /dev/stdout
144 | slurm_print_output "${JOB_ID}" "Errors" "${JOB_ERR}" /dev/stdout
145 |
146 | # Cleanup the workdir
147 | rm -rf "${WORK_DIR}"
148 | fi
149 |
--------------------------------------------------------------------------------
/slurm_utils.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Sections:
3 | # 1. General standalone utility functions
4 | # 2. SLURM variables management (requires previous utility functions)
5 | # 3. Main SLURM loop waiting function (relies on all utilities)
6 |
7 |
8 | #####
9 | ## 1. Standalone utility functions
10 | #####
11 |
12 | # Convenient exit function
13 | # This has multiple modes builtin depending on the arguments given:
14 | # + No argument: only exit, no message or anything printed
15 | # + One argument: exit with a message
16 | # + Two argument: exit with message and deletes temporary workdir (useful for
17 | # intermediate SLURM code)
18 | # + More arguments: Full SLURM management, cancels any ongoing job, print job logs, ...
19 | function die() {
20 | # External arguments
21 | local msg="${1}"
22 | # Extra arguments in the SLURM cases
23 | local workdir="${2}"
24 | local jobid=${3}
25 | local joblog="${4}"
26 | local joberr="${5}"
27 |
28 | if [[ -n "${jobid}" ]]; then
29 | msg="${jobid}: ${msg}"
30 | fi
31 | test -n "${msg}" && echo -e "${msg}" > /dev/stderr
32 | test -n "${jobid}" && scancel --quiet "${jobid}"
33 | test -n "${joblog}" && slurm_print_output "${jobid}" "Log" "${joblog}" /dev/stderr
34 | test -n "${joberr}" && slurm_print_output "${jobid}" "Errors" "${joberr}" /dev/stderr
35 | test -n "${workdir}" && test -d "${workdir}" && rm -rf "${workdir}"
36 | # Inform cleanup.sh that we encountered an error
37 | # touch "${CUSTOM_ENV_CI_WS}/${CUSTOM_ENV_CI_JOB_ID}"
38 | exit "${BUILD_FAILURE_EXIT_CODE}"
39 | }
40 |
41 |
42 | # Prints a SLURM job output file to $output
43 | # Does nothing if the file is empty
44 | function slurm_print_output() {
45 | # External arguments
46 | local jobid=${1}
47 | local logtype="${2}"
48 | local slurmlogfile="${3}"
49 | local output=${4}
50 |
51 | if [[ ! -f "${slurmlogfile}" || "$(cat "${slurmlogfile}")" == "" ]]; then
52 | return 0
53 | fi
54 | {
55 | echo -e "== SLURM Job ${jobid} ${logtype}"
56 | echo -e "============================"
57 | cat "${slurmlogfile}"
58 | } >> "${output}"
59 | }
60 |
61 |
62 | # Uses awk to convert a string of the form d-hh:min:s and all combinations to
63 | # seconds.
64 | # The result is return as simple echo.
65 | function slurm_time_to_seconds() {
66 | # Parameters
67 | local slurm_time="$1"
68 |
69 | # Local variables
70 | local num_colons="${slurm_time//[!:]/}"
71 | local num_dashes="${slurm_time//[!-]/}"
72 | num_colons=${#num_colons}
73 | num_dashes=${#num_dashes}
74 | local running_limit=0
75 | # We use awk to split the string into sub components. The fields are
76 | # available in $1 to $. If $3, e.g. doesn't exist, its value is 0 so
77 | # optional components at the end of the expression are taken care of
78 | # naturally. We need different cases for optional components at the
79 | # beginning of the expression.
80 | if [[ ${num_dashes} == 1 ]]; then # Suppose d-hh(:min(:s)) where parenthesis show optional components
81 | running_limit=$(echo "${slurm_time}" | awk -F[-:] '{ print ($1 * 86400) + ($2 * 3600) + ($3 * 60) + $4 }')
82 | elif [[ ${num_colons} == 2 ]]; then # Suppose hh:min:s
83 | running_limit=$(echo "${slurm_time}" | awk -F: '{ print ($1 * 3600) + ($2 * 60) + $3 }')
84 | elif [[ ${num_colons} == 1 || ${num_colons} == 0 ]]; then # Suppose min(:s)
85 | running_limit=$(echo "${slurm_time}" | awk -F: '{ print ($1 * 60) + $2 }')
86 | else
87 | return 1
88 | fi
89 | echo "${running_limit}"
90 | }
91 |
92 |
93 | #####
94 | ## 2. Variables which control the SLURM waiting loop's behavior
95 | #####
96 | export SLURM_GOOD_COMPLETED_STATUS="COMPLETED"
97 | export SLURM_GOOD_PENDING_STATUS="@(COMPLETED|COMPLETING|RUNNING)*"
98 | export SLURM_BAD_STATUS="@(FAILED|TIMEOUT|OUT_OF_MEMORY|REVOKED|NODE_FAIL|CANCELLED|BOOT_FAIL)*"
99 |
100 | export SLURM_UPDATE_INTERVAL=${CUSTOM_ENV_SLURM_UPDATE_INTERVAL:-120} # 2 minutes
101 | export SLURM_PENDING_LIMIT=${CUSTOM_ENV_SLURM_PENDING_LIMIT:-43200} # 12 hours
102 | SLURM_RUNNING_LIMIT=86400 # 24 hours
103 | if [[ -n ${CUSTOM_ENV_SLURM_TIME} ]]; then
104 | SLURM_RUNNING_LIMIT=$(slurm_time_to_seconds "${CUSTOM_ENV_SLURM_TIME}" || \
105 | die "Couldn't understand the time format ${CUSTOM_ENV_SLURM_TIME}.")
106 | fi
107 | export SLURM_RUNNING_LIMIT
108 |
109 |
110 | #####
111 | ## 3. SLURM waiting loop function
112 | #####
113 |
114 | # A simple waiting loop for a specific SLURM job based on its status.
115 | # Error conditions:
116 | # 1. We waited past the waiting limit
117 | # 2. The job status is one of ${SLURM_BAD_STATUS}.
118 | function slurm_wait_for_status() {
119 | # Get External params
120 | local jobid=${1}
121 | local waiting_limit=${2}
122 | local good_status_expr="${3}"
123 |
124 | # Internal variables
125 | local keep_waiting=1
126 | local waiting_time=0
127 | local jobstatus=""
128 |
129 | echo -e ""
130 | while [[ $keep_waiting == 1 ]]; do
131 | jobstatus="$(sacct -bn -j "${jobid}" | head -1 | tr -s ' ' | cut -d' ' -f 2)"
132 | if [[ $waiting_time -gt $waiting_limit ]]; then
133 | echo -e "\nJob ${jobid} has exceeded the waiting limit\
134 | of ${waiting_limit}." > /dev/stderr
135 | return 1
136 | fi
137 | # We need extglob for the expression variable based cases to work
138 | shopt -s extglob
139 | # shellcheck disable=SC2254
140 | case ${jobstatus} in
141 | ${good_status_expr})
142 | keep_waiting=0
143 | ;;
144 | ${SLURM_BAD_STATUS})
145 | echo -e ""
146 | return 1
147 | ;;
148 | *)
149 | echo -n "."
150 | sleep "$SLURM_UPDATE_INTERVAL"
151 | waiting_time=$((waiting_time + SLURM_UPDATE_INTERVAL))
152 | ;;
153 | esac
154 | shopt -u extglob
155 | done
156 | echo -e ""
157 | }
158 |
--------------------------------------------------------------------------------