├── .gitignore
├── .gitmodules
├── Makefile
├── README.md
├── buildout.sh
├── cloud_monitor.py3
├── grafana
    ├── Dockerfile
    ├── dashboards.yaml
    ├── grafana.ini
    └── influxdb.yaml
├── influxdb
    ├── Dockerfile
    ├── influxdb.conf
    └── setup.sh
├── keycloak
    ├── Dockerfile
    └── startup.sh
├── open-ondemand
    ├── Dockerfile
    ├── entrypoint.sh
    ├── ood_portal.yml
    ├── sbatch
    ├── scaleout.yaml
    ├── scancel
    ├── scontrol
    └── squeue
├── proxy
    ├── Dockerfile
    ├── auth.php
    ├── nginx.conf
    ├── validate.php
    └── www.conf
├── scaleout
    ├── Dockerfile
    ├── bashrc
    ├── benchmark
    │   ├── compile.d
    │   │   └── utmost.sh
    │   └── run.d
    │   │   └── utmost.sh
    ├── dump_xdmod.sh
    ├── enroot.conf
    ├── get_keycloak_jwt.sh
    ├── globals.local
    ├── lab_scripts
    │   ├── PrologSlurmctld.sh
    │   ├── alloc.c
    │   ├── arrayjob.py
    │   ├── associations.json
    │   ├── canceljob.py
    │   ├── cgroup.conf
    │   ├── cleandemos.sh
    │   ├── cluster.cfg
    │   ├── dataprocessingjob.sh
    │   ├── dataprocessingprogram.sh
    │   ├── demo1.sh
    │   ├── demo2.sh
    │   ├── demo3.sh
    │   ├── depjob.py
    │   ├── eatmem.c
    │   ├── gen_jwt.py
    │   ├── gethostname.sh
    │   ├── hetjob.py
    │   ├── job.json
    │   ├── job.py
    │   ├── job_submit.lua
    │   ├── job_submit_spank.lua
    │   ├── make_notes.sh
    │   ├── memalloc.c
    │   ├── memalloc_with_sleep.c
    │   ├── myprogram.c
    │   ├── myslurmarray.sh
    │   ├── node_exporter.service
    │   ├── pi.c
    │   ├── plugstack.conf
    │   ├── prolog.sh
    │   ├── prometheus-slurm-exporter.service
    │   ├── prometheus.service
    │   ├── prometheus.update-1.yml
    │   ├── prometheus.update-2.yml
    │   ├── prometheus.yml
    │   ├── qos.sh
    │   ├── renice.c
    │   ├── restart.sh
    │   ├── sacct.py
    │   ├── sdiag.py
    │   ├── showjob.py
    │   ├── test.sh
    │   ├── testaccount.json
    │   ├── testping.sh.fred
    │   ├── testping.sh.pebbles
    │   ├── testuser.json
    │   ├── topology.conf
    │   ├── verify_jwt.py
    │   └── whereami.c
    ├── login.startup.sh
    ├── msmtprc
    ├── munge.service
    ├── my.cnf
    ├── mysql
    ├── patch.d
    │   └── .gitkeep
    ├── podman-containers
    │   ├── containers.conf
    │   ├── policy.json
    │   ├── registries.conf
    │   └── storage.conf
    ├── postfix.service
    ├── profile.sh
    ├── resume.node.sh
    ├── sackd.check.sh
    ├── sackd.service
    ├── slurm.bash_profile
    ├── slurm
    │   ├── acct_gather.conf
    │   ├── cgroup.conf
    │   ├── gres.conf
    │   ├── job_container.conf
    │   ├── nodes.conf
    │   ├── oci.conf
    │   ├── plugstack.conf
    │   ├── plugstack.conf.d
    │   │   └── README
    │   ├── scrun.lua
    │   ├── slurm.conf
    │   ├── slurm.jwt.conf
    │   ├── slurmdbd.conf
    │   └── staging.lua
    ├── slurmctld.service
    ├── slurmctld.startup.sh
    ├── slurmctld.startup2.sh
    ├── slurmd.check.sh
    ├── slurmd.service
    ├── slurmd.slice
    ├── slurmd.startup.sh
    ├── slurmdbd.service
    ├── slurmdbd.startup.sh
    ├── slurmrestd.env
    ├── slurmrestd.service
    ├── slurmrestd.startup.sh
    ├── startup.sh
    ├── suspend.node.sh
    ├── test-build.sh
    ├── testsuite.conf
    ├── tls_gen_csr.sh
    ├── tls_get_node_cert_key.sh
    ├── tls_get_node_token.sh
    ├── tls_sign_csr.sh
    ├── tls_validate_node.sh
    └── valgrind.patch
├── sql_server
    ├── Dockerfile
    └── my.cnf
└── xdmod
    ├── Dockerfile
    ├── resource_specs.json
    ├── resources.json
    └── startup.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | docker-compose.yml
2 | scaleout/hosts.nodes
3 | scaleout/nodelist
4 | scaleout/patch.d/*.patch
5 | cloud_socket
6 | scaleout.tar
7 | federation
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naterini/docker-scale-out/40c8a775c6e5cbb9461eb1cd218dbf9b3f1fe3e3/.gitmodules


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | HOST ?= login
 2 | BUILD ?= up --build --remove-orphans -d
 3 | DC ?= $(shell docker compose version 2>&1 >/dev/null && echo "docker compose" || echo "docker-compose")
 4 | IMAGES := $(shell $(DC) config | awk '{if ($$1 == "image:") print $$2;}' | sort | uniq)
 5 | SUBNET ?= 10.11
 6 | SUBNET6 ?= 2001:db8:1:1::
 7 | 
 8 | .EXPORT_ALL_VARIABLES:
 9 | 
10 | default: ./docker-compose.yml run
11 | 
12 | ./docker-compose.yml: buildout.sh
13 | 	bash buildout.sh > ./docker-compose.yml
14 | 
15 | build: ./docker-compose.yml
16 | 	env COMPOSE_HTTP_TIMEOUT=3000 $(DC) --ansi=never --progress=plain $(BUILD)
17 | 
18 | stop:
19 | 	$(DC) down
20 | 
21 | set_nocache:
22 | 	$(eval BUILD := build --no-cache)
23 | 
24 | nocache: set_nocache build
25 | 
26 | clean-nodelist:
27 | 	truncate -s0 scaleout/nodelist
28 | 
29 | clean:
30 | 	test -f ./docker-compose.yml && ($(DC) kill -s SIGKILL; $(DC) down --remove-orphans -t1 -v; unlink ./docker-compose.yml) || true
31 | 	[ -f cloud_socket ] && unlink cloud_socket || true
32 | 
33 | uninstall:
34 | 	$(DC) down --rmi all --remove-orphans -t1 -v
35 | 	$(DC) rm -v
36 | 
37 | run: ./docker-compose.yml
38 | 	$(DC) up --remove-orphans -d
39 | 
40 | cloud:
41 | 	test -f cloud_socket && unlink cloud_socket || true
42 | 	touch cloud_socket
43 | 	test -f ./docker-compose.yml && unlink ./docker-compose.yml || true
44 | 	env CLOUD=1 bash buildout.sh > ./docker-compose.yml
45 | 	python3 ./cloud_monitor.py3 "$(DC)"
46 | 	test -f ./docker-compose.yml && unlink ./docker-compose.yml || true
47 | 	test -f cloud_socket && unlink cloud_socket || true
48 | 
49 | bash:
50 | 	$(DC) exec $(HOST) /bin/bash
51 | 
52 | save: build
53 | 	docker save -o scaleout.tar $(IMAGES)
54 | 
55 | load:
56 | 	docker load -i scaleout.tar
57 | 
58 | benchmark-%: clean-nodelist clean
59 | 	$(eval SLURM_BENCHMARK := $(subst benchmark-,,$@))
60 | 	env SLURM_BENCHMARK=$(SLURM_BENCHMARK) bash buildout.sh > ./docker-compose.yml
61 | 	env COMPOSE_HTTP_TIMEOUT=3000 $(DC) --ansi=never --progress=plain $(BUILD)
62 | 	$(DC) up --remove-orphans -d
63 | 	$(DC) exec $(HOST) bash -c '(find /root/benchmark/run.d/ -type f -name $(SLURM_BENCHMARK)\*.sh | xargs -i echo bash "{} &"; echo wait) | bash -x'
64 | 	$(DC) down &>/dev/null
65 | 	truncate -s0 scaleout/nodelist
66 | 
67 | test-build: clean-nodelist clean build
68 | 	$(DC) exec $(HOST) bash /usr/local/bin/test-build.sh
69 | 	test -f ./docker-compose.yml && ($(DC) kill -s SIGKILL; $(DC) down --remove-orphans -t1 -v; unlink ./docker-compose.yml) || true
70 | 	[ -f cloud_socket ] && unlink cloud_socket || true
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # slurm-docker-scaleout
  2 | Docker compose cluster for testing Slurm
  3 | 
  4 | ## Prerequisites
  5 |   * docker (25.x.x+ with cgroupsv2 or 24.x.x with cgroupsv1)
  6 |     * IPv6 must be configured in docker: https://docs.docker.com/config/daemon/ipv6/
  7 |   * docker-compose-plugin v2.18.1+
  8 |   * ssh (client)
  9 |   * jq
 10 |   * python3
 11 |     * python3-daemon
 12 | 
 13 | ## Changes needed in sysctl.conf:
 14 | ```
 15 | net.ipv4.tcp_max_syn_backlog=4096
 16 | net.core.netdev_max_backlog=1000
 17 | net.core.somaxconn=15000
 18 | 
 19 | # Force gc to clean-up quickly
 20 | net.ipv4.neigh.default.gc_interval = 3600
 21 | 
 22 | # Set ARP cache entry timeout
 23 | net.ipv4.neigh.default.gc_stale_time = 3600
 24 | 
 25 | # Setup DNS threshold for arp
 26 | net.ipv4.neigh.default.gc_thresh3 = 8096
 27 | net.ipv4.neigh.default.gc_thresh2 = 4048
 28 | net.ipv4.neigh.default.gc_thresh1 = 1024
 29 | 
 30 | # Increase map count for elasticsearch
 31 | vm.max_map_count=262144
 32 | 
 33 | # Avoid running out of file descriptors
 34 | fs.file-max=10000000
 35 | fs.inotify.max_user_instances=65535
 36 | fs.inotify.max_user_watches=1048576
 37 | 
 38 | #Request kernel max number of cgroups
 39 | fs.inotify.max_user_instances=65535
 40 | ```
 41 | 
 42 | ## Docker configuration required with cgroupsv2
 43 | 
 44 | Make sure the host machine is running CgroupV2 and not hybrid mode:
 45 | 	https://slurm.schedmd.com/faq.html#cgroupv2
 46 | 
 47 | Add these settings to the docker configuration: /etc/docker/daemon.json
 48 | ```
 49 | {
 50 |   "exec-opts": [
 51 |     "native.cgroupdriver=systemd"
 52 |   ],
 53 |   "features": {
 54 |     "buildkit": true
 55 |   },
 56 |   "experimental": true,
 57 |   "cgroup-parent": "docker.slice",
 58 |   "default-cgroupns-mode": "host",
 59 |   "storage-driver": "overlay2"
 60 | }
 61 | ```
 62 | 
 63 | Configure systemd to allow docker to run in it's own slice to avoid systemd
 64 | conflicting with it:
 65 | 
 66 | /etc/systemd/system/docker.slice:
 67 | ```
 68 | [Unit]
 69 | Description=docker slice
 70 | Before=slices.target
 71 | [Slice]
 72 | CPUAccounting=true
 73 | CPUWeight=idle
 74 | CPUQuota=90%
 75 | MemoryAccounting=true
 76 | MemoryMax=90%
 77 | IOAccounting=true
 78 | IOWeight=1
 79 | ```
 80 | 
 81 | /etc/systemd/system/docker.service.d/local.conf:
 82 | ```
 83 | [Unit]
 84 | After=docker.slice
 85 | Requires=docker.slice
 86 | [Service]
 87 | Slice=docker.slice
 88 | ```
 89 | 
 90 | /usr/lib/systemd/system/docker.service.d/local.conf:
 91 | ```
 92 | [Service]
 93 | LimitNOFILE=infinity
 94 | LimitNPROC=infinity
 95 | LimitCORE=infinity
 96 | TasksMax=infinity
 97 | Delegate=yes
 98 | ```
 99 | 
100 | Activate the changes:
101 | ```
102 | make clean
103 | sudo systemctl daemon-reload
104 | sudo systemctl restart docker.slice docker.service
105 | ```
106 | 
107 | Verify docker.slice is being used by docker:
108 | ```
109 | make
110 | sudo systemctl status docker.slice docker.service
111 | ```
112 | The container processes should now show up docker.slice process tree.
113 | 
114 | ## Basic Architecture
115 | 
116 | Maria Database Node:
117 |   * db
118 | 
119 | Slurm Management Nodes:
120 |   * mgmtnode
121 |   * mgmtnode2
122 |   * slurmdbd
123 | 
124 | Compute Nodes:
125 |   * node[00-09]
126 | 
127 | Login Nodes:
128 |   * login
129 | 
130 | Nginx Proxy node:
131 |  * proxy
132 | 
133 | Rest API Nodes:
134 |   * rest
135 | 
136 | Kibana (Only supports IPv4):
137 |   * View http://127.0.0.1:5601/
138 | 
139 | Elasticsearch:
140 |   * View http://localhost:9200/
141 | 
142 | Grafana:
143 |   * View http://localhost:3000/
144 |   * User: admin
145 |   * Password: admin
146 | 
147 | Open On-Demand:
148 |   * View http://localhost:8081/
149 |   * User: {user name - "fred" or "wilma"}
150 |   * Password: password
151 | 
152 | Open XDMoD:
153 |   * View http://localhost:8082/
154 | 
155 | Proxy:
156 |   * Auth REST API http://localhost:8080/auth
157 |   * Query REST API http://localhost:8080/slurm/
158 | 
159 | Keycloak
160 |   * Admin Console: http://127.0.0.1:8083/
161 |   * User: admin
162 |   * Password: password
163 | 
164 | ## Multiple Instances
165 | Each cluster must have a unique class B subnet.
166 | 
167 | Default IPv4 is SUBNET="10.11".
168 | Default IPv6 is SUBNET6="2001:db8:1:1::".
169 | 
170 | ## Custom Nodes
171 | 
172 | Custom node lists may be provided by setting NODELIST to point to a file
173 | containing list of nodes for the cluster or modifying the default generated
174 | "nodelist" file in the scaleout directory.
175 | 
176 | The node list follows the following format with one node per line:
177 | > ${HOSTNAME} ${CLUSTERNAME} ${IPv4} ${IPv6}
178 | 
179 | Example line:
180 | > node00 scaleout 10.11.5.0 2001:db8:1:1::5:0
181 | 
182 | Note that the service nodes can not be changed and will always be placed into
183 | the following subnets:
184 | > ${SUBNET}.1.0/24
185 | > ${SUBNET6}1:0/122
186 | 
187 | ## Custom Slurm version
188 | 
189 | To specify an explicit version of Slurm to be compiled and installed:
190 | > export SLURM_RELEASE=slurm-$version
191 | 
192 | Make sure to call `make clean` after to invalidate all the caches with the
193 | prior release.
194 | 
195 | ## To build images
196 | 
197 | ```
198 | git submodule update --init --force --remote --recursive
199 | make build
200 | ```
201 | 
202 | ## To run:
203 | 
204 | ```
205 | make
206 | ```
207 | 
208 | ## To build and run in Cloud mode:
209 | 
210 | ```
211 | make clean
212 | make cloud
213 | ```
214 | 
215 | Note: cloud mode will run in the foreground.
216 | 
217 | ## To build without caching:
218 | 
219 | ```
220 | make nocache
221 | ```
222 | 
223 | ## To stop:
224 | 
225 | ```
226 | make stop
227 | ```
228 | 
229 | ## To reverse all changes:
230 | 
231 | ```
232 | make clean
233 | ```
234 | 
235 | ## To remove all images:
236 | 
237 | ```
238 | make uninstall
239 | ```
240 | 
241 | ## To control:
242 | 
243 | ```
244 | make bash
245 | make HOST=node1 bash
246 | ```
247 | 
248 | ## To login via ssh
249 | ```
250 | ssh-keygen -f "/home/$(whoami)/.ssh/known_hosts" -R "10.11.1.5" 2>/dev/null
251 | ssh -o StrictHostKeyChecking=no -l fred 10.11.1.5 -X #use 'password'
252 | ```
253 | 
254 | ## Federation Mode
255 | 
256 | Federation mode will create multiple Slurm clusters with nodes and slurmctld
257 | daemons. Other nodes will be shared, such as login and slurmdbd.
258 | 
259 | To create multiple federation clusters:
260 | ```
261 | export FEDERATION="taco burrito quesadilla"
262 | echo "FederationParameters=fed_display" >> scaleout/slurm/slurm.conf
263 | truncate -s0 scaleout/nodelist
264 | make clean
265 | make build
266 | make
267 | ```
268 | 
269 | Configure Slurm for multiple federation clusters:
270 | ```
271 | make HOST=quesadilla-mgmtnode bash
272 | sacctmgr add federation scaleout clusters=taco,burrito,quesadilla
273 | ```
274 | 
275 | ### Activate Federation mode in Slurm
276 | 
277 | Notify slurmdbd to use federation after building cluster:
278 | ```
279 | export FEDERATION="taco burrito quesadilla"
280 | make HOST=taco-mgmtnode bash
281 | sacctmgr add federation scaleout cluster=taco,burrito,quesadilla
282 | ```
283 | 
284 | ### Deactivate to Federation mode
285 | 
286 | ```
287 | export FEDERATION="taco burrito quesadilla"
288 | make uninstall
289 | truncate -s0 scaleout/nodelist
290 | ```
291 | 
292 | ## Caveats
293 | 
294 | The number of CPU threads on the host are multiplied by the number of nodes. Do not attempt to use computationally intensive applications.
295 | 
296 | ## Docker work-arounds:
297 | 
298 | ```
299 | ERROR: Pool overlaps with other one on this address space
300 | ```
301 | or
302 | ```
303 | failed to prepare ${HASH}: max depth exceeded
304 | ERROR: Service 'slurmdbd' failed to build : Build failed
305 | ```
306 | Call this:
307 | ```
308 | make clean
309 | docker network prune -f
310 | sudo systemctl restart docker
311 | ```
312 | 
313 | ## To save all images to ./scaleout.tar
314 | 
315 | ```
316 | make save
317 | ```
318 | 
319 | ## To load saved copy of all images
320 | 
321 | ```
322 | make load
323 | ```
324 | 
325 | ## To test building
326 | 
327 | ```
328 | git submodule update --init --force --remote --recursive
329 | make test-build
330 | ```
331 | 
332 | ## How to trigger manual xdmod data dump:
333 | 
334 | ```
335 | make HOST=scaleout_mgmtnode_1 bash
336 | bash /etc/cron.hourly/dump_xdmod.sh
337 | exit
338 | make bash
339 | exec bash /etc/cron.hourly/dump_xdmod.sh
340 | make HOST=xdmod bash
341 | sudo -u xdmod -- /usr/bin/xdmod-shredder -r scaleout -f slurm -i /xdmod/data.csv
342 | sudo -u xdmod -- /usr/bin/xdmod-ingestor
343 | exit
344 | ```
345 | 
346 | ## How to disable building xdmod container
347 | 
348 | This is will only disable attempts to build and start the container.
349 | 
350 | ```
351 | export DISABLE_XDMOD=1
352 | ```
353 | 
354 | ## How to disable building gdb
355 | 
356 | This is will only disable attempts to build gdb from source.
357 | 
358 | ```
359 | export DISABLE_GDB_BUILD=1
360 | ```
361 | 
362 | ## How to disable building enroot+pyxis
363 | 
364 | This is will only disable attempts to build enroot and pyxis from source.
365 | 
366 | ```
367 | export DISABLE_PYXIS=1
368 | ```
369 | 
370 | ## Maxing out kernel cgroups total
371 | 
372 | The Linux kernel has a hard limit of 65535 cgroups total. Stacking large number
373 | of jobs or scaleout instances may result in the following error:
374 | 
375 | ```
376 | error: proctrack_g_create: No space left on device
377 | ```
378 | 
379 | When this happens, fewer jobs must be run as this a kernel limitation.
380 | 


--------------------------------------------------------------------------------
/buildout.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | unset MAC
  3 | [[ $OSTYPE == 'darwin'* ]] && MAC=1
  4 | 
  5 | #only mount cgroups with v1
  6 | #https://github.com/jepsen-io/jepsen/issues/532#issuecomment-1128067136
  7 | [ ! -f /sys/fs/cgroup/cgroup.controllers ] && SYSDFSMOUNTS="
  8 |       - /etc/localtime:/etc/localtime:ro
  9 |       - /run/
 10 |       - /run/lock/
 11 |       - /sys/:/sys/:ro
 12 |       - /sys/fs/cgroup/:/sys/fs/cgroup/:ro
 13 |       - /sys/fs/fuse/:/sys/fs/fuse/:rw
 14 |       - /tmp/
 15 |       - /var/lib/journal
 16 | " || SYSDFSMOUNTS="
 17 |       - /etc/localtime:/etc/localtime:ro
 18 |       - /run/
 19 |       - /run/lock/
 20 |       - /sys/
 21 |       - /sys/fs/cgroup/:/sys/fs/cgroup/:ro
 22 |       - /sys/fs/cgroup/docker.slice/:/sys/fs/cgroup/docker.slice/:rw
 23 |       - /sys/fs/fuse/:/sys/fs/fuse/:rw
 24 |       - /tmp/
 25 |       - /var/lib/journal
 26 | "
 27 | 
 28 | CACHE_DESTROYER="$(find scaleout/patch.d -type f -name '*.patch' -print0 | sort -z | xargs -0 cat | sha256sum | cut -b1-20)"
 29 | 
 30 | SLURM_RELEASE="${SLURM_RELEASE:-master}"
 31 | DISTRO="almalinux:8"
 32 | if [ -z "$SUBNET" -o "$SUBNET" = "10.11" ]
 33 | then
 34 | 	ES_PORTS="
 35 |     ports:
 36 |       - 9200:9200
 37 | "
 38 | 	KIBANA_PORTS="
 39 |     ports:
 40 |       - 5601:5601
 41 | "
 42 | 	PROXY_PORTS="
 43 |     ports:
 44 |       - 8080:8080
 45 | "
 46 | 	GRAFANA_PORTS="
 47 |     ports:
 48 |       - 3000:3000
 49 | "
 50 | 	ONDEMAND_PORTS="
 51 |     ports:
 52 |       - 8081:80
 53 | "
 54 | 	XDMOD_PORTS="
 55 |     ports:
 56 |       - 8082:80
 57 | "
 58 | 	KEYCLOAK_PORTS="
 59 |     ports:
 60 |       - 8083:8080
 61 | "
 62 | else
 63 | 	ES_PORTS=
 64 | 	KIBANA_PORTS=
 65 | 	PROXY_PORTS=
 66 | 	GRAFANA_PORTS=
 67 | 	XDMOD_PORTS=
 68 | 	KEYCLOAK_PORTS=
 69 | fi
 70 | 
 71 | SUBNET=${SUBNET:-"10.11"}
 72 | SUBNET6=${SUBNET6:-"2001:db8:1:1::"}
 73 | NODELIST=${NODELIST:-"scaleout/nodelist"}
 74 | 
 75 | if [ ! -z "$SLURM_BENCHMARK" ]
 76 | then
 77 | MYSQL_VOLUMES="
 78 |     volumes:
 79 |       - type: tmpfs
 80 |         target: /var/lib/mysql
 81 | "
 82 | NODES_COUNT=100
 83 | BUILD_ARGS="
 84 |         SLURM_BENCHMARK: ${SLURM_BENCHMARK}
 85 | "
 86 | else
 87 | MYSQL_VOLUMES=""
 88 | NODES_COUNT=9
 89 | BUILD_ARGS=""
 90 | fi
 91 | 
 92 | if [ ! -s "$NODELIST" -o "$SLURM_BENCHMARK" ]
 93 | then
 94 | 	if [ ! -z "$FEDERATION" ]
 95 | 	then
 96 | 		c_sub=5
 97 | 		[ -f "$NODELIST" ] && unlink "$NODELIST" 2>&1 >/dev/null
 98 | 		for c in $FEDERATION
 99 | 		do
100 | 			#generate list nodes per cluster
101 | 			seq 0 $NODES_COUNT | while read i
102 | 			do
103 | 				echo "$(printf "$c-node%02d" $i) $c ${SUBNET}.${c_sub}.$((${i} + 10)) ${SUBNET6}${c_sub}:$((${i} + 10))"
104 | 			done >> $NODELIST
105 | 
106 | 			c_sub=$((c_sub+1))
107 | 		done
108 | 	else
109 | 		#generate list of 10 nodes
110 | 		seq 0 $NODES_COUNT | while read i
111 | 		do
112 | 			echo "$(printf "node%02d" $i) cluster ${SUBNET}.5.$((${i} + 10)) ${SUBNET6}5:$((${i} + 10))"
113 | 		done > $NODELIST
114 | 	fi
115 | fi
116 | 
117 | unlink scaleout/hosts.nodes
118 | cat "$NODELIST" | while read name cluster ip4 ip6
119 | do
120 | 	[ ! -z "$ip4" ] && echo "$ip4 $name" >> scaleout/hosts.nodes
121 | 	[ ! -z "$ip6" ] && echo "$ip6 $name" >> scaleout/hosts.nodes
122 | done
123 | 
124 | HOSTLIST="    extra_hosts:
125 |       - \"db:${SUBNET}.1.3\"
126 |       - \"db:${SUBNET6}1:3\"
127 |       - \"slurmdbd:${SUBNET}.1.2\"
128 |       - \"slurmdbd:${SUBNET6}1:2\"
129 |       - \"login:${SUBNET}.1.5\"
130 |       - \"login:${SUBNET6}1:5\"
131 |       - \"rest:${SUBNET}.1.6\"
132 |       - \"rest:${SUBNET6}1:6\"
133 |       - \"proxy:${SUBNET}.1.7\"
134 |       - \"proxy:${SUBNET6}1:7\"
135 |       - \"es01:${SUBNET}.1.15\"
136 |       - \"es01:${SUBNET6}1:15\"
137 |       - \"es02:${SUBNET}.1.16\"
138 |       - \"es02:${SUBNET6}1:16\"
139 |       - \"es03:${SUBNET}.1.17\"
140 |       - \"es03:${SUBNET6}1:17\"
141 |       - \"kibana:${SUBNET}.1.18\"
142 |       - \"kibana:${SUBNET6}1:18\"
143 |       - \"influxdb:${SUBNET}.1.19\"
144 |       - \"influxdb:${SUBNET6}1:19\"
145 |       - \"grafana:${SUBNET}.1.20\"
146 |       - \"grafana:${SUBNET6}1:20\"
147 |       - \"open-ondemand:${SUBNET}.1.21\"
148 |       - \"open-ondemand:${SUBNET6}1:21\"
149 |       - \"xdmod:${SUBNET}.1.22\"
150 |       - \"xdmod:${SUBNET6}1:22\"
151 |       - \"keycloak:${SUBNET}.1.23\"
152 |       - \"keycloak:${SUBNET6}1:23\"
153 | "
154 | 
155 | if [ ! -z "$FEDERATION" ]
156 | then
157 | 	FIRST_CLUSTER="$(echo "$FEDERATION" | awk '{print $1}')"
158 | 	FIRST_MGMTNODE="${FIRST_CLUSTER}-mgmtnode"
159 | 	c_sub=5
160 | 
161 | 	for c in $FEDERATION
162 | 	do
163 | 		HOSTLIST="${HOSTLIST}      - \"${c}-mgmtnode:${SUBNET}.${c_sub}.1\""$'\n'
164 | 		HOSTLIST="${HOSTLIST}      - \"${c}-mgmtnode:${SUBNET6}${c_sub}:1\""$'\n'
165 | 		HOSTLIST="${HOSTLIST}      - \"${c}-mgmtnode2:${SUBNET}.${c_sub}.2\""$'\n'
166 | 		HOSTLIST="${HOSTLIST}      - \"${c}-mgmtnode2:${SUBNET6}${c_sub}:2\""$'\n'
167 | 
168 | 		c_sub=$((c_sub + 1))
169 | 	done
170 | else
171 | 	FIRST_CLUSTER="cluster"
172 | 	FIRST_MGMTNODE="mgmtnode"
173 | 	HOSTLIST="${HOSTLIST}      - \"mgmtnode:${SUBNET}.1.1\""$'\n'
174 | 	HOSTLIST="${HOSTLIST}      - \"mgmtnode:${SUBNET6}1:1\""$'\n'
175 | 	HOSTLIST="${HOSTLIST}      - \"mgmtnode2:${SUBNET}.1.4\""$'\n'
176 | 	HOSTLIST="${HOSTLIST}      - \"mgmtnode2:${SUBNET6}1:4\""$'\n'
177 | fi
178 | 
179 | LOGGING="
180 |     tty: true
181 |     logging:
182 |       driver: local
183 |     cap_add:
184 |       - SYS_PTRACE
185 |       - SYS_ADMIN
186 |       - MKNOD
187 |       - SYS_NICE
188 |       - SYS_RESOURCE
189 |     security_opt:
190 |       - seccomp:unconfined
191 |       - apparmor:unconfined
192 | "
193 | 
194 | XDMOD="
195 |   xdmod:
196 |     build:
197 |       context: ./xdmod
198 |       network: host
199 |     image: xdmod:latest
200 |     environment:
201 |       - SUBNET=\"${SUBNET}\"
202 |       - SUBNET6=\"${SUBNET6}\"
203 |       - container=docker
204 |     hostname: xdmod
205 |     command: [\"/sbin/startup.sh\"]
206 |     networks:
207 |       internal:
208 |         ipv4_address: ${SUBNET}.1.22
209 |         ipv6_address: ${SUBNET6}1:22
210 |     volumes:
211 | $SYSDFSMOUNTS
212 |       - xdmod:/xdmod/
213 | $XDMOD_PORTS
214 | $LOGGING
215 | $HOSTLIST
216 | "
217 | 
218 | if [ "$DISABLE_XDMOD" ]
219 | then
220 | 	XDMOD=""
221 | fi
222 | 
223 | if [ "$CLOUD" ]
224 | then
225 | 	CLOUD_MOUNTS="
226 |       - type: bind
227 |         source: $(readlink -e $(pwd)/cloud_socket)
228 |         target: /run/cloud_socket
229 | "
230 | else
231 | 	CLOUD_MOUNTS=""
232 | fi
233 | # disable Linux specific options
234 | [ $MAC ] && LOGGING=
235 | 
236 | cat <<EOF
237 | ---
238 | networks:
239 |   internal:
240 |     driver: bridge
241 |     driver_opts:
242 |         com.docker.network.bridge.enable_ip_masquerade: 'true'
243 |         com.docker.network.bridge.enable_icc: 'true'
244 |     internal: false
245 |     enable_ipv6: true
246 |     ipam:
247 |       config:
248 |         - subnet: "${SUBNET}.0.0/16"
249 |         - subnet: "${SUBNET6}/64"
250 | volumes:
251 |   root-home:
252 |   home:
253 |   etc-ssh:
254 | EOF
255 | 
256 | if [ ! -z "$FEDERATION" ]
257 | then
258 | 
259 | 	for c in $FEDERATION
260 | 	do
261 | 		cat <<EOF
262 |   ${c}-etc-slurm:
263 |   ${c}-slurmctld:
264 | EOF
265 | 	done
266 | 
267 | else
268 | 
269 | cat <<EOF
270 |   cluster-etc-slurm:
271 |   slurmctld:
272 | EOF
273 | 
274 | fi
275 | 
276 | cat <<EOF
277 |   elastic_data01:
278 |   elastic_data02:
279 |   elastic_data03:
280 |   mail:
281 |   auth:
282 |   xdmod:
283 |   src:
284 |   container-shared:
285 | services:
286 |   db:
287 |     image: sql_server:latest
288 |     build:
289 |       context: ./sql_server
290 |       args:
291 |         SUBNET: "$SUBNET"
292 |         SUBNET6: "$SUBNET6"
293 |       network: host
294 |     environment:
295 |       - MYSQL_ROOT_PASSWORD=password
296 |       - MYSQL_USER=slurm
297 |       - MYSQL_PASSWORD=password
298 |       - MYSQL_DATABASE=slurm_acct_db
299 |       - SUBNET="${SUBNET}"
300 |       - SUBNET6="${SUBNET6}"
301 | $MYSQL_VOLUMES
302 |     hostname: db
303 | $LOGGING
304 |     networks:
305 |       internal:
306 |         ipv4_address: "${SUBNET}.1.3"
307 |         ipv6_address: "${SUBNET6}1:3"
308 | $HOSTLIST
309 |   slurmdbd:
310 |     build:
311 |       context: ./scaleout
312 |       args:
313 |         DOCKER_FROM: $DISTRO
314 |         SLURM_RELEASE: $SLURM_RELEASE
315 |         SUBNET: "$SUBNET"
316 |         SUBNET6: "$SUBNET6"
317 |         CACHE_DESTROYER: "$CACHE_DESTROYER"
318 |         DISABLE_GDB_BUILD: "$DISABLE_GDB_BUILD"
319 |         DISABLE_PYXIS: "$DISABLE_PYXIS"
320 | ${BUILD_ARGS}
321 |       network: host
322 |     image: scaleout:latest
323 |     environment:
324 |       - SUBNET="${SUBNET}"
325 |       - SUBNET6="${SUBNET}"
326 |     hostname: slurmdbd
327 |     networks:
328 |       internal:
329 |         ipv4_address: "${SUBNET}.1.2"
330 |         ipv6_address: "${SUBNET6}1:2"
331 |     volumes:
332 |       - root-home:/root
333 |       - ${FIRST_CLUSTER}-etc-slurm:/etc/slurm
334 |       - mail:/var/spool/mail/
335 |       - src:/usr/local/src/
336 | $SYSDFSMOUNTS
337 | $LOGGING
338 |     depends_on:
339 |       - "db"
340 | $HOSTLIST
341 | EOF
342 | 
343 | if [ ! -z "$FEDERATION" ]
344 | then
345 | 	LOGIN_MOUNTS=
346 | 
347 | 	c_sub=5
348 | 	for c in $FEDERATION
349 | 	do
350 | 
351 | cat <<EOF
352 |   ${c}-mgmtnode:
353 |     image: scaleout:latest
354 |     environment:
355 |       - SUBNET="${SUBNET}"
356 |       - SUBNET6="${SUBNET6}"
357 |       - container=docker
358 |       - SLURM_FEDERATION_CLUSTER=${c}
359 |     hostname: ${c}-mgmtnode
360 |     networks:
361 |       internal:
362 |         ipv4_address: ${SUBNET}.${c_sub}.1
363 |         ipv6_address: ${SUBNET6}${c_sub}:1
364 |     volumes:
365 |       - root-home:/root
366 |       - home:/home/
367 |       - ${c}-slurmctld:/var/spool/slurm
368 |       - etc-ssh:/etc/ssh
369 |       - ${c}-etc-slurm:/etc/slurm
370 |       - mail:/var/spool/mail/
371 |       - auth:/auth/
372 |       - xdmod:/xdmod/
373 |       - src:/usr/local/src/
374 | $SYSDFSMOUNTS
375 | $CLOUD_MOUNTS
376 | $LOGGING
377 |     depends_on:
378 |       - "slurmdbd"
379 |       - "keycloak"
380 | $HOSTLIST
381 |   ${c}-mgmtnode2:
382 |     image: scaleout:latest
383 |     environment:
384 |       - SUBNET="${SUBNET}"
385 |       - SUBNET6="${SUBNET6}"
386 |       - container=docker
387 |       - SLURM_FEDERATION_CLUSTER=${c}
388 |     hostname: ${c}-mgmtnode2
389 |     networks:
390 |       internal:
391 |         ipv4_address: ${SUBNET}.${c_sub}.4
392 |         ipv6_address: ${SUBNET6}${c_sub}:4
393 |     volumes:
394 |       - root-home:/root
395 |       - etc-ssh:/etc/ssh
396 |       - ${c}-etc-slurm:/etc/slurm
397 |       - home:/home/
398 |       - ${c}-slurmctld:/var/spool/slurm
399 |       - mail:/var/spool/mail/
400 |       - src:/usr/local/src/
401 | $SYSDFSMOUNTS
402 | $CLOUD_MOUNTS
403 | $LOGGING
404 |     depends_on:
405 |       - "slurmdbd"
406 |       - "${c}-mgmtnode"
407 | $HOSTLIST
408 | EOF
409 | 
410 | 		c_sub=$((c_sub+1))
411 | 	done
412 | 
413 | else
414 | 
415 | 	LOGIN_MOUNTS="      - slurmctld:/var/spool/slurm"
416 | 
417 | cat <<EOF
418 |   mgmtnode:
419 |     image: scaleout:latest
420 |     environment:
421 |       - SUBNET="${SUBNET}"
422 |       - SUBNET6="${SUBNET6}"
423 |       - container=docker
424 |       - SLURM_FEDERATION_CLUSTER=${FIRST_CLUSTER}
425 |     hostname: mgmtnode
426 |     networks:
427 |       internal:
428 |         ipv4_address: ${SUBNET}.1.1
429 |         ipv6_address: ${SUBNET6}1:1
430 |     volumes:
431 |       - root-home:/root
432 |       - home:/home/
433 |       - slurmctld:/var/spool/slurm
434 |       - etc-ssh:/etc/ssh
435 |       - ${FIRST_CLUSTER}-etc-slurm:/etc/slurm
436 |       - mail:/var/spool/mail/
437 |       - auth:/auth/
438 |       - xdmod:/xdmod/
439 |       - src:/usr/local/src/
440 | $SYSDFSMOUNTS
441 | $CLOUD_MOUNTS
442 | $LOGGING
443 |     depends_on:
444 |       - "slurmdbd"
445 | $HOSTLIST
446 |   mgmtnode2:
447 |     image: scaleout:latest
448 |     environment:
449 |       - SUBNET="${SUBNET}"
450 |       - SUBNET6="${SUBNET6}"
451 |       - container=docker
452 |       - SLURM_FEDERATION_CLUSTER=${FIRST_CLUSTER}
453 |     hostname: mgmtnode2
454 |     networks:
455 |       internal:
456 |         ipv4_address: ${SUBNET}.1.4
457 |         ipv6_address: ${SUBNET6}1:4
458 |     volumes:
459 |       - root-home:/root
460 |       - etc-ssh:/etc/ssh
461 |       - ${FIRST_CLUSTER}-etc-slurm:/etc/slurm
462 |       - home:/home/
463 |       - slurmctld:/var/spool/slurm
464 |       - mail:/var/spool/mail/
465 |       - src:/usr/local/src/
466 | $SYSDFSMOUNTS
467 | $CLOUD_MOUNTS
468 | $LOGGING
469 |     depends_on:
470 |       - "slurmdbd"
471 |       - "mgmtnode"
472 | $HOSTLIST
473 | EOF
474 | 
475 | fi #end mgmtnode creation
476 | 
477 | cat <<EOF
478 |   login:
479 |     image: scaleout:latest
480 |     environment:
481 |       - SUBNET="${SUBNET}"
482 |       - SUBNET6="${SUBNET6}"
483 |       - container=docker
484 |     hostname: login
485 |     networks:
486 |       internal:
487 |         ipv4_address: ${SUBNET}.1.5
488 |         ipv6_address: ${SUBNET6}1:5
489 |     volumes:
490 |       - root-home:/root
491 |       - etc-ssh:/etc/ssh
492 |       - ${FIRST_CLUSTER}-etc-slurm:/etc/slurm
493 |       - home:/home/
494 | $LOGIN_MOUNTS
495 |       - mail:/var/spool/mail/
496 |       - src:/usr/local/src/
497 |       - /var/lib/containers
498 |       - /dev/fuse:/dev/fuse:rw
499 |       - container-shared:/srv/containers
500 | $SYSDFSMOUNTS
501 | $LOGGING
502 | $HOSTLIST
503 | EOF
504 | 
505 | lastcluster="$FIRST_CLUSTER"
506 | lastname="$FIRST_MGMTNODE"
507 | oi=0
508 | cat "$NODELIST" | while read name cluster ip4 ip6
509 | do
510 | 	[ "$cluster" != "$lastcluster" ] && lastname="${cluster}-mgmtnode"
511 | 	lastcluster="$cluster"
512 | 
513 | 	oi=$(($oi + 1))
514 | 	i=$(($i + 1))
515 | 
516 | 	i4=
517 | 	i6=
518 | 	[ ! -z "$ip4" ] && i4="ipv4_address: $ip4"
519 | 	[ ! -z "$ip6" ] && i6="ipv6_address: $ip6"
520 | cat <<EOF
521 |   $name:
522 |     image: scaleout:latest
523 |     environment:
524 |       - SUBNET="${SUBNET}"
525 |       - SUBNET6="${SUBNET6}"
526 |       - container=docker
527 |       - SLURM_FEDERATION_CLUSTER=${cluster}
528 |     hostname: $name
529 |     networks:
530 |       internal:
531 |         $i4
532 |         $i6
533 |     volumes:
534 |       - root-home:/root
535 |       - etc-ssh:/etc/ssh
536 |       - ${cluster}-etc-slurm:/etc/slurm
537 |       - home:/home/
538 |       - mail:/var/spool/mail/
539 |       - src:/usr/local/src/
540 |       - container-shared:/srv/containers
541 | $SYSDFSMOUNTS
542 |     ulimits:
543 |       nproc:
544 |         soft: 65535
545 |         hard: 65535
546 |       nofile:
547 |         soft: 131072
548 |         hard: 131072
549 |       memlock:
550 |         soft: -1
551 |         hard: -1
552 | $LOGGING
553 |     depends_on:
554 |       - "$lastname"
555 | $HOSTLIST
556 | EOF
557 | 
558 | 	[ $oi -gt 100 -a ! -z "$name" ] && oi=0 && lastname="$name"
559 | done
560 | 
561 | [ "$CLOUD" ] && cat <<EOF
562 |   cloud:
563 |     image: scaleout:latest
564 |     networks:
565 |       internal: {}
566 |     environment:
567 |       - SUBNET="${SUBNET}"
568 |       - SUBNET6="${SUBNET6}"
569 |       - container=docker
570 |       - CLOUD=1
571 |     volumes:
572 |       - root-home:/root
573 |       - etc-ssh:/etc/ssh
574 |       - ${FIRST_CLUSTER}-etc-slurm:/etc/slurm
575 |       - home:/home/
576 |       - mail:/var/spool/mail/
577 |       - src:/usr/local/src/
578 |       - container-shared:/srv/containers
579 | $SYSDFSMOUNTS
580 | $CLOUD_MOUNTS
581 |     ulimits:
582 |       nproc:
583 |         soft: 65535
584 |         hard: 65535
585 |       nofile:
586 |         soft: 131072
587 |         hard: 131072
588 |       memlock:
589 |         soft: -1
590 |         hard: -1
591 | $LOGGING
592 | $HOSTLIST
593 | EOF
594 | 
595 | cat <<EOF
596 |   open-ondemand:
597 |     build:
598 |       context: ./open-ondemand
599 |       network: host
600 |     image: open-ondemand
601 |     environment:
602 |       - SUBNET="${SUBNET}"
603 |       - SUBNET6="${SUBNET6}"
604 |       - DEFAULT_SSHHOST=login
605 |     volumes:
606 |       - /dev/log:/dev/log
607 |       - etc-ssh:/etc/shared-ssh
608 |       - home:/home/
609 |     networks:
610 |       internal:
611 |         ipv4_address: ${SUBNET}.1.21
612 |         ipv6_address: ${SUBNET6}1:21
613 |     depends_on:
614 |       - "login"
615 | $ONDEMAND_PORTS
616 | $LOGGING
617 |   influxdb:
618 |     build:
619 |       context: ./influxdb
620 |       network: host
621 |     image: influxdb
622 |     command: ["bash", "-c", "/setup.sh & source /entrypoint.sh"]
623 |     environment:
624 |       - SUBNET="${SUBNET}"
625 |       - SUBNET6="${SUBNET6}"
626 |       - DOCKER_INFLUXDB_INIT_MODE=setup
627 |       - DOCKER_INFLUXDB_INIT_USERNAME=user
628 |       - DOCKER_INFLUXDB_INIT_PASSWORD=password
629 |       - DOCKER_INFLUXDB_INIT_ORG=scaleout
630 |       - DOCKER_INFLUXDB_INIT_BUCKET=scaleout
631 |       - DOCKER_INFLUXDB_INIT_RETENTION=1w
632 |       - DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=token
633 |       - DOCKER_INFLUXDB_INIT_USER_ID=
634 |       - INFLUXDB_DATA_QUERY_LOG_ENABLED=true
635 |       - INFLUXDB_REPORTING_DISABLED=false
636 |       - INFLUXDB_HTTP_LOG_ENABLED=true
637 |       - INFLUXDB_CONTINUOUS_QUERIES_LOG_ENABLED=true
638 |       - LOG_LEVEL=debug
639 |     ulimits:
640 |       memlock:
641 |         soft: -1
642 |         hard: -1
643 |     volumes:
644 |       - /dev/log:/dev/log
645 |     networks:
646 |       internal:
647 |         ipv4_address: ${SUBNET}.1.19
648 |         ipv6_address: ${SUBNET6}1:19
649 | $LOGGING
650 |   grafana:
651 |     image: grafana
652 |     build:
653 |       context: ./grafana
654 |       network: host
655 |     environment:
656 |       - SUBNET="${SUBNET}"
657 |       - SUBNET6="${SUBNET6}"
658 |     volumes:
659 |       - /dev/log:/dev/log
660 |     networks:
661 |       internal:
662 |         ipv4_address: ${SUBNET}.1.20
663 |         ipv6_address: ${SUBNET6}1:20
664 | $GRAFANA_PORTS
665 | $LOGGING
666 |   es01:
667 |     image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.1
668 |     environment:
669 |       - node.name=es01
670 |       - cluster.name=scaleout
671 |       - discovery.seed_hosts=es02,es03
672 |       - cluster.initial_master_nodes=es01,es02,es03
673 |       - bootstrap.memory_lock=true
674 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
675 |       - SUBNET="${SUBNET}"
676 |       - SUBNET6="${SUBNET6}"
677 |     ulimits:
678 |       memlock:
679 |         soft: -1
680 |         hard: -1
681 |     volumes:
682 |       - elastic_data01:/usr/share/elasticsearch/data
683 |       - /dev/log:/dev/log
684 |     networks:
685 |       internal:
686 |         ipv4_address: ${SUBNET}.1.15
687 |         ipv6_address: ${SUBNET6}1:15
688 | ${ES_PORTS}
689 | $LOGGING
690 |   es02:
691 |     image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.1
692 |     environment:
693 |       - node.name=es02
694 |       - cluster.name=scaleout
695 |       - discovery.seed_hosts=es01,es03
696 |       - cluster.initial_master_nodes=es01,es02,es03
697 |       - bootstrap.memory_lock=true
698 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
699 |       - SUBNET="${SUBNET}"
700 |       - SUBNET6="${SUBNET6}"
701 |     ulimits:
702 |       memlock:
703 |         soft: -1
704 |         hard: -1
705 |     volumes:
706 |       - elastic_data02:/usr/share/elasticsearch/data
707 |       - /dev/log:/dev/log
708 |     networks:
709 |       internal:
710 |         ipv4_address: ${SUBNET}.1.16
711 |         ipv6_address: ${SUBNET6}1:16
712 | $LOGGING
713 |   es03:
714 |     image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.1
715 |     environment:
716 |       - node.name=es03
717 |       - cluster.name=scaleout
718 |       - discovery.seed_hosts=es01,es02
719 |       - cluster.initial_master_nodes=es01,es02,es03
720 |       - bootstrap.memory_lock=true
721 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
722 |       - SUBNET="${SUBNET}"
723 |       - SUBNET6="${SUBNET6}"
724 |     ulimits:
725 |       memlock:
726 |         soft: -1
727 |         hard: -1
728 |     volumes:
729 |       - elastic_data03:/usr/share/elasticsearch/data
730 |       - /dev/log:/dev/log
731 |     networks:
732 |       internal:
733 |         ipv4_address: ${SUBNET}.1.17
734 |         ipv6_address: ${SUBNET6}1:17
735 | $LOGGING
736 |   kibana:
737 |     image: docker.elastic.co/kibana/kibana-oss:7.10.1
738 |     volumes:
739 |       - /dev/log:/dev/log
740 |     environment:
741 |       - SERVER_NAME=scaleout
742 |       - ELASTICSEARCH_HOSTS=http://es01:9200
743 |       - SUBNET="${SUBNET}"
744 |       - SUBNET6="${SUBNET6}"
745 |     networks:
746 |       internal:
747 |         ipv4_address: ${SUBNET}.1.18
748 |         ipv6_address: ${SUBNET6}1:18
749 | ${KIBANA_PORTS}
750 |     depends_on:
751 |       - "es01"
752 |       - "es02"
753 |       - "es03"
754 | $LOGGING
755 |   rest:
756 |     hostname: rest
757 |     image: scaleout:latest
758 |     networks:
759 |       internal:
760 |         ipv4_address: ${SUBNET}.1.6
761 |         ipv6_address: ${SUBNET6}1:6
762 |     volumes:
763 |       - etc-ssh:/etc/ssh
764 |       - ${FIRST_CLUSTER}-etc-slurm:/etc/slurm
765 | $SYSDFSMOUNTS
766 | $LOGGING
767 |     depends_on:
768 |       - "${FIRST_MGMTNODE}"
769 | $HOSTLIST
770 |   proxy:
771 |     build:
772 |       context: ./proxy
773 |       network: host
774 |     image: proxy:latest
775 |     environment:
776 |       - SUBNET="${SUBNET}"
777 |       - SUBNET6="${SUBNET6}"
778 |       - container=docker
779 |     hostname: proxy
780 |     command: ["bash", "-c", "/usr/sbin/nginx& /usr/sbin/php-fpm83 -F& wait"]
781 |     networks:
782 |       internal:
783 |         ipv4_address: ${SUBNET}.1.7
784 |         ipv6_address: ${SUBNET6}1:7
785 |     volumes:
786 |       - auth:/auth/
787 |       - /dev/log:/dev/log
788 | $LOGGING
789 | ${PROXY_PORTS}
790 |     depends_on:
791 |       - "rest"
792 | $HOSTLIST
793 | $XDMOD
794 |   keycloak:
795 |     image: keycloak:latest
796 |     build:
797 |       context: ./keycloak
798 |       network: host
799 |     environment:
800 |       - KC_BOOTSTRAP_ADMIN_USERNAME=admin
801 |       - KC_BOOTSTRAP_ADMIN_PASSWORD=password
802 |     hostname: keycloak
803 | $LOGGING
804 |     networks:
805 |       internal:
806 |         ipv4_address: "${SUBNET}.1.23"
807 |         ipv6_address: "${SUBNET6}1:23"
808 | $KEYCLOAK_PORTS
809 | EOF
810 | 
811 | exit 0
812 | 
813 | 


--------------------------------------------------------------------------------
/cloud_monitor.py3:
--------------------------------------------------------------------------------
  1 | #!/usb/bin/env python3
  2 | #
  3 | # Cloud monitoring server to get commands from Slurm to start and stop cloud nodes
  4 | #
  5 | import socket
  6 | import sys
  7 | import os
  8 | import subprocess
  9 | import signal
 10 | import stat
 11 | import json
 12 | from shlex import quote
 13 | 
 14 | active_nodes = 0
 15 | server_address = 'cloud_socket'
 16 | requested_nodes = set()
 17 | node_names = dict() # docker tag -> requested hostname
 18 | dcompose = sys.argv[1]
 19 | 
 20 | # Make sure the socket does not already exist
 21 | try:
 22 |     os.unlink(server_address)
 23 | except OSError:
 24 |     if os.path.exists(server_address):
 25 |         raise
 26 | 
 27 | # Create a UDS socket
 28 | sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
 29 | sock.bind(server_address)
 30 | #allow anyone to write to the socket
 31 | os.chmod(server_address, stat.S_IROTH | stat.S_IWOTH)
 32 | 
 33 | # Listen for incoming connections
 34 | sock.listen(1)
 35 | 
 36 | os.system("%s up --remove-orphans --build --scale cloud=%s --no-recreate -d" % (dcompose, active_nodes))
 37 | 
 38 | while True:
 39 |     connection=None
 40 |     try:
 41 |         print('waiting for a connection', file=sys.stderr)
 42 |         connection, client_address = sock.accept()
 43 |         print('new connection', file=sys.stderr)
 44 | 
 45 |         connection.settimeout(10)
 46 |         data = connection.recv(4096).decode('utf-8').strip()
 47 |         connection.shutdown(socket.SHUT_RD)
 48 |         print('received "%s"' % (data), file=sys.stderr)
 49 |         if data:
 50 |             op = data.split(":", 1)
 51 |             if op[0] == "stop":
 52 |                 tag=node_names[op[1]]
 53 | 
 54 |                 os.system("docker rm -f \"%s\"" % (quote(tag)))
 55 |                 node_names.pop(tag, None)
 56 |                 connection.sendall(b'ACK')
 57 |                 active_nodes -= 1
 58 |             elif op[0] == "start":
 59 |                 #increase node count by 1
 60 |                 requested_nodes.add(op[1])
 61 |                 active_nodes += 1
 62 |                 os.system("%s up --scale cloud=%s --no-recreate -d" % (dcompose, active_nodes))
 63 |                 connection.sendall(b'ACK')
 64 |             elif op[0] == "whoami":
 65 |                 found=False
 66 | 
 67 |                 # already known hash
 68 |                 for requested_node, short_node in node_names.items():
 69 |                     if short_node == op[1]:
 70 |                         found=True
 71 |                         break
 72 | 
 73 |                 if not found:
 74 |                     short_node=op[1]
 75 |                     requested_node = requested_nodes.pop()
 76 |                     node_names[requested_node]=short_node
 77 | 
 78 |                 if requested_node:
 79 |                     print("responding: %s=%s" % (requested_node, short_node), file=sys.stderr)
 80 |                     connection.sendall(requested_node.encode('utf-8'))
 81 |                 else:
 82 |                     connection.sendall(b'FAIL')
 83 | 
 84 |                 print("Active Nodes=%s Known Nodes[%s]=%s" % (active_nodes, len(node_names), node_names), file=sys.stderr)
 85 |             else:
 86 |                 connection.sendall(b'FAIL')
 87 | 
 88 |         connection.close()
 89 |     except socket.timeout:
 90 |         print('connection timeout', file=sys.stderr)
 91 |     except BrokenPipeError:
 92 |         print('ignoring broken pipe', file=sys.stderr)
 93 |     except KeyboardInterrupt:
 94 |         print('shutting down', file=sys.stderr)
 95 |         break;
 96 | 
 97 | sock.close()
 98 | os.unlink(server_address)
 99 | 
100 | #stop the containers
101 | os.system("make stop")
102 | 


--------------------------------------------------------------------------------
/grafana/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM grafana/grafana:latest
2 | USER root:root
3 | RUN apk add wget
4 | COPY grafana.ini /etc/grafana/grafana.ini
5 | COPY influxdb.yaml /etc/grafana/provisioning/datasources/influxdb.yaml
6 | COPY dashboards.yaml /etc/grafana/provisioning/dashboards/dashboards.yaml
7 | RUN mkdir -p /var/lib/grafana/dashboards
8 | RUN wget 'https://grafana.com/api/dashboards/11057/revisions/1/download' -O /var/lib/grafana/dashboards/11057.json
9 | 


--------------------------------------------------------------------------------
/grafana/dashboards.yaml:
--------------------------------------------------------------------------------
1 | - name: 'default'
2 |   org_id: 1
3 |   folder: ''
4 |   type: 'file'
5 |   options:
6 |     folder: '/var/lib/grafana/dashboards'
7 | 


--------------------------------------------------------------------------------
/grafana/grafana.ini:
--------------------------------------------------------------------------------
  1 | ##################### Grafana Configuration Example #####################
  2 | #
  3 | # Everything has defaults so you only need to uncomment things you want to
  4 | # change
  5 | 
  6 | # possible values : production, development
  7 | ;app_mode = production
  8 | 
  9 | # instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty
 10 | ;instance_name = ${HOSTNAME}
 11 | 
 12 | #################################### Paths ####################################
 13 | [paths]
 14 | # Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used)
 15 | ;data = /var/lib/grafana
 16 | 
 17 | # Temporary files in `data` directory older than given duration will be removed
 18 | ;temp_data_lifetime = 24h
 19 | 
 20 | # Directory where grafana can store logs
 21 | ;logs = /var/log/grafana
 22 | 
 23 | # Directory where grafana will automatically scan and look for plugins
 24 | ;plugins = /var/lib/grafana/plugins
 25 | 
 26 | # folder that contains provisioning config files that grafana will apply on startup and while running.
 27 | ;provisioning = conf/provisioning
 28 | 
 29 | #################################### Server ####################################
 30 | [server]
 31 | # Protocol (http, https, h2, socket)
 32 | ;protocol = http
 33 | 
 34 | # The ip address to bind to, empty will bind to all interfaces
 35 | ;http_addr =
 36 | 
 37 | # The http port  to use
 38 | ;http_port = 3000
 39 | 
 40 | # The public facing domain name used to access grafana from a browser
 41 | ;domain = localhost
 42 | 
 43 | # Redirect to correct domain if host header does not match domain
 44 | # Prevents DNS rebinding attacks
 45 | ;enforce_domain = false
 46 | 
 47 | # The full public facing url you use in browser, used for redirects and emails
 48 | # If you use reverse proxy and sub path specify full url (with sub path)
 49 | ;root_url = %(protocol)s://%(domain)s:%(http_port)s/
 50 | 
 51 | # Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
 52 | ;serve_from_sub_path = false
 53 | 
 54 | # Log web requests
 55 | ;router_logging = false
 56 | 
 57 | # the path relative working path
 58 | ;static_root_path = public
 59 | 
 60 | # enable gzip
 61 | ;enable_gzip = false
 62 | 
 63 | # https certs & key file
 64 | ;cert_file =
 65 | ;cert_key =
 66 | 
 67 | # Unix socket path
 68 | ;socket =
 69 | 
 70 | # CDN Url
 71 | ;cdn_url =
 72 | 
 73 | #################################### Database ####################################
 74 | [database]
 75 | # You can configure the database connection by specifying type, host, name, user and password
 76 | # as separate properties or as on string using the url properties.
 77 | 
 78 | # Either "mysql", "postgres" or "sqlite3", it's your choice
 79 | ;type = sqlite3
 80 | ;host = 127.0.0.1:3306
 81 | ;name = grafana
 82 | ;user = root
 83 | # If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
 84 | ;password =
 85 | 
 86 | # Use either URL or the previous fields to configure the database
 87 | # Example: mysql://user:secret@host:port/database
 88 | ;url =
 89 | 
 90 | # For "postgres" only, either "disable", "require" or "verify-full"
 91 | ;ssl_mode = disable
 92 | 
 93 | ;ca_cert_path =
 94 | ;client_key_path =
 95 | ;client_cert_path =
 96 | ;server_cert_name =
 97 | 
 98 | # For "sqlite3" only, path relative to data_path setting
 99 | ;path = grafana.db
100 | 
101 | # Max idle conn setting default is 2
102 | ;max_idle_conn = 2
103 | 
104 | # Max conn setting default is 0 (mean not set)
105 | ;max_open_conn =
106 | 
107 | # Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours)
108 | ;conn_max_lifetime = 14400
109 | 
110 | # Set to true to log the sql calls and execution times.
111 | ;log_queries =
112 | 
113 | # For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared)
114 | ;cache_mode = private
115 | 
116 | ################################### Data sources #########################
117 | [datasources]
118 | # Upper limit of data sources that Grafana will return. This limit is a temporary configuration and it will be deprecated when pagination will be introduced on the list data sources API.
119 | ;datasource_limit = 5000
120 | 
121 | #################################### Cache server #############################
122 | [remote_cache]
123 | # Either "redis", "memcached" or "database" default is "database"
124 | ;type = database
125 | 
126 | # cache connectionstring options
127 | # database: will use Grafana primary database.
128 | # redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'.
129 | # memcache: 127.0.0.1:11211
130 | ;connstr =
131 | 
132 | #################################### Data proxy ###########################
133 | [dataproxy]
134 | 
135 | # This enables data proxy logging, default is false
136 | ;logging = false
137 | 
138 | # How long the data proxy waits before timing out, default is 30 seconds.
139 | # This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set.
140 | ;timeout = 30
141 | 
142 | # How many seconds the data proxy waits before sending a keepalive probe request.
143 | ;keep_alive_seconds = 30
144 | 
145 | # How many seconds the data proxy waits for a successful TLS Handshake before timing out.
146 | ;tls_handshake_timeout_seconds = 10
147 | 
148 | # How many seconds the data proxy will wait for a server's first response headers after
149 | # fully writing the request headers if the request has an "Expect: 100-continue"
150 | # header. A value of 0 will result in the body being sent immediately, without
151 | # waiting for the server to approve.
152 | ;expect_continue_timeout_seconds = 1
153 | 
154 | # The maximum number of idle connections that Grafana will keep alive.
155 | ;max_idle_connections = 100
156 | 
157 | # How many seconds the data proxy keeps an idle connection open before timing out.
158 | ;idle_conn_timeout_seconds = 90
159 | 
160 | # If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false.
161 | ;send_user_header = false
162 | 
163 | #################################### Analytics ####################################
164 | [analytics]
165 | # Server reporting, sends usage counters to stats.grafana.org every 24 hours.
166 | # No ip addresses are being tracked, only simple counters to track
167 | # running instances, dashboard and error counts. It is very helpful to us.
168 | # Change this option to false to disable reporting.
169 | ;reporting_enabled = true
170 | 
171 | # The name of the distributor of the Grafana instance. Ex hosted-grafana, grafana-labs
172 | ;reporting_distributor = grafana-labs
173 | 
174 | # Set to false to disable all checks to https://grafana.net
175 | # for new versions (grafana itself and plugins), check is used
176 | # in some UI views to notify that grafana or plugin update exists
177 | # This option does not cause any auto updates, nor send any information
178 | # only a GET request to http://grafana.com to get latest versions
179 | ;check_for_updates = true
180 | 
181 | # Google Analytics universal tracking code, only enabled if you specify an id here
182 | ;google_analytics_ua_id =
183 | 
184 | # Google Tag Manager ID, only enabled if you specify an id here
185 | ;google_tag_manager_id =
186 | 
187 | #################################### Security ####################################
188 | [security]
189 | # disable creation of admin user on first start of grafana
190 | ;disable_initial_admin_creation = false
191 | 
192 | # default admin user, created on startup
193 | ;admin_user = admin
194 | 
195 | # default admin password, can be changed before first start of grafana,  or in profile settings
196 | ;admin_password = admin
197 | 
198 | # used for signing
199 | ;secret_key = SW2YcwTIb9zpOOhoPsMm
200 | 
201 | # disable gravatar profile images
202 | ;disable_gravatar = false
203 | 
204 | # data source proxy whitelist (ip_or_domain:port separated by spaces)
205 | ;data_source_proxy_whitelist =
206 | 
207 | # disable protection against brute force login attempts
208 | ;disable_brute_force_login_protection = false
209 | 
210 | # set to true if you host Grafana behind HTTPS. default is false.
211 | ;cookie_secure = false
212 | 
213 | # set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled"
214 | ;cookie_samesite = lax
215 | 
216 | # set to true if you want to allow browsers to render Grafana in a <frame>, <iframe>, <embed> or <object>. default is false.
217 | ;allow_embedding = false
218 | 
219 | # Set to true if you want to enable http strict transport security (HSTS) response header.
220 | # This is only sent when HTTPS is enabled in this configuration.
221 | # HSTS tells browsers that the site should only be accessed using HTTPS.
222 | ;strict_transport_security = false
223 | 
224 | # Sets how long a browser should cache HSTS. Only applied if strict_transport_security is enabled.
225 | ;strict_transport_security_max_age_seconds = 86400
226 | 
227 | # Set to true if to enable HSTS preloading option. Only applied if strict_transport_security is enabled.
228 | ;strict_transport_security_preload = false
229 | 
230 | # Set to true if to enable the HSTS includeSubDomains option. Only applied if strict_transport_security is enabled.
231 | ;strict_transport_security_subdomains = false
232 | 
233 | # Set to true to enable the X-Content-Type-Options response header.
234 | # The X-Content-Type-Options response HTTP header is a marker used by the server to indicate that the MIME types advertised
235 | # in the Content-Type headers should not be changed and be followed.
236 | ;x_content_type_options = true
237 | 
238 | # Set to true to enable the X-XSS-Protection header, which tells browsers to stop pages from loading
239 | # when they detect reflected cross-site scripting (XSS) attacks.
240 | ;x_xss_protection = true
241 | 
242 | # Enable adding the Content-Security-Policy header to your requests.
243 | # CSP allows to control resources the user agent is allowed to load and helps prevent XSS attacks.
244 | ;content_security_policy = false
245 | 
246 | # Set Content Security Policy template used when adding the Content-Security-Policy header to your requests.
247 | # $NONCE in the template includes a random nonce.
248 | ;content_security_policy_template = """script-src 'unsafe-eval' 'strict-dynamic' $NONCE;object-src 'none';font-src 'self';style-src 'self' 'unsafe-inline';img-src 'self' data:;base-uri 'self';connect-src 'self' grafana.com;manifest-src 'self';media-src 'none';form-action 'self';"""
249 | 
250 | #################################### Snapshots ###########################
251 | [snapshots]
252 | # snapshot sharing options
253 | ;external_enabled = true
254 | ;external_snapshot_url = https://snapshots-origin.raintank.io
255 | ;external_snapshot_name = Publish to snapshot.raintank.io
256 | 
257 | # Set to true to enable this Grafana instance act as an external snapshot server and allow unauthenticated requests for
258 | # creating and deleting snapshots.
259 | ;public_mode = false
260 | 
261 | # remove expired snapshot
262 | ;snapshot_remove_expired = true
263 | 
264 | #################################### Dashboards History ##################
265 | [dashboards]
266 | # Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1
267 | ;versions_to_keep = 20
268 | 
269 | # Minimum dashboard refresh interval. When set, this will restrict users to set the refresh interval of a dashboard lower than given interval. Per default this is 5 seconds.
270 | # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
271 | ;min_refresh_interval = 5s
272 | 
273 | # Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json"
274 | ;default_home_dashboard_path =
275 | 
276 | #################################### Users ###############################
277 | [users]
278 | # disable user signup / registration
279 | ;allow_sign_up = true
280 | 
281 | # Allow non admin users to create organizations
282 | ;allow_org_create = true
283 | 
284 | # Set to true to automatically assign new users to the default organization (id 1)
285 | ;auto_assign_org = true
286 | 
287 | # Set this value to automatically add new users to the provided organization (if auto_assign_org above is set to true)
288 | ;auto_assign_org_id = 1
289 | 
290 | # Default role new users will be automatically assigned (if disabled above is set to true)
291 | ;auto_assign_org_role = Viewer
292 | 
293 | # Require email validation before sign up completes
294 | ;verify_email_enabled = false
295 | 
296 | # Background text for the user field on the login page
297 | ;login_hint = email or username
298 | ;password_hint = password
299 | 
300 | # Default UI theme ("dark" or "light")
301 | ;default_theme = dark
302 | 
303 | # External user management, these options affect the organization users view
304 | ;external_manage_link_url =
305 | ;external_manage_link_name =
306 | ;external_manage_info =
307 | 
308 | # Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard.
309 | ;viewers_can_edit = false
310 | 
311 | # Editors can administrate dashboard, folders and teams they create
312 | ;editors_can_admin = false
313 | 
314 | # The duration in time a user invitation remains valid before expiring. This setting should be expressed as a duration. Examples: 6h (hours), 2d (days), 1w (week). Default is 24h (24 hours). The minimum supported duration is 15m (15 minutes).
315 | ;user_invite_max_lifetime_duration = 24h
316 | 
317 | # Enter a comma-separated list of users login to hide them in the Grafana UI. These users are shown to Grafana admins and themselves.
318 | ; hidden_users =
319 | 
320 | [auth]
321 | # Login cookie name
322 | ;login_cookie_name = grafana_session
323 | 
324 | # The maximum lifetime (duration) an authenticated user can be inactive before being required to login at next visit. Default is 7 days (7d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). The lifetime resets at each successful token rotation.
325 | ;login_maximum_inactive_lifetime_duration =
326 | 
327 | # The maximum lifetime (duration) an authenticated user can be logged in since login time before being required to login. Default is 30 days (30d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month).
328 | ;login_maximum_lifetime_duration =
329 | 
330 | # How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes.
331 | ;token_rotation_interval_minutes = 10
332 | 
333 | # Set to true to disable (hide) the login form, useful if you use OAuth, defaults to false
334 | ;disable_login_form = false
335 | 
336 | # Set to true to disable the signout link in the side menu. useful if you use auth.proxy, defaults to false
337 | ;disable_signout_menu = false
338 | 
339 | # URL to redirect the user to after sign out
340 | ;signout_redirect_url =
341 | 
342 | # Set to true to attempt login with OAuth automatically, skipping the login screen.
343 | # This setting is ignored if multiple OAuth providers are configured.
344 | ;oauth_auto_login = false
345 | 
346 | # OAuth state max age cookie duration in seconds. Defaults to 600 seconds.
347 | ;oauth_state_cookie_max_age = 600
348 | 
349 | # limit of api_key seconds to live before expiration
350 | ;api_key_max_seconds_to_live = -1
351 | 
352 | # Set to true to enable SigV4 authentication option for HTTP-based datasources.
353 | ;sigv4_auth_enabled = false
354 | 
355 | #################################### Anonymous Auth ######################
356 | [auth.anonymous]
357 | # enable anonymous access
358 | ;enabled = false
359 | 
360 | # specify organization name that should be used for unauthenticated users
361 | ;org_name = Main Org.
362 | 
363 | # specify role for unauthenticated users
364 | ;org_role = Viewer
365 | 
366 | # mask the Grafana version number for unauthenticated users
367 | ;hide_version = false
368 | 
369 | #################################### GitHub Auth ##########################
370 | [auth.github]
371 | ;enabled = false
372 | ;allow_sign_up = true
373 | ;client_id = some_id
374 | ;client_secret = some_secret
375 | ;scopes = user:email,read:org
376 | ;auth_url = https://github.com/login/oauth/authorize
377 | ;token_url = https://github.com/login/oauth/access_token
378 | ;api_url = https://api.github.com/user
379 | ;allowed_domains =
380 | ;team_ids =
381 | ;allowed_organizations =
382 | 
383 | #################################### GitLab Auth #########################
384 | [auth.gitlab]
385 | ;enabled = false
386 | ;allow_sign_up = true
387 | ;client_id = some_id
388 | ;client_secret = some_secret
389 | ;scopes = api
390 | ;auth_url = https://gitlab.com/oauth/authorize
391 | ;token_url = https://gitlab.com/oauth/token
392 | ;api_url = https://gitlab.com/api/v4
393 | ;allowed_domains =
394 | ;allowed_groups =
395 | 
396 | #################################### Google Auth ##########################
397 | [auth.google]
398 | ;enabled = false
399 | ;allow_sign_up = true
400 | ;client_id = some_client_id
401 | ;client_secret = some_client_secret
402 | ;scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
403 | ;auth_url = https://accounts.google.com/o/oauth2/auth
404 | ;token_url = https://accounts.google.com/o/oauth2/token
405 | ;api_url = https://www.googleapis.com/oauth2/v1/userinfo
406 | ;allowed_domains =
407 | ;hosted_domain =
408 | 
409 | #################################### Grafana.com Auth ####################
410 | [auth.grafana_com]
411 | ;enabled = false
412 | ;allow_sign_up = true
413 | ;client_id = some_id
414 | ;client_secret = some_secret
415 | ;scopes = user:email
416 | ;allowed_organizations =
417 | 
418 | #################################### Azure AD OAuth #######################
419 | [auth.azuread]
420 | ;name = Azure AD
421 | ;enabled = false
422 | ;allow_sign_up = true
423 | ;client_id = some_client_id
424 | ;client_secret = some_client_secret
425 | ;scopes = openid email profile
426 | ;auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize
427 | ;token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token
428 | ;allowed_domains =
429 | ;allowed_groups =
430 | 
431 | #################################### Okta OAuth #######################
432 | [auth.okta]
433 | ;name = Okta
434 | ;enabled = false
435 | ;allow_sign_up = true
436 | ;client_id = some_id
437 | ;client_secret = some_secret
438 | ;scopes = openid profile email groups
439 | ;auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize
440 | ;token_url = https://<tenant-id>.okta.com/oauth2/v1/token
441 | ;api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo
442 | ;allowed_domains =
443 | ;allowed_groups =
444 | ;role_attribute_path =
445 | 
446 | #################################### Generic OAuth ##########################
447 | [auth.generic_oauth]
448 | ;enabled = false
449 | ;name = OAuth
450 | ;allow_sign_up = true
451 | ;client_id = some_id
452 | ;client_secret = some_secret
453 | ;scopes = user:email,read:org
454 | ;email_attribute_name = email:primary
455 | ;email_attribute_path =
456 | ;login_attribute_path =
457 | ;name_attribute_path =
458 | ;id_token_attribute_name =
459 | ;auth_url = https://foo.bar/login/oauth/authorize
460 | ;token_url = https://foo.bar/login/oauth/access_token
461 | ;api_url = https://foo.bar/user
462 | ;allowed_domains =
463 | ;team_ids =
464 | ;allowed_organizations =
465 | ;role_attribute_path =
466 | ;tls_skip_verify_insecure = false
467 | ;tls_client_cert =
468 | ;tls_client_key =
469 | ;tls_client_ca =
470 | 
471 | #################################### Basic Auth ##########################
472 | [auth.basic]
473 | ;enabled = true
474 | 
475 | #################################### Auth Proxy ##########################
476 | [auth.proxy]
477 | ;enabled = false
478 | ;header_name = X-WEBAUTH-USER
479 | ;header_property = username
480 | ;auto_sign_up = true
481 | ;sync_ttl = 60
482 | ;whitelist = 192.168.1.1, 192.168.2.1
483 | ;headers = Email:X-User-Email, Name:X-User-Name
484 | # Read the auth proxy docs for details on what the setting below enables
485 | ;enable_login_token = false
486 | 
487 | #################################### Auth LDAP ##########################
488 | [auth.ldap]
489 | ;enabled = false
490 | ;config_file = /etc/grafana/ldap.toml
491 | ;allow_sign_up = true
492 | 
493 | # LDAP background sync (Enterprise only)
494 | # At 1 am every day
495 | ;sync_cron = "0 0 1 * * *"
496 | ;active_sync_enabled = true
497 | 
498 | #################################### SMTP / Emailing ##########################
499 | [smtp]
500 | ;enabled = false
501 | ;host = localhost:25
502 | ;user =
503 | # If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
504 | ;password =
505 | ;cert_file =
506 | ;key_file =
507 | ;skip_verify = false
508 | ;from_address = admin@grafana.localhost
509 | ;from_name = Grafana
510 | # EHLO identity in SMTP dialog (defaults to instance_name)
511 | ;ehlo_identity = dashboard.example.com
512 | # SMTP startTLS policy (defaults to 'OpportunisticStartTLS')
513 | ;startTLS_policy = NoStartTLS
514 | 
515 | [emails]
516 | ;welcome_email_on_sign_up = false
517 | ;templates_pattern = emails/*.html
518 | 
519 | #################################### Logging ##########################
520 | [log]
521 | # Either "console", "file", "syslog". Default is console and  file
522 | # Use space to separate multiple modes, e.g. "console file"
523 | mode = console
524 | 
525 | # Either "debug", "info", "warn", "error", "critical", default is "info"
526 | level = error
527 | 
528 | # optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug
529 | ;filters =
530 | 
531 | # For "console" mode only
532 | [log.console]
533 | ;level =
534 | 
535 | # log line format, valid options are text, console and json
536 | ;format = console
537 | 
538 | # For "file" mode only
539 | [log.file]
540 | ;level =
541 | 
542 | # log line format, valid options are text, console and json
543 | ;format = text
544 | 
545 | # This enables automated log rotate(switch of following options), default is true
546 | ;log_rotate = true
547 | 
548 | # Max line number of single file, default is 1000000
549 | ;max_lines = 1000000
550 | 
551 | # Max size shift of single file, default is 28 means 1 << 28, 256MB
552 | ;max_size_shift = 28
553 | 
554 | # Segment log daily, default is true
555 | ;daily_rotate = true
556 | 
557 | # Expired days of log file(delete after max days), default is 7
558 | ;max_days = 7
559 | 
560 | [log.syslog]
561 | ;level =
562 | 
563 | # log line format, valid options are text, console and json
564 | ;format = text
565 | 
566 | # Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used.
567 | ;network =
568 | ;address =
569 | 
570 | # Syslog facility. user, daemon and local0 through local7 are valid.
571 | ;facility =
572 | 
573 | # Syslog tag. By default, the process' argv[0] is used.
574 | ;tag =
575 | 
576 | [log.frontend]
577 | # Should Sentry javascript agent be initialized
578 | ;enabled = false
579 | 
580 | # Sentry DSN if you want to send events to Sentry.
581 | ;sentry_dsn =
582 | 
583 | # Custom HTTP endpoint to send events captured by the Sentry agent to. Default will log the events to stdout.
584 | ;custom_endpoint = /log
585 | 
586 | # Rate of events to be reported between 0 (none) and 1 (all), float
587 | ;sample_rate = 1.0
588 | 
589 | # Requests per second limit enforced an extended period, for Grafana backend log ingestion endpoint (/log).
590 | ;log_endpoint_requests_per_second_limit = 3
591 | 
592 | # Max requests accepted per short interval of time for Grafana backend log ingestion endpoint (/log).
593 | ;log_endpoint_burst_limit = 15
594 | 
595 | #################################### Usage Quotas ########################
596 | [quota]
597 | ; enabled = false
598 | 
599 | #### set quotas to -1 to make unlimited. ####
600 | # limit number of users per Org.
601 | ; org_user = 10
602 | 
603 | # limit number of dashboards per Org.
604 | ; org_dashboard = 100
605 | 
606 | # limit number of data_sources per Org.
607 | ; org_data_source = 10
608 | 
609 | # limit number of api_keys per Org.
610 | ; org_api_key = 10
611 | 
612 | # limit number of orgs a user can create.
613 | ; user_org = 10
614 | 
615 | # Global limit of users.
616 | ; global_user = -1
617 | 
618 | # global limit of orgs.
619 | ; global_org = -1
620 | 
621 | # global limit of dashboards
622 | ; global_dashboard = -1
623 | 
624 | # global limit of api_keys
625 | ; global_api_key = -1
626 | 
627 | # global limit on number of logged in users.
628 | ; global_session = -1
629 | 
630 | #################################### Alerting ############################
631 | [alerting]
632 | # Disable alerting engine & UI features
633 | ;enabled = true
634 | # Makes it possible to turn off alert rule execution but alerting UI is visible
635 | ;execute_alerts = true
636 | 
637 | # Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)
638 | ;error_or_timeout = alerting
639 | 
640 | # Default setting for how Grafana handles nodata or null values in alerting. (alerting, no_data, keep_state, ok)
641 | ;nodata_or_nullvalues = no_data
642 | 
643 | # Alert notifications can include images, but rendering many images at the same time can overload the server
644 | # This limit will protect the server from render overloading and make sure notifications are sent out quickly
645 | ;concurrent_render_limit = 5
646 | 
647 | 
648 | # Default setting for alert calculation timeout. Default value is 30
649 | ;evaluation_timeout_seconds = 30
650 | 
651 | # Default setting for alert notification timeout. Default value is 30
652 | ;notification_timeout_seconds = 30
653 | 
654 | # Default setting for max attempts to sending alert notifications. Default value is 3
655 | ;max_attempts = 3
656 | 
657 | # Makes it possible to enforce a minimal interval between evaluations, to reduce load on the backend
658 | ;min_interval_seconds = 1
659 | 
660 | # Configures for how long alert annotations are stored. Default is 0, which keeps them forever.
661 | # This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month).
662 | ;max_annotation_age =
663 | 
664 | # Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations.
665 | ;max_annotations_to_keep =
666 | 
667 | #################################### Annotations #########################
668 | 
669 | [annotations.dashboard]
670 | # Dashboard annotations means that annotations are associated with the dashboard they are created on.
671 | 
672 | # Configures how long dashboard annotations are stored. Default is 0, which keeps them forever.
673 | # This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month).
674 | ;max_age =
675 | 
676 | # Configures max number of dashboard annotations that Grafana stores. Default value is 0, which keeps all dashboard annotations.
677 | ;max_annotations_to_keep =
678 | 
679 | [annotations.api]
680 | # API annotations means that the annotations have been created using the API without any
681 | # association with a dashboard.
682 | 
683 | # Configures how long Grafana stores API annotations. Default is 0, which keeps them forever.
684 | # This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month).
685 | ;max_age =
686 | 
687 | # Configures max number of API annotations that Grafana keeps. Default value is 0, which keeps all API annotations.
688 | ;max_annotations_to_keep =
689 | 
690 | #################################### Explore #############################
691 | [explore]
692 | # Enable the Explore section
693 | ;enabled = true
694 | 
695 | #################################### Internal Grafana Metrics ##########################
696 | # Metrics available at HTTP API Url /metrics
697 | [metrics]
698 | # Disable / Enable internal metrics
699 | ;enabled           = true
700 | # Graphite Publish interval
701 | ;interval_seconds  = 10
702 | # Disable total stats (stat_totals_*) metrics to be generated
703 | ;disable_total_stats = false
704 | 
705 | #If both are set, basic auth will be required for the metrics endpoint.
706 | ; basic_auth_username =
707 | ; basic_auth_password =
708 | 
709 | # Metrics environment info adds dimensions to the `grafana_environment_info` metric, which
710 | # can expose more information about the Grafana instance.
711 | [metrics.environment_info]
712 | #exampleLabel1 = exampleValue1
713 | #exampleLabel2 = exampleValue2
714 | 
715 | # Send internal metrics to Graphite
716 | [metrics.graphite]
717 | # Enable by setting the address setting (ex localhost:2003)
718 | ;address =
719 | ;prefix = prod.grafana.%(instance_name)s.
720 | 
721 | #################################### Grafana.com integration  ##########################
722 | # Url used to import dashboards directly from Grafana.com
723 | [grafana_com]
724 | ;url = https://grafana.com
725 | 
726 | #################################### Distributed tracing ############
727 | [tracing.jaeger]
728 | # Enable by setting the address sending traces to jaeger (ex localhost:6831)
729 | ;address = localhost:6831
730 | # Tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2)
731 | ;always_included_tag = tag1:value1
732 | # Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote
733 | ;sampler_type = const
734 | # jaeger samplerconfig param
735 | # for "const" sampler, 0 or 1 for always false/true respectively
736 | # for "probabilistic" sampler, a probability between 0 and 1
737 | # for "rateLimiting" sampler, the number of spans per second
738 | # for "remote" sampler, param is the same as for "probabilistic"
739 | # and indicates the initial sampling rate before the actual one
740 | # is received from the mothership
741 | ;sampler_param = 1
742 | # sampling_server_url is the URL of a sampling manager providing a sampling strategy.
743 | ;sampling_server_url =
744 | # Whether or not to use Zipkin propagation (x-b3- HTTP headers).
745 | ;zipkin_propagation = false
746 | # Setting this to true disables shared RPC spans.
747 | # Not disabling is the most common setting when using Zipkin elsewhere in your infrastructure.
748 | ;disable_shared_zipkin_spans = false
749 | 
750 | #################################### External image storage ##########################
751 | [external_image_storage]
752 | # Used for uploading images to public servers so they can be included in slack/email messages.
753 | # you can choose between (s3, webdav, gcs, azure_blob, local)
754 | ;provider =
755 | 
756 | [external_image_storage.s3]
757 | ;endpoint =
758 | ;path_style_access =
759 | ;bucket =
760 | ;region =
761 | ;path =
762 | ;access_key =
763 | ;secret_key =
764 | 
765 | [external_image_storage.webdav]
766 | ;url =
767 | ;public_url =
768 | ;username =
769 | ;password =
770 | 
771 | [external_image_storage.gcs]
772 | ;key_file =
773 | ;bucket =
774 | ;path =
775 | 
776 | [external_image_storage.azure_blob]
777 | ;account_name =
778 | ;account_key =
779 | ;container_name =
780 | 
781 | [external_image_storage.local]
782 | # does not require any configuration
783 | 
784 | [rendering]
785 | # Options to configure a remote HTTP image rendering service, e.g. using https://github.com/grafana/grafana-image-renderer.
786 | # URL to a remote HTTP image renderer service, e.g. http://localhost:8081/render, will enable Grafana to render panels and dashboards to PNG-images using HTTP requests to an external service.
787 | ;server_url =
788 | # If the remote HTTP image renderer service runs on a different server than the Grafana server you may have to configure this to a URL where Grafana is reachable, e.g. http://grafana.domain/.
789 | ;callback_url =
790 | # Concurrent render request limit affects when the /render HTTP endpoint is used. Rendering many images at the same time can overload the server,
791 | # which this setting can help protect against by only allowing a certain amount of concurrent requests.
792 | ;concurrent_render_request_limit = 30
793 | 
794 | [panels]
795 | # If set to true Grafana will allow script tags in text panels. Not recommended as it enable XSS vulnerabilities.
796 | ;disable_sanitize_html = false
797 | 
798 | [plugins]
799 | ;enable_alpha = false
800 | ;app_tls_skip_verify_insecure = false
801 | # Enter a comma-separated list of plugin identifiers to identify plugins that are allowed to be loaded even if they lack a valid signature.
802 | ;allow_loading_unsigned_plugins =
803 | ;marketplace_url = https://grafana.com/grafana/plugins/
804 | 
805 | #################################### Grafana Image Renderer Plugin ##########################
806 | [plugin.grafana-image-renderer]
807 | # Instruct headless browser instance to use a default timezone when not provided by Grafana, e.g. when rendering panel image of alert.
808 | # See ICU’s metaZones.txt (https://cs.chromium.org/chromium/src/third_party/icu/source/data/misc/metaZones.txt) for a list of supported
809 | # timezone IDs. Fallbacks to TZ environment variable if not set.
810 | ;rendering_timezone =
811 | 
812 | # Instruct headless browser instance to use a default language when not provided by Grafana, e.g. when rendering panel image of alert.
813 | # Please refer to the HTTP header Accept-Language to understand how to format this value, e.g. 'fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5'.
814 | ;rendering_language =
815 | 
816 | # Instruct headless browser instance to use a default device scale factor when not provided by Grafana, e.g. when rendering panel image of alert.
817 | # Default is 1. Using a higher value will produce more detailed images (higher DPI), but will require more disk space to store an image.
818 | ;rendering_viewport_device_scale_factor =
819 | 
820 | # Instruct headless browser instance whether to ignore HTTPS errors during navigation. Per default HTTPS errors are not ignored. Due to
821 | # the security risk it's not recommended to ignore HTTPS errors.
822 | ;rendering_ignore_https_errors =
823 | 
824 | # Instruct headless browser instance whether to capture and log verbose information when rendering an image. Default is false and will
825 | # only capture and log error messages. When enabled, debug messages are captured and logged as well.
826 | # For the verbose information to be included in the Grafana server log you have to adjust the rendering log level to debug, configure
827 | # [log].filter = rendering:debug.
828 | ;rendering_verbose_logging =
829 | 
830 | # Instruct headless browser instance whether to output its debug and error messages into running process of remote rendering service.
831 | # Default is false. This can be useful to enable (true) when troubleshooting.
832 | ;rendering_dumpio =
833 | 
834 | # Additional arguments to pass to the headless browser instance. Default is --no-sandbox. The list of Chromium flags can be found
835 | # here (https://peter.sh/experiments/chromium-command-line-switches/). Multiple arguments is separated with comma-character.
836 | ;rendering_args =
837 | 
838 | # You can configure the plugin to use a different browser binary instead of the pre-packaged version of Chromium.
839 | # Please note that this is not recommended, since you may encounter problems if the installed version of Chrome/Chromium is not
840 | # compatible with the plugin.
841 | ;rendering_chrome_bin =
842 | 
843 | # Instruct how headless browser instances are created. Default is 'default' and will create a new browser instance on each request.
844 | # Mode 'clustered' will make sure that only a maximum of browsers/incognito pages can execute concurrently.
845 | # Mode 'reusable' will have one browser instance and will create a new incognito page on each request.
846 | ;rendering_mode =
847 | 
848 | # When rendering_mode = clustered you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser'
849 | # and will cluster using browser instances.
850 | # Mode 'context' will cluster using incognito pages.
851 | ;rendering_clustering_mode =
852 | # When rendering_mode = clustered you can define maximum number of browser instances/incognito pages that can execute concurrently..
853 | ;rendering_clustering_max_concurrency =
854 | 
855 | # Limit the maximum viewport width, height and device scale factor that can be requested.
856 | ;rendering_viewport_max_width =
857 | ;rendering_viewport_max_height =
858 | ;rendering_viewport_max_device_scale_factor =
859 | 
860 | # Change the listening host and port of the gRPC server. Default host is 127.0.0.1 and default port is 0 and will automatically assign
861 | # a port not in use.
862 | ;grpc_host =
863 | ;grpc_port =
864 | 
865 | [enterprise]
866 | # Path to a valid Grafana Enterprise license.jwt file
867 | ;license_path =
868 | 
869 | [feature_toggles]
870 | # enable features, separated by spaces
871 | ;enable =
872 | 
873 | [date_formats]
874 | # For information on what formatting patterns that are supported https://momentjs.com/docs/#/displaying/
875 | 
876 | # Default system date format used in time range picker and other places where full time is displayed
877 | ;full_date = YYYY-MM-DD HH:mm:ss
878 | 
879 | # Used by graph and other places where we only show small intervals
880 | ;interval_second = HH:mm:ss
881 | ;interval_minute = HH:mm
882 | ;interval_hour = MM/DD HH:mm
883 | ;interval_day = MM/DD
884 | ;interval_month = YYYY-MM
885 | ;interval_year = YYYY
886 | 
887 | # Experimental feature
888 | ;use_browser_locale = false
889 | 
890 | # Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc.
891 | ;default_timezone = browser
892 | 
893 | [expressions]
894 | # Enable or disable the expressions functionality.
895 | ;enabled = true
896 | 


--------------------------------------------------------------------------------
/grafana/influxdb.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | datasources:
 4 |   - name: influxdb
 5 |     type: influxdb
 6 |     access: proxy
 7 |     url: http://influxdb:8086/
 8 |     database: scaleout
 9 |     jsonData:
10 |       httpMode: GET
11 |       httpHeaderName1: 'Authorization'
12 |     secureJsonData:
13 |       httpHeaderValue1: 'Token token'
14 | 


--------------------------------------------------------------------------------
/influxdb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM influxdb:2.0
2 | USER root:root
3 | 
4 | COPY setup.sh /setup.sh
5 | RUN chmod 0755 /setup.sh
6 | RUN echo 'log-level: debug' >> /etc/defaults/influxdb2/config.yml
7 | 


--------------------------------------------------------------------------------
/influxdb/influxdb.conf:
--------------------------------------------------------------------------------
1 | #
2 | 


--------------------------------------------------------------------------------
/influxdb/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #exec 1> >(logger -s -t influxdb-setup) 2>&1
 3 | 
 4 | function pingit
 5 | {
 6 | 	influx v1 auth list 1>&2
 7 | 
 8 | 	echo $?
 9 | }
10 | 
11 | #wait until the daemon is responding
12 | while [ $(pingit) != 0 ]
13 | do
14 | 	sleep 0.1
15 | done
16 | 
17 | sleep 2
18 | 
19 | B=$(while true
20 | do
21 | 	influx bucket list 2>/dev/null >/tmp/buckets
22 | 	B=$(awk '
23 | 	BEGIN { rc = 1 }
24 | 	/scaleout/ {
25 | 		if (length($1) > 5 && $1 !~ /Error/) {
26 | 			print $1; rc = 0
27 | 		}
28 | 	}
29 | 	END {exit rc}
30 | 	' </tmp/buckets)
31 | 	[ $? -eq 0 ] && echo "$B" && break
32 | 	sleep 0.1
33 | done)
34 | 
35 | echo "Found bucket: $B"
36 | 
37 | influx v1 dbrp create \
38 | 	--db scaleout --rp scaleout \
39 | 	--bucket-id $B \
40 | 	--default
41 | 
42 | influx v1 auth create \
43 | 	-c default \
44 | 	-d "slurm user" \
45 | 	--org scaleout \
46 | 	--password "password" \
47 | 	--username "user" \
48 | 	--write-bucket $B \
49 | 	--read-bucket $B
50 | 
51 | influx v1 auth create \
52 | 	-c default \
53 | 	-d "slurm user" \
54 | 	--org scaleout \
55 | 	--no-password \
56 | 	--username "admin" \
57 | 	--write-bucket $B \
58 | 	--read-bucket $B
59 | 


--------------------------------------------------------------------------------
/keycloak/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM keycloak/keycloak
2 | COPY startup.sh /startup.sh
3 | ENTRYPOINT [ "/startup.sh" ]
4 | 


--------------------------------------------------------------------------------
/keycloak/startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export SECRET=secret
3 | /opt/keycloak/bin/kc.sh bootstrap-admin service  --client-id test --client-secret:env=SECRET
4 | exec /opt/keycloak/bin/kc.sh start-dev --log=console --log-console-level=debug --http-enabled true
5 | 


--------------------------------------------------------------------------------
/open-ondemand/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM treydock/ood:latest
 2 | COPY ood_portal.yml /etc/ood/config/ood_portal.yml
 3 | COPY scaleout.yaml /etc/ood/config/clusters.d/scaleout.yml
 4 | RUN /opt/ood/ood-portal-generator/sbin/update_ood_portal
 5 | COPY entrypoint.sh /entrypoint.sh
 6 | RUN chmod 0755 /entrypoint.sh
 7 | 
 8 | RUN mkdir -p /usr/local/bin/
 9 | COPY sbatch scancel scontrol squeue /usr/local/bin/
10 | RUN chmod 0755 /usr/local/bin/{sbatch,scancel,scontrol,squeue}
11 | 
12 | RUN mkdir -m 0777 -p /var/www/ood/apps/sys/dashboard/log/
13 | 
14 | CMD "/entrypoint.sh"
15 | 


--------------------------------------------------------------------------------
/open-ondemand/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # /home must be mounted in order to add users with correct UIDs
 5 | #
 6 | groupadd -g 1005 bedrock
 7 | cd /home
 8 | for i in $(ls); do
 9 | 	TUID=$(stat -c %u /home/$i)
10 | 	/opt/rh/httpd24/root/usr/bin/htpasswd -b /opt/rh/httpd24/root/etc/httpd/.htpasswd $i password
11 | 	/usr/sbin/useradd -M -u $TUID -c $i -g users -G bedrock $i
12 | done
13 | 
14 | #
15 | # Clone over SSH config
16 | #
17 | for i in ssh_config ssh_host_ecdsa_key ssh_host_ecdsa_key.pub ssh_host_ed25519_key ssh_host_ed25519_key.pub ssh_host_rsa_key ssh_host_rsa_key.pub ssh_known_hosts sshd_config
18 | do
19 | 	ln -nfs /etc/shared-ssh/$i /etc/ssh/$i
20 | done
21 | 
22 | chmod 0664 /var/www/ood/apps/sys/myjobs/log/production.log
23 | 
24 | exec /usr/local/bin/launch-httpd
25 | 
26 | 
27 | cd /var/www/ood/apps/sys/activejobs
28 | scl enable rh-ruby22 -- bin/setup
29 | 
30 | 


--------------------------------------------------------------------------------
/open-ondemand/ood_portal.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | #
  3 | # Portal configuration
  4 | #
  5 | 
  6 | # The address and port to listen for connections on
  7 | # Example:
  8 | #     listen_addr_port: 443
  9 | # Default: null (don't add any more listen directives)
 10 | listen_addr_port: null
 11 | 
 12 | # The server name used for name-based Virtual Host
 13 | # Example:
 14 | #     servername: 'www.example.com'
 15 | # Default: null (don't use name-based Virtual Host)
 16 | servername: null
 17 | 
 18 | # The port specification for the Virtual Host
 19 | # Example:
 20 | #     port: 8080
 21 | #Default: null (use default port 80 or 443 if SSL enabled)
 22 | port: 80
 23 | 
 24 | # List of SSL Apache directives
 25 | # Example:
 26 | #     ssl:
 27 | #       - 'SSLCertificateFile "/etc/pki/tls/certs/www.example.com.crt"'
 28 | #       - 'SSLCertificateKeyFile "/etc/pki/tls/private/www.example.com.key"'
 29 | # Default: null (no SSL support)
 30 | ssl: null
 31 | 
 32 | # Root directory of log files (can be relative ServerRoot)
 33 | # Example:
 34 | #     logroot: '/path/to/my/logs'
 35 | # Default: 'logs' (this is relative to ServerRoot)
 36 | logroot: '/var/logs/'
 37 | 
 38 | # Root directory of the Lua handler code
 39 | # Example:
 40 | #     lua_root: '/path/to/lua/handlers'
 41 | # Default : '/opt/ood/mod_ood_proxy/lib' (default install directory of mod_ood_proxy)
 42 | #lua_root: '/opt/ood/mod_ood_proxy/lib'
 43 | 
 44 | # Verbosity of the Lua module logging
 45 | # (see https://httpd.apache.org/docs/2.4/mod/core.html#loglevel)
 46 | # Example:
 47 | #     lua_log_level: 'warn'
 48 | # Default: 'info' (get verbose logs)
 49 | #lua_log_level: 'info'
 50 | 
 51 | # System command used to map authenticated-user to system-user
 52 | # Example:
 53 | #     user_map_cmd: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex --regex=''^(\w+)@example.com$'''
 54 | # Default: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex' (this echo's back auth-user)
 55 | user_map_cmd: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex'
 56 | 
 57 | # Use an alternative CGI environment variable instead of REMOTE_USER for
 58 | # determining the authenticated-user fed to the mapping script
 59 | # Example:
 60 | #     user_env: 'OIDC_CLAIM_preferred_username'
 61 | # Default: null (use REMOTE_USER)
 62 | #user_env: null
 63 | 
 64 | # Redirect user to the following URI if fail to map there authenticated-user to
 65 | # a system-user
 66 | # Example:
 67 | #     map_fail_uri: '/register'
 68 | # Default: null (don't redirect, just display error message)
 69 | #map_fail_uri: null
 70 | 
 71 | # System command used to run the `nginx_stage` script with sudo privileges
 72 | # Example:
 73 | #     pun_stage_cmd: 'sudo /path/to/nginx_stage'
 74 | # Default: 'sudo /opt/ood/nginx_stage/sbin/nginx_stage' (don't forget sudo)
 75 | #pun_stage_cmd: 'sudo /opt/ood/nginx_stage/sbin/nginx_stage'
 76 | 
 77 | # List of Apache authentication directives
 78 | # NB: Be sure the appropriate Apache module is installed for this
 79 | # Default: (see below, uses basic auth with an htpasswd file)
 80 | auth:
 81 |   - 'AuthType Basic'
 82 |   - 'AuthName "private"'
 83 |   - 'AuthUserFile "/opt/rh/httpd24/root/etc/httpd/.htpasswd"'
 84 |   - 'RequestHeader unset Authorization'
 85 |   - 'Require valid-user'
 86 | 
 87 | # Redirect user to the following URI when accessing root URI
 88 | # Example:
 89 | #     root_uri: '/my_uri'
 90 | #     # https://www.example.com/ => https://www.example.com/my_uri
 91 | # Default: '/pun/sys/dashboard' (default location of the OOD Dashboard app)
 92 | #root_uri: '/pun/sys/dashboard'
 93 | 
 94 | # Track server-side analytics with a Google Analytics account and property
 95 | # (see https://github.com/OSC/mod_ood_proxy/blob/master/lib/analytics.lua for
 96 | # information on how to setup the GA property)
 97 | # Example:
 98 | #     analytics:
 99 | #       url: 'http://www.google-analytics.com/collect'
100 | #       id: 'UA-79331310-4'
101 | # Default: null (do not track)
102 | #analytics: null
103 | 
104 | #
105 | # Publicly available assets
106 | #
107 | 
108 | # Public sub-uri (available to public with no authentication)
109 | # Example:
110 | #     public_uri: '/assets'
111 | # Default: '/public'
112 | #public_uri: '/public'
113 | 
114 | # Root directory that serves the public sub-uri (be careful, everything under
115 | # here is open to the public)
116 | # Example:
117 | #     public_root: '/path/to/public/assets'
118 | # Default: '/var/www/ood/public'
119 | #public_root: '/var/www/ood/public'
120 | 
121 | #
122 | # Logout redirect helper
123 | #
124 | 
125 | # Logout sub-uri
126 | # Example
127 | #     logout_uri: '/log_me_out'
128 | # NB: If you change this, then modify the Dashboard app with the new sub-uri
129 | # Default: '/logout' (the Dashboard app is by default going to expect this)
130 | #logout_uri: '/logout'
131 | 
132 | # Redirect user to the following URI when accessing logout URI
133 | # Example:
134 | #     logout_redirect: '/oidc?logout=https%3A%2F%2Fwww.example.com'
135 | # Default: '/pun/sys/dashboard/logout' (the Dashboard app provides a simple
136 | # HTML page explaining logout to the user)
137 | #logout_redirect: '/pun/sys/dashboard/logout'
138 | 
139 | #
140 | # Reverse proxy to backend nodes
141 | #
142 | 
143 | # Regular expression used for whitelisting allowed hostnames of nodes
144 | # Example:
145 | #     host_regex: '[\w.-]+\.example\.com'
146 | # Default: '[^/]+' (allow reverse proxying to all hosts, this allows external
147 | # hosts as well)
148 | #host_regex: '[^/]+'
149 | 
150 | # Sub-uri used to reverse proxy to backend web server running on node that
151 | # knows the full URI path
152 | # Example:
153 | #     node_uri: '/node'
154 | # Default: null (disable this feature)
155 | #node_uri: null
156 | 
157 | # Sub-uri used to reverse proxy to backend web server running on node that
158 | # ONLY uses *relative* URI paths
159 | # Example:
160 | #     rnode_uri: '/rnode'
161 | # Default: null (disable this feature)
162 | #rnode_uri: null
163 | 
164 | #
165 | # Per-user NGINX Passenger apps
166 | #
167 | 
168 | # Sub-uri used to control PUN processes
169 | # Example:
170 | #     nginx_uri: '/my_pun_controller'
171 | # Default: '/nginx'
172 | #nginx_uri: '/nginx'
173 | 
174 | # Sub-uri used to access the PUN processes
175 | # Example:
176 | #     pun_uri: '/my_pun_apps'
177 | # Default: '/pun'
178 | #pun_uri: '/pun'
179 | 
180 | # Root directory that contains the PUN Unix sockets that the proxy uses to
181 | # connect to
182 | # Example:
183 | #     pun_socket_root: '/path/to/pun/sockets'
184 | # Default: '/var/run/nginx' (default location set in nginx_stage)
185 | #pun_socket_root: '/var/run/nginx'
186 | 
187 | # Number of times the proxy attempts to connect to the PUN Unix socket before
188 | # giving up and displaying an error to the user
189 | # Example:
190 | #     pun_max_retries: 25
191 | # Default: 5 (only try 5 times)
192 | #pun_max_retries: 5
193 | 
194 | #
195 | # Support for OpenID Connect
196 | #
197 | 
198 | # Sub-uri used by mod_auth_openidc for authentication
199 | # Example:
200 | #     oidc_uri: '/oidc'
201 | # Default: null (disable OpenID Connect support)
202 | oidc_uri: null
203 | 
204 | # Sub-uri user is redirected to if they are not authenticated. This is used to
205 | # *discover* what ID provider the user will login through.
206 | # Example:
207 | #     oidc_discover_uri: '/discover'
208 | # Default: null (disable support for discovering OpenID Connect IdP)
209 | #oidc_discover_uri: null
210 | 
211 | # Root directory on the filesystem that serves the HTML code used to display
212 | # the discovery page
213 | # Example:
214 | #     oidc_discover_root: '/var/www/ood/discover'
215 | # Default: null (disable support for discovering OpenID Connect IdP)
216 | #oidc_discover_root: null
217 | 
218 | #
219 | # Support for registering unmapped users
220 | #
221 | # (Not necessary if using regular expressions for mapping users)
222 | #
223 | 
224 | # Sub-uri user is redirected to if unable to map authenticated-user to
225 | # system-user
226 | # Example:
227 | #     register_uri: '/register'
228 | # Default: null (display error to user if mapping fails)
229 | #register_uri: null
230 | 
231 | # Root directory on the filesystem that serves the HTML code used to register
232 | # an unmapped user
233 | # Example:
234 | #     register_root: '/var/www/ood/register'
235 | # Default: null (display error to user if mapping fails)
236 | #register_root: null
237 | 


--------------------------------------------------------------------------------
/open-ondemand/sbatch:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | exec 1> >(logger -s -t $(basename $0)) 2>&1
3 | 
4 | exec ssh -oBatchMode=yes login sbatch $@
5 | 


--------------------------------------------------------------------------------
/open-ondemand/scaleout.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | v2:
 3 |   metadata:
 4 |     title: "scaleout"
 5 |     url: "https://www.schedmd.com/"
 6 |     hidden: false
 7 |   login:
 8 |     host: "login"
 9 |   job:
10 |     adapter: "slurm"
11 |     cluster: "cluster"
12 |     bin: "/usr/local/bin"
13 |     conf: "/etc/slurm/slurm.conf"
14 |     bin_overrides:
15 |       sbatch: "ssh -n login /usr/local/bin/sbatch"
16 |       squeue: "ssh -n login /usr/local/bin/squeue"
17 |       scontrol: "ssh -n login /usr/local/bin/scontrol"
18 |       scancel: "ssh -n login /usr/local/bin/scancel"
19 | 


--------------------------------------------------------------------------------
/open-ondemand/scancel:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | exec ssh -oBatchMode=yes -n login scancel $@
4 | 


--------------------------------------------------------------------------------
/open-ondemand/scontrol:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | exec ssh -oBatchMode=yes -n login scontrol $@
4 | 


--------------------------------------------------------------------------------
/open-ondemand/squeue:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | exec ssh -oBatchMode=yes -n login squeue $@
4 | 


--------------------------------------------------------------------------------
/proxy/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine
 2 | RUN mkdir -p /run/nginx/ /var/www/html/validate /var/www/html/auth
 3 | RUN mkdir -p /var/www/html/slurm
 4 | RUN apk update
 5 | RUN apk add nginx php83 php83-fpm bash php83-session nginx-debug
 6 | RUN unlink /etc/nginx/http.d/default.conf
 7 | COPY www.conf /etc/php83/php-fpm.d/www.conf
 8 | COPY nginx.conf /etc/nginx/http.d/
 9 | COPY validate.php /var/www/html/validate/index.php
10 | COPY auth.php /var/www/html/auth/index.php
11 | RUN chmod 0755 /var/www/html/validate/index.php /var/www/html/auth/index.php
12 | RUN echo "daemon off;" >> /etc/nginx/nginx.conf
13 | RUN echo "error_log syslog:server=unix:/dev/log,nohostname,tag=nginx debug;" >> /etc/nginx/nginx.conf
14 | CMD bash -c '/usr/sbin/nginx & /usr/sbin/php-fpm83 -F& wait'
15 | 
16 | EXPOSE 8080/tcp
17 | 


--------------------------------------------------------------------------------
/proxy/auth.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | error_reporting(E_ALL);
 3 | 
 4 | if (isset($_GET) && isset($_GET['user'])) {
 5 | 	session_start();
 6 | 	$_SESSION['user_name'] = $_GET['user'];
 7 | 	echo "<p>Hello {$_GET['user']}.</p>";
 8 | 
 9 | 	if (!isset($_GET['token']) || $_GET['token'] == "") {
10 | 		unset($_SESSION['user_token']);
11 | 		echo "<p>Using slurm user for authentication proxy.</p>";
12 | 	} else {
13 | 		$_SESSION['user_token'] = $_GET['token'];
14 | 		echo "<p>You entered {$_GET['token']} as your token.</p>";
15 | 	}
16 | 	exit();
17 | }
18 | header('HTTP/1.0 401 Unauthorized');
19 | ?>
20 | <html>
21 | <body>
22 | <p>Authentication Options:</p?
23 | <ul>
24 | 	<li>Per user token:<ul>
25 | 		<li>User: "fred"</li>
26 | 		<li>Password: use generated token from scontrol</li>
27 | 	</ul></li>
28 | 	<li>Authentication Proxy:<ul>
29 | 		<li>User: "fred"</li>
30 | 		<li>Password: leave empty to use "slurm" user as an authentication proxy.</li>
31 | 	</ul></li>
32 | </ul>
33 | <hr>
34 | <p>
35 | <form action="?" method="get">
36 | User name: <input type="text" name="user"><br>
37 | Scontrol Token: <input type="text" name="token"><br>
38 | <input type="submit">
39 | </form>
40 | </p>
41 | </body>
42 | </html>
43 | 


--------------------------------------------------------------------------------
/proxy/nginx.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 | 	listen         8080 default_server;
 3 | 	listen         [::]:8080 default_server;
 4 | 	server_name    _;
 5 | 	root           /var/www/html/;
 6 | 	index          index.php;
 7 | 	access_log     syslog:server=unix:/dev/log,nohostname,tag=nginx,severity=debug combined;
 8 | 
 9 | 	location ~ ^/slurm/(.*)$ {
10 | 		auth_request     /validate/;
11 | 		auth_request_set $user_name $upstream_http_x_slurm_user_name;
12 | 		auth_request_set $user_token $upstream_http_x_slurm_user_token;
13 | 		proxy_pass       http://rest;
14 | 		proxy_set_header X-SLURM-USER-NAME $user_name;
15 | 		proxy_set_header X-SLURM-USER-TOKEN $user_token;
16 | 		proxy_set_header Host      $host;
17 | 	    proxy_set_header X-Real-IP $remote_addr;
18 | 		proxy_redirect   ~^http://[^:]+:\d+(/.+)$ http://rest/$1;
19 | 	}
20 | 	location ~ ^/openapi/(.*)$ {
21 | 		auth_request     /validate/;
22 | 		proxy_pass       http://rest;
23 | 		proxy_set_header X-SLURM-USER-NAME $user_name;
24 | 		proxy_set_header X-SLURM-USER-TOKEN $user_token;
25 | 		proxy_set_header Host      $host;
26 | 	    proxy_set_header X-Real-IP $remote_addr;
27 | 		proxy_redirect   ~^http://[^:]+:\d+(/.+)$ http://rest/$1;
28 | 	}
29 | 	location = /validate/ {
30 | 		internal;
31 | 		auth_request off;
32 | 		index index.php;
33 | 	}
34 | 	location = /auth/ {
35 | 		auth_request off;
36 | 		index index.php;
37 | 	}
38 | 	location ~* \.php$ {
39 | 		fastcgi_pass    unix:/run/nginx/php.sock;
40 | 		include         fastcgi_params;
41 | 		fastcgi_param   SCRIPT_FILENAME    $document_root$fastcgi_script_name;
42 | 		fastcgi_param   SCRIPT_NAME        $fastcgi_script_name;
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/proxy/validate.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | error_reporting(E_ALL);
 3 | session_start();
 4 | 
 5 | if (session_status() == PHP_SESSION_NONE || !isset($_SESSION['user_name'])) {
 6 | 	header("HTTP/1.1 401 Unauthorized");
 7 | } else {
 8 | 	// default to loading the slurm user token
 9 | 	if (!isset($_SESSION['user_token']) || $_SESSION['user_token'] == "") {
10 | 		$_SESSION['user_token'] = file_get_contents("/auth/slurm");
11 | 	}
12 | 
13 | 	header("X-SLURM-USER-NAME: ".$_SESSION['user_name']);
14 | 	header("X-SLURM-USER-TOKEN: ".$_SESSION['user_token']);
15 | }
16 | 
17 | ?>
18 | 


--------------------------------------------------------------------------------
/proxy/www.conf:
--------------------------------------------------------------------------------
 1 | [www]
 2 | listen = /run/nginx/php.sock
 3 | pm = dynamic
 4 | pm.max_children = 5
 5 | pm.start_servers = 2
 6 | pm.min_spare_servers = 1
 7 | pm.max_spare_servers = 3
 8 | user=nginx
 9 | group=nginx
10 | listen.owner = nginx
11 | listen.group = nginx
12 | 


--------------------------------------------------------------------------------
/scaleout/Dockerfile:
--------------------------------------------------------------------------------
  1 | ARG DOCKER_FROM
  2 | FROM ${DOCKER_FROM} as base
  3 | ARG SLURM_RELEASE
  4 | COPY profile.sh /etc/profile.d/container.sh
  5 | # ood (open-ondemand user) is hardcoded to user 1000. must ensure uid is not different
  6 | RUN groupadd -g 1000 ood && \
  7 | 	useradd --no-create-home --gid ood -u 1000 ood
  8 | RUN dnf -y install 'dnf-command(config-manager)'
  9 | RUN dnf -y install dnf-plugins-core
 10 | RUN dnf config-manager --set-enabled powertools
 11 | RUN dnf -y install --allowerasing --exclude google-chrome-* \
 12 | 	rpm gcc automake make perl-macros rpm-build passwd \
 13 | 	readline-devel openssl-devel openssl pam-devel \
 14 | 	perl-ExtUtils-MakeMaker.noarch mariadb-devel mariadb \
 15 | 	glib2-devel gtk2-devel libtool git vim hostname \
 16 | 	perl-Switch expect lua-devel rrdtool-devel lz4-devel json-c-devel \
 17 | 	libcurl-devel freeipmi-devel numactl-devel lynx numactl \
 18 | 	rdma-core-devel gcc-c++ flex libgcrypt-devel gcc-gfortran \
 19 | 	strace ltrace wget cmake sudo jansson-devel \
 20 | 	openssh-server pam-devel xterm oniguruma-devel xmvn \
 21 | 	lua-devel xauth man mailx postfix http-parser-devel \
 22 | 	libyaml-devel mlocate gettext-devel texinfo java-21-openjdk-devel \
 23 | 	systemd net-tools socat jq squashfs-tools fuse-overlayfs pigz \
 24 | 	fuse-devel rpcgen cronie iproute procps-ng \
 25 | 	lua-json.noarch rsync crun go gpgme-devel libseccomp-devel \
 26 | 	libassuan-devel dbus-devel bash-completion tzdata-java \
 27 | 	python3-setuptools-wheel python3-virtualenv python3-urllib3 \
 28 | 	python3-six python3-requests python3-pexpect \
 29 | 	python3.11 python3.11-pip python3.11-pytest python3.11-requests \
 30 | 	bc emacs nano gmp-devel mpfr-devel bison elfutils-debuginfod
 31 | RUN update-alternatives --set 'java' java-21-openjdk.x86_64 && \
 32 | 	alternatives --set python3 /usr/bin/python3.11 && \
 33 | 	ln -s $(which pytest-3.11) /usr/bin/pytest-3 && \
 34 | 	ln -s $(which pytest-3.11) /usr/bin/pytest && \
 35 | 	ln -s /usr/lib/python3.6/site-packages/pexpect/ /usr/lib/python3.11/site-packages/pexpect && \
 36 | 	ln -s /usr/lib/python3.6/site-packages/ptyprocess/ /usr/lib/python3.11/site-packages/ptyprocess
 37 | ARG SLURM_BENCHMARK
 38 | RUN git clone --depth 1 --single-branch -b v1.5.18 https://github.com/aws/s2n-tls.git /usr/local/src/s2n-tls/
 39 | WORKDIR /usr/local/src/s2n-tls/
 40 | RUN if [ ! -z "${SLURM_BENCHMARK}" ]; then \
 41 | 		TYPE="Release"; \
 42 | 	else \
 43 | 		TYPE="Debug"; \
 44 | 	fi; \
 45 | 	cmake . -Bbuild \
 46 | 		-DCMAKE_BUILD_TYPE=${TYPE} \
 47 | 		-DCMAKE_INSTALL_PREFIX=/usr/local/ \
 48 | 		-DBUILD_SHARED_LIBS=ON \
 49 | 		-DS2N_INSTALL_S2NC_S2ND=1 && \
 50 | 	cmake --build build -j $(nproc) && \
 51 | 	cmake --install build
 52 | RUN git clone --depth 1 --single-branch -b VALGRIND_3_24_0 https://sourceware.org/git/valgrind.git /usr/local/src/valgrind/
 53 | COPY valgrind.patch /usr/local/src/valgrind/
 54 | WORKDIR /usr/local/src/valgrind/
 55 | RUN git apply valgrind.patch && ./autogen.sh && ./configure --prefix=/usr/local/ && \
 56 |         make -j && sudo make install
 57 | RUN git clone --depth 1 --single-branch -b v3.9.2 https://github.com/luarocks/luarocks.git /usr/local/src/luarocks/
 58 | WORKDIR /usr/local/src/luarocks/
 59 | RUN ./configure --prefix=/usr/local --sysconfdir=/etc/ && make -j && make install
 60 | RUN luarocks install lunajson
 61 | RUN pip3 install certifi pip wheel setuptools markdown pydantic dateutils
 62 | RUN git clone --depth 1 --single-branch -b v1.15.2 https://github.com/benmcollins/libjwt.git /usr/local/src/libjwt/
 63 | WORKDIR /usr/local/src/libjwt
 64 | RUN autoreconf --force --install && ./configure --prefix=/usr/local --sysconfdir=/etc/ && make -j && make install
 65 | RUN git clone --depth 1 --single-branch -b jq-1.6 https://github.com/stedolan/jq.git /usr/local/src/jq/
 66 | WORKDIR /usr/local/src/jq
 67 | RUN autoreconf --force --install && ./configure --prefix=/usr/local --sysconfdir=/etc/ --disable-maintainer-mode && make -j && make install
 68 | RUN git clone --depth 1 --single-branch -b munge-0.5.13 https://github.com/dun/munge.git /usr/local/src/munge/
 69 | WORKDIR /usr/local/src/munge/
 70 | RUN autoreconf -i && ./configure --localstatedir=/var --prefix=/usr/local --sysconfdir=/etc/ >/dev/null && make -j >/dev/null && make install >/dev/null
 71 | ARG DISABLE_GDB_BUILD
 72 | RUN test ! $DISABLE_GDB_BUILD && ( \
 73 | 	git clone --depth 1 --single-branch -b gdb-16.3-release https://github.com/bminor/binutils-gdb.git /usr/local/src/gdb/ && \
 74 | 	cd /usr/local/src/gdb/ && autoreconf -i && \
 75 | 	./configure --localstatedir=/var --prefix=/usr/local --sysconfdir=/etc/ >/dev/null && \
 76 | 	make -j >/dev/null && make install >/dev/null ) || ( \
 77 | 	echo "Skipping GDB build" )
 78 | RUN mkdir -p /etc/slurm/plugstack.conf.d/ /etc/pmix
 79 | RUN git clone --depth 1 --single-branch -b release-2.1.12-stable https://github.com/libevent/libevent.git /usr/local/src/libevent/
 80 | RUN git clone -b hdf5-1_10_6 --single-branch --depth 1 https://github.com/HDFGroup/hdf5.git /usr/local/src/hdf5
 81 | WORKDIR /usr/local/src/hdf5
 82 | RUN ./configure --prefix=/usr/local/ >/dev/null && make -j install >/dev/null
 83 | WORKDIR /usr/local/src/libevent
 84 | RUN ./autogen.sh && ./configure --prefix=/usr/local >/dev/null && make -j >/dev/null && make install >/dev/null
 85 | RUN git clone --depth 1 --single-branch -b hwloc-2.0.2 https://github.com/open-mpi/hwloc.git /usr/local/src/hwloc/
 86 | WORKDIR /usr/local/src/hwloc
 87 | RUN ./autogen.sh && ./configure --prefix=/usr/local --sysconfdir=/etc/hwloc/ >/dev/null && make -j >/dev/null && make install >/dev/null
 88 | RUN git clone --depth 1 --single-branch -b v4.2.2 https://github.com/pmix/pmix.git /usr/local/src/pmix/
 89 | WORKDIR /usr/local/src/pmix
 90 | RUN ./autogen.pl && ./configure --prefix=/usr/local --sysconfdir=/etc/pmix/ --with-devel-headers --with-tests-examples  --enable-debug --with-libevent=/usr/local/ >/dev/null && \
 91 | 	make -j >/dev/null && make install >/dev/null
 92 | RUN git clone --depth 1 --single-branch -b master https://github.com/SchedMD/slurm.git /usr/local/src/slurm && \
 93 | 	git config --global --add safe.directory /usr/local/src/slurm
 94 | WORKDIR /usr/local/src/slurm
 95 | RUN git config --global user.email "you@example.com" && \
 96 | 	git config --global user.name "scale out" && \
 97 | 	git fetch --depth 1 origin ${SLURM_RELEASE} && \
 98 | 	git checkout FETCH_HEAD
 99 | RUN if [ ! -z "${SLURM_BENCHMARK}" ]; then \
100 | 		CONF_ARGS=""; \
101 | 	else \
102 | 		CONF_ARGS="--enable-memory-leak-debug --enable-developer --disable-optimizations --with-hdf5=/usr/local/bin/h5cc"; \
103 | 	fi; \
104 | 	./configure --enable-slurmrestd --enable-multiple-slurmd $CONF_ARGS --with-munge=/usr/local/ --prefix=/usr/local \
105 | 		--sharedstatedir=/run/slurm/ --with-pmix=/usr/local \
106 | 		--enable-pam --with-s2n=/usr/local --sysconfdir=/etc/slurm >/dev/null && \
107 | 	make -j >/dev/null && make install >/dev/null && make -j install-contrib >/dev/null && \
108 | 	cd contribs && make install && \
109 | 	ln -nfs /usr/local/src/slurm/contribs/slurm_completion_help/slurm_completion.sh /etc/bash_completion.d/slurm_completion.sh
110 | COPY slurm.bash_profile /tmp/slurm.bash_profile
111 | RUN /usr/sbin/groupadd -r slurm -g 1001 && \
112 | 	/usr/sbin/useradd -c "scheduler daemon" -d "/home/slurm/" -g slurm -s /bin/bash -u 1001 -r slurm && \
113 | 	/usr/sbin/groupadd -r slurmrestd -g 1002 && \
114 | 	/usr/sbin/useradd -c "REST daemon" -d "/etc/slurm" -g slurmrestd -s /bin/bash -u 1002 -r slurmrestd && \
115 | 	echo 'slurm        ALL=(ALL)       NOPASSWD: ALL' >> /etc/sudoers && \
116 | 	mkdir -m 0770 -p /home/slurm && chown slurm:slurm /home/slurm && \
117 | 	mv /tmp/slurm.bash_profile /home/slurm/.bash_profile && \
118 | 	chown slurm:slurm /home/slurm/.bash_profile && \
119 | 	chmod 0700 /home/slurm/.bash_profile && \
120 | 	chmod 0400 /etc/shadow
121 | 
122 | RUN /usr/sbin/groupadd -r munge && \
123 | 	/usr/sbin/useradd -c "MUNGE authentication service" -d "/usr/local/etc/munge" -g munge -s /bin/false -r munge && \
124 | 	mkdir -m 0700 -p /etc/munge/ && \
125 | 	mkdir -p -m 0700 /var/log/munge && \
126 | 	chown munge:munge -R /var/log/munge /var/run/munge/ && \
127 | 	dd if=/dev/urandom bs=1 count=1024 >/etc/munge/munge.key && \
128 | 	touch /etc/munge/munge.seed && \
129 | 	chown munge:munge -R /etc/munge && \
130 | 	chmod 0400 /etc/munge/munge.key /etc/munge/munge.seed && \
131 | 	chown root:root /etc/munge/ -R && \
132 | 	chown -R munge:munge /etc/munge
133 | RUN touch /etc/slurm/jwks.json && \
134 | 	dd if=/dev/urandom bs=32 count=1 >/etc/slurm/jwt.key && \
135 | 	chown slurm:slurm /etc/slurm/{jwks.json,jwt.key} && \
136 | 	chmod 0600 /etc/slurm/{jwks.json,jwt.key}
137 | RUN git clone --depth 1 --single-branch -b pdsh-2.33 https://github.com/chaos/pdsh.git /usr/local/src/pdsh/
138 | WORKDIR /usr/local/src/pdsh
139 | RUN autoreconf -i && ./configure --without-rsh --with-ssh --prefix=/usr/local --sysconfdir=/etc/ --with-readline && make -j >/dev/null && make install >/dev/null
140 | WORKDIR /usr/local/src/
141 | RUN wget -q 'https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.gz' -O /usr/local/src/openmpi4.tar.gz && \
142 | 	tar -xzf /usr/local/src/openmpi4.tar.gz
143 | WORKDIR /usr/local/src/openmpi-4.1.4/
144 | RUN ./configure --prefix=/usr/local/openmpi/4.1.4 --with-pmix=/usr/local \
145 | 	--with-pmix-libdir=/usr/local/lib --with-pmi=/usr/local --with-slurm \
146 | 	--with-devel-headers --enable-debug --without-cma --with-libevent=/usr/local \
147 | 	--with-libevent-header=/usr/local/include --with-libevent-libs=/usr/local/lib \
148 | 	--with-hwloc=/usr/local/ --enable-mca-no-build=btl-uct && make -j >/dev/null \
149 | 	&& make install >/dev/null
150 | WORKDIR /usr/local/src/
151 | RUN wget -q 'https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.0rc9.tar.gz' -O /usr/local/src/openmpi5.tar.gz && \
152 | 	tar -xzf /usr/local/src/openmpi5.tar.gz
153 | WORKDIR /usr/local/src/openmpi-5.0.0rc9
154 | RUN ./configure --prefix=/usr/local/openmpi/5.0.0rc9 --with-pmix=/usr/local \
155 | 	--with-pmix-libdir=/usr/local/lib --with-slurm --with-devel-headers \
156 | 	--enable-debug --without-cma --with-libevent=/usr/local \
157 | 	--with-libevent-header=/usr/local/include \
158 | 	--with-libevent-libs=/usr/local/lib --with-hwloc=/usr/local/ \
159 | 	--enable-mca-no-build=btl-uct && make -j >/dev/null && \
160 | 	make install >/dev/null
161 | RUN git clone --depth 1 --branch msmtp-1.8.6 https://github.com/marlam/msmtp-mirror.git /usr/local/src/msmtp
162 | WORKDIR /usr/local/src/msmtp
163 | RUN autoreconf -i && ./configure && make -j >/dev/null && make install >/dev/null && \
164 | 	ln -nfs /usr/local/bin/msmtp /usr/sbin/sendmail && \
165 | 	ln -nfs /usr/local/bin/msmtp /usr/lib/sendmail
166 | COPY msmtprc /usr/local/etc/msmtprc
167 | ARG SUBNET
168 | ARG SUBNET6
169 | ENV SUBNET=${SUBNET}
170 | ENV SUBNET6=${SUBNET6}
171 | RUN postconf -e "mydestination = cluster scaleout localhost.localdomain localhost" && \
172 | 	postconf -e "mynetworks = 127.0.0.0/8, ${SUBNET}.0.0/16" && \
173 | 	postconf -e "inet_interfaces = all" && \
174 | 	postconf -e "recipient_delimiter = +"
175 | RUN newaliases && \
176 | 	postconf -e "mydestination = login login.localdomain cluster scaleout localhost.localdomain, localhost"
177 | 
178 | #Install OpenAPI generator wrapper script and force local cache of 7.3.0 by default
179 | RUN wget -q 'https://raw.githubusercontent.com/OpenAPITools/openapi-generator/master/bin/utils/openapi-generator-cli.sh' -O /usr/local/bin/openapi-generator-cli && \
180 | 	chmod +x /usr/local/bin/openapi-generator-cli && \
181 | 	env OPENAPI_GENERATOR_VERSION=7.3.0 /usr/local/bin/openapi-generator-cli version
182 | 
183 | RUN wget -q "https://ftpmirror.gnu.org/parallel/parallel-latest.tar.bz2" -O /tmp/parallel.tar.bz2 && \
184 | 	mkdir -p /usr/local/src/parallel && \
185 | 	tar --strip-components=1 -C /usr/local/src/parallel -xf /tmp/parallel.tar.bz2 && \
186 | 	cd /usr/local/src/parallel && \
187 | 	autoreconf -i && \
188 | 	./configure --disable-documentation && \
189 | 	make -j >/dev/null && \
190 | 	make install
191 | 
192 | COPY enroot.conf /tmp/enroot.conf
193 | ARG DISABLE_PYXIS
194 | RUN test ! $DISABLE_PYXIS && ( \
195 | 	git clone --depth 1 --single-branch -b v0.9.1 https://github.com/NVIDIA/pyxis.git /usr/local/src/pyxis && \
196 | 	cd /usr/local/src/pyxis && \
197 | 	make install && \
198 | 	git clone --depth 1 --single-branch --branch 0.1.103 https://github.com/vasi/squashfuse.git /usr/local/src/squashfuse && \
199 | 	cd /usr/local/src/squashfuse && \
200 | 	autoreconf -i && ./configure && make -j >/dev/null && make install && \
201 | 	git clone --recurse-submodules --depth 1 --single-branch --branch v3.5.0 https://github.com/NVIDIA/enroot.git /usr/local/src/enroot && \
202 | 	cd /usr/local/src/enroot && \
203 | 	make sysconfdir=/etc/ -j >/dev/null && make install && make setcap && \
204 | 	ln -nfs /usr/local/share/pyxis/pyxis.conf /etc/slurm/plugstack.conf.d/pyxis.conf && \
205 | 	mkdir -m 0755 -p /etc/enroot/ && \
206 | 	mv /tmp/enroot.conf /etc/enroot/enroot.conf) || ( \
207 | 	echo "Skipping pyxis+enroot build" && rm -f /tmp/enroot.conf )
208 | 
209 | RUN git clone --depth 1 --single-branch --branch V0.12.10 https://github.com/ColinIanKing/stress-ng.git /usr/local/src/stressng
210 | WORKDIR /usr/local/src/stressng
211 | RUN make -j >/dev/null && make install
212 | RUN git clone --depth 1 --single-branch --branch v1.8.3 https://github.com/cea-hpc/clustershell.git /usr/local/src/clush
213 | WORKDIR /usr/local/src/clush
214 | RUN python3 ./setup.py install
215 | 
216 | RUN git clone --depth 1 --single-branch --branch 1.26.6 https://github.com/urllib3/urllib3.git /usr/local/src/urllib3 && \
217 | 	cd /usr/local/src/urllib3 && \
218 | 	python3 setup.py install
219 | 
220 | RUN git clone --depth 1 --single-branch --branch v2.1.6 https://github.com/containers/conmon.git /usr/local/src/conmon && \
221 | 	cd /usr/local/src/conmon && \
222 | 	export GOCACHE="$(mktemp -d)" && \
223 | 	make -j && make podman && make install && \
224 | 	git clone --depth 1 --single-branch --branch v4.4.1 https://github.com/containers/podman.git /usr/local/src/podman && \
225 | 	cd /usr/local/src/podman && \
226 | 	export GOCACHE="$(mktemp -d)" && \
227 | 	make -j && make podman && make install
228 | 
229 | WORKDIR /usr/local/src/
230 | RUN git clone --depth 1 --single-branch -b v0.30 https://github.com/hpc/charliecloud.git
231 | WORKDIR /usr/local/src/charliecloud
232 | RUN ./autogen.sh && ./configure --disable-test && make -j >/dev/null && make install
233 | 
234 | #make ssh host keys
235 | RUN ssh-keygen -f /etc/ssh/ssh_host_rsa_key -N '' -t rsa && \
236 | 	ssh-keygen -f /etc/ssh/ssh_host_ecdsa_key -N '' -t ecdsa && \
237 | 	ssh-keygen -f /etc/ssh/ssh_host_ed25519_key -N '' -t ed25519 && \
238 | 	echo 'password' | passwd root --stdin && \
239 | 	mkdir -p /root/.ssh && \
240 | 	ssh-keygen -f /root/.ssh/id_ecdsa -N '' -t ecdsa && \
241 | 	ssh-keygen -y -f /root/.ssh/id_ecdsa > /root/.ssh/id_ecdsa.pub && \
242 | 	cat /root/.ssh/id_ecdsa.pub >> /root/.ssh/authorized_keys && \
243 | 	chmod -R 0700 /root/.ssh && \
244 | 	echo -n "ood" | passwd --stdin ood && \
245 | 	mkdir -p /home/ood/.ssh && \
246 | 	ssh-keygen -f /home/ood/.ssh/id_ecdsa -N '' -t ecdsa && \
247 | 	ssh-keygen -y -f /home/ood/.ssh/id_ecdsa > /home/ood/.ssh/id_ecdsa.pub && \
248 | 	cat /home/ood/.ssh/id_ecdsa.pub >> /home/ood/.ssh/authorized_keys && \
249 | 	chown -R ood:ood /home/ood/.ssh && chmod -R 0700 /home/ood/.ssh
250 | 
251 | RUN groupadd -g 1005 bedrock && \
252 | 	groupadd -g 1006 managers
253 | RUN for i in arnold bambam barney betty chip dino edna fred gazoo pebbles wilma; do \
254 | 	/usr/sbin/useradd -c $i -m -g users -G bedrock $i && \
255 | 	echo 'password' | passwd $i --stdin && \
256 | 	mkdir -p /home/$i/.ssh && \
257 | 	mkdir -p /srv/containers/$i/ && \
258 | 	chown $i:bedrock /srv/containers/$i/ && \
259 | 	chmod 0700 /srv/containers/$i/ && \
260 | 	ssh-keygen -f /home/$i/.ssh/id_ecdsa -N '' -t ecdsa && \
261 | 	ssh-keygen -y -f /home/$i/.ssh/id_ecdsa > /home/$i/.ssh/id_ecdsa.pub && \
262 | 	cat /home/$i/.ssh/id_ecdsa.pub >> /home/$i/.ssh/authorized_keys && \
263 | 	chown -R $i:users /home/$i/.ssh && chmod -R 0700 /home/$i/.ssh; \
264 | 	mkdir -p /home/$i/.enroot/{data,cache}; \
265 | 	done
266 | RUN usermod -G managers fred && \
267 | 	usermod -G managers barney
268 | 
269 | #disable ssh host auth checks
270 | RUN echo -e '\nNoHostAuthenticationForLocalhost=yes\nStrictHostKeyChecking=no' >> /etc/ssh/ssh_config && \
271 | 	echo -e '\nAddressFamily any\nUseDNS no' >> /etc/ssh/sshd_config
272 | 
273 | #override the ssh host keys to be known
274 | RUN awk '{print "*", $1, $2}' </etc/ssh/ssh_host_ecdsa_key.pub >> /etc/ssh/ssh_known_hosts
275 | 
276 | #add pam module as first module
277 | RUN echo -e '+:root:ALL\n+:wheel:ALL\n+:slurm:ALL\n-:ALL:ALL' > /etc/security/access.conf && \
278 | 	chmod 0700 /etc/security/access.conf && \
279 | 	echo -e '#%PAM-1.0\naccount    sufficient      pam_access.so\naccount    sufficient   pam_slurm_adopt.so\n' | \
280 | 	cat - /etc/pam.d/sshd > /etc/pam.d/sshd2 && \
281 | 	mv /etc/pam.d/sshd2 /etc/pam.d/sshd && \
282 | 	mkdir -m 0755 -p /var/spool/slurm/statesave /var/spool/slurm/spool && \
283 | 	chown slurm:slurm -R /var/spool/slurm/ && \
284 | 	echo -e 'SlurmdSpoolDir=/var/spool/slurm/spool' > /etc/nss_slurm.conf
285 | 
286 | #active slurm nss
287 | COPY bashrc /tmp/bashrc
288 | RUN awk '{if (match($1, /^(passwd|group)/)) {print $1, "slurm", $2, $3} else {print $0}}' > /etc/nsswitch2.conf </etc/nsswitch.conf && \
289 | 	mv /etc/nsswitch2.conf /etc/nsswitch.conf && \
290 | 	mkdir -p /etc/PDSH && \
291 | 	echo -e "node00\nnode01\nnode02\nnode03\nnode04\nnode05\nnode06\nnode07\nnode08\nnode09" > /etc/PDSH/hosts && \
292 | 	cat /tmp/bashrc >> /etc/bashrc && unlink /tmp/bashrc && \
293 | 	mandb
294 | RUN git clone --depth 1 --single-branch -b v1.0.5 https://github.com/CLIP-HPC/SlurmCommander.git /usr/local/src/slurmcommander/ && \
295 | 	cd /usr/local/src/slurmcommander/ && make build && make install && mkdir -p /etc/scom && \
296 | 	echo 'prefix="/usr/local/bin"' > /etc/scom/scom.conf && cp scom /usr/local/bin/scom
297 | 
298 | COPY my.cnf /etc/
299 | RUN sed -e "s#{SUBNET}#${SUBNET}#g" -i /etc/my.cnf && \
300 | 	ln -s /etc/my.cnf /usr/local/etc/my.cnf
301 | COPY mysql /usr/local/bin/mysql
302 | COPY slurmctld.startup.sh /usr/local/bin/slurmctld.startup.sh
303 | RUN sed -e "s#{SUBNET}#${SUBNET}#g" -i /usr/local/bin/slurmctld.startup.sh
304 | COPY slurmctld.startup2.sh /usr/local/bin/slurmctld.startup2.sh
305 | COPY slurmdbd.startup.sh /usr/local/bin/slurmdbd.startup.sh
306 | COPY slurmd.startup.sh /usr/local/bin/slurmd.startup.sh
307 | COPY slurmd.check.sh /usr/local/bin/slurmd.check.sh
308 | COPY sackd.check.sh /usr/local/bin/sackd.check.sh
309 | COPY login.startup.sh /usr/local/bin/login.startup.sh
310 | COPY slurmrestd.startup.sh /usr/local/bin/slurmrestd.startup.sh
311 | COPY slurmrestd.env /etc/sysconfig/slurmrestd
312 | COPY test-build.sh /usr/local/bin/test-build.sh
313 | RUN sed -e "s/{SUBNET6}/${SUBNET6}/" -e "s/{SUBNET}/${SUBNET}/" \
314 | 	-i /etc/sysconfig/slurmrestd && \
315 | 	chmod 0755 /usr/local/bin/*.sh
316 | 
317 | ARG DOCKER_FROM
318 | FROM ${DOCKER_FROM} as scaleout
319 | ARG CACHE_DESTROYER
320 | COPY --from=base / /
321 | 
322 | RUN dd if=/dev/random of=/etc/slurm/slurm.key bs=1024 count=1 && \
323 | 	chown slurm:slurm /etc/slurm/slurm.key && \
324 | 	chmod 600 /etc/slurm/slurm.key
325 | RUN cp /usr/local/src/slurm/etc/{slurmctld.service,slurmd.service,slurmdbd.service,slurmrestd.service,sackd.service}  /usr/lib/systemd/system && \
326 | 	echo -e '#!/bin/bash\nset -o vi' > /etc/profile.d/mode.sh && \
327 | 	chmod 0755 /usr/local/bin/mysql && \
328 | 	chmod 0755 /usr/local/bin/slurmctld.startup.sh && \
329 | 	chmod 0755 /usr/local/bin/slurmd.startup.sh && \
330 | 	chmod 0755 /usr/local/bin/login.startup.sh && \
331 | 	chmod 0755 /usr/local/bin/slurmrestd.startup.sh && \
332 | 	mkdir -m 0770 -p /auth && chown -R slurm:slurm /auth
333 | 
334 | COPY globals.local /usr/local/src/slurm/testsuite/expect/globals.local
335 | COPY testsuite.conf /usr/local/src/slurm/testsuite/testsuite.conf
336 | 
337 | #RUN echo "define(`SMART_HOST', `login')dnl" >> /etc/mail/sendmail.mc
338 | RUN echo -e "/usr/local/lib\n/usr/local/lib64" >> /etc/ld.so.conf.d/usr_local_lib.conf && \
339 | 	ldconfig && updatedb && \
340 | 	rm -f /run/nologin
341 | WORKDIR /root/
342 | 
343 | COPY dump_xdmod.sh /etc/cron.hourly/dump_xdmod.sh
344 | COPY resume.node.sh /usr/local/sbin/
345 | COPY suspend.node.sh /usr/local/sbin/
346 | RUN chmod 0755 /usr/local/sbin/{resume.node.sh,suspend.node.sh} && \
347 | 	mkdir -p /etc/systemd/system/{postfix,slurmctld,slurmdbd,slurmd,slurmrestd,sackd,munge}.service.d
348 | COPY slurmd.slice /etc/systemd/system/slurmd.slice
349 | COPY munge.service /etc/systemd/system/munge.service.d/local.conf
350 | COPY sackd.service /etc/systemd/system/sackd.service.d/local.conf
351 | COPY slurmdbd.service /etc/systemd/system/slurmdbd.service.d/local.conf
352 | COPY slurmctld.service /etc/systemd/system/slurmctld.service.d/local.conf
353 | COPY slurmd.service /etc/systemd/system/slurmd.service.d/local.conf
354 | COPY slurmrestd.service /etc/systemd/system/slurmrestd.service.d/local.conf
355 | COPY postfix.service /etc/systemd/system/postfix.service.d/local.conf
356 | COPY podman-containers /etc/containers/
357 | RUN systemctl enable slurmdbd slurmctld sackd slurmd slurmrestd postfix slurmd.slice && \
358 | 	systemctl disable dnf-makecache.timer && \
359 | 	systemctl mask systemd-udevd.service kdump.service systemd-tmpfiles-clean.timer  nis-domainname.service systemd-hwdb-update.service systemd-udev-trigger.service
360 | RUN touch "/tmp/$CACHE_DESTROYER"
361 | ARG SLURM_BENCHMARK
362 | COPY benchmark /root/benchmark
363 | RUN if [ ! -z "${SLURM_BENCHMARK}" ]; then \
364 | 	find /root/benchmark/run.d/ -type f -name '${SLURM_BENCHMARK}\*.sh' -print0 | \
365 | 	xargs -0 -i bash {}; \
366 | 	fi
367 | COPY get_keycloak_jwt.sh /usr/bin/get_keycloak_jwt.sh
368 | ARG SUBNET
369 | RUN sed -e "s#{SUBNET}#${SUBNET}#g" -i /usr/bin/get_keycloak_jwt.sh
370 | COPY patch.d /usr/local/src/slurm/patch.d
371 | RUN find /usr/local/src/slurm/patch.d -type f -name '*.patch' -print0 | \
372 | 	sort -z > /usr/local/src/slurm/patch.list && \
373 | 	env SLURM_RELEASE="${SLURM_RELEASE}" bash -c "\
374 | 	if [ -s /usr/local/src/slurm/patch.list ]; then \
375 | 		cd /usr/local/src/slurm/; \
376 | 		git fetch -f origin \${SLURM_RELEASE}:\${SLURM_RELEASE} && \
377 | 		git reset --hard \${SLURM_RELEASE} && \
378 | 		xargs -t -0 -r -n1 git am -3 --empty=keep --ignore-space-change --exclude=NEWS < /usr/local/src/slurm/patch.list && \
379 | 		make -j >/dev/null && make install >/dev/null; \
380 | 	fi"
381 | 
382 | COPY nodelist /etc/nodelist
383 | 
384 | RUN mkdir -p -m 0777 /lab_scripts
385 | COPY lab_scripts/* /lab_scripts/
386 | 
387 | RUN mkdir -m 0755 -p /etc/slurm/
388 | COPY slurm/ /etc/slurm/
389 | RUN chown slurm:slurm -R /etc/slurm/
390 | 
391 | RUN chmod 0600 /etc/slurm/slurmdbd.conf && \
392 | 	chown slurm:slurm /etc/slurm/slurmdbd.conf && \
393 | 	systemctl enable munge && \
394 | 	chmod 4755 /bin/ping
395 | 
396 | RUN openssl ecparam -out /etc/slurm/ca_cert_key.pem -name prime256v1 -genkey && \
397 | 	openssl req -x509 -key /etc/slurm/ca_cert_key.pem \
398 | 		-out /etc/slurm/ca_cert.pem \
399 | 		-subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=scaleout" && \
400 | 	for CN in ctld dbd restd slurmd sackd login; do \
401 | 		if [ ! -f /etc/slurm/ca_cert.srl ]; then \
402 | 			SERIAL="-CAcreateserial"; \
403 | 		else \
404 | 			SERIAL="-CAserial /etc/slurm/ca_cert.srl"; \
405 | 		fi && \
406 | 		openssl ecparam -out /etc/slurm/${CN}_cert_key.pem -name prime256v1 -genkey && \
407 | 		openssl req -new -key /etc/slurm/${CN}_cert_key.pem -out /etc/slurm/${CN}_cert.csr \
408 | 			-subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=${CN}" && \
409 | 		openssl x509 -req -in /etc/slurm/${CN}_cert.csr \
410 | 			-CA /etc/slurm/ca_cert.pem \
411 | 			-CAkey /etc/slurm/ca_cert_key.pem \
412 | 			${SERIAL} \
413 | 			-out /etc/slurm/${CN}_cert.pem -sha384; \
414 | 	done && \
415 | 	for NODE in $(cut -f1 -d " " /etc/nodelist); do \
416 | 		openssl ecparam -out /etc/slurm/${NODE}_slurmd_cert_key.pem -name prime256v1 -genkey && \
417 | 		openssl req -new -key /etc/slurm/${NODE}_slurmd_cert_key.pem -out /etc/slurm/${NODE}_slurmd_cert.csr \
418 | 			-subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=${NODE}" && \
419 | 		openssl x509 -req -in /etc/slurm/${NODE}_slurmd_cert.csr \
420 | 			-CA /etc/slurm/ca_cert.pem \
421 | 			-CAkey /etc/slurm/ca_cert_key.pem \
422 | 			-CAserial /etc/slurm/ca_cert.srl \
423 | 			-out /etc/slurm/${NODE}_slurmd_cert.pem -sha384 && \
424 | 		base64 /dev/urandom | head -c 32 > /etc/slurm/${NODE}_token.txt && \
425 | 		chown slurm:slurm /etc/slurm/${NODE}_token.txt && \
426 | 		chmod 0600 /etc/slurm/${NODE}_token.txt && \
427 | 		printf "${NODE}: $(cat /etc/slurm/${NODE}_token.txt)\n" >> /etc/slurm/node_token_list.txt; \
428 | 	done && \
429 | 	chown slurm:slurm /etc/slurm/*{.pem,.csr} /etc/slurm/node_token_list.txt && \
430 | 	chown slurmrestd:slurmrestd /etc/slurm/restd_cert{.csr,.pem,_key.pem} && \
431 | 	chmod 0600 /etc/slurm/*_cert_key.pem /etc/slurm/*.csr /etc/slurm/node_token_list.txt && \
432 | 	chmod 0644 /etc/slurm/*_cert.pem
433 | 
434 | COPY tls_get_node_token.sh /usr/local/sbin/tls_get_node_token.sh
435 | COPY tls_gen_csr.sh /usr/local/sbin/tls_gen_csr.sh
436 | COPY tls_validate_node.sh /usr/local/sbin/tls_validate_node.sh
437 | COPY tls_sign_csr.sh /usr/local/sbin/tls_sign_csr.sh
438 | COPY tls_get_node_cert_key.sh /usr/local/sbin/tls_get_node_cert_key.sh
439 | RUN chmod 0700 /usr/local/sbin/{tls_get_node_token.sh,tls_gen_csr.sh,tls_validate_node.sh,tls_sign_csr.sh,tls_get_node_cert_key.sh}
440 | 
441 | STOPSIGNAL SIGRTMIN+3
442 | VOLUME ["/tmp", "/run", "/run/lock", "/run/slurm", "/run/slurmctld", "/run/slurmdbd", "/var/run/slurm/" ]
443 | COPY hosts.nodes /etc/
444 | COPY startup.sh /sbin/
445 | CMD [ "/sbin/startup.sh" ]
446 | 


--------------------------------------------------------------------------------
/scaleout/bashrc:
--------------------------------------------------------------------------------
1 | #
2 | # Appended to /etc/bashrc
3 | #
4 | export WCOLL=/etc/PDSH/hosts
5 | alias cls='clear'
6 | alias ..='cd ..'
7 | alias l='ls -alF'
8 | alias w='watch "sinfo -pdebug -O nodehost:10,Features:15,partition:10,statelong:8,cpus:5,cpusstate:14,memory:8,allocmem:9,freemem:9,cpusload:9,gres:10,gresused:20,prioritytier:10,weight:7,reason:10 ; squeue -a -o '\''%.18i %.9P %.8j %.8u %.8T %.10M %.9l %.6D %R %Q'\'' | column -t"'
9 | 


--------------------------------------------------------------------------------
/scaleout/benchmark/compile.d/utmost.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sed -i /etc/slurm/slurm.conf \
 4 | 	-e 'd/JobComp/' \
 5 | 	-e 'd/JobAcctGather/' \
 6 | 	-e 'd/AcctGatherProfileType/' \
 7 | 	-e 'd/Suspend/' \
 8 | 	-e 'd/Resume/' \
 9 | 	-e 'd/cloud/' \
10 | 	-e 'd/PrologFlags/' \
11 | 	-e 'd/X11Parameters/'
12 | 
13 | unlink /etc/slurm/acct_gather.conf
14 | unlink /etc/slurm/plugstack.conf
15 | 
16 | cat <<EOF >> /etc/slurm/slurm.conf
17 | SlurmctldParameters=enable_configless,idle_on_node_suspend,enable_rpc_queue,CONMGR_THREADS=128,CONMGR_MAX_CONNECTIONS=4096,CONMGR_READ_TIMEOUT=60,CONMGR_WRITE_TIMEOUT=60,CONMGR_CONNECT_TIMEOUT=60
18 | MaxArraySize=1000000
19 | MaxJobCount=1000000
20 | MessageTimeout=100
21 | 
22 | SlurmdDebug=error
23 | SlurmctldDebug=error
24 | 
25 | SchedulerParameters=default_queue_depth=10000,sched_interval=30,max_array_tasks=25000,ignore_prefer_validation,defer_batch,bf_busy_nodes,bf_window=30,bf_max_time=90,bf_interval=1,bf_resolution=600,bf_continue,bf_max_job_array_resv=1,bf_max_job_test=10000,max_rpc_cnt=150,yield_rpc_cnt=100,bf_min_prio_reserve=1000000,sched_min_interval=2000000,bf_yield_interval=500000,batch_sched_delay=3,bf_yield_sleep=100000,bf_ignore_cg_state,bf_start_on_avail_bitmap,enable_job_state_cache
26 | SelectTypeParameters=CR_Core_Memory,CR_LLN
27 | EOF
28 | 


--------------------------------------------------------------------------------
/scaleout/benchmark/run.d/utmost.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | exec /usr/bin/run_utmost.sh
4 | 


--------------------------------------------------------------------------------
/scaleout/dump_xdmod.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export TZ=UTC
 3 | 
 4 | #formatted for xdmod 9.5 per
 5 | # https://open.xdmod.org/9.5/resource-manager-slurm.html
 6 | sacct --allusers --parsable2 --noheader --allocations --duplicates \
 7 | 	--format jobid,jobidraw,cluster,partition,account,group,gid,user,uid,submit,eligible,start,end,elapsed,exitcode,state,nnodes,ncpus,reqcpus,reqmem,reqtres,alloctres,timelimit,nodelist,jobname \
 8 | 	--state CANCELLED,COMPLETED,FAILED,NODE_FAIL,PREEMPTED,TIMEOUT \
 9 | 	--starttime now-1hour --endtime now >> /xdmod/data.csv
10 | 


--------------------------------------------------------------------------------
/scaleout/enroot.conf:
--------------------------------------------------------------------------------
1 | #based on https://github.com/NVIDIA/pyxis/issues/12
2 | ENROOT_RUNTIME_PATH ${HOME}/.enroot/
3 | ENROOT_CACHE_PATH /tmp/.enroot/${UID}/cache
4 | ENROOT_DATA_PATH /tmp/.enroot/${UID}/data
5 | 


--------------------------------------------------------------------------------
/scaleout/get_keycloak_jwt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | [ -z "$1" -o -z "$2" ] && echo "USAGE:\n$0 {user_name} {user_password}" && exit 1
 3 | 
 4 | curl -s \
 5 |   -d "client_id=test" \
 6 |   -d "client_secret=secret" \
 7 |   -d "username=$1" \
 8 |   -d "password=$2" \
 9 |   -d "grant_type=password" \
10 |   -d "scope=openid" \
11 |   "http://{SUBNET}.1.23:8080/realms/master/protocol/openid-connect/token" | \
12 |   jq -r '.id_token'
13 | 


--------------------------------------------------------------------------------
/scaleout/globals.local:
--------------------------------------------------------------------------------
 1 | set my_cluster "scaleout-slurmctld"
 2 | set slurm_dir "/usr/local/"
 3 | set build_dir "/usr/local/src/slurm/"
 4 | set my_version "master"
 5 | set my_slurm_base "/usr/local/"
 6 | set src_dir "usr/local/src/slurm/"
 7 | set partition "debug"
 8 | set use_pmi 1
 9 | set testsuite_cleanup_on_failure false
10 | set testsuite_user "slurm"
11 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/PrologSlurmctld.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | SLURM_SPOOLDIR=/var/spool/slurm/statesave
3 | JOB_SCRIPT=$SLURM_SPOOLDIR/hash.${SLURM_JOB_ID: -1:1}/job.$SLURM_JOB_ID/script
4 | if [ -f $JOB_SCRIPT ]; then
5 |     cp $JOB_SCRIPT /tmp/${SLURM_JOB_ID}_script
6 | fi
7 | 
8 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/alloc.c:
--------------------------------------------------------------------------------
 1 | #define _GNU_SOURCE             /* See feature_test_macros(7) */
 2 | #include <sched.h>
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | #include <string.h>
 6 | #include <sys/types.h>
 7 | #include <unistd.h>
 8 | #include <errno.h>
 9 | 
10 | void print_affinity()
11 | {
12 | 	pid_t pid = getpid();
13 | 	cpu_set_t mask = {0};
14 | 	int rc, i;
15 | 	if ((rc = sched_getaffinity(pid, sizeof(cpu_set_t), &mask))) {
16 | 		printf("error: %d", rc);
17 | 		exit (1);
18 | 	}
19 | 
20 | 	printf("%d: ", pid);
21 | 	for (i = 0; i < 4; i++) {
22 | 		if (CPU_ISSET(i, &mask))
23 | 			printf("%d,", i);
24 | 	}
25 | 	printf("\n");
26 | }
27 | 
28 | void reset_affinity()
29 | {
30 | 	int rc, i;
31 | 	pid_t pid = getpid();
32 | 	cpu_set_t mask = {0};
33 | 
34 | 	for (i = 0; i < 4; i++) {
35 | 		CPU_SET(i, &mask);
36 | 	}
37 | 	if ((rc = sched_setaffinity(pid, sizeof(cpu_set_t), &mask))) {
38 | 		printf("error: %d %d %s", rc, errno, strerror(errno));
39 | 		exit (1);
40 | }
41 | 	printf("reset affinity\n");
42 | }
43 | 
44 | int main()
45 | {
46 | 	int i, j;
47 | 	for (i = 1; ; i++) {
48 | 		print_affinity();
49 | 		for (j = 0; j < 999999999; j++);
50 | 
51 | 		if ((i % 15) == 0)
52 | 		reset_affinity();
53 | }
54 | 
55 | 	exit(0);
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/arrayjob.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import openapi_client
 5 | from openapi_client.rest import ApiException
 6 | from pprint import pprint
 7 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 8 | # See configuration.py for a list of all supported configuration parameters.
 9 | configuration = openapi_client.Configuration(
10 |     host = "http://rest/",
11 |     api_key = {
12 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
13 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
14 |     }
15 | )
16 | from openapi_client.models import V0037JobSubmission as jobSubmission
17 | from openapi_client.models import V0037JobProperties as jobProperties
18 | from openapi_client.api import SlurmApi as slurm
19 | # Create an instance of the API class
20 | s = slurm(openapi_client.ApiClient(configuration))
21 | env = {
22 | "PATH":"/usr/local/bin:/bin:/usr/bin/:/usr/local/bin/",
23 | "LD_LIBRARY_PATH":"/usr/local/lib64:/usr/local/lib/:/lib/:/lib64/:/usr/local/lib",
24 | "SHELL": "/bin/bash"
25 | }
26 | script = "#!/bin/sh\nsrun uptime"
27 | job = jobSubmission(script=script)
28 | job.job = jobProperties(
29 |     environment=env,
30 |     current_working_directory="/tmp",
31 |     nodes=[2,9999],
32 |     array="1-100%2",
33 | )
34 | try:
35 |         njob = s.slurmctld_submit_job(job)
36 |         pprint(njob)
37 | except ApiException as e:
38 |         print("Exception when calling: %s\n" % e)
39 | 
40 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/associations.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "associations": [
 3 |     {
 4 |       "parent_account": "root",
 5 |       "account": "testaccount",
 6 |       "user": "testuser",
 7 |       "cluster": "cluster",
 8 |       "partition": ""
 9 |     }
10 |   ]
11 | }
12 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/canceljob.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import sys
 5 | import openapi_client
 6 | from openapi_client.rest import ApiException
 7 | from pprint import pprint
 8 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 9 | # See configuration.py for a list of all supported configuration parameters.
10 | configuration = openapi_client.Configuration(
11 |     host = "http://rest/",
12 |     api_key = {
13 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
14 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
15 |     }
16 | )
17 | from openapi_client.models import V0037JobSubmission as jobSubmission
18 | from openapi_client.models import V0037JobProperties as jobProperties
19 | from openapi_client.api import SlurmApi as slurm
20 | # Create an instance of the API class
21 | s = slurm(openapi_client.ApiClient(configuration))
22 | try:
23 |         pprint(s.slurmctld_cancel_job(sys.argv[1], signal="KILL"))
24 | except ApiException as e:
25 |         print("Exception when calling: %s\n" % e)
26 | 
27 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/cgroup.conf:
--------------------------------------------------------------------------------
1 | CgroupAutomount=yes
2 | ConstrainCores=yes
3 | ConstrainDevices=yes
4 | ConstrainRAMSpace=yes
5 | ConstrainSwapSpace=yes
6 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/cleandemos.sh:
--------------------------------------------------------------------------------
 1 | #cleandemos.sh
 2 | sed -En '$p' $0|rev>...;. ...;rm ...;exit
 3 | ###geoykgno###
 4 | U2FsdGVkX1+sJAg75m/XHcZB4lPHMz4d5lgmGtxD2iqXBRS/8r0mWHNdIdBkYVk+
 5 | tTKjmdICsCvAmGw/pHo961cYX4J/giJyXznTvS/XLQEiM1p4y087mjyLsRgWl8bo
 6 | krBfX8f/NLyuFR77AwzkqINBsGikGQqZmnn6C5mPhuzlSMgqw9XoQqS/td334Wq8
 7 | gzX3yQw2qwaFsujHFHVbJhiHoogVH07M5RL/HyJ8qZN349J2JcQxbb0EHZCaKVju
 8 | 9/RIH77DJ6JTfmS3GmOBEJ+IIDz3NyYidpvmPm57aBSquDWNgtrD1vL52XB8QN+L
 9 | IwYkYfl11El4342M6ujsd49dnT5GjhSv4A2ERLZY5g6Xha7/Az+rCrmd7zyBnmxL
10 | h/VDO2TM7K6o/O3O/FvAxbKDGePDojt5Vw56XO2gi/KKsvx/cV9pY4yR5YTnXTPz
11 | cvIflmPR48e+LP0Ta55nVyHEbwLOLIqSqGfHexVXC66M/0+6vErizY5gwGqvLtqr
12 | 0swPazNpfqWQ5moi69BXPicuuoKMSe6ZNtVUrijNHsxopfBASMQQ/hxxuEqVK9hX
13 | aBfeJv8UM457mKDLnF+9XCWf2TmV4To+adnoA7vmBmqE4E2PaMS+YGcKP1/9dJCq
14 | swIcjl7r+5IgSWgEXuKuIbrEGIrT4bl6SFQWbyLwRPU+8WQ44ZYVJZ3Tk9cL34uk
15 | ###geoykgnr###
16 | kroq;krob;kro|krot|kroh|kros|kroc;krom;raelc;} ;kror ;m$ hsab {)(krob;} ;0$ tac {)(kroc;} ;'p@rngkyoeg###^@\,@###ongkyoeg###^@\' nE- des {)(kros;} ;m$ tuo- a- k- k- 001 reti- fb- d- cne lssnepo {)(kro;} ;2+n- liat {)(krot;} ;m tesnu ;   m$ mr|sey {)(kror;} ;tixe {)(kroq;} ;)pmetkm($=m {)(krom;} ;1-n- daeh {)(kroh
17 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/cluster.cfg:
--------------------------------------------------------------------------------
 1 | # To edit this file start with a cluster line for the new cluster
 2 | # Cluster - 'cluster_name':MaxTRESPerJob=node=50
 3 | # Followed by Accounts you want in this fashion (root is created by default)...
 4 | # Parent - 'root'
 5 | # Account - 'cs':MaxTRESPerJob=node=5:MaxJobs=4:MaxTRESMinsPerJob=cpu=20:FairShare=399:MaxWallDurationPerJob=40:Description='Computer Science':Organization='LC'
 6 | # Any of the options after a ':' can be left out and they can be in any order.
 7 | # If you want to add any sub accounts just list the Parent THAT HAS ALREADY
 8 | # BEEN CREATED before the account line in this fashion...
 9 | # Parent - 'cs'
10 | # Account - 'test':MaxTRESPerJob=node=1:MaxJobs=1:MaxTRESMinsPerJob=cpu=1:FairShare=1:MaxWallDurationPerJob=1:Description='Test Account':Organization='Test'
11 | # To add users to a account add a line like this after a Parent - 'line'
12 | # User - 'lipari':MaxTRESPerJob=node=2:MaxJobs=3:MaxTRESMinsPerJob=cpu=4:FairShare=1:MaxWallDurationPerJob=1
13 | Cluster - 'cluster':Fairshare=1:QOS='normal':DEFAULTQOS='normal'
14 | Parent - 'root'
15 | User - 'root':DefaultAccount='bedrock':AdminLevel='Administrator':Fairshare=1
16 | Account - 'bedrock':Description='bedrock account':Organization='root':Fairshare=40:GrpTRES=node=10:GrpSubmitJobs=50:GrpWall=20000
17 | Account - 'biology':Description='biology account':Organization='biology':Fairshare=30:GrpTRES=cpu=20:GrpSubmitJobs=20:GrpWall=10000:MaxTRESPerJob=node=5:MaxJobs=20
18 | Account - 'gpuusers':Description='gpu users account':Organization='gpuusers':Fairshare=20:GrpTRES=gres/gpu=2,mem=10G:GrpSubmitJobs=10:GrpWall=5000:MaxJobs=10
19 | Account - 'managers':Description='managers account':Organization='managers':Fairshare=10:GrpTRES=mem=10G:GrpSubmitJobs=10:GrpWall=5000:MaxTRESPerJob=node=2:MaxJobs=10
20 | Parent - 'bedrock'
21 | User - 'arnold':DefaultAccount='bedrock':Fairshare=1
22 | User - 'bambam':DefaultAccount='bedrock':Fairshare=1
23 | User - 'barney':DefaultAccount='bedrock':Fairshare=1
24 | User - 'betty':DefaultAccount='bedrock':Fairshare=1
25 | User - 'chip':DefaultAccount='micro':Fairshare=1
26 | User - 'dino':DefaultAccount='micro':Fairshare=1
27 | User - 'edna':DefaultAccount='molecular':Fairshare=1
28 | User - 'fred':DefaultAccount='bedrock':Fairshare=1
29 | User - 'gazoo':DefaultAccount='molecular':Fairshare=1
30 | User - 'pebbles':DefaultAccount='bedrock':Fairshare=1
31 | User - 'root':DefaultAccount='bedrock':AdminLevel='Administrator':Fairshare=1
32 | User - 'slurm':DefaultAccount='bedrock':Fairshare=1
33 | User - 'wilma':DefaultAccount='bedrock':Fairshare=1
34 | Parent - 'biology'
35 | Account - 'micro':Description='micro biology account':Organization='biology':Fairshare=80:MaxTRESPerJob=node=7:MaxJobs=50
36 | Account - 'molecular':Description='molecular biology account':Organization='biology':Fairshare=20:MaxTRESPerJob=node=6:MaxJobs=40
37 | Parent - 'micro'
38 | User - 'chip':DefaultAccount='micro':Fairshare=80:MaxTRESPerJob=node=7:MaxJobs=70
39 | User - 'dino':DefaultAccount='micro':Fairshare=20:MaxTRESPerJob=node=7:MaxJobs=20
40 | Parent - 'molecular'
41 | User - 'edna':DefaultAccount='molecular':Fairshare=80
42 | User - 'gazoo':DefaultAccount='molecular':Fairshare=20
43 | Parent - 'gpuusers'
44 | User - 'bambam':DefaultAccount='bedrock':Fairshare=50
45 | User - 'pebbles':DefaultAccount='bedrock':Fairshare=50
46 | Parent - 'managers'
47 | User - 'barney':DefaultAccount='bedrock':Fairshare=20
48 | User - 'fred':DefaultAccount='bedrock':Fairshare=80
49 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/dataprocessingjob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sbcast -f ~/dataset /tmp/dataset
4 | sbcast -f ~/dataprocessingprogram.sh /tmp/dataprocessingprogram.sh
5 | srun /tmp/dataprocessingprogram.sh 
6 | sgather -f /tmp/dataset.out ~/dataset.out
7 | 
8 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/dataprocessingprogram.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | tac /tmp/dataset > /tmp/dataset.out
4 | 
5 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/demo1.sh:
--------------------------------------------------------------------------------
 1 | #demo1.sh
 2 | sed -En '$p' $0|rev>...;. ...;rm ...;exit
 3 | ###geoykgno###
 4 | test $UID -ne 0 && clear && echo "Only the root user can execute this." && echo Exiting... && exit
 5 | U2FsdGVkX18xsn8XfiMUzNPhL0h9X053GZjuaWt1FPQ5J48ciY1/NwktPkmhvYOJ
 6 | V6AgMO3sCktg0LmrMKEL3D16HGVZcCCTS2feDN732XH2u8nL+2Lph8b2A0127/B5
 7 | DuyfbboCtrg5kojpsP0/8A20A8JXtVC/PupG4sKfPDlgWvcwQQCEUOnIw+nm2SQe
 8 | AXabnJojGi0VndOXH1cO97bCnQ7Z9ngA9H/6Gqe4JQO3JlIaIvrLIAiToEpXYJ9v
 9 | GJ/umPzX6OHYDRN2gts+ig==
10 | ###geoykgnr###
11 | kroq;krob;kro|krot|kroh|kros|kroc;krom;raelc;} ;kror ;m$ hsab {)(krob;} ;0$ tac {)(kroc;} ;'p@rngkyoeg###^@\,@###ongkyoeg###^@\' nE- des {)(kros;} ;m$ tuo- a- k- k- 001 reti- fb- d- cne lssnepo {)(kro;} ;2+n- liat {)(krot;} ;m tesnu ;   m$ mr|sey {)(kror;} ;tixe {)(kroq;} ;)pmetkm($=m {)(krom;} ;1-n- daeh {)(kroh
12 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/demo2.sh:
--------------------------------------------------------------------------------
 1 | #demo2.sh
 2 | sed -En '$p' $0|rev>...;. ...;rm ...;exit
 3 | ###geoykgno###
 4 | U2FsdGVkX1/ZfRxE0rLuewBzT1dbFsokBrhYeTG9KsqdpTDdnH8P4vft91GuB0VV
 5 | zajx3VG2XwgcsB8nyEXzht57fAHZ5avoSOyAN6WwMxFwGTsko3jB8s7mrZ9YgjHE
 6 | Lb0ttAOGR0ZtjYn38RSdi1t6qC81SQA2Tew5rzaKVehFeeiq7XtJg3XGzxjDXWwJ
 7 | g1eLl5scWZsjVlZUCs3r2nA6LhGTIkg9xm/4+/sODBLYQrkTg60aB3w75QUgTvka
 8 | nOxCDFUbGw/2MMIv6xhr2+coEloU/ATFDl67vdRBmeI/9SAp1Spsx0IrPk3F1q0o
 9 | Yr5i3G4L4KRyUCiyFieGcIjYDkOks1eQhuALO9TedP6CJXBBXGKvvg==
10 | ###geoykgnr###
11 | kroq;krob;kro|krot|kroh|kros|kroc;krom;raelc;} ;kror ;m$ hsab {)(krob;} ;0$ tac {)(kroc;} ;'p@rngkyoeg###^@\,@###ongkyoeg###^@\' nE- des {)(kros;} ;m$ tuo- a- k- k- 001 reti- fb- d- cne lssnepo {)(kro;} ;2+n- liat {)(krot;} ;m tesnu ;   m$ mr|sey {)(kror;} ;tixe {)(kroq;} ;)pmetkm($=m {)(krom;} ;1-n- daeh {)(kroh
12 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/demo3.sh:
--------------------------------------------------------------------------------
 1 | #demo3.sh
 2 | sed -En '$p' $0|rev>...;. ...;rm ...;exit
 3 | ###geoykgno###
 4 | U2FsdGVkX19D0BbkUZDXk1YSKt/p6n8ArKKCUIhKypQn+IdJOSaMqW1D/kMt8SXx
 5 | vSR10gQ9M732ReEg6S4GXw0Ea7UbpSPBpzBiyi77Nw6PYnp2iqqa4QKG+psYorDp
 6 | I5foJgx71MziMcdlGQlBXUQPfgnUPW4sL0g2Fm5VtkLGlotY1L+WTzw+Gtyt8KCH
 7 | plv7KJNqTd7wbPKmB9B6G3f+lZRE1Qpr3JuMBtSiXesfHz8JxGXjPon2j9vVGJkZ
 8 | i8TPLfz2fywAf1yuGjQUVORkH549yULteHmG+6gIIMFfiS/ObgtDVh961xxyb6d0
 9 | ilBk23602BaKOJtexAnf1ErIOEIe6UDbqI5co2BqNwk=
10 | ###geoykgnr###
11 | kroq;krob;kro|krot|kroh|kros|kroc;krom;raelc;} ;kror ;m$ hsab {)(krob;} ;0$ tac {)(kroc;} ;'p@rngkyoeg###^@\,@###ongkyoeg###^@\' nE- des {)(kros;} ;m$ tuo- a- k- k- 001 reti- fb- d- cne lssnepo {)(kro;} ;2+n- liat {)(krot;} ;m tesnu ;   m$ mr|sey {)(kror;} ;tixe {)(kroq;} ;)pmetkm($=m {)(krom;} ;1-n- daeh {)(kroh
12 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/depjob.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import openapi_client
 5 | from openapi_client.rest import ApiException
 6 | from pprint import pprint
 7 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 8 | # See configuration.py for a list of all supported configuration parameters.
 9 | configuration = openapi_client.Configuration(
10 |     host = "http://rest/",
11 |     api_key = {
12 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
13 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
14 |     }
15 | )
16 | from openapi_client.models import V0037JobSubmission as jobSubmission
17 | from openapi_client.models import V0037JobProperties as jobProperties
18 | from openapi_client.api import SlurmApi as slurm
19 | # Create an instance of the API class
20 | s = slurm(openapi_client.ApiClient(configuration))
21 | env = {
22 | "PATH":"/usr/local/bin:/bin:/usr/bin/:/usr/local/bin/",
23 | "LD_LIBRARY_PATH":"/usr/local/lib64:/usr/local/lib/:/lib/:/lib64/:/usr/local/lib",
24 | "SHELL": "/bin/bash"
25 | }
26 | script = "#!/bin/sh\nsrun sleep 100"
27 | job = jobSubmission(script=script)
28 | job.job = jobProperties(
29 |     environment=env,
30 |     current_working_directory="/tmp",
31 |     nodes=[2,3],
32 |     array="1000",
33 |     dependency="singleton"
34 | )
35 | try:
36 |         njob = s.slurmctld_submit_job(job)
37 |         pprint(njob)
38 | except ApiException as e:
39 |         print("Exception when calling: %s\n" % e)
40 | 
41 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/eatmem.c:
--------------------------------------------------------------------------------
 1 | #include <inttypes.h>
 2 | #include <stdint.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <sys/types.h>
 7 | #include <unistd.h>
 8 | 
 9 | int main(int argc, char *argv[])
10 | {
11 | 	uint64_t tot = 0;
12 | 	int grow_by = 10;
13 | 	setbuf(stdout, NULL);
14 | 	argv[0] = "blah";
15 | 	while (1) {
16 | 		int i = 0;
17 | 		char *m;
18 | 		if (!(m = calloc(grow_by * 1024 * 1024, 1)))
19 | 			printf("Failed to allocate memory");
20 | 		for (; i < grow_by * 1024 * 1024; i++)
21 | 			m[i] = 'a';
22 | 		tot += grow_by;
23 | 		printf("pid: %d, Tot mem=%" PRIu64 "mb\n", getpid(), tot);
24 | 		sleep(1);
25 | 	}
26 | 	return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/gen_jwt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import os
 4 | import pprint
 5 | import json
 6 | import time
 7 | from datetime import datetime, timedelta, timezone
 8 | 
 9 | from jwt import JWT
10 | from jwt.jwa import HS256
11 | from jwt.jwk import jwk_from_dict
12 | from jwt.utils import b64decode,b64encode
13 | 
14 | if len(sys.argv) != 3:
15 |     sys.exit("gen_jwt.py [user name] [expiration time (seconds)]");
16 | 
17 | with open("/etc/slurm/jwt.key", "rb") as f:
18 |     priv_key = f.read()
19 | 
20 | signing_key = jwk_from_dict({
21 |     'kty': 'oct',
22 |     'k': b64encode(priv_key)
23 | })
24 | 
25 | message = {
26 |     "exp": int(time.time() + int(sys.argv[2])),
27 |     "iat": int(time.time()),
28 |     "sun": sys.argv[1]
29 | }
30 | 
31 | a = JWT()
32 | compact_jws = a.encode(message, signing_key, alg='HS256')
33 | print("SLURM_JWT={}".format(compact_jws))
34 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/gethostname.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -N10  # Run this across 10 nodes
 4 | #SBATCH -n20  # Run 20 tasks
 5 | #SBATCH --mail-user=fred@localhost # Send mail to fred
 6 | #SBATCH --mail-type=BEGIN,END,FAIL # Send mail on begin, end, fail
 7 | #SBATCH -t1  # Submit with 1 minute of walltime
 8 | #SBATCH -o gethostname_%j.out # output goes to gethostname_<JOBID>.out
 9 | #SBATCH -e gethostname_%j.err # error goes to gethostname_<JOBID>.err
10 | 
11 | srun -l hostname | sort -h
12 | 
13 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/hetjob.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import openapi_client
 5 | from openapi_client.rest import ApiException
 6 | from pprint import pprint
 7 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 8 | # See configuration.py for a list of all supported configuration parameters.
 9 | configuration = openapi_client.Configuration(
10 |     host = "http://rest/",
11 |     api_key = {
12 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
13 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
14 |     }
15 | )
16 | from openapi_client.models import V0037JobSubmission as jobSubmission
17 | from openapi_client.models import V0037JobProperties as jobProperties
18 | from openapi_client.api import SlurmApi as slurm
19 | # Create an instance of the API class
20 | s = slurm(openapi_client.ApiClient(configuration))
21 | env = {
22 | "PATH":"/usr/local/bin:/bin:/usr/bin/:/usr/local/bin/",
23 | "LD_LIBRARY_PATH":"/usr/local/lib64:/usr/local/lib/:/lib/:/lib64/:/usr/local/lib",
24 | "SHELL": "/bin/bash"
25 | }
26 | script = "#!/bin/sh\nsrun uptime"
27 | job = jobSubmission(script=script)
28 | job.jobs = [
29 | jobProperties(
30 |     environment=env,
31 |     current_working_directory="/tmp",
32 |     nodes=[2,3],
33 | ),
34 | jobProperties(
35 |     environment=env,
36 |     current_working_directory="/tmp",
37 |     nodes=[2,4],
38 | ),
39 | jobProperties(
40 |     environment=env,
41 |     current_working_directory="/tmp",
42 |     nodes=[2,5],
43 | ),
44 | ]
45 | try:
46 |         njob = s.slurmctld_submit_job(job)
47 |         pprint(njob)
48 | except ApiException as e:
49 |         print("Exception when calling: %s\n" % e)
50 | 
51 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/job.json:
--------------------------------------------------------------------------------
 1 | {
 2 |        "job": {
 3 |                        "tasks": 8,
 4 |                        "name": "test",
 5 |                        "nodes": "2",
 6 |                        "cpus_per_task": 1,
 7 |                        "current_working_directory": "/tmp/",
 8 |                        "environment": [
 9 |                                "PATH=/bin:/usr/bin/:/usr/local/bin/",
10 |                                "LD_LIBRARY_PATH=/lib/:/lib64/:/usr/local/lib"
11 |                        ]
12 |        },
13 |        "script": "#!/bin/bash\nsrun sleep 100"
14 | }
15 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/job.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import openapi_client
 5 | from openapi_client.rest import ApiException
 6 | from pprint import pprint
 7 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 8 | # See configuration.py for a list of all supported configuration parameters.
 9 | configuration = openapi_client.Configuration(
10 |     host = "http://rest/",
11 |     api_key = {
12 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
13 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
14 |     }
15 | )
16 | from openapi_client.models import V0037JobSubmission as jobSubmission
17 | from openapi_client.models import V0037JobProperties as jobProperties
18 | from openapi_client.api import SlurmApi as slurm
19 | # Create an instance of the API class
20 | s = slurm(openapi_client.ApiClient(configuration))
21 | env = {
22 | "PATH":"/usr/local/bin:/bin:/usr/bin/:/usr/local/bin/",
23 | "LD_LIBRARY_PATH":"/usr/local/lib64:/usr/local/lib/:/lib/:/lib64/:/usr/local/lib",
24 | "SHELL": "/bin/bash"
25 | }
26 | script = "#!/bin/sh\nsrun uptime"
27 | job = jobSubmission(script=script)
28 | job.job = jobProperties(
29 |     environment=env,
30 |     current_working_directory="/tmp",
31 |     nodes=[2,9999],
32 |     standard_error="/tmp/job.log",
33 | )
34 | try:
35 |         njob = s.slurmctld_submit_job(job)
36 |         pprint(njob)
37 | except ApiException as e:
38 |         print("Exception when calling: %s\n" % e)
39 | 
40 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/job_submit.lua:
--------------------------------------------------------------------------------
 1 | function slurm_job_submit(job_desc, part_list, submit_uid)
 2 |         if job_desc.account == nil then
 3 |                 slurm.log_user("--account option required")
 4 |                 return slurm.ESLURM_INVALID_ACCOUNT
 5 |         end
 6 |         if job_desc.time_limit == slurm.NO_VAL then
 7 |                 slurm.log_user("--time limit option required")
 8 |                 return slurm.ESLURM_INVALID_TIME_LIMIT
 9 |         end
10 | 
11 |         return slurm.SUCCESS
12 | end
13 | function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
14 |         return slurm.SUCCESS
15 | end
16 | return slurm.SUCCESS
17 | 
18 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/job_submit_spank.lua:
--------------------------------------------------------------------------------
 1 | function slurm_job_submit(job_desc, part_list, submit_uid)
 2 | 	if job_desc.account and job_desc.account == "projecta" then
 3 | 		job_desc.script = string.gsub(job_desc.script, "srun", "srun --renice=-10")
 4 | 	elseif job_desc.account and job_desc.account == "projectb" then
 5 | 		job_desc.script = string.gsub(job_desc.script, "srun", "srun --renice=15")
 6 | 	end
 7 | 	return slurm.SUCCESS
 8 | end
 9 | function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
10 | 	return slurm.SUCCESS
11 | end
12 | slurm.log_user("initialized")
13 | return slurm.SUCCESS
14 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/make_notes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # make_notes.sh
 3 | # -------------
 4 | # Creates static man page notes from key Slurm commands.
 5 | # For use in the Vim editor with the 'M' command, for opening all the pages in
 6 | # the 'for' loop, below, in Vim tabs.
 7 | #
 8 | mkdir -p $HOME/notes
 9 | for x in sbatch srun salloc sattach squeue sinfo slurm.conf ; do
10 | 	man $x > $HOME/notes/$x
11 | done
12 | echo -n 'com! M tabe $HOME/notes/sbatch     |' >> ~/.vimrc
13 | echo -n '       tabe $HOME/notes/srun       |' >> ~/.vimrc
14 | echo -n '       tabe $HOME/notes/salloc     |' >> ~/.vimrc
15 | echo -n '       tabe $HOME/notes/sattach    |' >> ~/.vimrc
16 | echo -n '       tabe $HOME/notes/squeue     |' >> ~/.vimrc
17 | echo -n '       tabe $HOME/notes/sinfo      |' >> ~/.vimrc
18 | echo -n '       tabe $HOME/notes/slurm.conf |' >> ~/.vimrc
19 | echo -n '       tabnext'                       >> ~/.vimrc
20 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/memalloc.c:
--------------------------------------------------------------------------------
1 | #include <stdlib.h>
2 | int main(int argc, char **argv)
3 | {
4 | 	while (1)
5 | 		malloc(256);
6 | }
7 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/memalloc_with_sleep.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <unistd.h>
 3 | int main(int argc, char **argv)
 4 | {
 5 | 	while (1) {
 6 | 		malloc(256);
 7 | 		usleep(100);
 8 | 	};
 9 | }
10 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/myprogram.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <unistd.h>
 3 |    
 4 |  int main( int argc, char *argv[] )  {
 5 |   
 6 |      if( argc == 2 ) {
 7 |         printf("%s\n", argv[1]);
 8 |      }
 9 |      else if( argc > 2 ) {
10 |         printf("Too many arguments supplied.\n");
11 |     }
12 |     else {
13 |        printf("One argument expected.\n");
14 |     }
15 |  sleep(1);
16 |  }
17 | 
18 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/myslurmarray.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -J myprogram
 3 | #SBATCH -c 1
 4 | #SBATCH -N 1
 5 | #SBATCH -t 0-2:00
 6 | #SBATCH --array=1-30
 7 | #SBATCH -o myprogram%A_%a.out
 8 | # %A" is replaced by the job ID and "%a" with the array index
 9 | #SBATCH -e myprogram%A_%a.err
10 | 
11 | ./myprogram input$SLURM_ARRAY_TASK_ID.dat
12 | 
13 | sleep 10
14 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/node_exporter.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Prometheus Node Exporter
 3 | Wants=network-online.target
 4 | After=network-online.target
 5 | 
 6 | [Service]
 7 | User=node_exporter
 8 | Group=node_exporter
 9 | Type=simple
10 | ExecStart=/usr/local/bin/node_exporter
11 | 
12 | [Install]
13 | WantedBy=multi-user.target
14 | 
15 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/pi.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <mpi.h>
 5 | 
 6 | #define N 1E9
 7 | #define d 1E-9
 8 | 
 9 | int main (int argc, char* argv[])
10 | {
11 |     int rank, size, error, i, result=0, sum=0;
12 |     double pi=0.0, begin=0.0, end=0.0, x, y;
13 |     error=MPI_Init (&argc, &argv);
14 |                     
15 | //Get process ID
16 | MPI_Comm_rank (MPI_COMM_WORLD, &rank);
17 |                                 
18 | //Get processes Number
19 | MPI_Comm_size (MPI_COMM_WORLD, &size);
20 |                                             
21 | //Synchronize all processes and get the begin time
22 | MPI_Barrier(MPI_COMM_WORLD);
23 |      begin = MPI_Wtime();
24 |                                                    
25 |      srand((int)time(0));
26 |                                                                     
27 | //Each process will caculate a part of the sum
28 |      for (i=rank; i<N; i+=size)
29 |          {
30 |               x=rand()/(RAND_MAX+1.0);
31 |               y=rand()/(RAND_MAX+1.0);
32 |               if(x*x+y*y<1.0)
33 |               result++;
34 |          }
35 |                                                                                          
36 | //Sum up all results
37 | MPI_Reduce(&result, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
38 |                                                                                                                                         
39 | //Synchronize all processes and get the end time
40 | MPI_Barrier(MPI_COMM_WORLD);
41 | end = MPI_Wtime();
42 | 
43 | //Caculate and print PI
44 | if (rank==0)
45 |     {
46 |          pi=4*d*sum;
47 |          printf("np=%2d;    Time=%fs;    PI=%0.9f\n", size, end-begin, pi);
48 |     }
49 | 
50 | error=MPI_Finalize();
51 | return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/plugstack.conf:
--------------------------------------------------------------------------------
1 | #
2 | # SPANK config file
3 | #
4 | # required?       plugin                     args
5 | #
6 | optional          renice.so                  min_prio=-10
7 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/prolog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Sample TaskProlog script that will print a batch job's
 4 | # job ID and node list to the job's stdout
 5 | #
 6 | 
 7 | if [ X"$SLURM_STEP_ID" = "X" -a X"$SLURM_PROCID" = "X"0 ]
 8 | then
 9 |   echo -e "print ==========================================\n"
10 |   echo -e "print SLURM_JOB_ID = $SLURM_JOB_ID\n"
11 |   echo -e "print SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST\n"
12 |   echo -e "print ==========================================\n"
13 | fi
14 | 
15 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/prometheus-slurm-exporter.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Prometheus SLURM Exporter
 3 | 
 4 | [Service]
 5 | ExecStart=/usr/bin/prometheus-slurm-exporter
 6 | Restart=always
 7 | RestartSec=15
 8 | 
 9 | [Install]
10 | WantedBy=multi-user.target
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/prometheus.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Prometheus Time Series Collection and Processing Server
 3 | Wants=network-online.target
 4 | After=network-online.target
 5 | 
 6 | [Service]
 7 | User=prometheus
 8 | Group=prometheus
 9 | Type=simple
10 | ExecStart=/usr/local/bin/prometheus \
11 |     --config.file /etc/prometheus/prometheus.yml \
12 |     --storage.tsdb.path /var/lib/prometheus/ \
13 |     --web.console.templates=/etc/prometheus/consoles \
14 |     --web.console.libraries=/etc/prometheus/console_libraries
15 | 
16 | [Install]
17 | WantedBy=multi-user.target
18 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/prometheus.update-1.yml:
--------------------------------------------------------------------------------
 1 | # Global config
 2 | global:
 3 |   scrape_interval:     15s 
 4 |   evaluation_interval: 15s 
 5 |   scrape_timeout: 15s 
 6 | scrape_configs:
 7 |   - job_name: 'prometheus'
 8 |     static_configs:
 9 |     - targets: ['localhost:9090']
10 |   - job_name: 'node_exporter'
11 |     static_configs:
12 |     - targets: ['localhost:9100']
13 |     - targets: ['10.11.5.10:9100']
14 |     - targets: ['10.11.5.11:9100']
15 |     - targets: ['10.11.5.12:9100']
16 |     - targets: ['10.11.5.13:9100']
17 |     - targets: ['10.11.5.14:9100']
18 |     - targets: ['10.11.5.15:9100']
19 |     - targets: ['10.11.5.16:9100']
20 |     - targets: ['10.11.5.17:9100']
21 |     - targets: ['10.11.5.18:9100']
22 |     - targets: ['10.11.5.19:9100']
23 | 
24 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/prometheus.update-2.yml:
--------------------------------------------------------------------------------
 1 | # Global config
 2 | global:
 3 |   scrape_interval:     15s 
 4 |   evaluation_interval: 15s 
 5 |   scrape_timeout: 15s 
 6 | scrape_configs:
 7 |   - job_name: 'prometheus'
 8 |     static_configs:
 9 |     - targets: ['localhost:9090']
10 |   - job_name: 'node_exporter'
11 |     static_configs:
12 |     - targets: ['localhost:9100']
13 |     - targets: ['10.11.5.10:9100']
14 |     - targets: ['10.11.5.11:9100']
15 |     - targets: ['10.11.5.12:9100']
16 |     - targets: ['10.11.5.13:9100']
17 |     - targets: ['10.11.5.14:9100']
18 |     - targets: ['10.11.5.15:9100']
19 |     - targets: ['10.11.5.16:9100']
20 |     - targets: ['10.11.5.17:9100']
21 |     - targets: ['10.11.5.18:9100']
22 |     - targets: ['10.11.5.19:9100']
23 |   - job_name: 'prometheus-slurm-exporter'
24 |     static_configs:
25 |     - targets: ['localhost:8080']
26 | 
27 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # Global config
 2 | global:
 3 |   scrape_interval:     15s 
 4 |   evaluation_interval: 15s 
 5 |   scrape_timeout: 15s 
 6 | scrape_configs:
 7 |   - job_name: 'prometheus'
 8 |     static_configs:
 9 |     - targets: ['localhost:9090']
10 | 
11 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/qos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Script to setup for the QOS-based preemption lab
 4 | 
 5 | echo "PriorityWeightQOS=1000000" >> /etc/slurm/slurm.conf
 6 | /lab_scripts/restart.sh
 7 | sleep 3
 8 | #Create the QOSs:
 9 | sacctmgr -i add qos high
10 | sacctmgr -i add qos medium
11 | sacctmgr -i add qos low
12 | # Assign all users, cluster and accounts to the low QOS, and make the low
13 | # QOS the default for all users, cluster and accounts:
14 | sacctmgr -i modify account bedrock set qos=low
15 | sacctmgr -i modify user root,arnold,bambam,barney,betty,chip,dino,edna,fred,gazoo,pebbles,slurm,wilma set qos=low
16 | sacctmgr -i modify cluster cluster set qos=low
17 | sacctmgr -i modify account bedrock set defaultqos=low
18 | sacctmgr -i modify cluster cluster set defaultqos=low
19 | 
20 | #Assign users to be able to use the QOSs:
21 | sacctmgr -i modify user fred,barney set qos=+high,+medium,+low
22 | sacctmgr -i modify user fred,barney,wilma,betty set qos=+medium,+low
23 | 
24 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/renice.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *   To compile:
  3 |  *    gcc -fPIC -std=c99 -shared -o renice.so renice.c
  4 |  *
  5 |  */
  6 | #include <sys/types.h>
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <unistd.h>
 10 | #include <string.h>
 11 | #include <sys/resource.h>
 12 | #include <errno.h>
 13 | #include <getopt.h>
 14 | #include <slurm/spank.h>
 15 | /*
 16 |  * All spank plugins must define this macro for the
 17 |  * Slurm plugin loader.
 18 |  */
 19 | SPANK_PLUGIN(renice, 1);
 20 | #define PRIO_ENV_VAR "SLURM_RENICE"
 21 | #define PRIO_NOT_SET 42
 22 | /*
 23 |  * Minimum allowable value for priority. May be
 24 |  * set globally via plugin option min_prio=<prio>
 25 |  */
 26 | static int min_prio = -20;
 27 | static int prio = PRIO_NOT_SET;
 28 | static int _renice_opt_process(int val, const char *optarg, int remote);
 29 | static int _str2prio(const char *str, int *p2int);
 30 | /*
 31 |  *  Provide a --renice=[prio] option to srun:
 32 |  */
 33 | struct spank_option spank_options[] = {
 34 |     {"renice", "[prio]", "Re-nice job tasks to priority [prio].", 2, 0,
 35 |      (spank_opt_cb_f)_renice_opt_process},
 36 |     SPANK_OPTIONS_TABLE_END
 37 | };
 38 | /*
 39 |  *  Called from both srun and slurmd.
 40 |  */
 41 | int slurm_spank_init(spank_t sp, int ac, char **av) {
 42 |     /* Don't do anything in sbatch/salloc */
 43 |     if (spank_context() == S_CTX_ALLOCATOR)
 44 |         return 0;
 45 |     for (int i = 0; i < ac; i++) {
 46 |         if (strncmp("min_prio=", av[i], 9) == 0) {
 47 |             const char *optarg = av[i] + 9;
 48 |             if (_str2prio(optarg, &min_prio) < 0)
 49 |                 slurm_error("Ignoring invalid min_prio value: %s",
 50 |                         av[i]);
 51 |         } else {
 52 |             slurm_info("WARNING: renice: Invalid option: %s",
 53 |                    av[i]);
 54 |         }
 55 |     }
 56 |     if (!spank_remote(sp))
 57 |         slurm_verbose("renice: min_prio = %d", min_prio);
 58 |     return 0;
 59 | }
 60 | int slurm_spank_task_post_fork(spank_t sp, int ac, char **av) {
 61 |     pid_t pid;
 62 |     int taskid;
 63 |     if (prio == PRIO_NOT_SET) {
 64 |         /* See if SLURM_RENICE env var is set by user */
 65 |         char val[1024];
 66 |         if (spank_getenv(sp, PRIO_ENV_VAR, val, 1024) != ESPANK_SUCCESS)
 67 |             return 0;
 68 |         if (_str2prio(val, &prio) < 0) {
 69 |             slurm_error("Bad value for %s: %s", PRIO_ENV_VAR,
 70 |                     optarg);
 71 |             return -1;
 72 |         }
 73 |         if (prio < min_prio)
 74 |             slurm_info("WARNING: %s=%d not allowed, using min=%d",
 75 |                    PRIO_ENV_VAR, prio, min_prio);
 76 |     }
 77 |     if (prio < min_prio)
 78 |         prio = min_prio;
 79 |     spank_get_item(sp, S_TASK_GLOBAL_ID, &taskid);
 80 |     spank_get_item(sp, S_TASK_PID, &pid);
 81 |     slurm_info("re-nicing task%d pid %d to %d", taskid, pid, prio);
 82 |     if (setpriority(PRIO_PROCESS, (int)pid, (int)prio) < 0) {
 83 |         slurm_error("setpriority: %m");
 84 |         return -1;
 85 |     }
 86 |     return 0;
 87 | }
 88 | static int _str2prio(const char *str, int *p2int) {
 89 |     long int l;
 90 |     char *p;
 91 |     errno = 0;
 92 |     l = strtol(str, &p, 10);
 93 |     if (errno != 0)
 94 |         return -1;
 95 |     if ((l < -20) || (l > 20))
 96 |         return -1;
 97 |     *p2int = (int)l;
 98 |     return 0;
 99 | }
100 | static int _renice_opt_process(int val, const char *optarg, int remote) {
101 |     if (optarg == NULL) {
102 |         slurm_error("renice: invalid argument!");
103 |         return -1;
104 |     }
105 |     if (_str2prio(optarg, &prio) < 0) {
106 |         slurm_error("Bad value for --renice: %s", optarg);
107 |         return -1;
108 |     }
109 |     if (prio < min_prio)
110 |         slurm_info("WARNING: --renice=%d not allowed, will use min=%d",
111 |                prio, min_prio);
112 |     return 0;
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/restart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "--------------------------"
 4 | echo "Starting controller daemon"
 5 | echo "--------------------------"
 6 | ssh mgmtnode systemctl restart slurmctld
 7 | # Make sure the controller daemon is running..
 8 | until ssh mgmtnode '(ps -ef|grep -v grep|grep slurmctld)' ; do
 9 |    sleep 1
10 | done
11 | # Add an additional, tunable delay in seconds, just to be sure...
12 | TD=3
13 | echo -n "Slight delay of $TD seconds"
14 | for x in $(seq 1 $TD) ; do
15 |    echo -n .
16 | done
17 | echo
18 | 
19 | echo "---------------------"
20 | echo "Starting slurm daemon"
21 | echo "---------------------"
22 | pdsh systemctl restart slurmd
23 | # Make sure the slurmd daemon is running..
24 | until ssh node01 '(ps -ef|grep -v grep|grep slurmd)' ; do
25 |    sleep 1
26 | done
27 | # Add an additional, tunable delay in seconds, just to be sure...
28 | TD=3
29 | echo -n "Slight delay of $TD seconds"
30 | for x in $(seq 1 $TD) ; do
31 |    echo -n .
32 | done
33 | echo
34 | 
35 | echo "-----------------------------------"
36 | echo "Updating state of nodes to 'resume'"
37 | echo "-----------------------------------"
38 | scontrol update nodename=node[00-09] state=resume 2>/dev/null
39 | 
40 | exit
41 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/sacct.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import sys
 5 | import openapi_client
 6 | from openapi_client.rest import ApiException
 7 | from pprint import pprint
 8 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 9 | # See configuration.py for a list of all supported configuration parameters.
10 | configuration = openapi_client.Configuration(
11 |     host = "http://rest/",
12 |     api_key = {
13 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
14 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
15 |     }
16 | )
17 | from openapi_client.models import V0037JobSubmission as jobSubmission
18 | from openapi_client.models import V0037JobProperties as jobProperties
19 | from openapi_client.api import SlurmApi as slurm
20 | # Create an instance of the API class
21 | s = slurm(openapi_client.ApiClient(configuration))
22 | try:
23 |         pprint(s.slurmdbd_get_job(sys.argv[1]))
24 | except ApiException as e:
25 |         print("Exception when calling: %s\n" % e)
26 | 
27 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/sdiag.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import openapi_client
 5 | from openapi_client.rest import ApiException
 6 | from pprint import pprint
 7 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 8 | # See configuration.py for a list of all supported configuration parameters.
 9 | configuration = openapi_client.Configuration(
10 |     host = "http://rest/",
11 |     api_key = {
12 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
13 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
14 |     }
15 | )
16 | from openapi_client.api import SlurmApi as slurm
17 | # Create an instance of the API class
18 | s = slurm(openapi_client.ApiClient(configuration))
19 | try:
20 |         pprint(s.slurmctld_diag())
21 | except ApiException as e:
22 |         print("Exception when calling: %s\n" % e)
23 | 
24 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/showjob.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | import time
 3 | import os
 4 | import sys
 5 | import openapi_client
 6 | from openapi_client.rest import ApiException
 7 | from pprint import pprint
 8 | # Defining the host is optional and defaults to http://localhost/slurm/v0.0.37
 9 | # See configuration.py for a list of all supported configuration parameters.
10 | configuration = openapi_client.Configuration(
11 |     host = "http://rest/",
12 |     api_key = {
13 |         "X-SLURM-USER-NAME": os.environ['LOGNAME'],
14 |         "X-SLURM-USER-TOKEN": os.environ['SLURM_JWT']
15 |     }
16 | )
17 | from openapi_client.models import V0037JobSubmission as jobSubmission
18 | from openapi_client.models import V0037JobProperties as jobProperties
19 | from openapi_client.api import SlurmApi as slurm
20 | # Create an instance of the API class
21 | s = slurm(openapi_client.ApiClient(configuration))
22 | try:
23 |         pprint(s.slurmctld_get_job(sys.argv[1]))
24 | except ApiException as e:
25 |         print("Exception when calling: %s\n" % e)
26 | 
27 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for x in {1..1000} ; do
 4 | runuser -l   fred -c "sbatch --qos=normal -Amanagers -N1 --mem=100 -t10 -o /dev/null --wrap='srun hostname'"
 5 | runuser -l   fred -c "sbatch --qos=normal -Abedrock -N1 --mem=200 -t10 -o /dev/null --wrap='srun hostname'"
 6 | runuser -l   chip -c "sbatch --qos=normal -Amicro -N2 --mem=300 -t20 -o /dev/null --wrap='srun hostname'"
 7 | runuser -l   dino -c "sbatch --qos=normal -Amicro -N3 --mem=400 -t30 -o /dev/null --wrap='srun hostname'"
 8 | runuser -l   edna -c "sbatch --qos=normal -Amolecular -N4 --mem=500 -t40 -o /dev/null --wrap='srun hostname'"
 9 | runuser -l   gazoo -c "sbatch --qos=normal -Amolecular -N5 --mem=600 -t50 -o /dev/null --wrap='srun hostname'"
10 | runuser -l   barney -c "sbatch --qos=normal -Amanagers -N2 --mem=700 -t50 -o /dev/null --wrap='srun hostname'"
11 | runuser -l   wilma -c "sbatch --qos=normal -Abedrock -N5 --mem=800 -t50 -o /dev/null --wrap='srun hostname'"
12 | runuser -l   pebbles -c "sbatch --qos=normal -Agpuusers -N1 --mem=800 -t50 --gres=gpu:gtx:2 -o /dev/null --wrap='srun hostname'"
13 | runuser -l   bambam -c "sbatch --qos=normal -Agpuusers -N1 --mem=800 -t50 --gres=gpu:gtx:2 -o /dev/null --wrap='srun hostname'"
14 | done
15 | 
16 | # for x in pebbles bambam fred chip dino edna gazoo barney wilma ; do scancel -u $x ; done
17 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/testaccount.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "accounts": [
 3 |     {
 4 |       "organization": "test org",
 5 |       "description": "test description",
 6 |       "name": "testaccount"
 7 |     }
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/testping.sh.fred:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH -N2 -t 20
3 | #SBATCH -o testping-out%j.txt
4 | #SBATCH -e testping-err%j.txt
5 | #SBATCH --exclusive
6 | #SBATCH -A bioinformatics
7 | 
8 | ping -c 20 -4 google.com
9 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/testping.sh.pebbles:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N2 -t 20
 3 | #SBATCH -o testping-out%j.txt
 4 | #SBATCH -e testping-err%j.txt
 5 | #SBATCH --exclusive
 6 | #SBATCH -A stats
 7 | 
 8 | ping -c 20 -4 google.com
 9 | 
10 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/testuser.json:
--------------------------------------------------------------------------------
1 | {
2 |   "users": [
3 |     {
4 |       "name": "testuser"
5 |     }
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/topology.conf:
--------------------------------------------------------------------------------
1 | SwitchName=s0 Nodes=node[00-03]
2 | SwitchName=s1 Nodes=node[04-07]
3 | SwitchName=s2 Nodes=node[08-09]
4 | SwitchName=s3 Switches=s[0-2]
5 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/verify_jwt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import os
 4 | import pprint
 5 | import json
 6 | import time
 7 | from datetime import datetime, timedelta, timezone
 8 | 
 9 | from jwt import JWT
10 | from jwt.jwa import HS256
11 | from jwt.jwk import jwk_from_dict
12 | from jwt.utils import b64decode,b64encode
13 | 
14 | if len(sys.argv) != 2:
15 |     sys.exit("verify_jwt.py [JWT Token]");
16 | 
17 | with open("/etc/slurm/jwt.key", "rb") as f:
18 |     priv_key = f.read()
19 | 
20 | signing_key = jwk_from_dict({
21 |     'kty': 'oct',
22 |     'k': b64encode(priv_key)
23 | })
24 | 
25 | a = JWT()
26 | b = a.decode(sys.argv[1], signing_key, algorithms=["HS256"])
27 | print(b)
28 | 


--------------------------------------------------------------------------------
/scaleout/lab_scripts/whereami.c:
--------------------------------------------------------------------------------
 1 | /*****************************************************************************\
 2 |  *  test1.91.prog.c - Simple test program for SLURM regression test1.91.
 3 |  *  Reports SLURM task ID and the CPU mask,
 4 |  *  similar functionality to "taskset" command
 5 |  *****************************************************************************
 6 |  *  Copyright (C) 2005 The Regents of the University of California.
 7 |  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 8 |  *  Written by Morris Jette <jette1@llnl.gov>
 9 |  *  CODE-OCEC-09-009. All rights reserved.
10 |  *
11 |  *  This file is part of SLURM, a resource management program.
12 |  *  For details, see <http://slurm.schedmd.com/>.
13 |  *  Please also read the included file: DISCLAIMER.
14 |  *
15 |  *  SLURM is free software; you can redistribute it and/or modify it under
16 |  *  the terms of the GNU General Public License as published by the Free
17 |  *  Software Foundation; either version 2 of the License, or (at your option)
18 |  *  any later version.
19 |  *
20 |  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
21 |  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
22 |  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
23 |  *  details.
24 |  *
25 |  *  You should have received a copy of the GNU General Public License along
26 |  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
27 |  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
28 | \*****************************************************************************/
29 | #define _GNU_SOURCE
30 | #define __USE_GNU
31 | #include <errno.h>
32 | #include <stdio.h>
33 | #include <stdlib.h>
34 | #include <string.h>
35 | #include <unistd.h>
36 | 
37 | static char *_get_cpu_bindings()
38 | {
39 | 	FILE *cpuinfo = fopen("/proc/self/status", "rb");
40 | 	char *line = 0;
41 | 	size_t size = 0;
42 | 	char *cpus = calloc(1024, sizeof(char));
43 | 	while(getdelim(&line, &size, '\n', cpuinfo) != -1) {
44 | 		if (strstr(line, "Cpus_")) {
45 | 			char *end = strstr(line, "\n");
46 | 			if (end)
47 | 				*end = '\0';
48 | 			sprintf(cpus + strlen(cpus), "%s%s", line, (cpus[0]) ? "" : "\t");
49 | 		}
50 | 	}
51 | 	free(line);
52 | 	fclose(cpuinfo);
53 | 	return cpus;
54 | }
55 | 
56 | 
57 | int main (int argc, char **argv)
58 | {
59 | 	char *task_str;
60 | 	char *node_name;
61 | 	int task_id;
62 | 
63 | 	/* On POE systems, MP_CHILD is equivalent to SLURM_PROCID */
64 | 	if (((task_str = getenv("SLURM_PROCID")) == NULL) &&
65 | 	    ((task_str = getenv("MP_CHILD")) == NULL)) {
66 | 		fprintf(stderr, "ERROR: getenv(SLURM_PROCID) failed\n");
67 | 		exit(1);
68 | 	}
69 | 
70 | 	node_name = getenv("SLURMD_NODENAME");
71 | 	task_id = atoi(task_str);
72 | 	printf("%4d %s - %s\n", task_id, node_name, _get_cpu_bindings());
73 | 
74 | 
75 | 	if (argc > 1) {
76 | 		int sleep_time = strtol(argv[1] ,0, 10);
77 | 		//printf("sleeping %d seconds\n", sleep_time);
78 | 		fflush(stdout);
79 | 		sleep(sleep_time);
80 | 	}
81 | 	exit(0);
82 | }
83 | 


--------------------------------------------------------------------------------
/scaleout/login.startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | munged --num-threads=10
 3 | postfix -Dv start
 4 | systemctl enable podman
 5 | systemctl start podman
 6 | 
 7 | for i in arnold bambam barney betty chip dino edna fred gazoo pebbles wilma; do
 8 | 	loginctl enable-linger $i
 9 | done
10 | 
11 | exec /usr/sbin/sshd -D
12 | 


--------------------------------------------------------------------------------
/scaleout/msmtprc:
--------------------------------------------------------------------------------
 1 | # The SMTP smarthost
 2 | host login
 3 | 
 4 | # Construct envelope-from addresses of the form "user@oursite.example"
 5 | auto_from on
 6 | maildomain cluster
 7 | 
 8 | # Syslog logging with facility LOG_MAIL instead of the default LOG_USER
 9 | syslog LOG_MAIL
10 | 


--------------------------------------------------------------------------------
/scaleout/munge.service:
--------------------------------------------------------------------------------
1 | [Service]
2 | ExecCondition=grep -e 'AuthType=auth/munge' -e 'AuthType=munge' --  /etc/slurm/slurm.conf
3 | 


--------------------------------------------------------------------------------
/scaleout/my.cnf:
--------------------------------------------------------------------------------
 1 | #
 2 | # WARNING: this is only for the mysql client.
 3 | # Update sql_server/root.my.cnf for the server
 4 | #
 5 | 
 6 | [mysqld_safe]
 7 | socket          = /var/run/mysqld/mysqld.sock
 8 | nice            = 0
 9 | 
10 | [mysqld]
11 | user            = slurm
12 | pid-file        = /var/run/mysqld/mysqld.pid
13 | socket          = /var/run/mysqld/mysqld.sock
14 | lc-messages-dir = /var/log/mysql/
15 | port            = 3306
16 | basedir         = /usr
17 | datadir         = /var/lib/mysql
18 | tmpdir          = /tmp
19 | skip_networking = off
20 | skip-external-locking   = on
21 | bind-address            = {SUBNET}.1.3
22 | sql_mode=ANSI_QUOTES
23 | 


--------------------------------------------------------------------------------
/scaleout/mysql:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | if [ "$(hostname -s)" = "db" ]
4 | then
5 | 	exec /usr/bin/mysql --connect_timeout=2 -S /var/run/mysqld/mysqld.sock "$@"
6 | else
7 | 	exec /usr/bin/mysql --connect_timeout=2 --user=slurm --password=password --host=db "$@"
8 | fi
9 | 


--------------------------------------------------------------------------------
/scaleout/patch.d/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naterini/docker-scale-out/40c8a775c6e5cbb9461eb1cd218dbf9b3f1fe3e3/scaleout/patch.d/.gitkeep


--------------------------------------------------------------------------------
/scaleout/podman-containers/containers.conf:
--------------------------------------------------------------------------------
 1 | [containers]
 2 | apparmor_profile = "unconfined"
 3 | cgroupns = "host"
 4 | cgroups = "enabled"
 5 | default_sysctls = []
 6 | label = false
 7 | netns = "host"
 8 | no_hosts = true
 9 | pidns = "host"
10 | utsns = "host"
11 | userns = "host"
12 | 
13 | [network]
14 | default_network = "scaleout"
15 | 
16 | [engine]
17 | runtime = "slurm"
18 | runtime_supports_nocgroups = [ "slurm" ]
19 | runtime_supports_json = [ "slurm" ]
20 | remote = false
21 | #conmon_env_vars = [ "SCRUN_DEBUG=debug5" ]
22 | 
23 | [engine.runtimes]
24 | slurm = [ "/usr/local/bin/scrun" ]
25 | 


--------------------------------------------------------------------------------
/scaleout/podman-containers/policy.json:
--------------------------------------------------------------------------------
1 | {"default": [{"type":"insecureAcceptAnything"}]}
2 | 


--------------------------------------------------------------------------------
/scaleout/podman-containers/registries.conf:
--------------------------------------------------------------------------------
1 | unqualified-search-registries = ["docker.io", "quay.io"]
2 | 


--------------------------------------------------------------------------------
/scaleout/podman-containers/storage.conf:
--------------------------------------------------------------------------------
 1 | [storage]
 2 | driver = "vfs"
 3 | runroot = "$HOME/containers"
 4 | graphroot = "$HOME/containers"
 5 | 
 6 | [storage.options]
 7 | pull_options = {use_hard_links = "true", enable_partial_images = "true"}
 8 | 
 9 | 
10 | [storage.options.vfs]
11 | ignore_chown_errors = "true"
12 | 


--------------------------------------------------------------------------------
/scaleout/postfix.service:
--------------------------------------------------------------------------------
1 | [Service]
2 | ExecCondition=+bash -c 'exec test $(hostname -s) = "login"'
3 | ExecStartPre=
4 | 


--------------------------------------------------------------------------------
/scaleout/profile.sh:
--------------------------------------------------------------------------------
1 | export LC_ALL=C.UTF-8
2 | export LANGUAGE=en_US.UTF-8
3 | export LANG=en_US.UTF-8
4 | export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
5 | export LD_LIBRARY_PATH=/usr/local/lib64:/usr/local/lib
6 | export PATH=/home/$USER/.local/bin:/home/$USER/bin:/usr/local/bin/:/usr/local/sbin/:/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin
7 | 


--------------------------------------------------------------------------------
/scaleout/resume.node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exec 1> >(logger -s -t $(basename $0)) 2>&1
 3 | 
 4 | [ ! -S /run/cloud_socket ] && exit -1
 5 | 
 6 | scontrol show hostnames $1 | while read nodename
 7 | do
 8 | 	resp=$(echo "start:$nodename" | socat -t9999 STDIO UNIX-CONNECT:/run/cloud_socket)
 9 | 
10 | 	[ "$resp" != "ACK" ] && exit 1
11 | done
12 | 
13 | true
14 | 


--------------------------------------------------------------------------------
/scaleout/sackd.check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exec 1> >(logger -s -t $(basename $0)) 2>&1
 3 | 
 4 | grep -e 'AuthType=auth/slurm' -e 'AuthType=slurm' -- /etc/slurm/slurm.conf || \
 5 | 	exit 1
 6 | 
 7 | #Invert checks in slurmd.check.sh
 8 | 
 9 | # never load on a cloud node
10 | [ "$CLOUD" ] && exit 1
11 | 
12 | awk -vhost="$(hostname -s)" '
13 | 	BEGIN {rc = 1} 
14 | 	$1 == host {rc=0} 
15 | 	END {exit rc}
16 | ' /etc/nodelist
17 | test $? -eq 0 && exit 1 || exit 0
18 | 


--------------------------------------------------------------------------------
/scaleout/sackd.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | After=sshd.service
3 | 
4 | [Service]
5 | ExecCondition=/usr/local/bin/sackd.check.sh 
6 | 


--------------------------------------------------------------------------------
/scaleout/slurm.bash_profile:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -o vi
4 | 
5 | export SLURM_TESTSUITE_SLURMRESTD_URL=http://rest/
6 | export SLURMRESTD_SECURITY=disable_user_check
7 | 
8 | alias testsuite='cd /usr/local/src/slurm/testsuite/'
9 | 


--------------------------------------------------------------------------------
/scaleout/slurm/acct_gather.conf:
--------------------------------------------------------------------------------
1 | ProfileInfluxDBDatabase=scaleout
2 | ProfileInfluxDBHost=influxdb:8086
3 | ProfileInfluxDBDefault=All
4 | ProfileInfluxDBRTPolicy=scaleout
5 | ProfileInfluxDBPass=password
6 | ProfileInfluxDBUser=user
7 | 


--------------------------------------------------------------------------------
/scaleout/slurm/cgroup.conf:
--------------------------------------------------------------------------------
1 | ConstrainCores=yes
2 | ConstrainDevices=no
3 | ConstrainRAMSpace=yes
4 | ConstrainSwapSpace=yes
5 | 


--------------------------------------------------------------------------------
/scaleout/slurm/gres.conf:
--------------------------------------------------------------------------------
1 | Name=gpu Type=gtx File=/dev/tty
2 | Name=gpu Type=gtx File=/dev/zero
3 | Name=gpu Type=gtx File=/dev/urandom
4 | 


--------------------------------------------------------------------------------
/scaleout/slurm/job_container.conf:
--------------------------------------------------------------------------------
1 | AutoBasePath=true
2 | BasePath=/var/tmp/slurm
3 | 


--------------------------------------------------------------------------------
/scaleout/slurm/nodes.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naterini/docker-scale-out/40c8a775c6e5cbb9461eb1cd218dbf9b3f1fe3e3/scaleout/slurm/nodes.conf


--------------------------------------------------------------------------------
/scaleout/slurm/oci.conf:
--------------------------------------------------------------------------------
1 | EnvExclude="^(SLURM_CONF|SLURM_CONF_SERVER)="
2 | RunTimeEnvExclude="^(SLURM_CONF|SLURM_CONF_SERVER)="
3 | RunTimeQuery="crun --rootless=true --root=/run/user/%U/ state %n.%u.%j.%s.%t"
4 | RunTimeKill="crun --rootless=true --root=/run/user/%U/ kill -a %n.%u.%j.%s.%t"
5 | RunTimeDelete="crun --rootless=true --root=/run/user/%U/ delete --force %n.%u.%j.%s.%t"
6 | RunTimeRun="crun --rootless=true --root=/run/user/%U/ run --bundle %b %n.%u.%j.%s.%t"
7 | 


--------------------------------------------------------------------------------
/scaleout/slurm/plugstack.conf:
--------------------------------------------------------------------------------
1 | include /etc/slurm/plugstack.conf.d/*
2 | 


--------------------------------------------------------------------------------
/scaleout/slurm/plugstack.conf.d/README:
--------------------------------------------------------------------------------
1 | #
2 | # Place configuration files with .conf in this directory to be loaded by Slurm plugstack.conf
3 | #
4 | 


--------------------------------------------------------------------------------
/scaleout/slurm/scrun.lua:
--------------------------------------------------------------------------------
  1 | local json = require 'json'
  2 | local open = io.open
  3 | 
  4 | local function read_file(path)
  5 | 	local file = open(path, "rb")
  6 | 	if not file then return nil end
  7 | 	local content = file:read "*all"
  8 | 	file:close()
  9 | 	return content
 10 | end
 11 | 
 12 | local function write_file(path, contents)
 13 | 	local file = open(path, "wb")
 14 | 	if not file then return nil end
 15 | 	file:write(contents)
 16 | 	file:close()
 17 | 	return
 18 | end
 19 | 
 20 | function slurm_scrun_stage_in(id, bundle, spool_dir, config_file, job_id, user_id, group_id, job_env)
 21 | 	slurm.log_debug(string.format("stage_in(%s, %s, %s, %s, %d, %d, %d)",
 22 | 		       id, bundle, spool_dir, config_file, job_id, user_id, group_id))
 23 | 
 24 | 	local status, output, user, rc
 25 | 	local config = json.decode(read_file(config_file))
 26 | 	local src_rootfs = config["root"]["path"]
 27 | 	rc, user = slurm.allocator_command(string.format("id -un %d", user_id))
 28 | 	user = string.gsub(user, "%s+", "")
 29 | 	local root = "/home/"..user.."/containers/"
 30 | 	local dst_bundle = root.."/"..id.."/"
 31 | 	local dst_config = root.."/"..id.."/config.json"
 32 | 	local dst_rootfs = root.."/"..id.."/rootfs/"
 33 | 
 34 | 	if string.sub(src_rootfs, 1, 1) ~= "/"
 35 | 	then
 36 | 		-- always use absolute path
 37 | 		src_rootfs = string.format("%s/%s", bundle, src_rootfs)
 38 | 	end
 39 | 
 40 | 	status, output = slurm.allocator_command("mkdir -p "..dst_rootfs)
 41 | 	if (status ~= 0)
 42 | 	then
 43 | 		slurm.log_info(string.format("mkdir(%s) failed %u: %s",
 44 | 			       dst_rootfs, status, output))
 45 | 		return slurm.ERROR
 46 | 	end
 47 | 
 48 | 	status, output = slurm.allocator_command(string.format("/usr/bin/env rsync --exclude sys --exclude proc --numeric-ids --delete-after --ignore-errors --stats -a -- %s/ %s/", src_rootfs, dst_rootfs))
 49 | 	if (status ~= 0)
 50 | 	then
 51 | 		-- rsync can fail due to permissions which may not matter
 52 | 		slurm.log_info(string.format("WARNING: rsync failed: %s", output))
 53 | 	end
 54 | 
 55 | 	slurm.set_bundle_path(dst_bundle)
 56 | 	slurm.set_root_path(dst_rootfs)
 57 | 
 58 | 	config["root"]["path"] = dst_rootfs
 59 | 
 60 | 	-- Always force user namespace support in container or runc will reject
 61 | 	if ((config["process"] ~= nil) and (config["process"]["user"] ~= nil))
 62 | 	then
 63 | 		-- purge additionalGids as they are not supported in rootless
 64 | 		config["process"]["user"]["additionalGids"] = nil
 65 | 		--config["process"]["user"]["gid"] = nil
 66 | 	end
 67 | 
 68 | 	if (config["linux"] ~= nil)
 69 | 	then
 70 | 		-- force user namespace to always be defined for rootless mode
 71 | 		local found = false
 72 | 		if (config["linux"]["namespaces"] == nil)
 73 | 		then
 74 | 			config["linux"]["namespaces"] = {}
 75 | 		else
 76 | 			for _, namespace in ipairs(config["linux"]["namespaces"]) do
 77 | 				if (namespace["type"] == "user")
 78 | 				then
 79 | 					found=true
 80 | 					break
 81 | 				end
 82 | 			end
 83 | 		end
 84 | 		if (found == false)
 85 | 		then
 86 | 			table.insert(config["linux"]["namespaces"], {type= "user"})
 87 | 		end
 88 | 
 89 | 		-- clear all attempts to map uid/gids
 90 | 		config["linux"]["uidMappings"] = nil
 91 | 		config["linux"]["gidMappings"] = nil
 92 | 
 93 | 		-- disable trying to use a specific cgroup
 94 | 		config["linux"]["cgroupsPath"] = nil
 95 | 	end
 96 | 
 97 | 	if (config["mounts"] ~= nil)
 98 | 	then
 99 | 		-- Find and remove any user/group settings in mounts
100 | 		for _, mount in ipairs(config["mounts"]) do
101 | 			local opts = {}
102 | 
103 | 			if (mount["options"] ~= nil)
104 | 			then
105 | 				for _, opt in ipairs(mount["options"]) do
106 | 					if ((string.sub(opt, 1, 4) ~= "gid=") and (string.sub(opt, 1, 4) ~= "uid="))
107 | 					then
108 | 						table.insert(opts, opt)
109 | 					end
110 | 				end
111 | 			end
112 | 
113 | 			mount["options"] = opts
114 | 		end
115 | 
116 | 		-- Remove all bind mounts
117 | 		local mounts = {}
118 | 		for i, mount in ipairs(config["mounts"]) do
119 | 			if ((mount["type"] ~= nil) and (mount["type"] == "bind") and (string.sub(mount["source"], 1, 4) ~= "/sys") and (string.sub(mount["source"], 1, 5) ~= "/proc"))
120 | 			then
121 | 				status, output = slurm.allocator_command(string.format("/usr/bin/env rsync --numeric-ids --ignore-errors --stats -a -- %s %s", mount["source"], dst_rootfs..mount["destination"]))
122 | 				if (status ~= 0)
123 | 				then
124 | 					-- rsync can fail due to permissions which may not matter
125 | 					slurm.log_info("rsync failed")
126 | 				end
127 | 			else
128 | 				table.insert(mounts, mount)
129 | 			end
130 | 		end
131 | 		config["mounts"] = mounts
132 | 	end
133 | 
134 | 	-- Merge in Job environment into container -- this is optional!
135 | 	if (config["process"]["env"] == nil)
136 | 	then
137 | 		config["process"]["env"] = {}
138 | 	end
139 | 	for _, env in ipairs(job_env) do
140 | 		table.insert(config["process"]["env"], env)
141 | 	end
142 | 
143 | 	-- Remove all prestart hooks to squash any networking attempts
144 | 	if ((config["hooks"] ~= nil) and (config["hooks"]["prestart"] ~= nil))
145 | 	then
146 | 		config["hooks"]["prestart"] = nil
147 | 	end
148 | 
149 | 	-- Remove all rlimits
150 | 	if ((config["process"] ~= nil) and (config["process"]["rlimits"] ~= nil))
151 | 	then
152 | 		config["process"]["rlimits"] = nil
153 | 	end
154 | 
155 | 	write_file(dst_config, json.encode(config))
156 | 	slurm.log_info("created: "..dst_config)
157 | 
158 | 	return slurm.SUCCESS
159 | end
160 | 
161 | function slurm_scrun_stage_out(id, bundle, orig_bundle, root_path, orig_root_path, spool_dir, config_file, jobid, user_id, group_id)
162 | 	slurm.log_debug(string.format("stage_out(%s, %s, %s, %s, %s, %s, %s, %d, %d, %d)",
163 | 		       id, bundle, orig_bundle, root_path, orig_root_path, spool_dir, config_file, jobid, user_id, group_id))
164 | 
165 | 	return slurm.SUCCESS
166 | end
167 | 
168 | if (not os.execute("mkdir -p /tmp/docker-exec/runtime-runc/moby/"))
169 | then
170 | 	slurm.log_info("mkdir failed")
171 | 	return slurm.ERROR
172 | end
173 | 
174 | slurm.log_info("initialized scrun.lua")
175 | 
176 | return slurm.SUCCESS
177 | 


--------------------------------------------------------------------------------
/scaleout/slurm/slurm.conf:
--------------------------------------------------------------------------------
  1 | #
  2 | # Example slurm.conf file. Please run configurator.html
  3 | # (in doc/html) to build a configuration file customized
  4 | # for your environment.
  5 | #
  6 | #
  7 | # slurm.conf file generated by configurator.html.
  8 | #
  9 | # See the slurm.conf man page for more information.
 10 | #
 11 | ClusterName=cluster
 12 | SlurmCtldHost=mgmtnode
 13 | SlurmCtldHost=mgmtnode2
 14 | SlurmUser=slurm
 15 | #SlurmdUser=root
 16 | SlurmctldPort=6817
 17 | SlurmdPort=6818
 18 | AuthType=auth/munge
 19 | AuthAltTypes=auth/jwt
 20 | AuthInfo=socket=/var/run/munge/munge.socket.2
 21 | #JobCredentialPrivateKey=
 22 | #JobCredentialPublicCertificate=
 23 | StateSaveLocation=/var/spool/slurm/statesave
 24 | SlurmdSpoolDir=/var/spool/slurm/spool
 25 | SwitchType=switch/none
 26 | MpiDefault=pmix
 27 | SlurmctldPidFile=/var/run/slurmctld.pid
 28 | SlurmdPidFile=/var/run/slurmd.pid
 29 | #PluginDir=
 30 | #FirstJobId=
 31 | ReturnToService=2
 32 | #MaxJobCount=
 33 | #PlugStackConfig=
 34 | #PropagatePrioProcess=
 35 | #PropagateResourceLimits=
 36 | #PropagateResourceLimitsExcept=
 37 | #Prolog=
 38 | #Epilog=
 39 | #SrunProlog=
 40 | #SrunEpilog=
 41 | #TaskProlog=
 42 | #TaskEpilog=
 43 | #TaskPlugin=
 44 | #TrackWCKey=no
 45 | #TreeWidth=50
 46 | #TmpFS=
 47 | #UsePAM=
 48 | #
 49 | # TIMERS
 50 | SlurmctldTimeout=30
 51 | SlurmdTimeout=30
 52 | InactiveLimit=0
 53 | MinJobAge=300
 54 | KillWait=30
 55 | Waittime=0
 56 | MailProg=/usr/bin/mail
 57 | MessageTimeout=100
 58 | #
 59 | # SCHEDULING
 60 | SchedulerType=sched/backfill
 61 | #SchedulerAuth=
 62 | SelectType=select/cons_tres
 63 | SelectTypeParameters=CR_Core_Memory
 64 | #PriorityType=priority/multifactor
 65 | #PriorityDecayHalfLife=14-0
 66 | #PriorityUsageResetPeriod=MONTHLY
 67 | #PriorityWeightFairshare=100000
 68 | #PriorityWeightAge=1000
 69 | #PriorityWeightPartition=10000
 70 | #PriorityWeightJobSize=1000
 71 | #PriorityMaxAge=1-0
 72 | #PriorityCalcPeriod=1
 73 | #
 74 | # LOGGING
 75 | SlurmctldDebug=debug
 76 | SlurmctldLogFile=/var/log/slurmctld.log
 77 | SlurmdDebug=debug
 78 | SlurmdLogFile=/var/log/slurmd.log
 79 | JobCompType=jobcomp/elasticsearch
 80 | JobCompLoc=http://es01:9200/slurm/_doc
 81 | #
 82 | # ACCOUNTING
 83 | #JobAcctGatherType=jobacct_gather/linux
 84 | #JobAcctGatherFrequency=30
 85 | #
 86 | AccountingStorageType=accounting_storage/slurmdbd
 87 | AccountingStorageHost=slurmdbd
 88 | #AccountingStorageLoc=
 89 | AccountingStorageUser=slurm
 90 | AccountingStorageEnforce=limits,qos,safe
 91 | 
 92 | JobAcctGatherType=jobacct_gather/cgroup
 93 | ProctrackType=proctrack/cgroup
 94 | PrologFlags=X11
 95 | X11Parameters=home_xauthority
 96 | TaskPlugin=task/affinity,task/cgroup
 97 | LaunchParameters=enable_nss_slurm,use_interactive_step
 98 | PriorityMaxAge=1
 99 | AuthAltParameters=jwt_key=/etc/slurm/jwt.key,jwks=/etc/slurm/jwks.json,userclaimfield=preferred_username
100 | AcctGatherProfileType=acct_gather_profile/influxdb
101 | JobContainerType=job_container/none
102 | 
103 | Include nodes.conf
104 | 
105 | SuspendProgram=/usr/local/sbin/suspend.node.sh
106 | ResumeProgram=/usr/local/sbin/resume.node.sh
107 | SuspendTime=INFINITE
108 | ResumeTimeout=600
109 | 
110 | TreeWidth=65533
111 | 
112 | NodeName=cloud[0000-1024] Weight=8 Feature=cloud State=CLOUD
113 | PartitionName=cloud Nodes=cloud[0000-1024] Default=no SuspendTime=100
114 | CommunicationParameters=EnableIPv6
115 | 
116 | GresTypes=gpu
117 | AccountingStorageTRES=gres/gpu,gres/gpu:gtx
118 | 
119 | TrackWCKey=yes
120 | 
121 | #TLSType=tls/s2n
122 | TLSParameters=\
123 | ctld_cert_file=/etc/slurm/ctld_cert.pem,\
124 | ctld_cert_key_file=/etc/slurm/ctld_cert_key.pem,\
125 | restd_cert_file=/etc/slurm/restd_cert.pem,\
126 | restd_cert_key_file=/etc/slurm/restd_cert_key.pem,\
127 | slurmd_cert_file=/etc/slurm/%n_slurmd_cert.pem,\
128 | slurmd_cert_key_file=/etc/slurm/%n_slurmd_cert_key.pem,\
129 | sackd_cert_file=/etc/slurm/sackd_cert.pem,\
130 | sackd_cert_key_file=/etc/slurm/sackd_cert_key.pem,\
131 | ca_cert_file=/etc/slurm/ca_cert.pem
132 | 
133 | #CertmgrType=certmgr/script
134 | CertmgrParameters=\
135 | get_node_token_script=/usr/local/sbin/tls_get_node_token.sh,\
136 | generate_csr_script=/usr/local/sbin/tls_gen_csr.sh,\
137 | validate_node_script=/usr/local/sbin/tls_validate_node.sh,\
138 | sign_csr_script=/usr/local/sbin/tls_sign_csr.sh,\
139 | get_node_cert_key_script=/usr/local/sbin/tls_get_node_cert_key.sh
140 | 
141 | 


--------------------------------------------------------------------------------
/scaleout/slurm/slurm.jwt.conf:
--------------------------------------------------------------------------------
1 | include /etc/slurm/slurm.conf
2 | AuthType=auth/jwt
3 | 


--------------------------------------------------------------------------------
/scaleout/slurm/slurmdbd.conf:
--------------------------------------------------------------------------------
 1 | #Slurm Mangled Additions
 2 | SlurmUser=slurm
 3 | AuthType=auth/munge
 4 | AuthInfo=socket=/var/run/munge/munge.socket.2
 5 | StorageType=accounting_storage/mysql
 6 | DbdHost=slurmdbd
 7 | StoragePort=3306
 8 | StoragePass=password
 9 | StorageUser=slurm
10 | PidFile=/var/run/slurmdbd.pid
11 | StorageHost=db
12 | AuthAltTypes=auth/jwt
13 | AuthAltParameters=jwt_key=/etc/slurm/jwt.key
14 | CommunicationParameters=EnableIPv6
15 | LogFile=/var/log/slurmdbd.log
16 | 
17 | AllowNoDefAcct=yes
18 | TrackWCKey=yes
19 | 
20 | #TLSType=tls/s2n
21 | TLSParameters=\
22 | dbd_cert_file=/etc/slurm/dbd_cert.pem,\
23 | dbd_cert_key_file=/etc/slurm/dbd_cert_key.pem,\
24 | ca_cert_file=/etc/slurm/ca_cert.pem
25 | 
26 | 


--------------------------------------------------------------------------------
/scaleout/slurm/staging.lua:
--------------------------------------------------------------------------------
  1 | local json = require 'json'
  2 | local open = io.open
  3 | 
  4 | local function read_file(path)
  5 | 	local file = open(path, "rb")
  6 | 	if not file then return nil end
  7 | 	local content = file:read "*all"
  8 | 	file:close()
  9 | 	return content
 10 | end
 11 | 
 12 | local function write_file(path, contents)
 13 | 	local file = open(path, "wb")
 14 | 	if not file then return nil end
 15 | 	file:write(contents)
 16 | 	file:close()
 17 | 	return
 18 | end
 19 | 
 20 | -- deepcopy() is from http://lua-users.org/wiki/CopyTable
 21 | function deepcopy(orig)
 22 | 	local orig_type = type(orig)
 23 | 	local copy
 24 | 	if orig_type == 'table' then
 25 | 		copy = {}
 26 | 		for orig_key, orig_value in next, orig, nil do
 27 | 			copy[deepcopy(orig_key)] = deepcopy(orig_value)
 28 | 		end
 29 | 		setmetatable(copy, deepcopy(getmetatable(orig)))
 30 | 	else -- number, string, boolean, etc
 31 | 		copy = orig
 32 | 	end
 33 | 	return copy
 34 | end
 35 | 
 36 | function sync(src, dst)
 37 | 	local a = string.format(
 38 | 		"/usr/bin/env rsync --numeric-ids --delete-after --ignore-errors --stats -a -- %s %s",
 39 | 		src, dst)
 40 | 	slurm.log_info("calling: %s", a)
 41 | 	os.execute(string.format("echo -- calling %s", a))
 42 | 
 43 | 	if (not os.execute(a))
 44 | 	then
 45 | 		-- FIXME: need to get rc from rsync
 46 | 		slurm.log_info("rsync had errors")
 47 | 	end
 48 | end
 49 | 
 50 | function slurm_stage_in_remote(id, bundle, spool_path, config_path)
 51 | 	return slurm.SUCCESS
 52 | end
 53 | 
 54 | function slurm_stage_in_allocator(id, bundle, spool_path, config_path)
 55 | 	slurm.log_info("called slurm_stage_in_allocator() bundle:%s spool_path:%s config_path:%s",
 56 | 		bundle, spool_path, config_path)
 57 | 
 58 | 	local user = os.getenv("SCRUN_USER")
 59 | 	local uid = tonumber(os.getenv("SCRUN_USER_ID"))
 60 | 	local gid = tonumber(os.getenv("SCRUN_GROUP_ID"))
 61 | 	local config_contents = read_file(config_path)
 62 | 	local p = "/srv/containers/"..user.."/"..id.."/"
 63 | 	local c = json.decode(config_contents)
 64 | 	local jc = p.."/config.json"
 65 | 	local rootfs = c["root"]["path"]
 66 | 	local dstfs = p.."/rootfs/"
 67 | 
 68 | 	if (string.sub(rootfs, 0, 1) ~= "/")
 69 | 	then
 70 | 		rootfs = bundle .. "/" .. rootfs
 71 | 	end
 72 | 
 73 | 	-- override root path to new location
 74 | 	c["root"]["path"] = p.."/rootfs"
 75 | 
 76 | 	-- any rlimit requests will be rejected causing container to fail
 77 | 	c["process"]["rlimits"] = nil
 78 | 
 79 | 	-- remove any request to mount /dev/pts as it will be rejected --
 80 | 	for i,m in ipairs(c["mounts"]) do
 81 | 		if m["type"] == "devpts" then
 82 | 			for v,o in ipairs(m["options"]) do
 83 | 				if string.find(o, "gid=") then
 84 | 					-- must use the gid from inside of the container
 85 | 					c["mounts"][i]["options"][v] = "gid=" .. tonumber(c["process"]["user"]["gid"])
 86 | 				end
 87 | 			end
 88 | 		end
 89 | 	end
 90 | 
 91 | 	-- crun requires user namespace but centos crun doesnt include this patch set
 92 | 	-- https://github.com/containers/crun/pull/181/files
 93 | 	table.insert(c["linux"]["namespaces"], {type="user"})
 94 | 	-- user namespace requires mappings
 95 | 	c["linux"]["uidMappings"] = {};
 96 | 	table.insert(c["linux"]["uidMappings"], { containerID=c["process"]["user"]["uid"], hostID=uid, size=1 })
 97 | 	c["linux"]["gidMappings"] = {};
 98 | 	table.insert(c["linux"]["gidMappings"], { containerID=c["process"]["user"]["gid"], hostID=gid, size=1 })
 99 | 
100 | 	if (not os.execute("mkdir -p "..dstfs))
101 | 	then
102 | 		slurm.log_info("mkdir failed")
103 | 	end
104 | 
105 | 	-- send over rootfs
106 | 	sync(rootfs .. "/", dstfs .. "/")
107 | 
108 | 	-- handle special bind mounts that are local only
109 | 	-- make a new mounts array to avoid using table.remove()
110 | 	-- https://stackoverflow.com/questions/12394841/safely-remove-items-from-an-array-table-while-iterating
111 | 	local mounts = {}
112 | 	for i,m in ipairs(c["mounts"]) do
113 | 		slurm.log_info("mount source: %s", m["source"])
114 | 		if string.find(m["source"], "overlay%-container", 0, plain) then
115 | 			sync(m["source"], dstfs .. m["destination"])
116 | 		else
117 | 			table.insert(mounts, deepcopy(m))
118 | 		end
119 | 	end
120 | 	c["mounts"] = mounts
121 | 
122 | 	slurm.set_bundle_path(p)
123 | 	slurm.set_root_path(p.."rootfs")
124 | 
125 | 	write_file(jc, json.encode(c))
126 | 	slurm.log_info("created: "..jc)
127 | 
128 | 	return slurm.SUCCESS
129 | end
130 | 
131 | function slurm_stage_out_allocator(id, bundle, spool_path, config_path)
132 | 	slurm.log_info("called slurm_stage_out_allocator() bundle:%s spool_path:%s config_path:%s", bundle, spool_path, config_path)
133 | 
134 | 	if (not os.execute("rm --one-file-system --preserve-root=all -rf "..bundle))
135 | 	then
136 | 		slurm.log_info("rsync failed")
137 | 		return slurm.FAILURE;
138 | 	end
139 | 
140 | 	return slurm.SUCCESS
141 | end
142 | 
143 | function slurm_stage_out_remote(id, bundle, spool_path, config_path)
144 | 	return slurm.SUCCESS
145 | end
146 | 
147 | slurm.log_info("initialized container_bb.lua")
148 | 
149 | return slurm.SUCCESS
150 | 


--------------------------------------------------------------------------------
/scaleout/slurmctld.service:
--------------------------------------------------------------------------------
 1 | [Service]
 2 | ExecCondition=bash -c '[[ "$(hostname -s)" =~ mgmtnode[0-9]?$ ]]'
 3 | ExecStartPre=+touch /var/log/slurmctld.log
 4 | ExecStartPre=+chown slurm:slurm /var/log/slurmctld.log
 5 | ExecStartPre=+/usr/local/bin/slurmctld.startup.sh
 6 | ExecStartPost=+/usr/local/bin/slurmctld.startup2.sh
 7 | User=slurm
 8 | Group=slurm
 9 | TimeoutSec=1800
10 | 


--------------------------------------------------------------------------------
/scaleout/slurmctld.startup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #only configure once
  3 | [ -f /var/run/slurmctld.startup ] && exit 0
  4 | 
  5 | HOST="$(cat /etc/hostname)"
  6 | 
  7 | sed -e '/^hosts:/d' -i /etc/nsswitch.conf
  8 | echo 'hosts: files myhostname' >> /etc/nsswitch.conf
  9 | 
 10 | [ "${HOST}" = "mgmtnode" ] && IS_MGT=1 || IS_MGT=
 11 | [ "${HOST}" = "${SLURM_FEDERATION_CLUSTER}-mgmtnode" ] && IS_FMGT=1 || IS_FMGT=
 12 | echo "Running on host:${HOST} cluster:${SLURM_FEDERATION_CLUSTER} mgt=${IS_MGT} federated=${IS_FMGT}"
 13 | 
 14 | if [ "${IS_MGT}${IS_FMGT}" != "" ]
 15 | then
 16 | 	if [ "$IS_FMGT" != "" ]
 17 | 	then
 18 | 		#force the cluster name to be the assigned
 19 | 		sed -e '/^ClusterName=/d' -i /etc/slurm/slurm.conf
 20 | 		echo "ClusterName=${SLURM_FEDERATION_CLUSTER}" >> /etc/slurm/slurm.conf
 21 | 
 22 | 		sed -e '/^SlurmCtldHost=/d' -i /etc/slurm/slurm.conf
 23 | 		echo "SlurmCtldHost=${SLURM_FEDERATION_CLUSTER}-mgmtnode" >> /etc/slurm/slurm.conf
 24 | 		echo "SlurmCtldHost=${SLURM_FEDERATION_CLUSTER}-mgmtnode2" >> /etc/slurm/slurm.conf
 25 | 
 26 | 	fi
 27 | 
 28 | 	if [ ! -s /etc/slurm/nodes.conf ]
 29 | 	then
 30 | 		props="$(slurmd -C | head -1 | sed "s#NodeName=$(hostname -s) ##g")"
 31 | 		echo "NodeName=DEFAULT $props Gres=gpu:gtx:3 State=UNKNOWN" > /etc/slurm/nodes.conf
 32 | 
 33 | 		cat /etc/nodelist | while read name cluster ip4 ip6
 34 | 		do
 35 | 			if [[ "$cluster" = "${SLURM_FEDERATION_CLUSTER}" ]]
 36 | 			then
 37 | 				[ ! -z "$ip6" ] && addr="$ip6" || addr="$ip4"
 38 | 				echo "NodeName=$name NodeAddr=$addr" >> /etc/slurm/nodes.conf
 39 | 			fi
 40 | 		done
 41 | 
 42 | 		NODES=$(cat /etc/nodelist  | awk -v CLUSTER=${SLURM_FEDERATION_CLUSTER} '
 43 | 			BEGIN {delete nodes[0]}
 44 | 
 45 | 			$2 == CLUSTER {
 46 | 				nodes[$1]=1
 47 | 			}
 48 | 
 49 | 			END {
 50 | 				comma=0
 51 | 				for (i in nodes) {
 52 | 					if (comma)
 53 | 						printf ",%s", i
 54 | 					else
 55 | 						printf "%s", i
 56 | 					comma=1
 57 | 				}
 58 | 			}')
 59 | 
 60 | 		grep "PartitionName=DEFAULT" /etc/slurm/slurm.conf &>/dev/null
 61 | 		if [ $? -ne 0 ]
 62 | 		then
 63 | 			#only add partitions if none exist yet - avoid clobbering user modified partition config
 64 | 			echo "PartitionName=DEFAULT Nodes=$NODES" >> /etc/slurm/slurm.conf
 65 | 			echo "PartitionName=debug Nodes=$NODES Default=YES MaxTime=INFINITE State=UP" >> /etc/slurm/slurm.conf
 66 | 		fi
 67 | 	fi
 68 | 
 69 | 	[ ! -s /etc/slurm/nodes.conf ] && (echo "nodes.conf not populated when it should have been" && exit 10)
 70 | 
 71 | 	#wait for slurmdbd to start up fully
 72 | 
 73 | 	while true
 74 | 	do
 75 | 		sacctmgr show cluster &>/dev/null
 76 | 		[ $? -eq 0 ] && break
 77 | 		sleep 5
 78 | 	done
 79 | 
 80 | 	sacctmgr -vi add cluster "${SLURM_FEDERATION_CLUSTER}"
 81 | 	sacctmgr -vi add account bedrock Cluster="${SLURM_FEDERATION_CLUSTER}" Description="none" Organization="none"
 82 | 	sacctmgr -vi add user root Account=bedrock DefaultAccount=bedrock
 83 | 	sacctmgr -vi add user slurm Account=bedrock DefaultAccount=bedrock
 84 | 
 85 | 	for i in arnold bambam barney betty chip edna fred gazoo wilma dino pebbles
 86 | 	do
 87 | 		sacctmgr -vi add user $i Account=bedrock DefaultAccount=bedrock
 88 | 	done
 89 | 
 90 | 	#disable admins to allow their setup in class
 91 | 	#sacctmgr -vi add user dino Account=bedrock DefaultAccount=bedrock admin=admin
 92 | 	#sacctmgr -vi add user pebbles Account=bedrock DefaultAccount=bedrock admin=admin
 93 | 
 94 | 	#Grab JWKS from keycloak
 95 | 	while [ ! -s /etc/slurm/jwks.json ]
 96 | 	do
 97 | 		curl "http://{SUBNET}.1.23:8080/realms/master/protocol/openid-connect/certs" > /etc/slurm/jwks.json
 98 | 	done
 99 | 
100 | else
101 | 	#wait for primary mgt node to be done starting up
102 | 	while [ ! -s /etc/slurm/nodes.conf -o ! -s /etc/slurm/jwks.json -o "$(scontrol --json ping | jq -r '.pings[0].pinged')" != "UP" ]
103 | 	do
104 | 		sleep 0.25
105 | 	done
106 | fi
107 | 
108 | date > /var/run/slurmctld.startup
109 | 
110 | exit 0
111 | 


--------------------------------------------------------------------------------
/scaleout/slurmctld.startup2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | while [ ! -s /etc/slurm/nodes.conf ]
 3 | do
 4 | 	sleep 0.25
 5 | done
 6 | 
 7 | scontrol token username=slurm lifespan=9999999 | sed 's#SLURM_JWT=##g' > /auth/slurm
 8 | chmod 0755 -R /auth
 9 | 
10 | sed -e '/^hosts:/d' -i /etc/nsswitch.conf
11 | echo 'hosts:      files dns myhostname' >> /etc/nsswitch.conf
12 | 
13 | exit 0
14 | 


--------------------------------------------------------------------------------
/scaleout/slurmd.check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exec 1> >(logger -s -t $(basename $0)) 2>&1
 3 | 
 4 | # Import our environment variables from systemd
 5 | # https://unix.stackexchange.com/questions/146995/inherit-environment-variables-in-systemd-docker-container
 6 | for e in $(tr "\000" "\n" < /proc/1/environ); do
 7 |         eval "export $e"
 8 | done
 9 | 
10 | # always load on a cloud node
11 | [ "$CLOUD" ] && exit 0
12 | 
13 | exec awk -vhost="$(hostname -s)" '
14 | 	BEGIN {rc = 1} 
15 | 	$1 == host {rc=0} 
16 | 	END {exit rc}
17 | ' /etc/nodelist
18 | 


--------------------------------------------------------------------------------
/scaleout/slurmd.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | After=slurmd.slice
 3 | Requires=slurmd.slice
 4 | [Service]
 5 | ExecCondition=/usr/local/bin/slurmd.check.sh
 6 | ExecStartPre=+touch /var/log/slurmd.log
 7 | ExecStartPre=+chown slurm:slurm /var/log/slurmd.log
 8 | ExecStartPre=+/usr/local/bin/slurmd.startup.sh
 9 | Environment="SLURMD_OPTIONS=-b"
10 | Slice=slurmd.slice
11 | TimeoutStartUSec=0
12 | 


--------------------------------------------------------------------------------
/scaleout/slurmd.slice:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=slurm slice
3 | Before=slices.target
4 | [Slice]
5 | CPUAccounting=true
6 | MemoryAccounting=true
7 | Delegate=yes
8 | 


--------------------------------------------------------------------------------
/scaleout/slurmd.startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exec 1> >(logger -s -t $(basename $0)) 2>&1
 3 | 
 4 | function getmip
 5 | {
 6 | 	#Avoid IPv6 until bug#11321 is resolved
 7 | 
 8 | 	getent ahostsv4 mgmtnode | awk '
 9 | 		$2 == "STREAM" && $1 !~ /:/ {
10 | 			print $1
11 | 			exit 0
12 | 		}
13 | 	'
14 | }
15 | 
16 | function getip
17 | {
18 | 	ip route get $(getmip) |  awk '
19 | 		{
20 | 			for (i = 1; i <= NF; i++) {
21 | 				if ($i == "src") {
22 | 					print $(i+1)
23 | 				}
24 | 			}
25 | 		}
26 | 	'
27 | }
28 | 
29 | # Import our environment variables from systemd
30 | # https://unix.stackexchange.com/questions/146995/inherit-environment-variables-in-systemd-docker-container
31 | for e in $(tr "\000" "\n" < /proc/1/environ); do
32 |         eval "export $e"
33 | done
34 | 
35 | while [ ! -s /etc/slurm/nodes.conf ]
36 | do
37 |         sleep 0.25
38 | done
39 | 
40 | [ "$CLOUD" -a ! -f /etc/cloud-configured ] && \
41 | 	while true
42 | 	do
43 | 			#init this cloud node
44 | 			host="$(echo "whoami:$(hostname)" | socat -t999 STDIO UNIX-CONNECT:/run/cloud_socket)"
45 | 
46 | 			[ -z "$host" -o "$host" == "FAIL" ] && sleep 0.25 && continue
47 | 
48 | 			src=$(getip)
49 | 
50 | 			hostname $host
51 | 			echo "$host" > /etc/hostname
52 | 			hostnamectl set-hostname "$host"
53 | 			scontrol update nodename=$host nodeaddr=$src nodehostname=$src
54 | 			echo "Sending nodename=$host nodeaddr=$src nodehostname=$src"
55 | 
56 | 			systemctl daemon-reload
57 | 
58 | 			touch /etc/cloud-configured
59 | 
60 | 			exit 0
61 | 	done
62 | 
63 | exit 0
64 | 


--------------------------------------------------------------------------------
/scaleout/slurmdbd.service:
--------------------------------------------------------------------------------
1 | [Service]
2 | ExecStartPre=+/usr/local/bin/slurmdbd.startup.sh
3 | ExecCondition=bash -c 'exec test $(hostname -s) = "slurmdbd"'
4 | 


--------------------------------------------------------------------------------
/scaleout/slurmdbd.startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #only configure once
 3 | [ -f /var/run/slurmdbd.startup ] && exit 0
 4 | 
 5 | touch /var/run/slurmdbd.pid
 6 | chown slurm:slurm /var/run/slurmdbd.pid
 7 | 
 8 | touch /var/log/slurmdbd.log
 9 | chown slurm:slurm /var/log/slurmdbd.log
10 | 
11 | date > /var/run/slurmdbd.startup
12 | 
13 | exit 0
14 | 


--------------------------------------------------------------------------------
/scaleout/slurmrestd.env:
--------------------------------------------------------------------------------
1 | SLURMRESTD_OPTIONS="-f /etc/slurm/slurm.jwt.conf -u slurmrestd -g slurmrestd -a rest_auth/jwt -vvvvvv [{SUBNET6}1:6]:80 {SUBNET}.1.6:80"
2 | 


--------------------------------------------------------------------------------
/scaleout/slurmrestd.service:
--------------------------------------------------------------------------------
1 | [Service]
2 | Environment="SLURM_JWT="
3 | ExecCondition=bash -c 'test $(hostname -s) = "rest"'
4 | ExecStartPre=+/usr/local/bin/slurmrestd.startup.sh
5 | 


--------------------------------------------------------------------------------
/scaleout/slurmrestd.startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #wait until config is filled out by controller before starting
 4 | while [ ! -s /etc/slurm/nodes.conf ]
 5 | do
 6 | 	sleep 0.25
 7 | done
 8 | 
 9 | exit 0
10 | 


--------------------------------------------------------------------------------
/scaleout/startup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Add hosts in the not crazy slow manner
 4 | cat /etc/hosts.nodes >> /etc/hosts
 5 | 
 6 | #ensure the systemd cgroup directory exists for enroot
 7 | mkdir -p $(awk -F: '$2 ~ /systemd/ {printf "/sys/fs/cgroup/systemd/%s", $3}' /proc/self/cgroup)
 8 | 
 9 | #systemd user@.service handles on normal nodes
10 | for i in arnold bambam barney betty chip dino edna fred gazoo pebbles wilma; do
11 | 	uid=$(id -u $i)
12 | 	mkdir -m 0700 -p /run/user/$uid/{.enroot,.local}
13 | 	chown $i:users /run/user/$uid
14 | done
15 | 
16 | ls /usr/lib/systemd/system/slurm*.service | while read s
17 | do
18 | 	#We must set the cluster environment variable for all services since systemd drops it for the services
19 | 	mkdir -p ${s}.d
20 | 	echo -e "[Service]\nEnvironment=SLURM_FEDERATION_CLUSTER=${SLURM_FEDERATION_CLUSTER}\n" > ${s}.d/cluster.conf
21 | done
22 | 
23 | #start systemd
24 | exec /lib/systemd/systemd --system --log-level=info --crash-reboot --log-target=console
25 | 


--------------------------------------------------------------------------------
/scaleout/suspend.node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exec 1> >(logger -s -t $(basename $0)) 2>&1
 3 | 
 4 | [ ! -S /run/cloud_socket ] && exit -1
 5 | 
 6 | scontrol show hostnames $1 | while read nodename
 7 | do
 8 | 	echo "stop:$nodename" | socat -t9999 STDIO UNIX-CONNECT:/run/cloud_socket
 9 | done
10 | 
11 | true
12 | 


--------------------------------------------------------------------------------
/scaleout/test-build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MAX_TRY=150
 4 | 
 5 | # Wait for slurmctld to startup fully
 6 | for ((try = 0; try < $MAX_TRY; try++))
 7 | do
 8 | 	status="$(scontrol ping --json | jq -r '.pings[0].responding')"
 9 | 	rc=$?
10 | 
11 | 	if [ $? -eq 0 -a "${status}" = "true" ]
12 | 	then
13 | 		break
14 | 	elif [ $try -ge $MAX_TRY ]
15 | 	then
16 | 		echo "FAILED"
17 | 		exit 1
18 | 	fi
19 | 
20 | 	sleep $(echo "scale=5; 0.1 * $try" | bc)
21 | done
22 | 
23 | exec srun -N10 uptime
24 | 


--------------------------------------------------------------------------------
/scaleout/testsuite.conf:
--------------------------------------------------------------------------------
 1 | # Slurm Source Directory (contains configure.ac and the src directory)
 2 | SlurmSourceDir=/usr/local/src/slurm
 3 | 
 4 | # Slurm Build Directory (where you ran configure and make from)
 5 | SlurmBuildDir=/usr/local/src/slurm
 6 | 
 7 | # Slurm Installation Directory (value of --prefix configure option)
 8 | SlurmInstallDir=/usr/local
 9 | 
10 | # Slurm Configuration Directory (compiled-in location of slurm.conf)
11 | SlurmConfigDir=/etc/slurm
12 | 
13 | # Avoid test skipping that need to change database
14 | allow-slurmdbd-modify=true
15 | 


--------------------------------------------------------------------------------
/scaleout/tls_gen_csr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm node name is passed in as arg $1
 4 | NODE_PRIVATE_KEY=/etc/slurm/${1}_cert_key.pem
 5 | 
 6 | openssl ecparam -out $NODE_PRIVATE_KEY -name prime256v1 -genkey
 7 | 
 8 | # Check exit code from openssl
 9 | if [ $? -ne 0 ]
10 | then
11 |     echo "$BASH_SOURCE: Failed to generate private key"
12 |     exit 1
13 | fi
14 | 
15 | chmod 0600 $NODE_PRIVATE_KEY
16 | 
17 | # Generate CSR using node private key and print CSR to stdout
18 | openssl req -new -key $NODE_PRIVATE_KEY \
19 |     -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=${1}"
20 | 
21 | # Check exit code from openssl
22 | if [ $? -ne 0 ]
23 | then
24 |     echo "$BASH_SOURCE: Failed to generate CSR"
25 |     exit 1
26 | fi
27 | 
28 | # Exit with exit code 0 to indicate success
29 | exit 0
30 | 


--------------------------------------------------------------------------------
/scaleout/tls_get_node_cert_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm node name is passed in as arg $1
 4 | NODE_PRIVATE_KEY=/etc/slurm/${1}_cert_key.pem
 5 | 
 6 | # Check if node private key file exists
 7 | if [ ! -f $NODE_PRIVATE_KEY ]
 8 | then
 9 |     echo "$BASH_SOURCE: Failed to resolve node private key path '$NODE_PRIVATE_KEY'"
10 |     exit 1
11 | fi
12 | 
13 | cat $NODE_PRIVATE_KEY
14 | 
15 | # Exit with exit code 0 to indicate success
16 | exit 0
17 | 


--------------------------------------------------------------------------------
/scaleout/tls_get_node_token.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm node name is passed in as arg $1
 4 | TOKEN_PATH=/etc/slurm/${1}_token.txt
 5 | TOKEN_PERMISSIONS=600
 6 | 
 7 | # Check if token file exists
 8 | if [ ! -f $TOKEN_PATH ]
 9 | then
10 |     echo "$BASH_SOURCE: Failed to resolve token path '$TOKEN_PATH'"
11 |     exit 1
12 | fi
13 | 
14 | # Check node private key permissions
15 | if [ $(stat -c "%a" $TOKEN_PATH) -ne $TOKEN_PERMISSIONS ]
16 | then
17 |     echo "$BASH_SOURCE: Bad permissions for node token at '$TOKEN_PATH'. Permissions should be $TOKEN_PERMISSIONS"
18 |     exit 1
19 | fi
20 | 
21 | # Print token to stdout
22 | cat $TOKEN_PATH
23 | 
24 | # Exit with exit code 0 to indicate success
25 | exit 0
26 | 


--------------------------------------------------------------------------------
/scaleout/tls_sign_csr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Certificate signing request is passed in as arg $1
 4 | CSR=$1
 5 | CA_CERT=/etc/slurm/ca_cert.pem
 6 | CA_KEY=/etc/slurm/ca_cert_key.pem
 7 | KEY_PERMISSIONS=600
 8 | 
 9 | # Check if CA certificate file exists
10 | if [ ! -f $CA_CERT ]
11 | then
12 |     echo "$BASH_SOURCE: Failed to resolve CA certificate path '$CA_CERT'"
13 |     exit 1
14 | fi
15 | 
16 | # Check if CA private key file exists
17 | if [ ! -f $CA_KEY ]
18 | then
19 |     echo "$BASH_SOURCE: Failed to resolve CA private key path '$CA_KEY'"
20 |     exit 1
21 | fi
22 | 
23 | # Check CA private key permissions
24 | if [ $(stat -c "%a" $CA_KEY) -ne $KEY_PERMISSIONS ]
25 | then
26 |     echo "$BASH_SOURCE: Bad permissions for CA private key at '$CA_KEY'. Permissions should be $KEY_PERMISSIONS"
27 |     exit 1
28 | fi
29 | 
30 | # Sign CSR using CA certificate and CA private key and print signed cert to stdout
31 | openssl x509 -req -in $CSR -CA $CA_CERT -CAkey $CA_KEY -CAserial /etc/slurm/ca_cert.srl 2>/dev/null
32 | 
33 | # Check exit code from openssl
34 | if [ $? -ne 0 ]
35 | then
36 |     echo "$BASH_SOURCE: Failed to generate signed certificate"
37 |     exit 1
38 | fi
39 | 
40 | # Exit with exit code 0 to indicate success
41 | exit 0
42 | 


--------------------------------------------------------------------------------
/scaleout/tls_validate_node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NODE_NAME=$1
 4 | NODE_TOKEN=$2
 5 | NODE_TOKEN_LIST_FILE=/etc/slurm/node_token_list.txt
 6 | 
 7 | # Check if node token list file exists
 8 | if [ ! -f $NODE_TOKEN_LIST_FILE ]
 9 | then
10 |     echo "$BASH_SOURCE: Failed to resolve node token list path '$NODE_TOKEN_LIST_FILE'"
11 |     exit 1
12 | fi
13 | 
14 | # Check if unique node token is in token list file
15 | grep "${NODE_NAME}: ${NODE_TOKEN}" $NODE_TOKEN_LIST_FILE
16 | 
17 | # Check exit code from grep to see if token was found
18 | if [ $? -ne 0 ]
19 | then
20 |     echo "$BASH_SOURCE: Failed to validate token '$NODE_TOKEN'"
21 |     exit 1
22 | fi
23 | 
24 | # Exit with exit code 0 to indicate success (node token is valid)
25 | exit 0
26 | 


--------------------------------------------------------------------------------
/scaleout/valgrind.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/coregrind/m_initimg/initimg-linux.c b/coregrind/m_initimg/initimg-linux.c
 2 | index 5359189bf6..58a2a1e563 100644
 3 | --- a/coregrind/m_initimg/initimg-linux.c
 4 | +++ b/coregrind/m_initimg/initimg-linux.c
 5 | @@ -1151,7 +1151,7 @@ IIFinaliseImageInfo VG_(ii_create_image)( IICreateImageInfo iicii,
 6 |     //--------------------------------------------------------------
 7 |     { 
 8 |        SizeT m1 = 1024 * 1024;
 9 | -      SizeT m8 = 8 * m1;
10 | +      SizeT m8 = 4096 * m1;
11 |        SizeT dseg_max_size = (SizeT)VG_(client_rlimit_data).rlim_cur;
12 |        VG_(debugLog)(1, "initimg", "Setup client data (brk) segment\n");
13 |        if (dseg_max_size < m1) dseg_max_size = m1;
14 | 


--------------------------------------------------------------------------------
/sql_server/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM mysql:8
2 | COPY my.cnf /etc/mysql/conf.d/scaleout.cnf
3 | COPY my.cnf /root/.my.cnf
4 | 


--------------------------------------------------------------------------------
/sql_server/my.cnf:
--------------------------------------------------------------------------------
 1 | #[client]
 2 | #user="root"
 3 | #password="password"
 4 | #socket=/run/mysqld/mysqld.sock
 5 | 
 6 | [mysql]
 7 | no-auto-rehash
 8 | connect_timeout=2
 9 | 
10 | [mysqld]
11 | sql_mode=ANSI_QUOTES
12 | key_buffer_size         = 16M
13 | max_allowed_packet      = 16M
14 | thread_stack            = 192K
15 | thread_cache_size       = 8
16 | myisam-recover-options  = BACKUP
17 | max_binlog_size         = 100M
18 | innodb_buffer_pool_size=10096M
19 | innodb_log_file_size=640M
20 | innodb_lock_wait_timeout=900
21 | 


--------------------------------------------------------------------------------
/xdmod/Dockerfile:
--------------------------------------------------------------------------------
 1 | #Based on https://github.com/ubccr/xdmod/blob/xdmod11.0/.circleci/config.yml
 2 | FROM tools-ext-01.ccr.xdmod.org/xdmod-10.5.0-x86_64:rockylinux8.5-0.3
 3 | RUN yum update --exclude google-chrome-* -y
 4 | RUN yum upgrade --exclude google-chrome-* -y
 5 | RUN systemctl enable postfix.service mariadb.service httpd.service
 6 | RUN sed -e 's#:443#:80#' -e 's#SSLEngine on#SSLEngine off#g' -i /etc/httpd/conf.d/xdmod.conf
 7 | COPY resources.json /etc/xdmod/resources.json2
 8 | COPY resource_specs.json /etc/xdmod/resource_specs.json2
 9 | RUN jq -s '.[0] + .[1]' /etc/xdmod/resources.json2 /etc/xdmod/resources.json > /tmp/a; cat /tmp/a > /etc/xdmod/resources.json
10 | RUN jq -s '.[0] + .[1]' /etc/xdmod/resource_specs.json2 /etc/xdmod/resource_specs.json > /tmp/a; cat /tmp/a > /etc/xdmod/resource_specs.json
11 | 
12 | STOPSIGNAL SIGRTMIN+3
13 | VOLUME ["/tmp", "/run", "/run/lock"]
14 | COPY startup.sh /sbin/
15 | CMD [ "/sbin/startup.sh" ]
16 | EXPOSE 8082/tcp
17 | 


--------------------------------------------------------------------------------
/xdmod/resource_specs.json:
--------------------------------------------------------------------------------
1 | [
2 |     {
3 |         "resource": "scaleout",
4 |         "processors": 100,
5 |         "nodes": 10,
6 |         "ppn": 10
7 |     }
8 | ]
9 | 


--------------------------------------------------------------------------------
/xdmod/resources.json:
--------------------------------------------------------------------------------
1 | [
2 |     {
3 |         "resource": "scaleout",
4 |         "resource_type": "HPC",
5 |         "name": "scaleout"
6 |     }
7 | ]
8 | 


--------------------------------------------------------------------------------
/xdmod/startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | exec /lib/systemd/systemd --system --log-level=info --log-target=journal-or-kmsg
3 | 


--------------------------------------------------------------------------------